Fuzzy Matching with Soundex and Levenshtein

#   _____           _              _____        __      
#  |  __ \         (_)            |_   _|      / _|     
#  | |__) |___  ___ _ _ __   ___    | |  _ __ | |_ ___  
#  |  _  // _ \/ __| | '_ \ / _ \   | | | '_ \|  _/ _ \ 
#  | | \ \  __/ (__| | |_) |  __/  _| |_| | | | || (_) |
#  |_|  \_\___|\___|_| .__/ \___| |_____|_| |_|_| \___/ 
#                    | |                                
#                    |_|

# Project Name - FUZZY MATCHING WITH SOUNDEX AND LEVENSHTEIN

# Recipe URL - https://app.lingk.io/a/10932/tf/17861

# Description - # This recipe demonstrates the use of fuzzy matching in Spark with Soundex and Levenshtein Distance.
                # The soundex algorithm is often used to compare first names that are spelled differently.
                # You might want to use the Levenshtein distance when joining two DataFrames if you don’t want to require exact string matches.
                # It’s always a struggle to minimize the number of false positives when performing fuzzy joins. So do multiple tests and join with multiple columns to improve results.
                # To run this recipe, choose your environment and click Run!

# Industry - Higher Ed
# Business Process - Graduate Reporting

# Systems - 
# Connectors - JSON
# Data Flows - Single Direction
# Connection Type - JSON

# Add Recipe notes / Change log information here!

#    _____                            _                 
#   / ____|                          | |                
#  | |     ___  _ __  _ __   ___  ___| |_ ___  _ __ ___ 
#  | |    / _ \| '_ \| '_ \ / _ \/ __| __/ _ \| '__/ __|
#  | |___| (_) | | | | | | |  __/ (__| || (_) | |  \__ \
#   \_____\___/|_| |_|_| |_|\___|\___|\__\___/|_|  |___/
#

# CONNECTORS specify what data will be pulled into the in-memory database during processing

connectors:

# JSON Setup - https://help.lingk.io/en/articles/74-json-connector-reference

###### Start: JSON Connectors #######

-     
    name: customerAddressList1   
    type: json
    properties:
      jsonObject: >
         [
          { "id":1, "Address1":"Main Street", "firstName":"Will"  },
          { "id":2, "Address1":"Main Street", "firstName":"Crystal"  },  
          { "id":3, "Address1":"Main Street", "firstName":"Will"  },
          { "id":4, "Address1":"Main Street", "firstName":"Will"  },
          { "id":5, "Address1":"Main Street", "firstName":"Will"  }
         ]

-     
    name: customerAddressList2     
    type: json
    properties:
      jsonObject: >
         [
          { "id":1, "Address1":"Main Street", "firstName":"Will" },
          { "id":2, "Address1":"Main Str", "firstName":"Cristall" },  
          { "id":3, "Address1":"Front Street", "firstName":"Billy" },
          { "id":4, "Address1":"West Main Street", "firstName":"Thomas" },
          { "id":5, "Address1":"Circle Av", "firstName":"Sandra" }
         ]

###### End: JSON Connectors #######

#   ______                         _       
#  |  ____|                       | |      
#  | |__ ___  _ __ _ __ ___   __ _| |_ ___ 
#  |  __/ _ \| '__| '_ ` _ \ / _` | __/ __|
#  | | | (_) | |  | | | | | | (_| | |_\__ \
#  |_|  \___/|_|  |_| |_| |_|\__,_|\__|___/

readFormats: 
-     
    name: csv     
    type: delimited     
    properties:      
        quoteAllFields: false
        delimiter: ','       
        header: true

#    _____ _        _                            _       
#   / ____| |      | |                          | |      
#  | (___ | |_ __ _| |_ ___ _ __ ___   ___ _ __ | |_ ___ 
#   \___ \| __/ _` | __/ _ \ '_ ` _ \ / _ \ '_ \| __/ __|
#   ____) | || (_| | ||  __/ | | | | |  __/ | | | |_\__ \
#  |_____/ \__\__,_|\__\___|_| |_| |_|\___|_| |_|\__|___/

# STATEMENTS specify how the data should be processed while in memory

statements:

#******************************************************************** D I S C L A I M E R ***********************************************************************************************
#                                                                                                                                                                                       * 
# Note that in an effort to keep recipes optimized for DPH (Data Processing Hours), print statements should be commented out after development has concluded for a recipe.              *
# For more information on DPH optimization, please visit the following help article - https://help.lingk.io/en/articles/212-minimizing-data-processing-hours-on-the-lingk-platform      *
#                                                                                                                                                                                       * 
#******************************************************************** D I S C L A I M E R ***********************************************************************************************

# fuzzy logic test to find all customers that live on Main Street (St, Street, St.) regardless of city
# The "levenshtein" distance is a string metric for measuring difference between the two given strings.
  - statement: |
        (customerquery) => 
          SELECT a.ID, 
          levenshtein(a.Address1, b.Address1) distance,
          a.Address1 Address1a,
          b.Address1 Address1b,
           ((length(a.Address1)-levenshtein(a.Address1, b.Address1))/length(a.Address1))*100 Confidence 
          FROM customerAddressList1 a 
            INNER JOIN customerAddressList2 b ON a.id=b.id 
          ORDER by a.ID 
  #- statement: print customerquery

# fuzzy logic test to find all customers that live on Main Street (St, Street, St.) regardless of city
# The "levenshtein" distance is a string metric for measuring difference between the two given strings.
  - statement: |
        (customerquery) => 
          SELECT a.ID, 
          a.Address1 Address1a,
          b.Address1 Address1b,
          levenshtein(a.Address1, b.Address1) distance,
           ((length(a.Address1)-levenshtein(a.Address1, b.Address1))/length(a.Address1))*100 Confidence 
          FROM customerAddressList1 a 
            INNER JOIN customerAddressList2 b ON a.id=b.id and levenshtein(a.Address1, b.Address1) < 5
          ORDER by a.ID 
  #- statement: print customerquery

# fuzzy logic test to find all customers that live on Main Street (St, Street, St.) regardless of city
# The "levenshtein" distance is a string metric for measuring difference between the two given strings.
  - statement: |
        (customerquery) => 
          SELECT a.ID, 
          a.Address1 Address1a,
          b.Address1 Address1b,
          levenshtein(a.Address1, b.Address1) distance,
           ((length(a.Address1)-levenshtein(a.Address1, b.Address1))/length(a.Address1))*100 Confidence 
          FROM customerAddressList1 a 
            INNER JOIN customerAddressList2 b ON a.id=b.id and levenshtein(a.Address1, b.Address1) < 4
          ORDER by a.ID 
  #- statement: print customerquery

# soundex test to find compare firstnames - is a phonetic algorithm, an algorithm to index names by their sound, when pronounced in English
  - statement: |
        (customerquery) => 
          SELECT 
            a.ID, 
            a.FirstName FirstNameA,
            b.FirstName FirstNameB,
            soundex(a.FirstName) FirstNameA,
            soundex(b.FirstName) FirstNameB,
            (soundex(a.FirstName) == soundex(b.FirstName)) Match
          FROM customerAddressList1 a 
            INNER JOIN customerAddressList2 b ON a.id=b.id
          ORDER by a.ID 
  #- statement: print customerquery

# Add more statements to convert, join, aggregrate, transform, and integrate your data