Data Diff Simplified

#   _____           _              _____        __      
#  |  __ \         (_)            |_   _|      / _|     
#  | |__) |___  ___ _ _ __   ___    | |  _ __ | |_ ___  
#  |  _  // _ \/ __| | '_ \ / _ \   | | | '_ \|  _/ _ \ 
#  | | \ \  __/ (__| | |_) |  __/  _| |_| | | | || (_) |
#  |_|  \_\___|\___|_| .__/ \___| |_____|_| |_|_| \___/ 
#                    | |                                
#                    |_|

# Project Name - DATA DIFF SIMPLIFIED

# Recipe URL - https://app.lingk.io/a/10932/tf/17854

# Description - # This recipe does a data diff on JSON structures in the JSON connector (though it will work with ANY connector because all outputs are the same for Lingk)
                # To use this recipe, just click Run! To see changes, delete, change, and add records and run again to see the output. 
                # Lingk stores a hashed version of the records and their keys in a storage bucket.

# Industry - Higher Ed
# Business Process - Graduate Reporting

# Systems - Amazon S3 Bucket
# Connectors - S3Bucket, JSON
# Data Flows - Single Direction
# Connection Type - JSON

# Add Recipe notes / Change log information here!

#    _____                            _                 
#   / ____|                          | |                
#  | |     ___  _ __  _ __   ___  ___| |_ ___  _ __ ___ 
#  | |    / _ \| '_ \| '_ \ / _ \/ __| __/ _ \| '__/ __|
#  | |___| (_) | | | | | | |  __/ (__| || (_) | |  \__ \
#   \_____\___/|_| |_|_| |_|\___|\___|\__\___/|_|  |___/
#

# CONNECTORS specify what data will be pulled into the in-memory database during processing

connectors:

# Configure S3Bucket credentials in your Environment before running this recipe
  # S3Bucket Setup - https://help.lingk.io/en/articles/298-amazon-s3-connector-setup-guide
  # JSON Setup - https://help.lingk.io/en/articles/74-json-connector-reference

###### Start: JSON Connectors ######

-
    name: newCourse10
    type: json
    postprocessor: courseDataDiff
    properties:
      jsonObject: >
       [
         { "courseId": "phys15a", "creditHours": 52, "newColumn": "newValue", "description": "daf"},
         { "courseId": "phys25r", "creditHours": 22, "newColumn": "newValue1", "description": "ADDED" },
         { "courseId": "phys20r", "creditHours": 22, "newColumn": "newValue1", "description": "dfadf" },
         { "courseId": "phys21r", "creditHours": 800, "newColumn": "newValuex", "description": "dd" },
         { "courseId": "phys24r", "creditHours": 224, "newColumn": "newValuey", "description": "This is df" },
         { "courseId": "phys213", "creditHours": 224, "newColumn": "newValuey", "description": "This is added" }
       ]
    schema:
      fields:
        - name: courseId
          type: String
        - name: creditHours
          type: Long
        - name: description
          type: String
        - name: accountId
          type: String
        - name: notInData
          type: String

###### End: JSON Connectors ######

###### Start: S3Bucket Connectors ######

# Storage buckets can be either Lingk-hosted AWS S3 
# For on-premise/private instance SIRE, these can be configured per installation
storageBuckets:
-
    name: myProviderS3Diff
    type: s3Bucket
    properties:
      bucketName: datadiffv2

###### End: S3Bucket Connectors ######

# Postprocessors handle the diffing
# Specify the primary keys to diff properly
# As a result of the post process, an additional field is added to the data "__change" only changed data is returned
postprocessors:
-
    name: courseDataDiff
    type: diffprocessor
    properties:
      primaryKeys:  [ "courseId" ]
      storageBucketRef: myProviderS3Diff

#    _____ _        _                            _       
#   / ____| |      | |                          | |      
#  | (___ | |_ __ _| |_ ___ _ __ ___   ___ _ __ | |_ ___ 
#   \___ \| __/ _` | __/ _ \ '_ ` _ \ / _ \ '_ \| __/ __|
#   ____) | || (_| | ||  __/ | | | | |  __/ | | | |_\__ \
#  |_____/ \__\__,_|\__\___|_| |_| |_|\___|_| |_|\__|___/

# STATEMENTS specify how the data should be processed while in memory

statements:

#******************************************************************** D I S C L A I M E R ***********************************************************************************************
#                                                                                                                                                                                       * 
# Note that in an effort to keep recipes optimized for DPH (Data Processing Hours), print statements should be commented out after development has concluded for a recipe.              *
# For more information on DPH optimization, please visit the following help article - https://help.lingk.io/en/articles/212-minimizing-data-processing-hours-on-the-lingk-platform      *
#                                                                                                                                                                                       * 
#******************************************************************** D I S C L A I M E R ***********************************************************************************************

- statement: (changeDataset) => select * from newCourse10
#- statement: print changeDataset
- statement: (deleted) => select * from changeDataset where __change = 'd' # deleted
#- statement: print deleted
- statement: (created) => select * from changeDataset where __change = 'a' # added
#- statement: print created
- statement: (changed) => select * from changeDataset where __change = 'u' # updated
#- statement: print changed
- statement: (ch) => select creditHours from changed
#- statement: print ch

# Add more statements to convert, join, aggregrate, transform, and integrate your data