Transformation Examples

#   _____           _              _____        __      
#  |  __ \         (_)            |_   _|      / _|     
#  | |__) |___  ___ _ _ __   ___    | |  _ __ | |_ ___  
#  |  _  // _ \/ __| | '_ \ / _ \   | | | '_ \|  _/ _ \ 
#  | | \ \  __/ (__| | |_) |  __/  _| |_| | | | || (_) |
#  |_|  \_\___|\___|_| .__/ \___| |_____|_| |_|_| \___/ 
#                    | |                                
#                    |_|

# Project Name - TRANSFORMATION EXAMPLES

# Recipe URL - https://app.lingk.io/a/10932/tf/17909

# Description - # This recipe demonstrates how to transform data from static JSONs using different spark functions to accomplish it

# Industry - Higher Ed
# Business Process - Graduate Reporting

# Systems - 
# Connectors - JSON
# Data Flows - Single Direction
# Connection Type - JSON

# Add Recipe notes / Change log information here!

#    _____                            _                 
#   / ____|                          | |                
#  | |     ___  _ __  _ __   ___  ___| |_ ___  _ __ ___ 
#  | |    / _ \| '_ \| '_ \ / _ \/ __| __/ _ \| '__/ __|
#  | |___| (_) | | | | | | |  __/ (__| || (_) | |  \__ \
#   \_____\___/|_| |_|_| |_|\___|\___|\__\___/|_|  |___/
#

# CONNECTORS specify what data will be pulled into the in-memory database during processing

connectors:

# JSON Setup - https://help.lingk.io/en/articles/74-json-connector-reference

###### Start: JSON Connectors #######

-
# JSON data representing a subset of courses
    name: courses
    type: json
    properties:
      jsonObject: >
       [
         { "courseId":"CIT163","creditHours":3,"title":"Intro to Programming w/C++"},
         { "courseId":"CIT313","creditHours":3,"title":"Web Programming II"},
         { "courseId":"CIT416","creditHours":3,"title":"Advanced Web Programming"},
         { "courseId":"CHM218","creditHours":2.67,"title":"Chem Lab II"},
         { "courseId":"CIT410","creditHours":3,"title":"E-Commerce"}
       ]

-
# JSON data representing a subset of students
    name: students
    type: json
    properties:
      jsonObject: >
       [
         {"username":"riley","lastname":"Testman","firstname":"Myke","email":"RILEY@someschool.edu","idnumber":"2710418"},
         {"username":"caldwelld","lastname":"Caldwell","firstname":"Donna","email":"caldwelld@someschool.edu","idnumber":"2565460"},
         {"username":"rahall","lastname":"Rahall","firstname":"Eileen","email":"rahall@someschool.edu","idnumber":"1156610"},
         {"username":"mccormin","lastname":"McCormick","firstname":"Nancy","email":"mccormin@someschool.edu","idnumber":"6277323"},
         {"username":"smith1878","lastname":"Smith","firstname":"Roderick","email":"smith1878@someschool.edu","idnumber":"34496"},
         {"username":"martin257","lastname":"Martin","firstname":"Wayne","email":"MARTIN257@someschool.edu","idnumber":"3896281"}
       ]

-
# JSON data representing a subset of registrations involving the students and courses above
    name: registrations
    type: json
    properties:
      jsonObject: >
       [
         {"courseId":"CIT163","idnumber":"2710418","regDate":"2018-03-13","grade":"A"},
         {"courseId":"CIT163","idnumber":"2565460","regDate":"2018-04-01","grade":"B"},
         {"courseId":"CIT163","idnumber":"6277323","regDate":"2018-03-09","grade":"A"},
         {"courseId":"CIT163","idnumber":"3896281","regDate":"2018-03-12","grade":"C"},
         {"courseId":"CIT410","idnumber":"2710418","regDate":"2018-03-13","grade":"B"},
         {"courseId":"CIT313","idnumber":"2710418","regDate":"2018-04-03","grade":"A"}
       ]

###### End: JSON Connectors #######

#    _____ _        _                            _       
#   / ____| |      | |                          | |      
#  | (___ | |_ __ _| |_ ___ _ __ ___   ___ _ __ | |_ ___ 
#   \___ \| __/ _` | __/ _ \ '_ ` _ \ / _ \ '_ \| __/ __|
#   ____) | || (_| | ||  __/ | | | | |  __/ | | | |_\__ \
#  |_____/ \__\__,_|\__\___|_| |_| |_|\___|_| |_|\__|___/

# STATEMENTS specify how the data should be processed while in memory

statements:

#******************************************************************** D I S C L A I M E R ***********************************************************************************************
#                                                                                                                                                                                       * 
# Note that in an effort to keep recipes optimized for DPH (Data Processing Hours), print statements should be commented out after development has concluded for a recipe.              *
# For more information on DPH optimization, please visit the following help article - https://help.lingk.io/en/articles/212-minimizing-data-processing-hours-on-the-lingk-platform      *
#                                                                                                                                                                                       * 
#******************************************************************** D I S C L A I M E R ***********************************************************************************************

# Example demonstrating various string functions available in Spark
- statement: |
    (strings) => Select 
                    upper(courseId)                               upperCase,
                    lower(courseId)                               lowerCase,
                    substring(courseId, 1, 2)                     substring,
                    format_string ("Student: %s earned the grade of %s in %s", 
                        idnumber, grade, courseId)                C_Style_Format_String,
                    split(regDate, "-")                           split_String,
                    regexp_replace(regDate, "%-04-%", " APRIL ")  regular_Expression_Replacement,
                    initcap(courseId)                             first_Letter_Capitalized,
                    lpad(grade, 10, ' ')                          left_Padding,
                    length(idnumber)                              length_of_String
                from 
                    registrations
#- statement: print strings

# Example demonstrating various date functions available in Spark
- statement: |
    (dates) => Select 
                  current_date                                  current_Date,
                  current_timestamp                             current_Timestamp,
                  dayofyear(regDate)                            day_of_the_Year,
                  dayofmonth(regDate)                           day_of_the_Month,
                  datediff(current_date, regDate)               days_Between_Dates,
                  month(regDate)                                month_from_Date,
                  to_utc_timestamp(regDate, 'EST')              convert_Date_to_Timestamp,
                  year(regDate)                                 year_of_Date,
                  date_format(regDate, "M d Y")                 format_Date
              from 
                  registrations
#- statement: print dates

# Example demonstrating various number functions available in Spark
- statement: |
    (numbers) => Select 
                    isnan (creditHours)                         not_A_Number_Check,
                    nanvl (creditHours, title)                  return_Col1_if_num_Col2_if_Col1_NAN,
                    length(title)                               number_Characters,
                    levenshtein(courseId, title)                levenshtein_distance_two_columns,
                    least(3, 4, 7, 1, 8)                        least_Value,
                    greatest(3, 4, 7, 1, 8)                     greatest_Value,
                    rand()                                      random_Number,
                    ceil(creditHours)                           ceiling_Value,
                    floor(creditHours)                          floor_Value,
                    cast("1.68E+7" as Decimal(10,1))             scientific_notation_Value,
                    int(1.68E+7)                                scientific_notation_int_Value
                from 
                    courses
#- statement: print numbers

# Example demonstrating various Boolean functions available in Spark
- statement: |
    (bools) => Select 
                    not (true)                                returns_False,
                    not (false)                               returns_True,
                    isnull (grade)                            checks_for_Null_Values,
                    case 
                        when courseId='CIT163' then 'C++' 
                        when courseId='CIT313' 
                              OR courseId='CIT410' then 'WEB' 
                        end                                   case_Demo 
                from 
                    registrations
#- statement: print bools

# The following examples demonstrate various spark functions that may be used to transform data into information.
# You can uncomment the various statement blocks (remove the #) to see any of these in action.  
# Each statement: | and statement: print... combination will generate and output data

# Example demonstrating the use of a general select * from a data provider above
#- statement: |
#     (course) => select 
#                   * 
#                 from 
#                   courses
#- statement: print course

# Example demonstrating the use of a general select * with a where clause using a LIKE from a data provider above
#- statement: |
#      (course) => select 
#                    * 
#                  from 
#                    courses 
#                  where 
#                     title LIKE '%Web%'
#- statement: print course

# Example demonstrating the use of a general select * with multiple criteria in a where clause with a substr function from a data provider above
#- statement: |
#       (UpperLevelCredit3HourCourses) => select 
#                                            * 
#                                         from 
#                                            courses 
#                                         where 
#                                            creditHours=3 
#                                            and substr(courseId, 4, 3) >= "300"
#- statement: print UpperLevelCredit3HourCourses

# Example demonstrating the aggregate sum function, totally the # of credit hours students are enrolled in
#- statement: |
#       (SCHs) => select 
#                     sum(creditHours) as Total_SCHs 
#                 from 
#                     registrations, courses 
#                 where 
#                     courses.courseId=registrations.courseId
# OR
#- statement: |
#        (SCHs) => select 
#                      sum(creditHours) as Total_SCHs 
#                  from 
#                      registrations r 
#                  join 
#                      courses c 
#                  on 
#                      r.courseId=c.courseId
#- statement: print SCHs

# Example demonstrating string concatenation and spark aliasing to output student names as a single field
#- statement: |
#        (students) => select 
#                          concat (lastname, ', ', firstname) as Student_Name 
#                      from 
#                          students
#- statement: print students

# Example demonstrating the use of sum and count aggregate functions as well as a case statement to translate letter grades to numbers while calculating each students' GPA
#- statement: |
#         (gpa) => select 
#                     (sum(case 
#                              when r.grade='A' then 4 
#                              when r.grade='B' then 3 
#                              when r.grade='C' then 2 
#                              when r.grade='D' then 1 
#                              when r.grade='F' then 0 
#                          end) 
#                          / count(*)) as GPA, 
#                      r.idnumber as ID 
#                   from 
#                      registrations r 
#                   group by 
#                      r.idnumber
#- statement: print gpa

# Example demonstrating the use of max, min and datediff to find the # of days between first and last registration
#- statement: |
#        (days) => select 
#                      datediff(max(regDate), 
#                      min(regDate)) as days 
#                  from 
#                      registrations 
#- statement: print days

# Example demonstrating the use of collect_list to return a list of course IDs 
#- statement: |
#        (multipleregs) => select 
#                              collect_list(distinct(courseId)) as courses_with_registrations 
#                          from 
#                              registrations 
#- statement: print multipleregs

# Example demonstrating aggregate count and group by listing # of registrations per course 
#- statement: |
#        (regnumbers) => select 
#                            c.courseId as Course, 
#                            title as Title, 
#                            count(*) as Registrants 
#                        from 
#                            courses c 
#                        join 
#                            registrations r 
#                        on 
#                            c.courseId=r.courseId 
#                        group by 
#                            c.courseId, title 
#- statement: print regnumbers

# Add more statements to convert, join, aggregrate, transform, and integrate your data