diff --git a/.github/workflows/cd-terraform-core.yml b/.github/workflows/cd-terraform-core.yml index aa6612325..4345d81f7 100644 --- a/.github/workflows/cd-terraform-core.yml +++ b/.github/workflows/cd-terraform-core.yml @@ -48,6 +48,7 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} pre-production: needs: ["test", "validate"] uses: ./.github/workflows/deploy_terraform.yml @@ -79,6 +80,7 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} production: needs: [ "pre-production" ] uses: ./.github/workflows/deploy_terraform.yml @@ -110,3 +112,5 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} + diff --git a/.github/workflows/cd-terraform-etl.yml b/.github/workflows/cd-terraform-etl.yml index 2e89e59d5..ff1816123 100644 --- a/.github/workflows/cd-terraform-etl.yml +++ b/.github/workflows/cd-terraform-etl.yml @@ -48,6 +48,7 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} pre-production: needs: ["test", "validate"] uses: ./.github/workflows/deploy_terraform.yml @@ -79,6 +80,7 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} production: needs: [ "pre-production" ] uses: ./.github/workflows/deploy_terraform.yml @@ -110,3 +112,4 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} diff --git a/.github/workflows/cd-terraform-networking.yml b/.github/workflows/cd-terraform-networking.yml index b292e34dd..c7a46f373 100644 --- a/.github/workflows/cd-terraform-networking.yml +++ b/.github/workflows/cd-terraform-networking.yml @@ -52,6 +52,7 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} pre-production: needs: ["validate"] uses: ./.github/workflows/deploy_terraform_networking.yml @@ -80,6 +81,7 @@ jobs: AWS_MOSAIC_VPC_ID: ${{ secrets.AWS_MOSAIC_VPC_ID }} AWS_DP_VPC_ID: ${{ secrets.AWS_DP_DEV_VPC_ID }} GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} production: needs: [ "pre-production" ] uses: ./.github/workflows/deploy_terraform_networking.yml @@ -108,3 +110,4 @@ jobs: AWS_MOSAIC_VPC_ID: ${{ secrets.AWS_MOSAIC_VPC_ID }} AWS_DP_VPC_ID: ${{ secrets.AWS_DP_STG_VPC_ID }} GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_PROD }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} diff --git a/.github/workflows/ci-terraform-backend-setup.yml b/.github/workflows/ci-terraform-backend-setup.yml index 208f1845c..29e145b84 100644 --- a/.github/workflows/ci-terraform-backend-setup.yml +++ b/.github/workflows/ci-terraform-backend-setup.yml @@ -42,3 +42,4 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} diff --git a/.github/workflows/ci-terraform-core.yml b/.github/workflows/ci-terraform-core.yml index e74716951..50788a328 100644 --- a/.github/workflows/ci-terraform-core.yml +++ b/.github/workflows/ci-terraform-core.yml @@ -42,6 +42,7 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} CI-Production-Plan: name: "Production" @@ -71,6 +72,8 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} + CI-Staging-lint: name: "Lint" uses: ./.github/workflows/lint-terraform.yml @@ -99,3 +102,4 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} diff --git a/.github/workflows/ci-terraform-etl.yml b/.github/workflows/ci-terraform-etl.yml index a8fb4e70a..d66e998a4 100644 --- a/.github/workflows/ci-terraform-etl.yml +++ b/.github/workflows/ci-terraform-etl.yml @@ -42,6 +42,8 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} + ETL-Plan-Production: name: "Production" uses: ./.github/workflows/plan-terraform.yml @@ -70,6 +72,8 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} + ETL-Lint: name: "Lint" uses: ./.github/workflows/lint-terraform.yml @@ -98,3 +102,4 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} diff --git a/.github/workflows/ci-terraform-networking.yml b/.github/workflows/ci-terraform-networking.yml index 3642655e1..ce9f3659f 100644 --- a/.github/workflows/ci-terraform-networking.yml +++ b/.github/workflows/ci-terraform-networking.yml @@ -43,6 +43,8 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} + CI-Production-Plan: name: "Production" uses: ./.github/workflows/plan-terraform.yml @@ -71,6 +73,8 @@ jobs: COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} PRODUCTION_FIREWALL_IP: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} + CI-Staging-Lint: name: "Lint" uses: ./.github/workflows/lint-terraform.yml @@ -99,3 +103,4 @@ jobs: GOOGLE_CREDENTIALS: ${{ secrets.GOOGLE_CREDENTIALS_STG }} COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 407d91440..6f650e1c2 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -72,6 +72,8 @@ on: required: true PRODUCTION_FIREWALL_IP: required: true + TERRAFORM_SECRET_TOKEN: + required: true jobs: deploy: name: Terraform Apply @@ -81,6 +83,10 @@ jobs: - name: Checkout Source uses: actions/checkout@v3 + - name: Set Github Auth + run: git config --global url."https://oauth2:${{ secrets.TERRAFORM_SECRET_TOKEN}}@github.com".insteadOf https://github.com + shell: bash + - name: Install Terraform uses: hashicorp/setup-terraform@v2.0.3 with: diff --git a/.github/workflows/deploy_terraform_networking.yml b/.github/workflows/deploy_terraform_networking.yml index 9d3e4d308..dea4187ac 100644 --- a/.github/workflows/deploy_terraform_networking.yml +++ b/.github/workflows/deploy_terraform_networking.yml @@ -66,6 +66,8 @@ on: required: true GOOGLE_CREDENTIALS: required: true + TERRAFORM_SECRET_TOKEN: + required: true jobs: @@ -77,6 +79,10 @@ jobs: - name: Checkout Source uses: actions/checkout@v3 + - name: Set Github Auth + run: git config --global url."https://oauth2:${{ secrets.TERRAFORM_SECRET_TOKEN}}@github.com".insteadOf https://github.com + shell: bash + - name: Install Terraform uses: hashicorp/setup-terraform@v2.0.3 with: diff --git a/.github/workflows/lint-terraform.yml b/.github/workflows/lint-terraform.yml index 85c3c4b9f..cb1b7ad0f 100644 --- a/.github/workflows/lint-terraform.yml +++ b/.github/workflows/lint-terraform.yml @@ -60,6 +60,8 @@ on: required: true PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: required: true + TERRAFORM_SECRET_TOKEN: + required: true jobs: lint: @@ -69,6 +71,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 + - name: Set Github Auth + run: git config --global url."https://oauth2:${{ secrets.TERRAFORM_SECRET_TOKEN}}@github.com".insteadOf https://github.com + shell: bash + - name: Install Terraform uses: hashicorp/setup-terraform@v2.0.3 with: diff --git a/.github/workflows/plan-terraform.yml b/.github/workflows/plan-terraform.yml index 27db8e629..fdc7808a5 100644 --- a/.github/workflows/plan-terraform.yml +++ b/.github/workflows/plan-terraform.yml @@ -60,6 +60,8 @@ on: required: true PRODUCTION_FIREWALL_IP: required: true + TERRAFORM_SECRET_TOKEN: + required: true jobs: plan: @@ -70,7 +72,11 @@ jobs: - name: Checkout uses: actions/checkout@v3 - + + - name: Set Github Auth + run: git config --global url."https://oauth2:${{ secrets.TERRAFORM_SECRET_TOKEN}}@github.com".insteadOf https://github.com + shell: bash + - name: Install Terraform uses: hashicorp/setup-terraform@v2.0.3 with: @@ -123,6 +129,7 @@ jobs: copy_liberator_to_pre_prod_lambda_execution_role: ${{ secrets.COPY_LIBERATOR_TO_PRE_PROD_LAMBDA_EXECUTION_ROLE }} pre_production_liberator_data_storage_kms_key_arn: ${{ secrets.PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN }} production_firewall_ip: ${{ secrets.PRODUCTION_FIREWALL_IP }} + TERRAFORM_SECRET_TOKEN: ${{ secrets.TERRAFORM_SECRET_TOKEN }} - name: Terraform Compliance if: ${{ (success()) && (inputs.build_path != './terraform/etl') }} id: terraform-compliance diff --git a/.github/workflows/validate-and-lint-terraform.yml b/.github/workflows/validate-and-lint-terraform.yml index 94d7b925d..3e378d913 100644 --- a/.github/workflows/validate-and-lint-terraform.yml +++ b/.github/workflows/validate-and-lint-terraform.yml @@ -60,6 +60,8 @@ on: required: true PRE_PRODUCTION_LIBERATOR_DATA_STORAGE_KMS_KEY_ARN: required: true + TERRAFORM_SECRET_TOKEN: + required: true jobs: validate: @@ -69,6 +71,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 + - name: Set Github Auth + run: git config --global url."https://oauth2:${{ secrets.TERRAFORM_SECRET_TOKEN}}@github.com".insteadOf https://github.com + shell: bash + - name: Install Terraform uses: hashicorp/setup-terraform@v2.0.3 with: @@ -107,6 +113,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 + - name: Set Github Auth + run: git config --global url."https://oauth2:${{ secrets.TERRAFORM_SECRET_TOKEN}}@github.com".insteadOf https://github.com + shell: bash + - name: Install Terraform uses: hashicorp/setup-terraform@v2.0.3 with: diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 03cfa0921..ae352c3e1 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -807,11 +807,11 @@ def standardize_schools_admissions_data(schools_admissions_cleaned: DataFrame) - * address_line_4: Fourth line of the address. Should be of type string and can be blank. * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. Should be of type string and can be blank. - * source_filter: Field to contain additional information on parking permits (only contains holding string for now). + * source_filter: Field to contain additional information on schools admissions (only contains holding string for now). Should be of type string and can be blank. Args: - schools_admissions_cleaned: parking permit DataFrame after preparing and cleaning it. + schools_admissions_cleaned: schools admissions DataFrame after preparing and cleaning it. Returns: A schools admissions DataFrame with all the standard column listed above. @@ -841,6 +841,146 @@ def standardize_schools_admissions_data(schools_admissions_cleaned: DataFrame) - return schools_admissions +def prepare_clean_freedom_pass_admissions_data(freedom_df: DataFrame) -> DataFrame: + """A function to prepare and clean schools admissions data. Splits ou middle name from first name. Sorts address + columns so that they are consistent with other datasets. + + Args: + freedom_df (Dataframe): Dataframe containing freedom pass applications data. + + Returns: + freedom_cleaned (Dataframe): A DataFrame after preparing data from multiple sources and cleaning it. + """ + + address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] + + freedom_cleaned = freedom_df \ + .withColumn("source", lit("freedom_passes")) \ + .withColumn("source_id", col("applicantid")) \ + .withColumn("first_name", col("forename")) \ + .withColumn("middle_name", lit("")) \ + .withColumn("last_name", col("surname")) \ + .withColumn("name", regexp_replace(concat_ws(" ", col("first_name"), col("last_name")), r"\s+", " ")) \ + .withColumnRenamed("house_name_number", "address_line_1") \ + .withColumnRenamed("building_name", "address_line_2") \ + .withColumnRenamed("street", "address_line_3") \ + .withColumnRenamed("district", "address_line_4") \ + .withColumnRenamed("postcode", "post_code") \ + .withColumnRenamed("email_address", "email") \ + .withColumn("date_of_birth", to_date(col("date_of_birth"), format="dd/MM/yyyy"))\ + .withColumn("uprn", lit("")) \ + .withColumn("source_filter", lit("freedom_passes_2024")) \ + .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), + col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter")) + + # create a zip of address line arrays, sorted in the order of not null (False), column order + freedom_cleaned = freedom_cleaned.select( + col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), + col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter"), + array_sort( + arrays_zip( + array([col(c).isNull() for c in address_cols]), + array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]) + ) + ).alias('address_sorted')) + + # disaggregate address_sorted arrays into columns + freedom_cleaned = freedom_cleaned.select( + col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), + col("source_filter"), + *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + + # rejig address lines + freedom_cleaned = freedom_cleaned \ + .withColumn("address_line_1", when(col("address_line_1").rlike(r"\d+[a-z]$") + & col("address_line_2").rlike(r"^[A-Za-z]"), + concat_ws(" ", col("address_line_1"), col("address_line_2"))) + .otherwise(col("address_line_1"))) \ + .withColumn("address_line_2", when(col("address_line_1").contains(col("address_line_2")), + col("address_line_3")) + .otherwise(concat_ws(" ", col("address_line_2"), col("address_line_3")))) \ + .withColumn("address_line_2", when(col("address_line_2").rlike(r"\d+$"), + concat_ws(" ", col("address_line_2"), col("address_line_4"))) + .otherwise(col("address_line_2"))) \ + .withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), lit("london"))) \ + .withColumn("address_line_2", when(col("address_line_2").isNull(), lit("hackney")) + .otherwise(col("address_line_2"))) \ + .withColumn("address_line_3", when(col("address_line_3").isNull(), lit("london")) + .otherwise(col("address_line_3"))) \ + .withColumn("address_line_4", lit("")) \ + .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), + col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter")) + + return freedom_cleaned + + +def standardize_freedom_pass_data(freedom_cleaned: DataFrame) -> DataFrame: + """Standardize freedom pass data. This function convert all the custom names (coming from their respective + sources to standard names that will be used by various other functions like feature engineering etc.) + The DataFrame returned will have the following columns: + + * source: Source of the data like parking, tax etc. Should be of type string and cannot be blank. + * source_id: Unique ID for reach record. It's ok to have same person with different source_id. Should be of type + string and cannot be blank. + * uprn: UPRN of the address. Should be of type string and can be blank. + * title: Title of the person. Should be of type string and can be blank. + * first_name: First name of the person. Should be of type string and can be blank. + * middle_name: Middle name of the person. Should be of type string and can be blank. + * last_name: Last name of the person. Should be of type string and can be blank. + * name: Concatenation of first and last name after sorting alphabetically of the person. Should be of type + string and can be blank. + * date_of_birth: Date of birth of the person. Should be of type Date and can be blank. + * post_code: Postal code of the address. Should be of type string and can be blank. + * address_line_1: First line of the address. Should be of type string and can be blank. If this is empty then check + if other address lines contain a value, and shift if necessary. + * address_line_2: Second line of the address. Should be of type string and can be blank. If this is empty then check + if other address lines contain a value, and shift if necessary. + * address_line_3: Third line of the address. Should be of type string and can be blank. + * address_line_4: Fourth line of the address. Should be of type string and can be blank. + * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. + Should be of type string and can be blank. + * source_filter: Field to contain additional information on freedom pass dataset e.g year (only contains holding string for now). + Should be of type string and can be blank. + + Args: + freedom_cleaned (Dataframe): Freedom pass dataframe after preparing and cleaning it. + + Returns: + freedom_passes (Dataframe): Freedom pass dataframe with all the standardised columns listed above. + + """ + freedom_passes = freedom_cleaned \ + .withColumn("source_id", col("source_id")) \ + .withColumn("title", categorise_title(lower(trim(col("title"))))) \ + .withColumn("first_name", standardize_name(trim(col("first_name")))) \ + .withColumn("middle_name", standardize_name(trim(col("middle_name")))) \ + .withColumn("last_name", standardize_name(trim(col("last_name")))) \ + .withColumn("name", standardize_name(trim(col("name")))) \ + .withColumn("post_code", lower(trim(col("post_code")))) \ + .withColumn("address_line_1", standardize_address_line(trim(col("address_line_1")))) \ + .withColumn("address_line_2", standardize_address_line(trim(col("address_line_2")))) \ + .withColumn("address_line_3", standardize_address_line(trim(col("address_line_3")))) \ + .withColumn("address_line_4", standardize_address_line(trim(col("address_line_4")))) \ + .withColumn("full_address1", full_address(trim(col("address_line_1")), trim(col("address_line_2")), + trim(col("address_line_3")), + trim(col("address_line_4")))) \ + .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) \ + .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), + col("full_address"), col("source_filter")) + + return freedom_passes + + def prepare_clean_electoral_register_data(electoral_register_df: DataFrame) -> DataFrame: """ This function cleans raw electoral register data from Xpress read for standardising. diff --git a/scripts/jobs/parking/parking_cycle_hangar_allocation.py b/scripts/jobs/parking/parking_cycle_hangar_allocation.py new file mode 100644 index 000000000..9f6a9eed6 --- /dev/null +++ b/scripts/jobs/parking/parking_cycle_hangar_allocation.py @@ -0,0 +1,312 @@ +import sys +from awsglue.transforms import * +from awsglue.utils import getResolvedOptions +from pyspark.context import SparkContext +from awsglue.context import GlueContext +from awsglue.job import Job +from awsglue import DynamicFrame +from scripts.helpers.helpers import get_glue_env_var, get_latest_partitions, PARTITION_KEYS,create_pushdown_predicate_for_max_date_partition_value + +def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame: + for alias, frame in mapping.items(): + frame.toDF().createOrReplaceTempView(alias) + result = spark.sql(query) + return DynamicFrame.fromDF(result, glueContext, transformation_ctx) + + +args = getResolvedOptions(sys.argv, ["JOB_NAME"]) +sc = SparkContext() +glueContext = GlueContext(sc) +spark = glueContext.spark_session +job = Job(glueContext) +job.init(args["JOB_NAME"], args) + +environment = get_glue_env_var("environment") + +# Script generated for node Amazon S3 +AmazonS3_node1658997944648 = glueContext.create_dynamic_frame.from_catalog( + database="parking-raw-zone", + table_name="parking_parking_ops_cycle_hangar_list", + transformation_ctx="AmazonS3_node1658997944648", +) + +# Script generated for node Amazon S3 +AmazonS3_node1697705005761 = glueContext.create_dynamic_frame.from_catalog( + database="dataplatform-" + environment + "-liberator-raw-zone", + table_name="liberator_permit_llpg", + transformation_ctx="AmazonS3_node1697705005761", +) + +# Script generated for node Amazon S3 +AmazonS3_node1697704537304 = glueContext.create_dynamic_frame.from_catalog( + database="dataplatform-" + environment + "-liberator-refined-zone", + table_name="parking_cycle_hangars_denormalisation", + transformation_ctx="AmazonS3_node1697704537304", +) + +# Script generated for node Amazon S3 +AmazonS3_node1697705499200 = glueContext.create_dynamic_frame.from_catalog( + database="dataplatform-" + environment + "-liberator-raw-zone", + table_name="liberator_hangar_allocations", + transformation_ctx="AmazonS3_node1697705499200", +) + +# Script generated for node Amazon S3 +AmazonS3_node1697704672904 = glueContext.create_dynamic_frame.from_catalog( + database="dataplatform-" + environment + "-liberator-raw-zone", + table_name="liberator_hangar_waiting_list", + transformation_ctx="AmazonS3_node1697704672904", +) + +# Script generated for node Amazon S3 +AmazonS3_node1697704891824 = glueContext.create_dynamic_frame.from_catalog( + database="dataplatform-" + environment + "-liberator-raw-zone", + table_name="liberator_licence_party", + transformation_ctx="AmazonS3_node1697704891824", +) + +# Script generated for node SQL +SqlQuery343 = """ +/******************************************************************************************************************** +parking_cycle_hangar_allocation + +The SQL details the number of cycle spaces that are occupied +in each cycle hangar. It also identifies the number of Parties +that are on the waiting list. The code has been amended to use +Tom's hangar list + +19/10/2023 - Create Query +*******************************************************************************************************************/ +/************************************************************ +Create a comparison between Toms Hangar list and EStreet +************************************************************/ +With TomHangar as ( + SELECT + asset_no, asset_type, street_or_estate, zone, status, key_number, fob, location_description, + road, postcode, date_installed, easting, northing, road_or_pavement, + case + When asset_no like '%Bikehangar_1577%' Then '1577' + When asset_no like '%Bikehangar_H1439%' Then 'H1439' + When asset_no like '%Bikehangar_H1440%' Then 'Hangar_H1440' + When asset_no like '%Bikehangar_1435%' Then 'Bikehangar_H1435' + ELSE replace(asset_no, ' ','_') + END as HangarID + from parking_parking_ops_cycle_hangar_list + WHERE import_date = (Select MAX(import_date) + FROM parking_parking_ops_cycle_hangar_list) + AND asset_type = 'Hangar' AND status = 'Active '), + +Hanger as ( + SELECT hanger_id, + ROW_NUMBER() OVER ( PARTITION BY hanger_id ORDER BY hanger_id DESC) H1 + From parking_cycle_hangars_denormalisation + WHERE import_date = (Select MAX(import_date) from parking_cycle_hangars_denormalisation)), + +Hangar_Comp as ( + SELECT + asset_no, HangarID, B.hanger_id + FROM TomHangar as A + LEFT JOIN Hanger as B ON A.HangarID = B.hanger_id AND H1 = 1 + UNION ALL + SELECT 'new_only','new_only','new_only'), +/************************************************************ +Create the Waiting list - unique "party_id" +************************************************************/ +waiting_list as ( + SELECT *, + ROW_NUMBER() OVER ( PARTITION BY party_id, hanger_id ORDER BY party_id, hanger_id DESC) row1 + FROM liberator_hangar_waiting_list + WHERE Import_Date = (Select MAX(Import_Date) from + liberator_hangar_waiting_list)), +/*** Party List ***/ +Licence_Party as ( + SELECT * from liberator_licence_party + WHERE Import_Date = (Select MAX(Import_Date) from + liberator_licence_party)), +/*** STREET ***/ +LLPG as ( + SELECT * + FROM liberator_permit_llpg + WHERE import_date = (Select MAX(import_date) from + liberator_permit_llpg)), +/******************************************************************************* +Cycle Hangar allocation details +*******************************************************************************/ +Cycle_Hangar_allocation as ( + SELECT + *, + ROW_NUMBER() OVER ( PARTITION BY party_id + ORDER BY party_id, date_of_allocation DESC) row_num + FROM liberator_hangar_allocations + WHERE Import_Date = (Select MAX(Import_Date) from + liberator_hangar_allocations) + AND allocation_status IN ('live')), + +Street_Rec as ( + SELECT * + FROM liberator_permit_llpg + WHERE import_date = (Select MAX(import_date) from + liberator_permit_llpg) + AND address1 = 'STREET RECORD'), + +Cycle_Hangar_Wait_List as ( + SELECT + A.party_id, first_name, surname, B.uprn as USER_UPRN, + B.address1, B.address2, B.address3, B.postcode, B.telephone_number, D.Address2 as Street,registration_date + ,A.hanger_id, E.party_id Allocated_Party_ID + FROM waiting_list as A + LEFT JOIN Licence_Party as B ON A.party_id = B.business_party_id + LEFT JOIN LLPG as C ON B.uprn = cast(C.UPRN as string) + LEFT JOIN Street_Rec as D ON C.USRN = D.USRN + LEFT JOIN Cycle_Hangar_allocation as E ON A.party_id = E.party_id AND row_num = 1 + WHERE row1= 1 AND E.party_id is NULL and D.Address2 is not NULL), + +/************************************************************ +Waiting List CREATED +************************************************************/ +Estreet_Hanger as ( + SELECT hanger_id, space, hangar_location, + ROW_NUMBER() OVER ( PARTITION BY hanger_id, space, hangar_location + ORDER BY hanger_id, space, hangar_location DESC) H1 + From parking_cycle_hangars_denormalisation + WHERE import_date = (Select MAX(import_date) from parking_cycle_hangars_denormalisation) and + allocation_status = 'live' and key_issued = 'Y' + UNION ALL + SELECT 'new_only', ' ', 'NEWONLY', 1), + +Wait_List_Hangar as ( + SELECT A.party_id, A.hanger_id, + ROW_NUMBER() OVER ( PARTITION BY A.party_id, A.hanger_id + ORDER BY A.party_id, A.hanger_id DESC) H2 + FROM liberator_hangar_waiting_list as A + INNER JOIN Cycle_Hangar_Wait_List as B ON A.party_id = B.party_id + WHERE import_date = (Select MAX(import_date) + FROM liberator_hangar_waiting_list)), + +Wait_List_Earlist_Latest as ( + SELECT A.hanger_id, max(A.registration_date) as Max_Date, min(A.registration_date) as Min_Date + FROM liberator_hangar_waiting_list as A + INNER JOIN Cycle_Hangar_Wait_List as B ON A.party_id = B.party_id + WHERE import_date = (Select MAX(import_date) + FROM liberator_hangar_waiting_list) + AND A.registration_date not + IN ('2000-01-01','1900-12-13','1000-04-02','1100-04-02', + '1200-04-02','1300-04-02','1400-04-02','2000-12-17','1200-03-24') + GROUP BY A.hanger_id), + +Wait_total as ( + SELECT hanger_id, count(*) as Wait_Total + FROM Wait_List_Hangar + WHERE H2 = 1 + GROUP BY hanger_id), + +allocated_Total as ( + SELECT hanger_id, hangar_location, count(*) as Total_Allocated + FROM Estreet_Hanger + WHERE H1 = 1 + GROUP BY hanger_id,hangar_location), + +Full_Hangar_Data as ( + SELECT + A.hanger_id, A.hangar_location, + CASE + When A.hanger_id = 'new_only' Then 0 + ELSE Total_Allocated + END as Total_Allocated, + Wait_Total, + CASE + When A.hanger_id = 'new_only' Then 0 + ELSE ( 6 - Total_Allocated) + END as free_spaces, + Min_Date as Earlist_Registration_Date, + Max_Date as Latest_Registration_Date + FROM allocated_Total as A + LEFT JOIN Wait_total as B ON A.hanger_id = B.hanger_id + LEFT JOIN Wait_List_Earlist_Latest as C ON A.hanger_id = C.hanger_id), + +Hangar_WAit_List as ( + SELECT + A.asset_no as Tom_Asset_No, B.hanger_id as HangarID, street_or_estate, zone, location_description, + postcode, date_installed, + CASE + When Total_Allocated is NULL Then 0 + ELSE Total_Allocated + END as Total_Allocated, + CASE + When Wait_Total is NULL Then 0 + ELSE Wait_Total + END as Wait_Total, + CASE + When free_spaces is NULL Then 6 + ELSE free_spaces + END as free_spaces, + Earlist_Registration_Date, Latest_Registration_Date + FROM TomHangar as A + LEFT JOIN Hangar_Comp as B ON A.asset_no = B.asset_no + LEFT JOIN Full_Hangar_Data as C ON B.hanger_id = C.hanger_id), + +/*** Output the data ***/ +Output as ( + SELECT *, + CASE + When Total_Allocated = 6 Then 'N/A' + When Wait_Total >= free_spaces Then 'Yes' + Else 'No' + END as hangar_can_be_filled + FROM Hangar_WAit_List + WHERE HangarID is not NULL + UNION ALL + SELECT A.Tom_Asset_No, A.HangarID, A.street_or_estate, A.zone, A.location_description, + A.postcode, A.date_installed, A.Total_Allocated, TotalWatch, A.free_spaces, + C.Earlist_Registration_Date, C.Latest_Registration_Date, + CASE + When A.Total_Allocated = 6 Then 'N/A' + When A.Wait_Total >= A.free_spaces Then 'Yes' + Else 'No' + END as hangar_can_be_filled + FROM Hangar_WAit_List as A + LEFT JOIN (SELECT hanger_id, count(*) as TotalWatch FROM waiting_list + GROUP BY hanger_id) B ON replace(A.Tom_Asset_No,' ','_') = B.hanger_id + LEFT JOIN Full_Hangar_Data as C ON B.hanger_id = C.hanger_id + WHERE A.HangarID is NULL) + +SELECT *, + current_timestamp() as ImportDateTime, + replace(cast(current_date() as string),'-','') as import_date, + + cast(Year(current_date) as string) as import_year, + cast(month(current_date) as string) as import_month, + cast(day(current_date) as string) as import_day +FROM Output +""" +SQL_node1658765472050 = sparkSqlQuery( + glueContext, + query=SqlQuery343, + mapping={ + "parking_parking_ops_cycle_hangar_list": AmazonS3_node1658997944648, + "parking_cycle_hangars_denormalisation": AmazonS3_node1697704537304, + "liberator_hangar_waiting_list": AmazonS3_node1697704672904, + "liberator_licence_party": AmazonS3_node1697704891824, + "liberator_permit_llpg": AmazonS3_node1697705005761, + "liberator_hangar_allocations": AmazonS3_node1697705499200, + }, + transformation_ctx="SQL_node1658765472050", +) + +# Script generated for node Amazon S3 +AmazonS3_node1658765590649 = glueContext.getSink( + path="s3://dataplatform-" + environment + "-refined-zone/parking/parking_cycle_hangar_allocation/", + connection_type="s3", + updateBehavior="UPDATE_IN_DATABASE", + partitionKeys=PARTITION_KEYS, + compression="snappy", + enableUpdateCatalog=True, + transformation_ctx="AmazonS3_node1658765590649", +) +AmazonS3_node1658765590649.setCatalogInfo( + catalogDatabase="dataplatform-" + environment + "-liberator-refined-zone", + catalogTableName="parking_cycle_hangar_allocation", +) +AmazonS3_node1658765590649.setFormat("glueparquet") +AmazonS3_node1658765590649.writeFrame(SQL_node1658765472050) +job.commit() diff --git a/terraform/backend-setup/03-input-derived.tf b/terraform/backend-setup/03-input-derived.tf index 19d9a2167..db66b3410 100644 --- a/terraform/backend-setup/03-input-derived.tf +++ b/terraform/backend-setup/03-input-derived.tf @@ -1,6 +1,6 @@ # Any internal local variables should be declared here. We also import the tag module for convenience module "tags" { - source = "git@github.com:LBHackney-IT/infrastructure.git//modules/aws-tags-lbh/module?ref=master" + source = "github.com/LBHackney-IT/aws-tags-lbh.git?ref=v1.1.1" application = var.application automation_build_url = var.automation_build_url diff --git a/terraform/core/03-input-derived.tf b/terraform/core/03-input-derived.tf index a928bcf2e..3d1818503 100644 --- a/terraform/core/03-input-derived.tf +++ b/terraform/core/03-input-derived.tf @@ -1,6 +1,6 @@ # Any internal local variables should be declared here. We also import the tag module for convenience module "tags" { - source = "git@github.com:LBHackney-IT/infrastructure.git//modules/aws-tags-lbh/module?ref=master" + source = "github.com/LBHackney-IT/aws-tags-lbh.git?ref=v1.1.1" application = var.application automation_build_url = var.automation_build_url diff --git a/terraform/etl/03-input-derived.tf b/terraform/etl/03-input-derived.tf index 1232cfac7..1fd9aa9c8 100644 --- a/terraform/etl/03-input-derived.tf +++ b/terraform/etl/03-input-derived.tf @@ -1,6 +1,6 @@ # Any internal local variables should be declared here. We also import the tag module for convenience module "tags" { - source = "git@github.com:LBHackney-IT/infrastructure.git//modules/aws-tags-lbh/module?ref=master" + source = "github.com/LBHackney-IT/aws-tags-lbh.git?ref=v1.1.1" application = var.application automation_build_url = var.automation_build_url @@ -47,8 +47,8 @@ data "aws_ssm_parameter" "aws_vpc_id" { data "aws_subnets" "network" { filter { - name = "vpc-id" - values = [data.aws_ssm_parameter.aws_vpc_id.value] + name = "vpc-id" + values = [data.aws_ssm_parameter.aws_vpc_id.value] } } @@ -58,4 +58,4 @@ data "aws_vpc" "network" { data "aws_iam_role" "glue_role" { name = "${local.identifier_prefix}-glue-role" -} \ No newline at end of file +} diff --git a/terraform/networking/03-input-derived.tf b/terraform/networking/03-input-derived.tf index 9e6740ccb..d188f6e82 100644 --- a/terraform/networking/03-input-derived.tf +++ b/terraform/networking/03-input-derived.tf @@ -1,6 +1,6 @@ # General module "tags" { - source = "git@github.com:LBHackney-IT/infrastructure.git//modules/aws-tags-lbh/module?ref=master" + source = "github.com/LBHackney-IT/aws-tags-lbh.git?ref=v1.1.1" application = var.application automation_build_url = var.automation_build_url