Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Two new tables to test for data quality with Great Expectations #2038

Merged
merged 2 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions scripts/helpers/housing_gx_dq_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
'sql': """SELECT * FROM "housing-refined-zone"."tenure_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."tenure_reshape") and description in ('Secure', 'Introductory', 'Mesne Profit Ac', 'Non-Secure') and (endoftenuredate is null or substr(endoftenuredate, 1, 11) = '1900-01-01')""",
'id_field': 'tenancy_id'},
'contacts_reshape': {
'sql': """SELECT id, targetid, createdat, contacttype, subtype, value, lastmodified, targettype, isactive, person_id, import_date FROM "housing-refined-zone"."contacts_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."contacts_reshape") and isactive=True""",
'sql': """SELECT id, targetid, substr(createdat, 1, 10) as createdat, contacttype, subtype, value, substr(lastmodified, 1, 10) as lastmodified, targettype, isactive, person_id, import_date FROM "housing-refined-zone"."contacts_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."contacts_reshape") and isactive=True""",
'id_field': 'id'},
'housing_homeowner_record_sheet': {
'sql': """SELECT * FROM "housing-raw-zone"."housing_homeowner_record_sheet" where import_date=(select max(import_date) from "housing-raw-zone"."housing_homeowner_record_sheet")""",
Expand All @@ -16,11 +16,17 @@
'id_field': 'property_dwelling_reference_number'},
'assets_reshape': {
'sql': """SELECT * FROM "housing-refined-zone"."assets_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."assets_reshape") and assettype = 'Dwelling'""",
'id_field': 'asset_id'}
'id_field': 'asset_id'},
'matenancyagreement': {
'sql': """SELECT *, substr(cast(eot as varchar), 1, 10) as eot_parsed, substr(cast(cot as varchar), 1, 10) as cot_parsed FROM "housing-raw-zone"."sow2b_dbo_matenancyagreement" where import_date=(select max(import_date) FROM "housing-raw-zone"."sow2b_dbo_matenancyagreement")""",
'id_field': 'tag_ref'},
'maproperty': {
'sql': """SELECT * FROM "housing-raw-zone"."sow2b_dbo_maproperty" where import_date=(select max(import_date) FROM "housing-raw-zone"."sow2b_dbo_maproperty")""",
'id_field': 'prop_ref'}
}

table_list = ['person_reshape', 'tenure_reshape', 'contacts_reshape', 'housing_homeowner_record_sheet',
'housing_dwellings_list', 'assets_reshape']
'housing_dwellings_list', 'assets_reshape', 'matenancyagreement', 'maproperty']

partition_keys = ['import_year', 'import_month', 'import_day', 'import_date']

Expand All @@ -39,10 +45,12 @@
'expect_contact_value_column_values_to_be_unique': 'UNIQUENESS',
'expect_contact_value_column_values_to_not_be_null': 'COMPLETENESS',
'expect_date_of_birth_column_values_to_not_be_null': 'COMPLETENESS',
'expect_date_of_birth_to_be_between': 'VALIDITY',
'expect_date_of_birth_to_be_between': 'TIMELINESS',
'expect_description_values_to_be_in_set': 'CONSISTENCY',
'expect_estate_ref_no_column_values_to_match_regex': 'VALIDITY',
'expect_first_name_column_value_length': 'VALIDITY',
'expect_is_organisation_column_values_to_not_be_null': 'COMPLETENESS',
'expect_is_organisation_values_to_be_in_set': 'CONSISTENCY',
'expect_llpg_and_prop_ref_column_values_to_be_unique_within_record': 'UNIQUENESS',
'expect_llpg_column_value_lengths_between': 'VALIDITY',
'expect_llpg_column_values_to_be_unique': 'UNIQUENESS',
Expand Down Expand Up @@ -70,6 +78,7 @@
'expect_sub_type_column_values_to_not_be_null': 'COMPLETENESS',
'expect_surname_column_value_length': 'VALIDITY',
'expect_firstname_column_value_length': 'VALIDITY',
'expect_tag_ref_column_not_to_be_null': 'COMPLETENESS',
'expect_target_id_and_value_column_values_to_be_unique_within_record': 'UNIQUENESS',
'expect_target_id_column_values_to_not_be_null': 'COMPLETENESS',
'expect_target_type_column_values_to_be_in_set': 'CONSISTENCY',
Expand All @@ -78,6 +87,7 @@
'expect_tenancy_id_column_not_to_be_null': 'COMPLETENESS',
'expect_tenure_code_column_not_to_be_null': 'COMPLETENESS',
'expect_tenure_type_column_values_to_be_in_set': 'CONSISTENCY',
'expect_tenure_code_values_to_be_in_set': 'CONSISTENCY',
'expect_uprn_column_value_lengths_between': 'VALIDITY',
'expect_uprn_column_values_to_match_regex': 'VALIDITY',
'expect_uprn_column_values_to_not_be_null': 'COMPLETENESS',
Expand Down
37 changes: 37 additions & 0 deletions scripts/jobs/housing/housing_maproperty_gx_suite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# flake8: noqa: F821
import sys

from awsglue.utils import getResolvedOptions
import great_expectations as gx
import great_expectations.expectations as gxe

arg_key = ['s3_target_location']
args = getResolvedOptions(sys.argv, arg_key)
locals().update(args)


class ExpectPropRefColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
column: str = 'prop_ref'
description: str = "Expect Prop Ref field to be unique for a property type"


class ExpectArrPatchNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "arr_patch"
description: str = "Expect Arrears Patch column to be complete with no missing values"


class ExpectPropRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "prop_ref"
description: str = "Expect Prop Ref column to be complete with no missing values"


# add to GX context
context = gx.get_context(mode="file", project_root_dir=s3_target_location)

suite = gx.ExpectationSuite(name='maproperty_suite')

suite.add_expectation(ExpectPropRefColumnValuesToBeUnique())
suite.add_expectation(ExpectArrPatchNotToBeNull())
suite.add_expectation(ExpectPropRefNotToBeNull())

suite = context.suites.add(suite)
82 changes: 82 additions & 0 deletions scripts/jobs/housing/housing_matenancyagreement_gx_suite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# flake8: noqa: F821
from datetime import datetime
import sys

from awsglue.utils import getResolvedOptions
import great_expectations as gx
import great_expectations.expectations as gxe

arg_key = ['s3_target_location']
args = getResolvedOptions(sys.argv, arg_key)
locals().update(args)


class ExpectTagRefColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
column: str = 'tag_ref'
description: str = "Expect Tag Ref field to be unique for a tenancy"


class ExpectTagRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "tag_ref"
description: str = "Expect Tag Ref column to be complete with no missing values"


class ExpectPropRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "prop_ref"
description: str = "Expect Prop Ref column to be complete with no missing values"


class ExpectCoTNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "cot"
description: str = "Expect Tenancy start date column (cot) to be complete with no missing values"


class ExpectTenureNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "tenure"
description: str = "Expect tenure to be complete with no missing values"


class ExpectSaffRentAccNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "u_saff_rentacc"
description: str = "Expect Saff rent account (payment ref) to be complete with no missing values"


class ExpectRentGroupRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
column: str = "rentgrp_ref"
description: str = "Expect Rent Group ref column to be complete with no missing values"


class ExpectEoTToBeBetween(gxe.ExpectColumnValuesToBeBetween):
column: str = 'eot_parsed'
min_value: str = datetime(1920, 1, 1, 0, 0, 0).isoformat()
max_value: str = datetime.today().isoformat()
description: str = "Expect eot_parsed be between 1920-01-01 and today's date"
condition_parser: str = 'great_expectations'
row_condition: str = 'col("eot_parsed").notNull()'


class ExpectCoTToBeBetween(gxe.ExpectColumnValuesToBeBetween):
column: str = 'cot_parsed'
min_value: str = datetime(1920, 1, 1, 0, 0, 0).isoformat()
max_value: str = datetime.today().isoformat()
description: str = "Expect cot_parsed be between 1920-01-01 and today's date"
condition_parser: str = 'great_expectations'
row_condition: str = 'col("cot").notNull()'


# add to GX context
context = gx.get_context(mode="file", project_root_dir=s3_target_location)

suite = gx.ExpectationSuite(name='matenancyagreement_suite')

suite.add_expectation(ExpectTagRefColumnValuesToBeUnique())
suite.add_expectation(ExpectTagRefNotToBeNull())
suite.add_expectation(ExpectPropRefNotToBeNull())
suite.add_expectation(ExpectCoTNotToBeNull())
suite.add_expectation(ExpectTenureNotToBeNull())
suite.add_expectation(ExpectSaffRentAccNotToBeNull())
suite.add_expectation(ExpectRentGroupRefNotToBeNull())
suite.add_expectation(ExpectEoTToBeBetween())
suite.add_expectation(ExpectCoTToBeBetween())

suite = context.suites.add(suite)
Loading