From 46eaa5ef9ad280ab8b8ca240b8433470935f755e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ben=20Hammond=20=F0=9F=8E=9B=EF=B8=8F?= Date: Mon, 23 Dec 2024 16:31:26 -0700 Subject: [PATCH] BACKEND: Double quotes and linting for all python files (#3893) --- .github/linters/black.ini | 1 - airflow/dags/acs_condition.py | 82 +- airflow/dags/acs_population.py | 94 +- airflow/dags/bjs_incarceration.py | 26 +- airflow/dags/cawp_time.py | 18 +- airflow/dags/cdc_hiv.py | 98 +- airflow/dags/cdc_restricted.py | 100 +- airflow/dags/cdc_vaccination_county.py | 20 +- airflow/dags/cdc_vaccination_national.py | 32 +- airflow/dags/cdc_wisqars.py | 66 +- airflow/dags/cdc_wisqars_black_men.py | 46 +- airflow/dags/cdc_wisqars_youth.py | 28 +- airflow/dags/cdc_wonder.py | 62 +- airflow/dags/census_pop_estimates.py | 18 +- airflow/dags/census_pop_estimates_sc.py | 18 +- airflow/dags/chr.py | 24 +- .../dags/decia_2010_territory_population.py | 34 +- .../dags/decia_2020_territory_population.py | 40 +- airflow/dags/geo_context.py | 18 +- airflow/dags/graphql_ahr_behavioral_health.py | 76 +- .../dags/graphql_ahr_non-behavioral_health.py | 76 +- airflow/dags/kff_vaccination.py | 22 +- airflow/dags/maternal_mortality.py | 24 +- airflow/dags/phrma.py | 126 +-- airflow/dags/phrma_brfss.py | 112 +-- airflow/dags/sanity_check.py | 14 +- airflow/dags/test_sanity_check.py | 45 +- airflow/dags/util.py | 68 +- airflow/dags/vera_incarceration_county.py | 36 +- data_server/main.py | 52 +- data_server/test_data_server.py | 85 +- e2e_tests/data_serving.py | 28 +- e2e_tests/scripts/ensure_datasets_equal.py | 44 +- exporter/main.py | 76 +- exporter/test_exporter.py | 126 +-- pyproject.toml | 1 - python/data_server/dataset_cache.py | 2 +- python/data_server/setup.py | 9 +- python/datasources/acs_condition.py | 202 ++-- python/datasources/acs_population.py | 338 +++---- python/datasources/age_adjust_cdc_hiv.py | 28 +- .../datasources/age_adjust_cdc_restricted.py | 40 +- python/datasources/bjs_incarceration.py | 34 +- python/datasources/cawp_time.py | 52 +- python/datasources/cdc_hiv.py | 2 +- python/datasources/cdc_restricted.py | 68 +- python/datasources/cdc_restricted_local.py | 50 +- python/datasources/cdc_vaccination_county.py | 18 +- .../datasources/cdc_vaccination_national.py | 86 +- python/datasources/cdc_wisqars.py | 38 +- python/datasources/cdc_wisqars_black_men.py | 10 +- python/datasources/cdc_wisqars_youth.py | 24 +- python/datasources/cdc_wonder.py | 22 +- python/datasources/census_pop_estimates.py | 62 +- python/datasources/census_pop_estimates_sc.py | 152 ++- python/datasources/chr.py | 90 +- python/datasources/data_source.py | 10 +- .../decia_2010_territory_population.py | 18 +- .../decia_2020_territory_population.py | 38 +- python/datasources/geo_context.py | 12 +- python/datasources/graphql_ahr.py | 72 +- python/datasources/kff_vaccination.py | 140 +-- python/datasources/maternal_mortality.py | 44 +- python/datasources/phrma.py | 46 +- python/datasources/phrma_brfss.py | 26 +- python/datasources/setup.py | 9 +- .../datasources/vera_incarceration_county.py | 12 +- python/ingestion/bjs_utils.py | 54 +- python/ingestion/cdc_wisqars_utils.py | 66 +- python/ingestion/cdc_wonder_utils.py | 32 +- python/ingestion/census.py | 34 +- python/ingestion/constants.py | 4 +- python/ingestion/dataset_utils.py | 64 +- python/ingestion/gcs_to_bq_util.py | 32 +- python/ingestion/github_util.py | 4 +- python/ingestion/graphql_ahr_utils.py | 58 +- python/ingestion/het_types.py | 58 +- python/ingestion/local_pipeline_utils.py | 8 +- python/ingestion/merge_utils.py | 176 ++-- python/ingestion/phrma_utils.py | 76 +- python/ingestion/pubsub_publisher.py | 6 +- python/ingestion/setup.py | 18 +- python/ingestion/standardized_columns.py | 60 +- python/ingestion/url_file_to_gcs.py | 6 +- python/run_local_pipelines.py | 4 +- .../tests/data_server/test_dataset_cache.py | 68 +- .../tests/datasources/test_acs_condition.py | 262 ++--- .../tests/datasources/test_acs_population.py | 253 ++--- .../tests/datasources/test_age_adjustment.py | 213 ++-- .../test_age_adjustment_cdc_hiv.py | 34 +- .../datasources/test_bjs_incarceration.py | 90 +- python/tests/datasources/test_cawp_time.py | 26 +- python/tests/datasources/test_cdc_hiv.py | 40 +- .../tests/datasources/test_cdc_restricted.py | 210 ++-- .../datasources/test_cdc_restricted_local.py | 28 +- .../test_cdc_vaccination_county.py | 24 +- .../test_cdc_vaccination_national.py | 52 +- .../datasources/test_cdc_wisqars_black_men.py | 16 +- .../datasources/test_cdc_wisqars_youth.py | 12 +- python/tests/datasources/test_cdc_wonder.py | 34 +- .../datasources/test_census_pop_estimates.py | 24 +- .../test_census_pop_estimates_sc.py | 94 +- python/tests/datasources/test_data_source.py | 4 +- .../test_decia_2020_territory_population.py | 34 +- python/tests/datasources/test_geo_context.py | 48 +- python/tests/datasources/test_graphql_ahr.py | 102 +- .../tests/datasources/test_kff_vaccination.py | 50 +- .../datasources/test_maternal_mortality.py | 40 +- python/tests/datasources/test_phrma.py | 80 +- python/tests/datasources/test_phrma_brfss.py | 100 +- python/tests/datasources/test_utils.py | 14 +- .../test_vera_incarceration_county.py | 38 +- python/tests/ingestion/test_bjs_utils.py | 68 +- .../tests/ingestion/test_cdc_wisqars_utils.py | 54 +- python/tests/ingestion/test_census.py | 2 +- python/tests/ingestion/test_dataset_utils.py | 938 +++++++++--------- python/tests/ingestion/test_gcs_to_bq.py | 66 +- .../ingestion/test_graphql_ahr_measure_ids.py | 8 +- python/tests/ingestion/test_merge_utils.py | 588 +++++------ .../ingestion/test_standardized_columns.py | 8 +- .../tests/ingestion/test_url_file_to_gcs.py | 49 +- run_gcs_to_bq/main.py | 38 +- run_gcs_to_bq/test_run_gcs_to_bq.py | 24 +- run_ingestion/main.py | 44 +- run_ingestion/test_run_ingestion.py | 12 +- 125 files changed, 4080 insertions(+), 4159 deletions(-) diff --git a/.github/linters/black.ini b/.github/linters/black.ini index 4603580112..841d5258df 100644 --- a/.github/linters/black.ini +++ b/.github/linters/black.ini @@ -1,4 +1,3 @@ # configuration settings for black python linter when run by SuperLinter on GitHub actions CI [tool.black] -skip-string-normalization = true line-length = 120 \ No newline at end of file diff --git a/airflow/dags/acs_condition.py b/airflow/dags/acs_condition.py index 0b1b80bb87..1cfc4e4a0a 100644 --- a/airflow/dags/acs_condition.py +++ b/airflow/dags/acs_condition.py @@ -9,8 +9,8 @@ _ACS_DATASET_NAME = "acs_condition" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( @@ -22,122 +22,122 @@ # CACHE ACS SOURCE INTO TMP JSON IN BUCKETS -acs_condition_gcs_payload_2012 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2012') +acs_condition_gcs_payload_2012 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2012") acs_condition_gcs_operator_2012 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2012', acs_condition_gcs_payload_2012, data_ingestion_dag + "acs_condition_to_gcs_2012", acs_condition_gcs_payload_2012, data_ingestion_dag ) -acs_condition_gcs_payload_2013 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2013') +acs_condition_gcs_payload_2013 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2013") acs_condition_gcs_operator_2013 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2013', acs_condition_gcs_payload_2013, data_ingestion_dag + "acs_condition_to_gcs_2013", acs_condition_gcs_payload_2013, data_ingestion_dag ) -acs_condition_gcs_payload_2014 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2014') +acs_condition_gcs_payload_2014 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2014") acs_condition_gcs_operator_2014 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2014', acs_condition_gcs_payload_2014, data_ingestion_dag + "acs_condition_to_gcs_2014", acs_condition_gcs_payload_2014, data_ingestion_dag ) -acs_condition_gcs_payload_2015 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2015') +acs_condition_gcs_payload_2015 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2015") acs_condition_gcs_operator_2015 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2015', acs_condition_gcs_payload_2015, data_ingestion_dag + "acs_condition_to_gcs_2015", acs_condition_gcs_payload_2015, data_ingestion_dag ) -acs_condition_gcs_payload_2016 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2016') +acs_condition_gcs_payload_2016 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2016") acs_condition_gcs_operator_2016 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2016', acs_condition_gcs_payload_2016, data_ingestion_dag + "acs_condition_to_gcs_2016", acs_condition_gcs_payload_2016, data_ingestion_dag ) -acs_condition_gcs_payload_2017 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2017') +acs_condition_gcs_payload_2017 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2017") acs_condition_gcs_operator_2017 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2017', acs_condition_gcs_payload_2017, data_ingestion_dag + "acs_condition_to_gcs_2017", acs_condition_gcs_payload_2017, data_ingestion_dag ) -acs_condition_gcs_payload_2018 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2018') +acs_condition_gcs_payload_2018 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2018") acs_condition_gcs_operator_2018 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2018', acs_condition_gcs_payload_2018, data_ingestion_dag + "acs_condition_to_gcs_2018", acs_condition_gcs_payload_2018, data_ingestion_dag ) -acs_condition_gcs_payload_2019 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2019') +acs_condition_gcs_payload_2019 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2019") acs_condition_gcs_operator_2019 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2019', acs_condition_gcs_payload_2019, data_ingestion_dag + "acs_condition_to_gcs_2019", acs_condition_gcs_payload_2019, data_ingestion_dag ) -acs_condition_gcs_payload_2020 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2020') +acs_condition_gcs_payload_2020 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2020") acs_condition_gcs_operator_2020 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2020', acs_condition_gcs_payload_2020, data_ingestion_dag + "acs_condition_to_gcs_2020", acs_condition_gcs_payload_2020, data_ingestion_dag ) -acs_condition_gcs_payload_2021 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2021') +acs_condition_gcs_payload_2021 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2021") acs_condition_gcs_operator_2021 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2021', acs_condition_gcs_payload_2021, data_ingestion_dag + "acs_condition_to_gcs_2021", acs_condition_gcs_payload_2021, data_ingestion_dag ) -acs_condition_gcs_payload_2022 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year='2022') +acs_condition_gcs_payload_2022 = util.generate_gcs_payload(_ACS_WORKFLOW_ID, year="2022") acs_condition_gcs_operator_2022 = util.create_gcs_ingest_operator( - 'acs_condition_to_gcs_2022', acs_condition_gcs_payload_2022, data_ingestion_dag + "acs_condition_to_gcs_2022", acs_condition_gcs_payload_2022, data_ingestion_dag ) # PROCESS AND WRITE TO BQ -acs_condition_bq_payload_2012 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2012') +acs_condition_bq_payload_2012 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2012") acs_condition_bq_operator_2012 = util.create_bq_ingest_operator( "acs_condition_to_bq_2012", acs_condition_bq_payload_2012, data_ingestion_dag ) -acs_condition_bq_payload_2013 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2013') +acs_condition_bq_payload_2013 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2013") acs_condition_bq_operator_2013 = util.create_bq_ingest_operator( "acs_condition_to_bq_2013", acs_condition_bq_payload_2013, data_ingestion_dag ) -acs_condition_bq_payload_2014 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2014') +acs_condition_bq_payload_2014 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2014") acs_condition_bq_operator_2014 = util.create_bq_ingest_operator( "acs_condition_to_bq_2014", acs_condition_bq_payload_2014, data_ingestion_dag ) -acs_condition_bq_payload_2015 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2015') +acs_condition_bq_payload_2015 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2015") acs_condition_bq_operator_2015 = util.create_bq_ingest_operator( "acs_condition_to_bq_2015", acs_condition_bq_payload_2015, data_ingestion_dag ) -acs_condition_bq_payload_2016 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2016') +acs_condition_bq_payload_2016 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2016") acs_condition_bq_operator_2016 = util.create_bq_ingest_operator( "acs_condition_to_bq_2016", acs_condition_bq_payload_2016, data_ingestion_dag ) -acs_condition_bq_payload_2017 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2017') +acs_condition_bq_payload_2017 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2017") acs_condition_bq_operator_2017 = util.create_bq_ingest_operator( "acs_condition_to_bq_2017", acs_condition_bq_payload_2017, data_ingestion_dag ) -acs_condition_bq_payload_2018 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2018') +acs_condition_bq_payload_2018 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2018") acs_condition_bq_operator_2018 = util.create_bq_ingest_operator( "acs_condition_to_bq_2018", acs_condition_bq_payload_2018, data_ingestion_dag ) -acs_condition_bq_payload_2019 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2019') +acs_condition_bq_payload_2019 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2019") acs_condition_bq_operator_2019 = util.create_bq_ingest_operator( "acs_condition_to_bq_2019", acs_condition_bq_payload_2019, data_ingestion_dag ) -acs_condition_bq_payload_2020 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2020') +acs_condition_bq_payload_2020 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2020") acs_condition_bq_operator_2020 = util.create_bq_ingest_operator( "acs_condition_to_bq_2020", acs_condition_bq_payload_2020, data_ingestion_dag ) -acs_condition_bq_payload_2021 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2021') +acs_condition_bq_payload_2021 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2021") acs_condition_bq_operator_2021 = util.create_bq_ingest_operator( "acs_condition_to_bq_2021", acs_condition_bq_payload_2021, data_ingestion_dag ) -acs_condition_bq_payload_2022 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2022') +acs_condition_bq_payload_2022 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2022") acs_condition_bq_operator_2022 = util.create_bq_ingest_operator( "acs_condition_to_bq_2022", acs_condition_bq_payload_2022, data_ingestion_dag ) # EXPORT FROM BQ TO BUCKETS acs_condition_exporter_payload_race = { - 'dataset_name': _ACS_DATASET_NAME, - 'demographic': "by_race", + "dataset_name": _ACS_DATASET_NAME, + "demographic": "by_race", } acs_condition_exporter_operator_race = util.create_exporter_operator( "acs_condition_exporter_race", @@ -146,16 +146,16 @@ ) acs_condition_exporter_payload_age = { - 'dataset_name': _ACS_DATASET_NAME, - 'demographic': "by_age", + "dataset_name": _ACS_DATASET_NAME, + "demographic": "by_age", } acs_condition_exporter_operator_age = util.create_exporter_operator( "acs_condition_exporter_age", acs_condition_exporter_payload_age, data_ingestion_dag ) acs_condition_exporter_payload_sex = { - 'dataset_name': _ACS_DATASET_NAME, - 'demographic': "by_sex", + "dataset_name": _ACS_DATASET_NAME, + "demographic": "by_sex", } acs_condition_exporter_operator_sex = util.create_exporter_operator( "acs_condition_exporter_sex", acs_condition_exporter_payload_sex, data_ingestion_dag diff --git a/airflow/dags/acs_population.py b/airflow/dags/acs_population.py index 0dc57765c9..97d3bcbe64 100644 --- a/airflow/dags/acs_population.py +++ b/airflow/dags/acs_population.py @@ -6,124 +6,124 @@ from datetime import timedelta import util -_ACS_WORKFLOW_ID = 'ACS_POPULATION' -_ACS_DATASET_NAME = 'acs_population' +_ACS_WORKFLOW_ID = "ACS_POPULATION" +_ACS_DATASET_NAME = "acs_population" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'acs_population_ingestion_dag', + "acs_population_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for ACS Population', + description="Ingestion configuration for ACS Population", ) acs_pop_gcs_payload = util.generate_gcs_payload(_ACS_WORKFLOW_ID) -acs_pop_gcs_operator = util.create_gcs_ingest_operator('acs_population_to_gcs', acs_pop_gcs_payload, data_ingestion_dag) +acs_pop_gcs_operator = util.create_gcs_ingest_operator("acs_population_to_gcs", acs_pop_gcs_payload, data_ingestion_dag) -acs_pop_bq_payload_2009 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2009') +acs_pop_bq_payload_2009 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2009") acs_pop_bq_operator_2009 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2009', acs_pop_bq_payload_2009, data_ingestion_dag + "acs_population_to_bq_2009", acs_pop_bq_payload_2009, data_ingestion_dag ) -acs_pop_bq_payload_2010 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2010') +acs_pop_bq_payload_2010 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2010") acs_pop_bq_operator_2010 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2010', acs_pop_bq_payload_2010, data_ingestion_dag + "acs_population_to_bq_2010", acs_pop_bq_payload_2010, data_ingestion_dag ) -acs_pop_bq_payload_2011 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2011') +acs_pop_bq_payload_2011 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2011") acs_pop_bq_operator_2011 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2011', acs_pop_bq_payload_2011, data_ingestion_dag + "acs_population_to_bq_2011", acs_pop_bq_payload_2011, data_ingestion_dag ) -acs_pop_bq_payload_2012 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2012') +acs_pop_bq_payload_2012 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2012") acs_pop_bq_operator_2012 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2012', acs_pop_bq_payload_2012, data_ingestion_dag + "acs_population_to_bq_2012", acs_pop_bq_payload_2012, data_ingestion_dag ) -acs_pop_bq_payload_2013 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2013') +acs_pop_bq_payload_2013 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2013") acs_pop_bq_operator_2013 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2013', acs_pop_bq_payload_2013, data_ingestion_dag + "acs_population_to_bq_2013", acs_pop_bq_payload_2013, data_ingestion_dag ) -acs_pop_bq_payload_2014 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2014') +acs_pop_bq_payload_2014 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2014") acs_pop_bq_operator_2014 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2014', acs_pop_bq_payload_2014, data_ingestion_dag + "acs_population_to_bq_2014", acs_pop_bq_payload_2014, data_ingestion_dag ) -acs_pop_bq_payload_2015 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2015') +acs_pop_bq_payload_2015 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2015") acs_pop_bq_operator_2015 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2015', acs_pop_bq_payload_2015, data_ingestion_dag + "acs_population_to_bq_2015", acs_pop_bq_payload_2015, data_ingestion_dag ) -acs_pop_bq_payload_2016 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2016') +acs_pop_bq_payload_2016 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2016") acs_pop_bq_operator_2016 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2016', acs_pop_bq_payload_2016, data_ingestion_dag + "acs_population_to_bq_2016", acs_pop_bq_payload_2016, data_ingestion_dag ) -acs_pop_bq_payload_2017 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2017') +acs_pop_bq_payload_2017 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2017") acs_pop_bq_operator_2017 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2017', acs_pop_bq_payload_2017, data_ingestion_dag + "acs_population_to_bq_2017", acs_pop_bq_payload_2017, data_ingestion_dag ) -acs_pop_bq_payload_2018 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2018') +acs_pop_bq_payload_2018 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2018") acs_pop_bq_operator_2018 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2018', acs_pop_bq_payload_2018, data_ingestion_dag + "acs_population_to_bq_2018", acs_pop_bq_payload_2018, data_ingestion_dag ) -acs_pop_bq_payload_2019 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2019') +acs_pop_bq_payload_2019 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2019") acs_pop_bq_operator_2019 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2019', acs_pop_bq_payload_2019, data_ingestion_dag + "acs_population_to_bq_2019", acs_pop_bq_payload_2019, data_ingestion_dag ) -acs_pop_bq_payload_2020 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2020') +acs_pop_bq_payload_2020 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2020") acs_pop_bq_operator_2020 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2020', acs_pop_bq_payload_2020, data_ingestion_dag + "acs_population_to_bq_2020", acs_pop_bq_payload_2020, data_ingestion_dag ) -acs_pop_bq_payload_2021 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2021') +acs_pop_bq_payload_2021 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2021") acs_pop_bq_operator_2021 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2021', acs_pop_bq_payload_2021, data_ingestion_dag + "acs_population_to_bq_2021", acs_pop_bq_payload_2021, data_ingestion_dag ) -acs_pop_bq_payload_2022 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year='2022') +acs_pop_bq_payload_2022 = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, year="2022") acs_pop_bq_operator_2022 = util.create_bq_ingest_operator( - 'acs_population_to_bq_2022', acs_pop_bq_payload_2022, data_ingestion_dag + "acs_population_to_bq_2022", acs_pop_bq_payload_2022, data_ingestion_dag ) acs_pop_exporter_payload_race = { - 'dataset_name': _ACS_DATASET_NAME, - 'demographic': "by_race", + "dataset_name": _ACS_DATASET_NAME, + "demographic": "by_race", } acs_pop_exporter_operator_race = util.create_exporter_operator( - 'acs_population_exporter_race', acs_pop_exporter_payload_race, data_ingestion_dag + "acs_population_exporter_race", acs_pop_exporter_payload_race, data_ingestion_dag ) acs_pop_exporter_payload_age = { - 'dataset_name': _ACS_DATASET_NAME, - 'demographic': "by_age", + "dataset_name": _ACS_DATASET_NAME, + "demographic": "by_age", } acs_pop_exporter_operator_age = util.create_exporter_operator( - 'acs_population_exporter_age', acs_pop_exporter_payload_age, data_ingestion_dag + "acs_population_exporter_age", acs_pop_exporter_payload_age, data_ingestion_dag ) acs_pop_exporter_payload_sex = { - 'dataset_name': _ACS_DATASET_NAME, - 'demographic': "by_sex", + "dataset_name": _ACS_DATASET_NAME, + "demographic": "by_sex", } acs_pop_exporter_operator_sex = util.create_exporter_operator( - 'acs_population_exporter_sex', acs_pop_exporter_payload_sex, data_ingestion_dag + "acs_population_exporter_sex", acs_pop_exporter_payload_sex, data_ingestion_dag ) -connector1 = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id='connector1') -connector2 = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id='connector2') -connector3 = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id='connector3') +connector1 = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id="connector1") +connector2 = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id="connector2") +connector3 = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id="connector3") # ensure CACHING step runs, then 2009 to make new BQ tables # then run the rest of the years in parallel chunks diff --git a/airflow/dags/bjs_incarceration.py b/airflow/dags/bjs_incarceration.py index 4c8243b840..34b8758a51 100644 --- a/airflow/dags/bjs_incarceration.py +++ b/airflow/dags/bjs_incarceration.py @@ -5,40 +5,40 @@ from datetime import timedelta import util -_BJS_INCARCERATION_WORKFLOW_ID = 'BJS_INCARCERATION_DATA' -_BJS_INCARCERATION_DATASET_NAME = 'bjs_incarceration_data' +_BJS_INCARCERATION_WORKFLOW_ID = "BJS_INCARCERATION_DATA" +_BJS_INCARCERATION_DATASET_NAME = "bjs_incarceration_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'bjs_incarceration_ingestion_dag', + "bjs_incarceration_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for BJS', + description="Ingestion configuration for BJS", ) bjs_incarceration_bq_payload = util.generate_bq_payload(_BJS_INCARCERATION_WORKFLOW_ID, _BJS_INCARCERATION_DATASET_NAME) bjs_incarceration_bq_operator = util.create_bq_ingest_operator( - 'bjs_incarceration_to_bq', bjs_incarceration_bq_payload, data_ingestion_dag + "bjs_incarceration_to_bq", bjs_incarceration_bq_payload, data_ingestion_dag ) -payload_race = {'dataset_name': _BJS_INCARCERATION_DATASET_NAME, 'demographic': "race_and_ethnicity"} +payload_race = {"dataset_name": _BJS_INCARCERATION_DATASET_NAME, "demographic": "race_and_ethnicity"} bjs_incarceration_exporter_operator_race = util.create_exporter_operator( - 'bjs_incarceration_exporter_race', payload_race, data_ingestion_dag + "bjs_incarceration_exporter_race", payload_race, data_ingestion_dag ) -payload_age = {'dataset_name': _BJS_INCARCERATION_DATASET_NAME, 'demographic': "age"} +payload_age = {"dataset_name": _BJS_INCARCERATION_DATASET_NAME, "demographic": "age"} bjs_incarceration_exporter_operator_age = util.create_exporter_operator( - 'bjs_incarceration_exporter_age', payload_age, data_ingestion_dag + "bjs_incarceration_exporter_age", payload_age, data_ingestion_dag ) -payload_sex = {'dataset_name': _BJS_INCARCERATION_DATASET_NAME, 'demographic': "sex"} +payload_sex = {"dataset_name": _BJS_INCARCERATION_DATASET_NAME, "demographic": "sex"} bjs_incarceration_exporter_operator_sex = util.create_exporter_operator( - 'bjs_incarceration_exporter_sex', payload_sex, data_ingestion_dag + "bjs_incarceration_exporter_sex", payload_sex, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/cawp_time.py b/airflow/dags/cawp_time.py index ac5762d37c..9fb4e51180 100644 --- a/airflow/dags/cawp_time.py +++ b/airflow/dags/cawp_time.py @@ -7,27 +7,27 @@ # NEW FLOW - TIME SERIES for US CONGRESS -_CAWP_TIME_WORKFLOW_ID = 'CAWP_TIME_DATA' -_CAWP_TIME_DATASET_NAME = 'cawp_time_data' +_CAWP_TIME_WORKFLOW_ID = "CAWP_TIME_DATA" +_CAWP_TIME_DATASET_NAME = "cawp_time_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cawp_time_ingestion_dag', + "cawp_time_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CAWP_TIME', + description="Ingestion configuration for CAWP_TIME", ) cawp_time_bq_payload = util.generate_bq_payload(_CAWP_TIME_WORKFLOW_ID, _CAWP_TIME_DATASET_NAME) -cawp_time_pop_bq_operator = util.create_bq_ingest_operator('cawp_time_to_bq', cawp_time_bq_payload, data_ingestion_dag) +cawp_time_pop_bq_operator = util.create_bq_ingest_operator("cawp_time_to_bq", cawp_time_bq_payload, data_ingestion_dag) -cawp_time_exporter_payload_race = {'dataset_name': _CAWP_TIME_DATASET_NAME, 'demographic': "race_and_ethnicity"} +cawp_time_exporter_payload_race = {"dataset_name": _CAWP_TIME_DATASET_NAME, "demographic": "race_and_ethnicity"} cawp_time_exporter_operator_race = util.create_exporter_operator( - 'cawp_time_exporter_race', cawp_time_exporter_payload_race, data_ingestion_dag + "cawp_time_exporter_race", cawp_time_exporter_payload_race, data_ingestion_dag ) diff --git a/airflow/dags/cdc_hiv.py b/airflow/dags/cdc_hiv.py index 3cd16daaf6..524b4f6cbf 100644 --- a/airflow/dags/cdc_hiv.py +++ b/airflow/dags/cdc_hiv.py @@ -5,47 +5,47 @@ import util from datetime import timedelta -_CDC_HIV_WORKFLOW_ID = 'CDC_HIV_DATA' -_CDC_HIV_DATASET_NAME = 'cdc_hiv_data' -_HIV_AGE_ADJUST_WORKFLOW_ID = 'AGE_ADJUST_CDC_HIV' +_CDC_HIV_WORKFLOW_ID = "CDC_HIV_DATA" +_CDC_HIV_DATASET_NAME = "cdc_hiv_data" +_HIV_AGE_ADJUST_WORKFLOW_ID = "AGE_ADJUST_CDC_HIV" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_hiv_ingestion_dag', + "cdc_hiv_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for HIV', + description="Ingestion configuration for HIV", ) # RACE NATIONAL cdc_hiv_bq_payload_race_national = util.generate_bq_payload( _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, - demographic='race', - geographic='national', + demographic="race", + geographic="national", ) cdc_hiv_bq_operator_race_national = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_race_national', cdc_hiv_bq_payload_race_national, data_ingestion_dag + "cdc_hiv_to_bq_race_national", cdc_hiv_bq_payload_race_national, data_ingestion_dag ) # RACE STATE cdc_hiv_bq_payload_race_state = util.generate_bq_payload( - _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic='race', geographic='state' + _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic="race", geographic="state" ) cdc_hiv_bq_operator_race_state = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_race_state', cdc_hiv_bq_payload_race_state, data_ingestion_dag + "cdc_hiv_to_bq_race_state", cdc_hiv_bq_payload_race_state, data_ingestion_dag ) # RACE COUNTY cdc_hiv_bq_payload_race_county = util.generate_bq_payload( - _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic='race', geographic='county' + _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic="race", geographic="county" ) cdc_hiv_bq_operator_race_county = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_race_county', cdc_hiv_bq_payload_race_county, data_ingestion_dag + "cdc_hiv_to_bq_race_county", cdc_hiv_bq_payload_race_county, data_ingestion_dag ) @@ -53,65 +53,65 @@ cdc_hiv_bq_payload_sex_national = util.generate_bq_payload( _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, - demographic='sex', - geographic='national', + demographic="sex", + geographic="national", ) cdc_hiv_bq_operator_sex_national = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_sex_national', cdc_hiv_bq_payload_sex_national, data_ingestion_dag + "cdc_hiv_to_bq_sex_national", cdc_hiv_bq_payload_sex_national, data_ingestion_dag ) # SEX STATE cdc_hiv_bq_payload_sex_state = util.generate_bq_payload( - _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic='sex', geographic='state' + _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic="sex", geographic="state" ) cdc_hiv_bq_operator_sex_state = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_sex_state', cdc_hiv_bq_payload_sex_state, data_ingestion_dag + "cdc_hiv_to_bq_sex_state", cdc_hiv_bq_payload_sex_state, data_ingestion_dag ) # SEX COUNTY cdc_hiv_bq_payload_sex_county = util.generate_bq_payload( - _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic='sex', geographic='county' + _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic="sex", geographic="county" ) cdc_hiv_bq_operator_sex_county = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_sex_county', cdc_hiv_bq_payload_sex_county, data_ingestion_dag + "cdc_hiv_to_bq_sex_county", cdc_hiv_bq_payload_sex_county, data_ingestion_dag ) # AGE NATIONAL cdc_hiv_bq_payload_age_national = util.generate_bq_payload( _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, - demographic='age', - geographic='national', + demographic="age", + geographic="national", ) cdc_hiv_bq_operator_age_national = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_age_national', cdc_hiv_bq_payload_age_national, data_ingestion_dag + "cdc_hiv_to_bq_age_national", cdc_hiv_bq_payload_age_national, data_ingestion_dag ) # AGE STATE cdc_hiv_bq_payload_age_state = util.generate_bq_payload( - _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic='age', geographic='state' + _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic="age", geographic="state" ) cdc_hiv_bq_operator_age_state = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_age_state', cdc_hiv_bq_payload_age_state, data_ingestion_dag + "cdc_hiv_to_bq_age_state", cdc_hiv_bq_payload_age_state, data_ingestion_dag ) # AGE COUNTY cdc_hiv_bq_payload_age_county = util.generate_bq_payload( - _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic='age', geographic='county' + _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, demographic="age", geographic="county" ) cdc_hiv_bq_operator_age_county = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_age_county', cdc_hiv_bq_payload_age_county, data_ingestion_dag + "cdc_hiv_to_bq_age_county", cdc_hiv_bq_payload_age_county, data_ingestion_dag ) # BLACK WOMEN NATIONAL cdc_hiv_bq_payload_black_women_national = util.generate_bq_payload( _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, - demographic='black_women', - geographic='national', + demographic="black_women", + geographic="national", ) cdc_hiv_bq_operator_black_women_national = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_black_women_national', + "cdc_hiv_to_bq_black_women_national", cdc_hiv_bq_payload_black_women_national, data_ingestion_dag, ) @@ -120,11 +120,11 @@ cdc_hiv_bq_payload_black_women_state = util.generate_bq_payload( _CDC_HIV_WORKFLOW_ID, _CDC_HIV_DATASET_NAME, - demographic='black_women', - geographic='state', + demographic="black_women", + geographic="state", ) cdc_hiv_bq_operator_black_women_state = util.create_bq_ingest_operator( - 'cdc_hiv_to_bq_black_women_state', + "cdc_hiv_to_bq_black_women_state", cdc_hiv_bq_payload_black_women_state, data_ingestion_dag, ) @@ -136,41 +136,41 @@ _CDC_HIV_DATASET_NAME, ) cdc_hiv_age_adjust_op = util.create_bq_ingest_operator( - 'cdc_hiv_age_adjust', cdc_hiv_age_adjust_payload, data_ingestion_dag + "cdc_hiv_age_adjust", cdc_hiv_age_adjust_payload, data_ingestion_dag ) # EXPORTERS payload_race = { - 'dataset_name': _CDC_HIV_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _CDC_HIV_DATASET_NAME, + "demographic": "race_and_ethnicity", } cdc_hiv_exporter_operator_race = util.create_exporter_operator( - 'cdc_hiv_exporter_race', payload_race, data_ingestion_dag + "cdc_hiv_exporter_race", payload_race, data_ingestion_dag ) -payload_age = {'dataset_name': _CDC_HIV_DATASET_NAME, 'demographic': "age"} -cdc_hiv_exporter_operator_age = util.create_exporter_operator('cdc_hiv_exporter_age', payload_age, data_ingestion_dag) +payload_age = {"dataset_name": _CDC_HIV_DATASET_NAME, "demographic": "age"} +cdc_hiv_exporter_operator_age = util.create_exporter_operator("cdc_hiv_exporter_age", payload_age, data_ingestion_dag) -payload_sex = {'dataset_name': _CDC_HIV_DATASET_NAME, 'demographic': "sex", 'should_export_as_alls': True} -cdc_hiv_exporter_operator_sex = util.create_exporter_operator('cdc_hiv_exporter_sex', payload_sex, data_ingestion_dag) +payload_sex = {"dataset_name": _CDC_HIV_DATASET_NAME, "demographic": "sex", "should_export_as_alls": True} +cdc_hiv_exporter_operator_sex = util.create_exporter_operator("cdc_hiv_exporter_sex", payload_sex, data_ingestion_dag) payload_black_women = { - 'dataset_name': _CDC_HIV_DATASET_NAME, - 'demographic': "black_women", - 'should_export_as_alls': True, + "dataset_name": _CDC_HIV_DATASET_NAME, + "demographic": "black_women", + "should_export_as_alls": True, } cdc_hiv_exporter_operator_black_women = util.create_exporter_operator( - 'cdc_hiv_exporter_black_women', payload_black_women, data_ingestion_dag + "cdc_hiv_exporter_black_women", payload_black_women, data_ingestion_dag ) payload_race_with_age_adjust = { - 'dataset_name': _CDC_HIV_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _CDC_HIV_DATASET_NAME, + "demographic": "race_and_ethnicity", } cdc_hiv_exporter_operator_race_with_age_adjust = util.create_exporter_operator( - 'cdc_hiv_exporter_race_with_age_adjust', + "cdc_hiv_exporter_race_with_age_adjust", payload_race_with_age_adjust, data_ingestion_dag, ) diff --git a/airflow/dags/cdc_restricted.py b/airflow/dags/cdc_restricted.py index 38031e7a38..f3258086b6 100644 --- a/airflow/dags/cdc_restricted.py +++ b/airflow/dags/cdc_restricted.py @@ -5,123 +5,123 @@ from datetime import timedelta import util -_CDC_RESTRICTED_WORKFLOW_ID = 'CDC_RESTRICTED_DATA' -_AGE_ADJUST_WORKFLOW_ID = 'AGE_ADJUST_CDC_RESTRICTED' -_CDC_RESTRICTED_DATASET = 'cdc_restricted_data' +_CDC_RESTRICTED_WORKFLOW_ID = "CDC_RESTRICTED_DATA" +_AGE_ADJUST_WORKFLOW_ID = "AGE_ADJUST_CDC_RESTRICTED" +_CDC_RESTRICTED_DATASET = "cdc_restricted_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_restricted_data_dag', + "cdc_restricted_data_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC Restricted Data', + description="Ingestion configuration for CDC Restricted Data", ) # COUNTY cdc_bq_payload_race_county = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='county', - demographic='race', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="county", + demographic="race", ) cdc_restricted_bq_op_race_county = util.create_bq_ingest_operator( - 'cdc_restricted_race_county_gcs_to_bq', cdc_bq_payload_race_county, data_ingestion_dag + "cdc_restricted_race_county_gcs_to_bq", cdc_bq_payload_race_county, data_ingestion_dag ) cdc_bq_payload_sex_county = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='county', - demographic='sex', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="county", + demographic="sex", ) cdc_restricted_bq_op_sex_county = util.create_bq_ingest_operator( - 'cdc_restricted_sex_county_gcs_to_bq', cdc_bq_payload_sex_county, data_ingestion_dag + "cdc_restricted_sex_county_gcs_to_bq", cdc_bq_payload_sex_county, data_ingestion_dag ) cdc_bq_payload_age_county = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='county', - demographic='age', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="county", + demographic="age", ) cdc_restricted_bq_op_age_county = util.create_bq_ingest_operator( - 'cdc_restricted_age_county_gcs_to_bq', cdc_bq_payload_age_county, data_ingestion_dag + "cdc_restricted_age_county_gcs_to_bq", cdc_bq_payload_age_county, data_ingestion_dag ) # STATE cdc_bq_payload_race_state = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='state', - demographic='race', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="state", + demographic="race", ) cdc_restricted_bq_op_race_state = util.create_bq_ingest_operator( - 'cdc_restricted_race_state_gcs_to_bq', cdc_bq_payload_race_state, data_ingestion_dag + "cdc_restricted_race_state_gcs_to_bq", cdc_bq_payload_race_state, data_ingestion_dag ) cdc_bq_payload_sex_state = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='state', - demographic='sex', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="state", + demographic="sex", ) cdc_restricted_bq_op_sex_state = util.create_bq_ingest_operator( - 'cdc_restricted_sex_state_gcs_to_bq', cdc_bq_payload_sex_state, data_ingestion_dag + "cdc_restricted_sex_state_gcs_to_bq", cdc_bq_payload_sex_state, data_ingestion_dag ) cdc_bq_payload_age_state = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='state', - demographic='age', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="state", + demographic="age", ) cdc_restricted_bq_op_age_state = util.create_bq_ingest_operator( - 'cdc_restricted_age_state_gcs_to_bq', cdc_bq_payload_age_state, data_ingestion_dag + "cdc_restricted_age_state_gcs_to_bq", cdc_bq_payload_age_state, data_ingestion_dag ) # NATIONAL cdc_bq_payload_race_national = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='national', - demographic='race', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="national", + demographic="race", ) cdc_restricted_bq_op_race_national = util.create_bq_ingest_operator( - 'cdc_restricted_race_national_gcs_to_bq', cdc_bq_payload_race_national, data_ingestion_dag + "cdc_restricted_race_national_gcs_to_bq", cdc_bq_payload_race_national, data_ingestion_dag ) cdc_bq_payload_sex_national = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='national', - demographic='sex', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="national", + demographic="sex", ) cdc_restricted_bq_op_sex_national = util.create_bq_ingest_operator( - 'cdc_restricted_sex_national_gcs_to_bq', cdc_bq_payload_sex_national, data_ingestion_dag + "cdc_restricted_sex_national_gcs_to_bq", cdc_bq_payload_sex_national, data_ingestion_dag ) cdc_bq_payload_age_national = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), - geographic='national', - demographic='age', + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), + geographic="national", + demographic="age", ) cdc_restricted_bq_op_age_national = util.create_bq_ingest_operator( - 'cdc_restricted_age_national_gcs_to_bq', cdc_bq_payload_age_national, data_ingestion_dag + "cdc_restricted_age_national_gcs_to_bq", cdc_bq_payload_age_national, data_ingestion_dag ) @@ -131,26 +131,26 @@ ) cdc_restricted_age_adjust_op = util.create_bq_ingest_operator( - 'cdc_restricted_age_adjust', cdc_age_adjust_payload, data_ingestion_dag + "cdc_restricted_age_adjust", cdc_age_adjust_payload, data_ingestion_dag ) # sanity_check = util.sanity_check_operator('sanity_check', _CDC_RESTRICTED_DATASET, data_ingestion_dag) -cdc_restricted_exporter_payload_race = {'dataset_name': _CDC_RESTRICTED_DATASET, 'demographic': "by_race"} +cdc_restricted_exporter_payload_race = {"dataset_name": _CDC_RESTRICTED_DATASET, "demographic": "by_race"} cdc_restricted_exporter_operator_race = util.create_exporter_operator( - 'cdc_restricted_exporter_race', cdc_restricted_exporter_payload_race, data_ingestion_dag + "cdc_restricted_exporter_race", cdc_restricted_exporter_payload_race, data_ingestion_dag ) -cdc_restricted_exporter_payload_age = {'dataset_name': _CDC_RESTRICTED_DATASET, 'demographic': "by_age"} +cdc_restricted_exporter_payload_age = {"dataset_name": _CDC_RESTRICTED_DATASET, "demographic": "by_age"} cdc_restricted_exporter_operator_age = util.create_exporter_operator( - 'cdc_restricted_exporter_age', cdc_restricted_exporter_payload_age, data_ingestion_dag + "cdc_restricted_exporter_age", cdc_restricted_exporter_payload_age, data_ingestion_dag ) -cdc_restricted_exporter_payload_sex = {'dataset_name': _CDC_RESTRICTED_DATASET, 'demographic': "by_sex"} +cdc_restricted_exporter_payload_sex = {"dataset_name": _CDC_RESTRICTED_DATASET, "demographic": "by_sex"} cdc_restricted_exporter_operator_sex = util.create_exporter_operator( - 'cdc_restricted_exporter_sex', cdc_restricted_exporter_payload_sex, data_ingestion_dag + "cdc_restricted_exporter_sex", cdc_restricted_exporter_payload_sex, data_ingestion_dag ) # CDC Restricted Data Ingestion DAG ( diff --git a/airflow/dags/cdc_vaccination_county.py b/airflow/dags/cdc_vaccination_county.py index a79fb9a90a..814422a38c 100644 --- a/airflow/dags/cdc_vaccination_county.py +++ b/airflow/dags/cdc_vaccination_county.py @@ -5,34 +5,34 @@ from datetime import timedelta import util -_CDC_VACCINATION_COUNTY_WORKFLOW_ID = 'CDC_VACCINATION_COUNTY' -_CDC_VACCINATION_COUNTY_DATASET_NAME = 'cdc_vaccination_county' +_CDC_VACCINATION_COUNTY_WORKFLOW_ID = "CDC_VACCINATION_COUNTY" +_CDC_VACCINATION_COUNTY_DATASET_NAME = "cdc_vaccination_county" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_vaccination_county_ingestion_dag', + "cdc_vaccination_county_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC Vaccination County', + description="Ingestion configuration for CDC Vaccination County", ) cdc_vaccination_county_bq_payload = util.generate_bq_payload( _CDC_VACCINATION_COUNTY_WORKFLOW_ID, _CDC_VACCINATION_COUNTY_DATASET_NAME ) cdc_vaccination_county_bq_operator = util.create_bq_ingest_operator( - 'cdc_vaccination_county_to_bq', cdc_vaccination_county_bq_payload, data_ingestion_dag + "cdc_vaccination_county_to_bq", cdc_vaccination_county_bq_payload, data_ingestion_dag ) cdc_vaccination_county_exporter_payload_alls = { - 'dataset_name': _CDC_VACCINATION_COUNTY_DATASET_NAME, - 'demographic': "alls", + "dataset_name": _CDC_VACCINATION_COUNTY_DATASET_NAME, + "demographic": "alls", } cdc_vaccination_county_exporter_operator_alls = util.create_exporter_operator( - 'cdc_vaccination_county_exporter_alls', cdc_vaccination_county_exporter_payload_alls, data_ingestion_dag + "cdc_vaccination_county_exporter_alls", cdc_vaccination_county_exporter_payload_alls, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/cdc_vaccination_national.py b/airflow/dags/cdc_vaccination_national.py index 0b8895e5e6..c86a204832 100644 --- a/airflow/dags/cdc_vaccination_national.py +++ b/airflow/dags/cdc_vaccination_national.py @@ -5,51 +5,51 @@ from datetime import timedelta import util -_CDC_VACCINATION_NATIONAL_WORKFLOW_ID = 'CDC_VACCINATION_NATIONAL' -_CDC_VACCINATION_NATIONAL_DATASET_NAME = 'cdc_vaccination_national' +_CDC_VACCINATION_NATIONAL_WORKFLOW_ID = "CDC_VACCINATION_NATIONAL" +_CDC_VACCINATION_NATIONAL_DATASET_NAME = "cdc_vaccination_national" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_vaccination_national_ingestion_dag', + "cdc_vaccination_national_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC Vaccination National', + description="Ingestion configuration for CDC Vaccination National", ) cdc_vaccination_national_bq_payload = util.generate_bq_payload( _CDC_VACCINATION_NATIONAL_WORKFLOW_ID, _CDC_VACCINATION_NATIONAL_DATASET_NAME ) cdc_vaccination_national_bq_operator = util.create_bq_ingest_operator( - 'cdc_vaccination_national_to_bq', cdc_vaccination_national_bq_payload, data_ingestion_dag + "cdc_vaccination_national_to_bq", cdc_vaccination_national_bq_payload, data_ingestion_dag ) cdc_vaccination_national_exporter_payload_race = { - 'dataset_name': _CDC_VACCINATION_NATIONAL_DATASET_NAME, - 'demographic': "race", + "dataset_name": _CDC_VACCINATION_NATIONAL_DATASET_NAME, + "demographic": "race", } cdc_vaccination_national_exporter_operator_race = util.create_exporter_operator( - 'cdc_vaccination_national_exporter_race', cdc_vaccination_national_exporter_payload_race, data_ingestion_dag + "cdc_vaccination_national_exporter_race", cdc_vaccination_national_exporter_payload_race, data_ingestion_dag ) cdc_vaccination_national_exporter_payload_age = { - 'dataset_name': _CDC_VACCINATION_NATIONAL_DATASET_NAME, - 'demographic': "age", + "dataset_name": _CDC_VACCINATION_NATIONAL_DATASET_NAME, + "demographic": "age", } cdc_vaccination_national_exporter_operator_age = util.create_exporter_operator( - 'cdc_vaccination_national_exporter_age', cdc_vaccination_national_exporter_payload_age, data_ingestion_dag + "cdc_vaccination_national_exporter_age", cdc_vaccination_national_exporter_payload_age, data_ingestion_dag ) cdc_vaccination_national_exporter_payload_sex = { - 'dataset_name': _CDC_VACCINATION_NATIONAL_DATASET_NAME, - 'demographic': "sex", + "dataset_name": _CDC_VACCINATION_NATIONAL_DATASET_NAME, + "demographic": "sex", } cdc_vaccination_national_exporter_operator_sex = util.create_exporter_operator( - 'cdc_vaccination_national_exporter_sex', cdc_vaccination_national_exporter_payload_sex, data_ingestion_dag + "cdc_vaccination_national_exporter_sex", cdc_vaccination_national_exporter_payload_sex, data_ingestion_dag ) diff --git a/airflow/dags/cdc_wisqars.py b/airflow/dags/cdc_wisqars.py index c9c5295ecb..2432dcec2d 100644 --- a/airflow/dags/cdc_wisqars.py +++ b/airflow/dags/cdc_wisqars.py @@ -6,30 +6,30 @@ from datetime import timedelta -_CDC_WISQARS_WORKFLOW_ID = 'CDC_WISQARS_DATA' -_CDC_WISQARS_DATASET_NAME = 'cdc_wisqars_data' +_CDC_WISQARS_WORKFLOW_ID = "CDC_WISQARS_DATA" +_CDC_WISQARS_DATASET_NAME = "cdc_wisqars_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_wisqars_ingestion_dag', + "cdc_wisqars_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC_WISQARS', + description="Ingestion configuration for CDC_WISQARS", ) # AGE NATIONAL cdc_wisqars_bq_payload_age_national = util.generate_bq_payload( _CDC_WISQARS_WORKFLOW_ID, _CDC_WISQARS_DATASET_NAME, - demographic='age', - geographic='national', + demographic="age", + geographic="national", ) cdc_wisqars_bq_operator_age_national = util.create_bq_ingest_operator( - 'cdc_wisqars_to_bq_age_national', + "cdc_wisqars_to_bq_age_national", cdc_wisqars_bq_payload_age_national, data_ingestion_dag, ) @@ -38,11 +38,11 @@ cdc_wisqars_bq_payload_age_state = util.generate_bq_payload( _CDC_WISQARS_WORKFLOW_ID, _CDC_WISQARS_DATASET_NAME, - demographic='age', - geographic='state', + demographic="age", + geographic="state", ) cdc_wisqars_bq_operator_age_state = util.create_bq_ingest_operator( - 'cdc_wisqars_to_bq_age_state', + "cdc_wisqars_to_bq_age_state", cdc_wisqars_bq_payload_age_state, data_ingestion_dag, ) @@ -51,11 +51,11 @@ cdc_wisqars_bq_payload_race_national = util.generate_bq_payload( _CDC_WISQARS_WORKFLOW_ID, _CDC_WISQARS_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='national', + demographic="race_and_ethnicity", + geographic="national", ) cdc_wisqars_bq_operator_race_national = util.create_bq_ingest_operator( - 'cdc_wisqars_to_bq_race_national', + "cdc_wisqars_to_bq_race_national", cdc_wisqars_bq_payload_race_national, data_ingestion_dag, ) @@ -64,11 +64,11 @@ cdc_wisqars_bq_payload_race_state = util.generate_bq_payload( _CDC_WISQARS_WORKFLOW_ID, _CDC_WISQARS_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='state', + demographic="race_and_ethnicity", + geographic="state", ) cdc_wisqars_bq_operator_race_state = util.create_bq_ingest_operator( - 'cdc_wisqars_to_bq_race_state', + "cdc_wisqars_to_bq_race_state", cdc_wisqars_bq_payload_race_state, data_ingestion_dag, ) @@ -77,11 +77,11 @@ cdc_wisqars_bq_payload_sex_national = util.generate_bq_payload( _CDC_WISQARS_WORKFLOW_ID, _CDC_WISQARS_DATASET_NAME, - demographic='sex', - geographic='national', + demographic="sex", + geographic="national", ) cdc_wisqars_bq_operator_sex_national = util.create_bq_ingest_operator( - 'cdc_wisqars_to_bq_sex_national', + "cdc_wisqars_to_bq_sex_national", cdc_wisqars_bq_payload_sex_national, data_ingestion_dag, ) @@ -90,36 +90,36 @@ cdc_wisqars_bq_payload_sex_state = util.generate_bq_payload( _CDC_WISQARS_WORKFLOW_ID, _CDC_WISQARS_DATASET_NAME, - demographic='sex', - geographic='state', + demographic="sex", + geographic="state", ) cdc_wisqars_bq_operator_sex_state = util.create_bq_ingest_operator( - 'cdc_wisqars_to_bq_sex_state', + "cdc_wisqars_to_bq_sex_state", cdc_wisqars_bq_payload_sex_state, data_ingestion_dag, ) # EXPORTERS -payload_age = {'dataset_name': _CDC_WISQARS_DATASET_NAME, 'demographic': "age"} +payload_age = {"dataset_name": _CDC_WISQARS_DATASET_NAME, "demographic": "age"} cdc_wisqars_exporter_operator_age = util.create_exporter_operator( - 'cdc_wisqars_exporter_age', payload_age, data_ingestion_dag + "cdc_wisqars_exporter_age", payload_age, data_ingestion_dag ) payload_race = { - 'dataset_name': _CDC_WISQARS_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _CDC_WISQARS_DATASET_NAME, + "demographic": "race_and_ethnicity", } cdc_wisqars_exporter_operator_race = util.create_exporter_operator( - 'cdc_wisqars_exporter_race', payload_race, data_ingestion_dag + "cdc_wisqars_exporter_race", payload_race, data_ingestion_dag ) payload_sex = { - 'dataset_name': _CDC_WISQARS_DATASET_NAME, - 'demographic': "sex", - 'should_export_as_alls': True, + "dataset_name": _CDC_WISQARS_DATASET_NAME, + "demographic": "sex", + "should_export_as_alls": True, } cdc_wisqars_exporter_operator_sex = util.create_exporter_operator( - 'cdc_wisqars_exporter_sex', payload_sex, data_ingestion_dag + "cdc_wisqars_exporter_sex", payload_sex, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/cdc_wisqars_black_men.py b/airflow/dags/cdc_wisqars_black_men.py index a10dc874f4..2c9ac16fa9 100644 --- a/airflow/dags/cdc_wisqars_black_men.py +++ b/airflow/dags/cdc_wisqars_black_men.py @@ -8,26 +8,26 @@ _CDC_WISQARS_BLACK_MEN_DATASET_NAME = "cdc_wisqars_black_men_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_wisqars_black_men_ingestion_dag', + "cdc_wisqars_black_men_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC_WISQARS Black Men', + description="Ingestion configuration for CDC_WISQARS Black Men", ) # URBANICITY NATIONAL cdc_wisqars_black_men_bq_payload_urbanicity_national = util.generate_bq_payload( _CDC_WISQARS_BLACK_MEN_WORKFLOW_ID, _CDC_WISQARS_BLACK_MEN_DATASET_NAME, - demographic='urbanicity', - geographic='national', + demographic="urbanicity", + geographic="national", ) cdc_wisqars_black_men_bq_operator_urbanicity_national = util.create_bq_ingest_operator( - 'cdc_wisqars_black_men_to_bq_urbanicity_national', + "cdc_wisqars_black_men_to_bq_urbanicity_national", cdc_wisqars_black_men_bq_payload_urbanicity_national, data_ingestion_dag, ) @@ -36,11 +36,11 @@ cdc_wisqars_black_men_bq_payload_urbanicity_state = util.generate_bq_payload( _CDC_WISQARS_BLACK_MEN_WORKFLOW_ID, _CDC_WISQARS_BLACK_MEN_DATASET_NAME, - demographic='urbanicity', - geographic='state', + demographic="urbanicity", + geographic="state", ) cdc_wisqars_black_men_bq_operator_urbanicity_state = util.create_bq_ingest_operator( - 'cdc_wisqars_black_men_to_bq_urbanicity_state', + "cdc_wisqars_black_men_to_bq_urbanicity_state", cdc_wisqars_black_men_bq_payload_urbanicity_state, data_ingestion_dag, ) @@ -49,11 +49,11 @@ cdc_wisqars_black_men_bq_payload_age_national = util.generate_bq_payload( _CDC_WISQARS_BLACK_MEN_WORKFLOW_ID, _CDC_WISQARS_BLACK_MEN_DATASET_NAME, - demographic='age', - geographic='national', + demographic="age", + geographic="national", ) cdc_wisqars_black_men_bq_operator_age_national = util.create_bq_ingest_operator( - 'cdc_wisqars_black_men_to_bq_age_national', + "cdc_wisqars_black_men_to_bq_age_national", cdc_wisqars_black_men_bq_payload_age_national, data_ingestion_dag, ) @@ -62,31 +62,31 @@ cdc_wisqars_black_men_bq_payload_age_state = util.generate_bq_payload( _CDC_WISQARS_BLACK_MEN_WORKFLOW_ID, _CDC_WISQARS_BLACK_MEN_DATASET_NAME, - demographic='age', - geographic='state', + demographic="age", + geographic="state", ) cdc_wisqars_black_men_bq_operator_age_state = util.create_bq_ingest_operator( - 'cdc_wisqars_black_men_to_bq_age_state', + "cdc_wisqars_black_men_to_bq_age_state", cdc_wisqars_black_men_bq_payload_age_state, data_ingestion_dag, ) # Exporters payload_urbanicity = { - 'dataset_name': _CDC_WISQARS_BLACK_MEN_DATASET_NAME, - 'demographic': "urbanicity", - 'should_export_as_alls': True, + "dataset_name": _CDC_WISQARS_BLACK_MEN_DATASET_NAME, + "demographic": "urbanicity", + "should_export_as_alls": True, } cdc_wisqars_black_men_exporter_operator_urbanicity = util.create_exporter_operator( - 'cdc_wisqars_black_men_exporter_urbanicity', payload_urbanicity, data_ingestion_dag + "cdc_wisqars_black_men_exporter_urbanicity", payload_urbanicity, data_ingestion_dag ) payload_age = { - 'dataset_name': _CDC_WISQARS_BLACK_MEN_DATASET_NAME, - 'demographic': "age", + "dataset_name": _CDC_WISQARS_BLACK_MEN_DATASET_NAME, + "demographic": "age", } cdc_wisqars_black_men_exporter_operator_age = util.create_exporter_operator( - 'cdc_wisqars_black_men_exporter_age', payload_age, data_ingestion_dag + "cdc_wisqars_black_men_exporter_age", payload_age, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/cdc_wisqars_youth.py b/airflow/dags/cdc_wisqars_youth.py index 865857bdfd..fff91dc7ca 100644 --- a/airflow/dags/cdc_wisqars_youth.py +++ b/airflow/dags/cdc_wisqars_youth.py @@ -10,26 +10,26 @@ _CDC_WISQARS_YOUTH_DATASET_NAME = "cdc_wisqars_youth_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_wisqars_youth_ingestion_dag', + "cdc_wisqars_youth_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC_WISQARS Youth', + description="Ingestion configuration for CDC_WISQARS Youth", ) # RACE NATIONAL cdc_wisqars_youth_bq_payload_race_national = util.generate_bq_payload( _CDC_WISQARS_YOUTH_WORKFLOW_ID, _CDC_WISQARS_YOUTH_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='national', + demographic="race_and_ethnicity", + geographic="national", ) cdc_wisqars_youth_bq_operator_race_national = util.create_bq_ingest_operator( - 'cdc_wisqars_youth_to_bq_race_national', + "cdc_wisqars_youth_to_bq_race_national", cdc_wisqars_youth_bq_payload_race_national, data_ingestion_dag, ) @@ -38,23 +38,23 @@ cdc_wisqars_youth_bq_payload_race_state = util.generate_bq_payload( _CDC_WISQARS_YOUTH_WORKFLOW_ID, _CDC_WISQARS_YOUTH_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='state', + demographic="race_and_ethnicity", + geographic="state", ) cdc_wisqars_youth_bq_operator_race_state = util.create_bq_ingest_operator( - 'cdc_wisqars_youth_to_bq_race_state', + "cdc_wisqars_youth_to_bq_race_state", cdc_wisqars_youth_bq_payload_race_state, data_ingestion_dag, ) # Exporters payload_race = { - 'dataset_name': _CDC_WISQARS_YOUTH_DATASET_NAME, - 'demographic': "race_and_ethnicity", - 'should_export_as_alls': True, + "dataset_name": _CDC_WISQARS_YOUTH_DATASET_NAME, + "demographic": "race_and_ethnicity", + "should_export_as_alls": True, } cdc_wisqars_youth_exporter_operator_race = util.create_exporter_operator( - 'cdc_wisqars_youth_exporter_race', payload_race, data_ingestion_dag + "cdc_wisqars_youth_exporter_race", payload_race, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/cdc_wonder.py b/airflow/dags/cdc_wonder.py index db7294339b..d6b274b172 100644 --- a/airflow/dags/cdc_wonder.py +++ b/airflow/dags/cdc_wonder.py @@ -4,19 +4,19 @@ import util from datetime import timedelta -_CDC_WONDER_WORKFLOW_ID = 'CDC_WONDER_DATA' -_CDC_WONDER_DATASET_NAME = 'cdc_wonder_data' +_CDC_WONDER_WORKFLOW_ID = "CDC_WONDER_DATA" +_CDC_WONDER_DATASET_NAME = "cdc_wonder_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'cdc_wonder_ingestion_dag', + "cdc_wonder_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC Wonder Data', + description="Ingestion configuration for CDC Wonder Data", ) # INGEST BY GEO / DEMO @@ -25,86 +25,86 @@ cdc_wonder_bq_payload_race_national = util.generate_bq_payload( _CDC_WONDER_WORKFLOW_ID, _CDC_WONDER_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='national', + demographic="race_and_ethnicity", + geographic="national", ) cdc_wonder_bq_operator_race_national = util.create_bq_ingest_operator( - 'cdc_wonder_to_bq_race_national', cdc_wonder_bq_payload_race_national, data_ingestion_dag + "cdc_wonder_to_bq_race_national", cdc_wonder_bq_payload_race_national, data_ingestion_dag ) # age_national cdc_wonder_bq_payload_age_national = util.generate_bq_payload( _CDC_WONDER_WORKFLOW_ID, _CDC_WONDER_DATASET_NAME, - demographic='age', - geographic='national', + demographic="age", + geographic="national", ) cdc_wonder_bq_operator_age_national = util.create_bq_ingest_operator( - 'cdc_wonder_to_bq_age_national', cdc_wonder_bq_payload_age_national, data_ingestion_dag + "cdc_wonder_to_bq_age_national", cdc_wonder_bq_payload_age_national, data_ingestion_dag ) # sex_national cdc_wonder_bq_payload_sex_national = util.generate_bq_payload( _CDC_WONDER_WORKFLOW_ID, _CDC_WONDER_DATASET_NAME, - demographic='sex', - geographic='national', + demographic="sex", + geographic="national", ) cdc_wonder_bq_operator_sex_national = util.create_bq_ingest_operator( - 'cdc_wonder_to_bq_sex_national', cdc_wonder_bq_payload_sex_national, data_ingestion_dag + "cdc_wonder_to_bq_sex_national", cdc_wonder_bq_payload_sex_national, data_ingestion_dag ) # race_state cdc_wonder_bq_payload_race_state = util.generate_bq_payload( _CDC_WONDER_WORKFLOW_ID, _CDC_WONDER_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='state', + demographic="race_and_ethnicity", + geographic="state", ) cdc_wonder_bq_operator_race_state = util.create_bq_ingest_operator( - 'cdc_wonder_to_bq_race_state', cdc_wonder_bq_payload_race_state, data_ingestion_dag + "cdc_wonder_to_bq_race_state", cdc_wonder_bq_payload_race_state, data_ingestion_dag ) # age_state cdc_wonder_bq_payload_age_state = util.generate_bq_payload( _CDC_WONDER_WORKFLOW_ID, _CDC_WONDER_DATASET_NAME, - demographic='age', - geographic='state', + demographic="age", + geographic="state", ) cdc_wonder_bq_operator_age_state = util.create_bq_ingest_operator( - 'cdc_wonder_to_bq_age_state', cdc_wonder_bq_payload_age_state, data_ingestion_dag + "cdc_wonder_to_bq_age_state", cdc_wonder_bq_payload_age_state, data_ingestion_dag ) # sex_state cdc_wonder_bq_payload_sex_state = util.generate_bq_payload( _CDC_WONDER_WORKFLOW_ID, _CDC_WONDER_DATASET_NAME, - demographic='sex', - geographic='state', + demographic="sex", + geographic="state", ) cdc_wonder_bq_operator_sex_state = util.create_bq_ingest_operator( - 'cdc_wonder_to_bq_sex_state', cdc_wonder_bq_payload_sex_state, data_ingestion_dag + "cdc_wonder_to_bq_sex_state", cdc_wonder_bq_payload_sex_state, data_ingestion_dag ) # EXPORT BY DEMOGRAPHIC payload_race = { - 'dataset_name': _CDC_WONDER_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _CDC_WONDER_DATASET_NAME, + "demographic": "race_and_ethnicity", } cdc_wonder_exporter_operator_race = util.create_exporter_operator( - 'cdc_wonder_exporter_race', payload_race, data_ingestion_dag + "cdc_wonder_exporter_race", payload_race, data_ingestion_dag ) -payload_age = {'dataset_name': _CDC_WONDER_DATASET_NAME, 'demographic': "age"} +payload_age = {"dataset_name": _CDC_WONDER_DATASET_NAME, "demographic": "age"} cdc_wonder_exporter_operator_age = util.create_exporter_operator( - 'cdc_wonder_exporter_age', payload_age, data_ingestion_dag + "cdc_wonder_exporter_age", payload_age, data_ingestion_dag ) -payload_sex = {'dataset_name': _CDC_WONDER_DATASET_NAME, 'demographic': "sex", 'should_export_as_alls': True} +payload_sex = {"dataset_name": _CDC_WONDER_DATASET_NAME, "demographic": "sex", "should_export_as_alls": True} cdc_wonder_exporter_operator_sex = util.create_exporter_operator( - 'cdc_wonder_exporter_sex', payload_sex, data_ingestion_dag + "cdc_wonder_exporter_sex", payload_sex, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/census_pop_estimates.py b/airflow/dags/census_pop_estimates.py index f9b8586db3..90c02fd844 100644 --- a/airflow/dags/census_pop_estimates.py +++ b/airflow/dags/census_pop_estimates.py @@ -6,31 +6,31 @@ import util -_CENSUS_POP_ESTIMATES_WORKFLOW_ID = 'CENSUS_POP_ESTIMATES' -_CENSUS_POP_ESTIMATES_DATASET_NAME = 'census_pop_estimates' +_CENSUS_POP_ESTIMATES_WORKFLOW_ID = "CENSUS_POP_ESTIMATES" +_CENSUS_POP_ESTIMATES_DATASET_NAME = "census_pop_estimates" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'census_pop_estimates_ingestion_dag', + "census_pop_estimates_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for Census Population Estimates', + description="Ingestion configuration for Census Population Estimates", ) census_pop_estimates_bq_payload = util.generate_bq_payload( _CENSUS_POP_ESTIMATES_WORKFLOW_ID, _CENSUS_POP_ESTIMATES_DATASET_NAME ) census_pop_estimates_bq_operator = util.create_bq_ingest_operator( - 'census_pop_estimates_to_bq', census_pop_estimates_bq_payload, data_ingestion_dag + "census_pop_estimates_to_bq", census_pop_estimates_bq_payload, data_ingestion_dag ) -census_pop_estimates_exporter_payload_race = {'dataset_name': _CENSUS_POP_ESTIMATES_DATASET_NAME, 'demographic': "race"} +census_pop_estimates_exporter_payload_race = {"dataset_name": _CENSUS_POP_ESTIMATES_DATASET_NAME, "demographic": "race"} census_pop_estimates_exporter_operator_race = util.create_exporter_operator( - 'census_pop_estimates_exporter_race', census_pop_estimates_exporter_payload_race, data_ingestion_dag + "census_pop_estimates_exporter_race", census_pop_estimates_exporter_payload_race, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/census_pop_estimates_sc.py b/airflow/dags/census_pop_estimates_sc.py index 7d350692f2..d7a87e7957 100644 --- a/airflow/dags/census_pop_estimates_sc.py +++ b/airflow/dags/census_pop_estimates_sc.py @@ -6,31 +6,31 @@ import util -_CENSUS_POP_ESTIMATES_SC_WORKFLOW_ID = 'CENSUS_POP_ESTIMATES_SC' -_CENSUS_POP_ESTIMATES_SC_DATASET_NAME = 'census_pop_estimates_sc' +_CENSUS_POP_ESTIMATES_SC_WORKFLOW_ID = "CENSUS_POP_ESTIMATES_SC" +_CENSUS_POP_ESTIMATES_SC_DATASET_NAME = "census_pop_estimates_sc" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'census_pop_estimates_sc_ingestion_dag', + "census_pop_estimates_sc_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for Census Population Estimates SC', + description="Ingestion configuration for Census Population Estimates SC", ) census_pop_estimates_sc_bq_payload = util.generate_bq_payload( _CENSUS_POP_ESTIMATES_SC_WORKFLOW_ID, _CENSUS_POP_ESTIMATES_SC_DATASET_NAME ) census_pop_estimates_sc_bq_operator = util.create_bq_ingest_operator( - 'census_pop_estimates_sc_to_bq', census_pop_estimates_sc_bq_payload, data_ingestion_dag + "census_pop_estimates_sc_to_bq", census_pop_estimates_sc_bq_payload, data_ingestion_dag ) -census_pop_estimates_sc_exporter_payload = {'dataset_name': _CENSUS_POP_ESTIMATES_SC_DATASET_NAME} +census_pop_estimates_sc_exporter_payload = {"dataset_name": _CENSUS_POP_ESTIMATES_SC_DATASET_NAME} census_pop_estimates_sc_exporter_operator = util.create_exporter_operator( - 'census_pop_estimates_sc_exporter', census_pop_estimates_sc_exporter_payload, data_ingestion_dag + "census_pop_estimates_sc_exporter", census_pop_estimates_sc_exporter_payload, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/chr.py b/airflow/dags/chr.py index 5b344c7f88..0496c46dc8 100644 --- a/airflow/dags/chr.py +++ b/airflow/dags/chr.py @@ -5,31 +5,31 @@ from datetime import timedelta import util -_CHR_WORKFLOW_ID = 'CHR_DATA' -_CHR_DATASET_NAME = 'chr_data' +_CHR_WORKFLOW_ID = "CHR_DATA" +_CHR_DATASET_NAME = "chr_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'chr_ingestion_dag', + "chr_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CHR', + description="Ingestion configuration for CHR", ) -chr_bq_payload_race = util.generate_bq_payload(_CHR_WORKFLOW_ID, _CHR_DATASET_NAME, demographic='race') -chr_pop_bq_operator_race = util.create_bq_ingest_operator('chr_to_bq', chr_bq_payload_race, data_ingestion_dag) +chr_bq_payload_race = util.generate_bq_payload(_CHR_WORKFLOW_ID, _CHR_DATASET_NAME, demographic="race") +chr_pop_bq_operator_race = util.create_bq_ingest_operator("chr_to_bq", chr_bq_payload_race, data_ingestion_dag) chr_exporter_payload_race = { - 'dataset_name': _CHR_DATASET_NAME, - 'demographic': "race_and_ethnicity", - 'should_export_as_alls': True, + "dataset_name": _CHR_DATASET_NAME, + "demographic": "race_and_ethnicity", + "should_export_as_alls": True, } chr_exporter_operator_race = util.create_exporter_operator( - 'chr_exporter', chr_exporter_payload_race, data_ingestion_dag + "chr_exporter", chr_exporter_payload_race, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/decia_2010_territory_population.py b/airflow/dags/decia_2010_territory_population.py index 790bda14c2..ec21523203 100644 --- a/airflow/dags/decia_2010_territory_population.py +++ b/airflow/dags/decia_2010_territory_population.py @@ -7,47 +7,47 @@ # one very long comma separated string _DECIA_2010_POPULATION_GCS_FILENAMES: str = ( - 'decia_2010_territory_population-by_race_and_ethnicity_territory.json,' - 'decia_2010_territory_population-by_sex_territory.json,' - 'decia_2010_territory_population-by_age_territory.json' + "decia_2010_territory_population-by_race_and_ethnicity_territory.json," + "decia_2010_territory_population-by_sex_territory.json," + "decia_2010_territory_population-by_age_territory.json" ) -_DECIA_2010_POPULATION_WORKFLOW_ID = 'DECIA_2010_POPULATION' -_DECIA_2010_POPULATION_DATASET = 'decia_2010_territory_population' +_DECIA_2010_POPULATION_WORKFLOW_ID = "DECIA_2010_POPULATION" +_DECIA_2010_POPULATION_DATASET = "decia_2010_territory_population" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'decia_2010_population_data_ingestion_dag', + "decia_2010_population_data_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for ACS 2010 Population Data', + description="Ingestion configuration for ACS 2010 Population Data", ) decia_2010_bq_payload = util.generate_bq_payload( _DECIA_2010_POPULATION_WORKFLOW_ID, _DECIA_2010_POPULATION_DATASET, - gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), + gcs_bucket=Variable.get("GCS_MANUAL_UPLOADS_BUCKET"), filename=_DECIA_2010_POPULATION_GCS_FILENAMES, ) -decia_2010_bq_op = util.create_bq_ingest_operator('decia_2010_gcs_to_bq', decia_2010_bq_payload, data_ingestion_dag) +decia_2010_bq_op = util.create_bq_ingest_operator("decia_2010_gcs_to_bq", decia_2010_bq_payload, data_ingestion_dag) -decia_2010_exporter_payload_race = {'dataset_name': _DECIA_2010_POPULATION_DATASET, 'demographic': "race"} +decia_2010_exporter_payload_race = {"dataset_name": _DECIA_2010_POPULATION_DATASET, "demographic": "race"} decia_2010_exporter_operator_race = util.create_exporter_operator( - 'decia_2010_exporter_race', decia_2010_exporter_payload_race, data_ingestion_dag + "decia_2010_exporter_race", decia_2010_exporter_payload_race, data_ingestion_dag ) -decia_2010_exporter_payload_age = {'dataset_name': _DECIA_2010_POPULATION_DATASET, 'demographic': "age"} +decia_2010_exporter_payload_age = {"dataset_name": _DECIA_2010_POPULATION_DATASET, "demographic": "age"} decia_2010_exporter_operator_age = util.create_exporter_operator( - 'decia_2010_exporter_age', decia_2010_exporter_payload_age, data_ingestion_dag + "decia_2010_exporter_age", decia_2010_exporter_payload_age, data_ingestion_dag ) -decia_2010_exporter_payload_sex = {'dataset_name': _DECIA_2010_POPULATION_DATASET, 'demographic': "sex"} +decia_2010_exporter_payload_sex = {"dataset_name": _DECIA_2010_POPULATION_DATASET, "demographic": "sex"} decia_2010_exporter_operator_sex = util.create_exporter_operator( - 'decia_2010_exporter_sex', decia_2010_exporter_payload_sex, data_ingestion_dag + "decia_2010_exporter_sex", decia_2010_exporter_payload_sex, data_ingestion_dag ) # Data Ingestion DAG diff --git a/airflow/dags/decia_2020_territory_population.py b/airflow/dags/decia_2020_territory_population.py index c335b08b47..17fc6424c1 100644 --- a/airflow/dags/decia_2020_territory_population.py +++ b/airflow/dags/decia_2020_territory_population.py @@ -7,19 +7,19 @@ import util -_DECIA_2020_TERRITORY_POPULATION_WORKFLOW_ID = 'DECIA_2020_TERRITORY_POPULATION_DATA' -_DECIA_2020_TERRITORY_POPULATION_DATASET_NAME = 'decia_2020_territory_population' +_DECIA_2020_TERRITORY_POPULATION_WORKFLOW_ID = "DECIA_2020_TERRITORY_POPULATION_DATA" +_DECIA_2020_TERRITORY_POPULATION_DATASET_NAME = "decia_2020_territory_population" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'decia_2020_population_data_ingestion_dag', + "decia_2020_population_data_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for Island Areas 2020 population data', + description="Ingestion configuration for Island Areas 2020 population data", ) @@ -31,7 +31,7 @@ geographic="state", ) decia_2020_bq_ingest_race_state = util.create_bq_ingest_operator( - 'decia_2020_pop_to_bq_race_state', bq_payload_race_state, data_ingestion_dag + "decia_2020_pop_to_bq_race_state", bq_payload_race_state, data_ingestion_dag ) bq_payload_age_state = util.generate_bq_payload( @@ -41,7 +41,7 @@ geographic="state", ) decia_2020_bq_ingest_age_state = util.create_bq_ingest_operator( - 'decia_2020_pop_to_bq_age_state', bq_payload_age_state, data_ingestion_dag + "decia_2020_pop_to_bq_age_state", bq_payload_age_state, data_ingestion_dag ) bq_payload_sex_state = util.generate_bq_payload( @@ -51,7 +51,7 @@ geographic="state", ) decia_2020_bq_ingest_sex_state = util.create_bq_ingest_operator( - 'decia_2020_pop_to_bq_sex_state', bq_payload_sex_state, data_ingestion_dag + "decia_2020_pop_to_bq_sex_state", bq_payload_sex_state, data_ingestion_dag ) bq_payload_race_county = util.generate_bq_payload( @@ -61,7 +61,7 @@ geographic="county", ) decia_2020_bq_ingest_race_county = util.create_bq_ingest_operator( - 'decia_2020_pop_to_bq_race_county', bq_payload_race_county, data_ingestion_dag + "decia_2020_pop_to_bq_race_county", bq_payload_race_county, data_ingestion_dag ) bq_payload_age_county = util.generate_bq_payload( @@ -71,7 +71,7 @@ geographic="county", ) decia_2020_bq_ingest_age_county = util.create_bq_ingest_operator( - 'decia_2020_pop_to_bq_age_county', bq_payload_age_county, data_ingestion_dag + "decia_2020_pop_to_bq_age_county", bq_payload_age_county, data_ingestion_dag ) bq_payload_sex_county = util.generate_bq_payload( @@ -81,29 +81,29 @@ geographic="county", ) decia_2020_bq_ingest_sex_county = util.create_bq_ingest_operator( - 'decia_2020_pop_to_bq_sex_county', bq_payload_sex_county, data_ingestion_dag + "decia_2020_pop_to_bq_sex_county", bq_payload_sex_county, data_ingestion_dag ) exporter_payload_race = { - 'dataset_name': _DECIA_2020_TERRITORY_POPULATION_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _DECIA_2020_TERRITORY_POPULATION_DATASET_NAME, + "demographic": "race_and_ethnicity", } decia_2020_population_data_exporter_operator_race = util.create_exporter_operator( - 'decia_2020_population_data_exporter_race', exporter_payload_race, data_ingestion_dag + "decia_2020_population_data_exporter_race", exporter_payload_race, data_ingestion_dag ) -exporter_payload_age = {'dataset_name': _DECIA_2020_TERRITORY_POPULATION_DATASET_NAME, 'demographic': "age"} +exporter_payload_age = {"dataset_name": _DECIA_2020_TERRITORY_POPULATION_DATASET_NAME, "demographic": "age"} decia_2020_population_data_exporter_operator_age = util.create_exporter_operator( - 'decia_2020_population_data_exporter_age', exporter_payload_age, data_ingestion_dag + "decia_2020_population_data_exporter_age", exporter_payload_age, data_ingestion_dag ) -exporter_payload_sex = {'dataset_name': _DECIA_2020_TERRITORY_POPULATION_DATASET_NAME, 'demographic': "sex"} +exporter_payload_sex = {"dataset_name": _DECIA_2020_TERRITORY_POPULATION_DATASET_NAME, "demographic": "sex"} decia_2020_population_data_exporter_operator_sex = util.create_exporter_operator( - 'decia_2020_population_data_exporter_sex', exporter_payload_sex, data_ingestion_dag + "decia_2020_population_data_exporter_sex", exporter_payload_sex, data_ingestion_dag ) -connector = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id='connector') +connector = DummyOperator(default_args=default_args, dag=data_ingestion_dag, task_id="connector") # Ingestion DAG ( diff --git a/airflow/dags/geo_context.py b/airflow/dags/geo_context.py index 7a71629689..ca252c308b 100644 --- a/airflow/dags/geo_context.py +++ b/airflow/dags/geo_context.py @@ -6,29 +6,29 @@ import util -_GEO_CONTEXT_WORKFLOW_ID = 'GEO_CONTEXT' -_GEO_CONTEXT_DATASET_NAME = 'geo_context' +_GEO_CONTEXT_WORKFLOW_ID = "GEO_CONTEXT" +_GEO_CONTEXT_DATASET_NAME = "geo_context" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'geo_context_ingestion_dag', + "geo_context_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for GEO CONTEXT', + description="Ingestion configuration for GEO CONTEXT", ) geo_context_bq_payload = util.generate_bq_payload(_GEO_CONTEXT_WORKFLOW_ID, _GEO_CONTEXT_DATASET_NAME) geo_context_pop_bq_operator = util.create_bq_ingest_operator( - 'geo_context_to_bq', geo_context_bq_payload, data_ingestion_dag + "geo_context_to_bq", geo_context_bq_payload, data_ingestion_dag ) -geo_context_exporter_payload = {'dataset_name': _GEO_CONTEXT_DATASET_NAME} +geo_context_exporter_payload = {"dataset_name": _GEO_CONTEXT_DATASET_NAME} geo_context_exporter_operator = util.create_exporter_operator( - 'geo_context_exporter', geo_context_exporter_payload, data_ingestion_dag + "geo_context_exporter", geo_context_exporter_payload, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/graphql_ahr_behavioral_health.py b/airflow/dags/graphql_ahr_behavioral_health.py index 07a192c169..93f45d8006 100644 --- a/airflow/dags/graphql_ahr_behavioral_health.py +++ b/airflow/dags/graphql_ahr_behavioral_health.py @@ -5,55 +5,55 @@ from datetime import timedelta -_GRAPHQL_AHR_WORKFLOW_ID = 'GRAPHQL_AHR_DATA' -_GRAPHQL_AHR_DATASET_NAME = 'graphql_ahr_data' +_GRAPHQL_AHR_WORKFLOW_ID = "GRAPHQL_AHR_DATA" +_GRAPHQL_AHR_DATASET_NAME = "graphql_ahr_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'graphql_ahr_behavioral_health_ingestion_dag', + "graphql_ahr_behavioral_health_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for all behavioral health topics fromGRAPHQL AHR', + description="Ingestion configuration for all behavioral health topics fromGRAPHQL AHR", ) # AGE NATIONAL graphql_ahr_bq_payload_age_national = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='behavioral_health', - demographic='age', - geographic='national', + category="behavioral_health", + demographic="age", + geographic="national", ) graphql_ahr_bq_operator_age_national = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_age_national', graphql_ahr_bq_payload_age_national, data_ingestion_dag + "graphql_ahr_to_bq_age_national", graphql_ahr_bq_payload_age_national, data_ingestion_dag ) # AGE STATE graphql_ahr_bq_payload_age_state = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='behavioral_health', - demographic='age', - geographic='state', + category="behavioral_health", + demographic="age", + geographic="state", ) graphql_ahr_bq_operator_age_state = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_age_state', graphql_ahr_bq_payload_age_state, data_ingestion_dag + "graphql_ahr_to_bq_age_state", graphql_ahr_bq_payload_age_state, data_ingestion_dag ) # RACE NATIONAL graphql_ahr_bq_payload_race_national = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='behavioral_health', - demographic='race_and_ethnicity', - geographic='national', + category="behavioral_health", + demographic="race_and_ethnicity", + geographic="national", ) graphql_ahr_bq_operator_race_national = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_race_national', graphql_ahr_bq_payload_race_national, data_ingestion_dag + "graphql_ahr_to_bq_race_national", graphql_ahr_bq_payload_race_national, data_ingestion_dag ) @@ -61,56 +61,56 @@ graphql_ahr_bq_payload_race_state = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='behavioral_health', - demographic='race_and_ethnicity', - geographic='state', + category="behavioral_health", + demographic="race_and_ethnicity", + geographic="state", ) graphql_ahr_bq_operator_race_state = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_race_state', graphql_ahr_bq_payload_race_state, data_ingestion_dag + "graphql_ahr_to_bq_race_state", graphql_ahr_bq_payload_race_state, data_ingestion_dag ) # SEX NATIONAL graphql_ahr_bq_payload_sex_national = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='behavioral_health', - demographic='sex', - geographic='national', + category="behavioral_health", + demographic="sex", + geographic="national", ) graphql_ahr_bq_operator_sex_national = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_sex_national', graphql_ahr_bq_payload_sex_national, data_ingestion_dag + "graphql_ahr_to_bq_sex_national", graphql_ahr_bq_payload_sex_national, data_ingestion_dag ) # SEX STATE graphql_ahr_bq_payload_sex_state = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='behavioral_health', - demographic='sex', - geographic='state', + category="behavioral_health", + demographic="sex", + geographic="state", ) graphql_ahr_bq_operator_sex_state = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_sex_state', graphql_ahr_bq_payload_sex_state, data_ingestion_dag + "graphql_ahr_to_bq_sex_state", graphql_ahr_bq_payload_sex_state, data_ingestion_dag ) # EXPORTERS -payload_age = {'dataset_name': _GRAPHQL_AHR_DATASET_NAME, 'category': 'behavioral_health', 'demographic': "age"} +payload_age = {"dataset_name": _GRAPHQL_AHR_DATASET_NAME, "category": "behavioral_health", "demographic": "age"} graphql_ahr_exporter_operator_age = util.create_exporter_operator( - 'graphql_ahr_exporter_age', payload_age, data_ingestion_dag + "graphql_ahr_exporter_age", payload_age, data_ingestion_dag ) payload_race = { - 'dataset_name': _GRAPHQL_AHR_DATASET_NAME, - 'category': 'behavioral_health', - 'demographic': "race_and_ethnicity", + "dataset_name": _GRAPHQL_AHR_DATASET_NAME, + "category": "behavioral_health", + "demographic": "race_and_ethnicity", } graphql_ahr_exporter_operator_race = util.create_exporter_operator( - 'graphql_ahr_exporter_race', payload_race, data_ingestion_dag + "graphql_ahr_exporter_race", payload_race, data_ingestion_dag ) -payload_sex = {'dataset_name': _GRAPHQL_AHR_DATASET_NAME, 'category': 'behavioral_health', 'demographic': "sex"} +payload_sex = {"dataset_name": _GRAPHQL_AHR_DATASET_NAME, "category": "behavioral_health", "demographic": "sex"} graphql_ahr_exporter_operator_sex = util.create_exporter_operator( - 'graphql_ahr_exporter_sex', payload_sex, data_ingestion_dag + "graphql_ahr_exporter_sex", payload_sex, data_ingestion_dag ) # Ingestion Dag diff --git a/airflow/dags/graphql_ahr_non-behavioral_health.py b/airflow/dags/graphql_ahr_non-behavioral_health.py index 12f7e6a060..0e03f3ab4b 100644 --- a/airflow/dags/graphql_ahr_non-behavioral_health.py +++ b/airflow/dags/graphql_ahr_non-behavioral_health.py @@ -5,55 +5,55 @@ from datetime import timedelta -_GRAPHQL_AHR_WORKFLOW_ID = 'GRAPHQL_AHR_DATA' -_GRAPHQL_AHR_DATASET_NAME = 'graphql_ahr_data' +_GRAPHQL_AHR_WORKFLOW_ID = "GRAPHQL_AHR_DATA" +_GRAPHQL_AHR_DATASET_NAME = "graphql_ahr_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'graphql_ahr_non-behavioral_health_ingestion_dag', + "graphql_ahr_non-behavioral_health_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for all non-behavioral health topics from GRAPHQL AHR', + description="Ingestion configuration for all non-behavioral health topics from GRAPHQL AHR", ) # AGE NATIONAL graphql_ahr_bq_payload_age_national = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='non-behavioral_health', - demographic='age', - geographic='national', + category="non-behavioral_health", + demographic="age", + geographic="national", ) graphql_ahr_bq_operator_age_national = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_age_national', graphql_ahr_bq_payload_age_national, data_ingestion_dag + "graphql_ahr_to_bq_age_national", graphql_ahr_bq_payload_age_national, data_ingestion_dag ) # AGE STATE graphql_ahr_bq_payload_age_state = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='non-behavioral_health', - demographic='age', - geographic='state', + category="non-behavioral_health", + demographic="age", + geographic="state", ) graphql_ahr_bq_operator_age_state = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_age_state', graphql_ahr_bq_payload_age_state, data_ingestion_dag + "graphql_ahr_to_bq_age_state", graphql_ahr_bq_payload_age_state, data_ingestion_dag ) # RACE NATIONAL graphql_ahr_bq_payload_race_national = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='non-behavioral_health', - demographic='race_and_ethnicity', - geographic='national', + category="non-behavioral_health", + demographic="race_and_ethnicity", + geographic="national", ) graphql_ahr_bq_operator_race_national = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_race_national', graphql_ahr_bq_payload_race_national, data_ingestion_dag + "graphql_ahr_to_bq_race_national", graphql_ahr_bq_payload_race_national, data_ingestion_dag ) @@ -61,56 +61,56 @@ graphql_ahr_bq_payload_race_state = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='non-behavioral_health', - demographic='race_and_ethnicity', - geographic='state', + category="non-behavioral_health", + demographic="race_and_ethnicity", + geographic="state", ) graphql_ahr_bq_operator_race_state = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_race_state', graphql_ahr_bq_payload_race_state, data_ingestion_dag + "graphql_ahr_to_bq_race_state", graphql_ahr_bq_payload_race_state, data_ingestion_dag ) # SEX NATIONAL graphql_ahr_bq_payload_sex_national = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='non-behavioral_health', - demographic='sex', - geographic='national', + category="non-behavioral_health", + demographic="sex", + geographic="national", ) graphql_ahr_bq_operator_sex_national = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_sex_national', graphql_ahr_bq_payload_sex_national, data_ingestion_dag + "graphql_ahr_to_bq_sex_national", graphql_ahr_bq_payload_sex_national, data_ingestion_dag ) # SEX STATE graphql_ahr_bq_payload_sex_state = util.generate_bq_payload( _GRAPHQL_AHR_WORKFLOW_ID, _GRAPHQL_AHR_DATASET_NAME, - category='non-behavioral_health', - demographic='sex', - geographic='state', + category="non-behavioral_health", + demographic="sex", + geographic="state", ) graphql_ahr_bq_operator_sex_state = util.create_bq_ingest_operator( - 'graphql_ahr_to_bq_sex_state', graphql_ahr_bq_payload_sex_state, data_ingestion_dag + "graphql_ahr_to_bq_sex_state", graphql_ahr_bq_payload_sex_state, data_ingestion_dag ) # EXPORTERS -payload_age = {'dataset_name': _GRAPHQL_AHR_DATASET_NAME, 'category': 'non-behavioral_health', 'demographic': "age"} +payload_age = {"dataset_name": _GRAPHQL_AHR_DATASET_NAME, "category": "non-behavioral_health", "demographic": "age"} graphql_ahr_exporter_operator_age = util.create_exporter_operator( - 'graphql_ahr_exporter_age', payload_age, data_ingestion_dag + "graphql_ahr_exporter_age", payload_age, data_ingestion_dag ) payload_race = { - 'dataset_name': _GRAPHQL_AHR_DATASET_NAME, - 'category': 'non-behavioral_health', - 'demographic': "race_and_ethnicity", + "dataset_name": _GRAPHQL_AHR_DATASET_NAME, + "category": "non-behavioral_health", + "demographic": "race_and_ethnicity", } graphql_ahr_exporter_operator_race = util.create_exporter_operator( - 'graphql_ahr_exporter_race', payload_race, data_ingestion_dag + "graphql_ahr_exporter_race", payload_race, data_ingestion_dag ) -payload_sex = {'dataset_name': _GRAPHQL_AHR_DATASET_NAME, 'category': 'non-behavioral_health', 'demographic': "sex"} +payload_sex = {"dataset_name": _GRAPHQL_AHR_DATASET_NAME, "category": "non-behavioral_health", "demographic": "sex"} graphql_ahr_exporter_operator_sex = util.create_exporter_operator( - 'graphql_ahr_exporter_sex', payload_sex, data_ingestion_dag + "graphql_ahr_exporter_sex", payload_sex, data_ingestion_dag ) # Ingestion Dag diff --git a/airflow/dags/kff_vaccination.py b/airflow/dags/kff_vaccination.py index d6c55d5200..e4a23f4f9e 100644 --- a/airflow/dags/kff_vaccination.py +++ b/airflow/dags/kff_vaccination.py @@ -7,34 +7,34 @@ import util -_KFF_VACCINATION_WORKFLOW_ID = 'KFF_VACCINATION' -_KFF_VACCINATION_DATASET_NAME = 'kff_vaccination' +_KFF_VACCINATION_WORKFLOW_ID = "KFF_VACCINATION" +_KFF_VACCINATION_DATASET_NAME = "kff_vaccination" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'kff_vaccination_ingestion_dag', + "kff_vaccination_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for CDC Vaccination National', + description="Ingestion configuration for CDC Vaccination National", ) kff_vaccination_bq_payload = util.generate_bq_payload(_KFF_VACCINATION_WORKFLOW_ID, _KFF_VACCINATION_DATASET_NAME) kff_vaccination_bq_operator = util.create_bq_ingest_operator( - 'kff_vaccination_to_bq', kff_vaccination_bq_payload, data_ingestion_dag + "kff_vaccination_to_bq", kff_vaccination_bq_payload, data_ingestion_dag ) -kff_vaccination_exporter_payload_race = {'dataset_name': _KFF_VACCINATION_DATASET_NAME, 'demographic': "race"} +kff_vaccination_exporter_payload_race = {"dataset_name": _KFF_VACCINATION_DATASET_NAME, "demographic": "race"} kff_vaccination_exporter_operator_race = util.create_exporter_operator( - 'kff_vaccination_exporter_race', kff_vaccination_exporter_payload_race, data_ingestion_dag + "kff_vaccination_exporter_race", kff_vaccination_exporter_payload_race, data_ingestion_dag ) -kff_vaccination_exporter_payload_alls = {'dataset_name': _KFF_VACCINATION_DATASET_NAME, 'demographic': "alls"} +kff_vaccination_exporter_payload_alls = {"dataset_name": _KFF_VACCINATION_DATASET_NAME, "demographic": "alls"} kff_vaccination_exporter_operator_alls = util.create_exporter_operator( - 'kff_vaccination_exporter_alls', kff_vaccination_exporter_payload_alls, data_ingestion_dag + "kff_vaccination_exporter_alls", kff_vaccination_exporter_payload_alls, data_ingestion_dag ) # Ingestion DAG diff --git a/airflow/dags/maternal_mortality.py b/airflow/dags/maternal_mortality.py index 80ff3d8ed0..76a695196d 100644 --- a/airflow/dags/maternal_mortality.py +++ b/airflow/dags/maternal_mortality.py @@ -6,28 +6,28 @@ from datetime import timedelta -_MM_WORKFLOW_ID = 'MATERNAL_MORTALITY_DATA' -_MM_DATASET_NAME = 'maternal_mortality_data' +_MM_WORKFLOW_ID = "MATERNAL_MORTALITY_DATA" +_MM_DATASET_NAME = "maternal_mortality_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'maternal_mortality_ingestion_dag', + "maternal_mortality_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for MATERNAL_MORTALITY', + description="Ingestion configuration for MATERNAL_MORTALITY", ) -maternal_mortality_bq_payload = util.generate_bq_payload(_MM_WORKFLOW_ID, _MM_DATASET_NAME, demographic='race') +maternal_mortality_bq_payload = util.generate_bq_payload(_MM_WORKFLOW_ID, _MM_DATASET_NAME, demographic="race") maternal_mortality_bq_operator = util.create_bq_ingest_operator( - 'maternal_mortality_to_bq', maternal_mortality_bq_payload, data_ingestion_dag + "maternal_mortality_to_bq", maternal_mortality_bq_payload, data_ingestion_dag ) maternal_mortality_exporter_payload_race = { - 'dataset_name': _MM_DATASET_NAME, - 'demographic': 'race', - 'should_export_as_alls': True, + "dataset_name": _MM_DATASET_NAME, + "demographic": "race", + "should_export_as_alls": True, } maternal_mortality_exporter_operator_race = util.create_exporter_operator( - 'maternal_mortality_exporter_race', maternal_mortality_exporter_payload_race, data_ingestion_dag + "maternal_mortality_exporter_race", maternal_mortality_exporter_payload_race, data_ingestion_dag ) (maternal_mortality_bq_operator >> maternal_mortality_exporter_operator_race) diff --git a/airflow/dags/phrma.py b/airflow/dags/phrma.py index 7f4d8c5ffa..1db66e8ad2 100644 --- a/airflow/dags/phrma.py +++ b/airflow/dags/phrma.py @@ -4,19 +4,19 @@ import util from datetime import timedelta -_PHRMA_WORKFLOW_ID = 'PHRMA_DATA' -_PHRMA_DATASET_NAME = 'phrma_data' +_PHRMA_WORKFLOW_ID = "PHRMA_DATA" +_PHRMA_DATASET_NAME = "phrma_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'phrma_ingestion_dag', + "phrma_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for Phrma', + description="Ingestion configuration for Phrma", ) # INGEST BY GEO / DEMO @@ -25,55 +25,55 @@ phrma_bq_payload_sex_national = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='sex', - geographic='national', + demographic="sex", + geographic="national", ) phrma_bq_operator_sex_national = util.create_bq_ingest_operator( - 'phrma_to_bq_sex_national', phrma_bq_payload_sex_national, data_ingestion_dag + "phrma_to_bq_sex_national", phrma_bq_payload_sex_national, data_ingestion_dag ) # race_national phrma_bq_payload_race_national = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='national', + demographic="race_and_ethnicity", + geographic="national", ) phrma_bq_operator_race_national = util.create_bq_ingest_operator( - 'phrma_to_bq_race_national', phrma_bq_payload_race_national, data_ingestion_dag + "phrma_to_bq_race_national", phrma_bq_payload_race_national, data_ingestion_dag ) # age_national phrma_bq_payload_age_national = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='age', - geographic='national', + demographic="age", + geographic="national", ) phrma_bq_operator_age_national = util.create_bq_ingest_operator( - 'phrma_to_bq_age_national', phrma_bq_payload_age_national, data_ingestion_dag + "phrma_to_bq_age_national", phrma_bq_payload_age_national, data_ingestion_dag ) # lis_national phrma_bq_payload_lis_national = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='lis', - geographic='national', + demographic="lis", + geographic="national", ) phrma_bq_operator_lis_national = util.create_bq_ingest_operator( - 'phrma_to_bq_lis_national', phrma_bq_payload_lis_national, data_ingestion_dag + "phrma_to_bq_lis_national", phrma_bq_payload_lis_national, data_ingestion_dag ) # elig_national phrma_bq_payload_elig_national = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='eligibility', - geographic='national', + demographic="eligibility", + geographic="national", ) phrma_bq_operator_elig_national = util.create_bq_ingest_operator( - 'phrma_to_bq_elig_national', phrma_bq_payload_elig_national, data_ingestion_dag + "phrma_to_bq_elig_national", phrma_bq_payload_elig_national, data_ingestion_dag ) @@ -81,55 +81,55 @@ phrma_bq_payload_sex_state = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='sex', - geographic='state', + demographic="sex", + geographic="state", ) phrma_bq_operator_sex_state = util.create_bq_ingest_operator( - 'phrma_to_bq_sex_state', phrma_bq_payload_sex_state, data_ingestion_dag + "phrma_to_bq_sex_state", phrma_bq_payload_sex_state, data_ingestion_dag ) # race_state phrma_bq_payload_race_state = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='state', + demographic="race_and_ethnicity", + geographic="state", ) phrma_bq_operator_race_state = util.create_bq_ingest_operator( - 'phrma_to_bq_race_state', phrma_bq_payload_race_state, data_ingestion_dag + "phrma_to_bq_race_state", phrma_bq_payload_race_state, data_ingestion_dag ) # age_state phrma_bq_payload_age_state = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='age', - geographic='state', + demographic="age", + geographic="state", ) phrma_bq_operator_age_state = util.create_bq_ingest_operator( - 'phrma_to_bq_age_state', phrma_bq_payload_age_state, data_ingestion_dag + "phrma_to_bq_age_state", phrma_bq_payload_age_state, data_ingestion_dag ) # lis_state phrma_bq_payload_lis_state = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='lis', - geographic='state', + demographic="lis", + geographic="state", ) phrma_bq_operator_lis_state = util.create_bq_ingest_operator( - 'phrma_to_bq_lis_state', phrma_bq_payload_lis_state, data_ingestion_dag + "phrma_to_bq_lis_state", phrma_bq_payload_lis_state, data_ingestion_dag ) # elig_state phrma_bq_payload_elig_state = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='eligibility', - geographic='state', + demographic="eligibility", + geographic="state", ) phrma_bq_operator_elig_state = util.create_bq_ingest_operator( - 'phrma_to_bq_elig_state', phrma_bq_payload_elig_state, data_ingestion_dag + "phrma_to_bq_elig_state", phrma_bq_payload_elig_state, data_ingestion_dag ) @@ -137,84 +137,84 @@ phrma_bq_payload_sex_county = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='sex', - geographic='county', + demographic="sex", + geographic="county", ) phrma_bq_operator_sex_county = util.create_bq_ingest_operator( - 'phrma_to_bq_sex_county', phrma_bq_payload_sex_county, data_ingestion_dag + "phrma_to_bq_sex_county", phrma_bq_payload_sex_county, data_ingestion_dag ) # race_county phrma_bq_payload_race_county = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='county', + demographic="race_and_ethnicity", + geographic="county", ) phrma_bq_operator_race_county = util.create_bq_ingest_operator( - 'phrma_to_bq_race_county', phrma_bq_payload_race_county, data_ingestion_dag + "phrma_to_bq_race_county", phrma_bq_payload_race_county, data_ingestion_dag ) # age_county phrma_bq_payload_age_county = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='age', - geographic='county', + demographic="age", + geographic="county", ) phrma_bq_operator_age_county = util.create_bq_ingest_operator( - 'phrma_to_bq_age_county', phrma_bq_payload_age_county, data_ingestion_dag + "phrma_to_bq_age_county", phrma_bq_payload_age_county, data_ingestion_dag ) # lis_county phrma_bq_payload_lis_county = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='lis', - geographic='county', + demographic="lis", + geographic="county", ) phrma_bq_operator_lis_county = util.create_bq_ingest_operator( - 'phrma_to_bq_lis_county', phrma_bq_payload_lis_county, data_ingestion_dag + "phrma_to_bq_lis_county", phrma_bq_payload_lis_county, data_ingestion_dag ) # elig_county phrma_bq_payload_elig_county = util.generate_bq_payload( _PHRMA_WORKFLOW_ID, _PHRMA_DATASET_NAME, - demographic='eligibility', - geographic='county', + demographic="eligibility", + geographic="county", ) phrma_bq_operator_elig_county = util.create_bq_ingest_operator( - 'phrma_to_bq_elig_county', phrma_bq_payload_elig_county, data_ingestion_dag + "phrma_to_bq_elig_county", phrma_bq_payload_elig_county, data_ingestion_dag ) # EXPORT BY DEMOGRAPHIC payload_race = { - 'dataset_name': _PHRMA_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _PHRMA_DATASET_NAME, + "demographic": "race_and_ethnicity", } -phrma_exporter_operator_race = util.create_exporter_operator('phrma_exporter_race', payload_race, data_ingestion_dag) +phrma_exporter_operator_race = util.create_exporter_operator("phrma_exporter_race", payload_race, data_ingestion_dag) -payload_age = {'dataset_name': _PHRMA_DATASET_NAME, 'demographic': "age"} -phrma_exporter_operator_age = util.create_exporter_operator('phrma_exporter_age', payload_age, data_ingestion_dag) +payload_age = {"dataset_name": _PHRMA_DATASET_NAME, "demographic": "age"} +phrma_exporter_operator_age = util.create_exporter_operator("phrma_exporter_age", payload_age, data_ingestion_dag) -payload_sex = {'dataset_name': _PHRMA_DATASET_NAME, 'demographic': "sex", 'should_export_as_alls': True} -phrma_exporter_operator_sex = util.create_exporter_operator('phrma_exporter_sex', payload_sex, data_ingestion_dag) +payload_sex = {"dataset_name": _PHRMA_DATASET_NAME, "demographic": "sex", "should_export_as_alls": True} +phrma_exporter_operator_sex = util.create_exporter_operator("phrma_exporter_sex", payload_sex, data_ingestion_dag) -payload_lis = {'dataset_name': _PHRMA_DATASET_NAME, 'demographic': "lis"} -phrma_exporter_operator_lis = util.create_exporter_operator('phrma_exporter_lis', payload_lis, data_ingestion_dag) +payload_lis = {"dataset_name": _PHRMA_DATASET_NAME, "demographic": "lis"} +phrma_exporter_operator_lis = util.create_exporter_operator("phrma_exporter_lis", payload_lis, data_ingestion_dag) payload_eligibility = { - 'dataset_name': _PHRMA_DATASET_NAME, - 'demographic': "eligibility", + "dataset_name": _PHRMA_DATASET_NAME, + "demographic": "eligibility", } phrma_exporter_operator_eligibility = util.create_exporter_operator( - 'phrma_exporter_eligibility', payload_eligibility, data_ingestion_dag + "phrma_exporter_eligibility", payload_eligibility, data_ingestion_dag ) # Ingestion DAG ( diff --git a/airflow/dags/phrma_brfss.py b/airflow/dags/phrma_brfss.py index 1b67633976..81c5db40f7 100644 --- a/airflow/dags/phrma_brfss.py +++ b/airflow/dags/phrma_brfss.py @@ -5,19 +5,19 @@ from datetime import timedelta -_PHRMA_BRFSS_WORKFLOW_ID = 'PHRMA_BRFSS_DATA' -_PHRMA_BRFSS_DATASET_NAME = 'phrma_brfss_data' +_PHRMA_BRFSS_WORKFLOW_ID = "PHRMA_BRFSS_DATA" +_PHRMA_BRFSS_DATASET_NAME = "phrma_brfss_data" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'phrma_brfss_ingestion_dag', + "phrma_brfss_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for Phrma Brfss', + description="Ingestion configuration for Phrma Brfss", ) # INGEST BY GEO / DEMO @@ -27,66 +27,66 @@ phrma_brfss_bq_payload_race_national = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='national', + demographic="race_and_ethnicity", + geographic="national", ) phrma_brfss_bq_operator_race_national = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_race_national', phrma_brfss_bq_payload_race_national, data_ingestion_dag + "phrma_brfss_to_bq_race_national", phrma_brfss_bq_payload_race_national, data_ingestion_dag ) # age_national phrma_brfss_bq_payload_age_national = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='age', - geographic='national', + demographic="age", + geographic="national", ) phrma_brfss_bq_operator_age_national = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_age_national', phrma_brfss_bq_payload_age_national, data_ingestion_dag + "phrma_brfss_to_bq_age_national", phrma_brfss_bq_payload_age_national, data_ingestion_dag ) # sex_national phrma_brfss_bq_payload_sex_national = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='sex', - geographic='national', + demographic="sex", + geographic="national", ) phrma_brfss_bq_operator_sex_national = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_sex_national', phrma_brfss_bq_payload_sex_national, data_ingestion_dag + "phrma_brfss_to_bq_sex_national", phrma_brfss_bq_payload_sex_national, data_ingestion_dag ) # education_national phrma_brfss_bq_payload_education_national = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='education', - geographic='national', + demographic="education", + geographic="national", ) phrma_brfss_bq_operator_education_national = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_education_national', phrma_brfss_bq_payload_education_national, data_ingestion_dag + "phrma_brfss_to_bq_education_national", phrma_brfss_bq_payload_education_national, data_ingestion_dag ) # income_national phrma_brfss_bq_payload_income_national = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='income', - geographic='national', + demographic="income", + geographic="national", ) phrma_brfss_bq_operator_income_national = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_income_national', phrma_brfss_bq_payload_income_national, data_ingestion_dag + "phrma_brfss_to_bq_income_national", phrma_brfss_bq_payload_income_national, data_ingestion_dag ) # insurance_status_national phrma_brfss_bq_payload_insurance_status_national = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='insurance_status', - geographic='national', + demographic="insurance_status", + geographic="national", ) phrma_brfss_bq_operator_insurance_status_national = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_insurance_status_national', phrma_brfss_bq_payload_insurance_status_national, data_ingestion_dag + "phrma_brfss_to_bq_insurance_status_national", phrma_brfss_bq_payload_insurance_status_national, data_ingestion_dag ) @@ -94,107 +94,107 @@ phrma_brfss_bq_payload_race_state = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='race_and_ethnicity', - geographic='state', + demographic="race_and_ethnicity", + geographic="state", ) phrma_brfss_bq_operator_race_state = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_race_state', phrma_brfss_bq_payload_race_state, data_ingestion_dag + "phrma_brfss_to_bq_race_state", phrma_brfss_bq_payload_race_state, data_ingestion_dag ) # age_state phrma_brfss_bq_payload_age_state = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='age', - geographic='state', + demographic="age", + geographic="state", ) phrma_brfss_bq_operator_age_state = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_age_state', phrma_brfss_bq_payload_age_state, data_ingestion_dag + "phrma_brfss_to_bq_age_state", phrma_brfss_bq_payload_age_state, data_ingestion_dag ) # sex_state phrma_brfss_bq_payload_sex_state = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='sex', - geographic='state', + demographic="sex", + geographic="state", ) phrma_brfss_bq_operator_sex_state = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_sex_state', phrma_brfss_bq_payload_sex_state, data_ingestion_dag + "phrma_brfss_to_bq_sex_state", phrma_brfss_bq_payload_sex_state, data_ingestion_dag ) # education_state phrma_brfss_bq_payload_education_state = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='education', - geographic='state', + demographic="education", + geographic="state", ) phrma_brfss_bq_operator_education_state = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_education_state', phrma_brfss_bq_payload_education_state, data_ingestion_dag + "phrma_brfss_to_bq_education_state", phrma_brfss_bq_payload_education_state, data_ingestion_dag ) # income_state phrma_brfss_bq_payload_income_state = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='income', - geographic='state', + demographic="income", + geographic="state", ) phrma_brfss_bq_operator_income_state = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_income_state', phrma_brfss_bq_payload_income_state, data_ingestion_dag + "phrma_brfss_to_bq_income_state", phrma_brfss_bq_payload_income_state, data_ingestion_dag ) # insurance_status_state phrma_brfss_bq_payload_insurance_status_state = util.generate_bq_payload( _PHRMA_BRFSS_WORKFLOW_ID, _PHRMA_BRFSS_DATASET_NAME, - demographic='insurance_status', - geographic='state', + demographic="insurance_status", + geographic="state", ) phrma_brfss_bq_operator_insurance_status_state = util.create_bq_ingest_operator( - 'phrma_brfss_to_bq_insurance_status_state', phrma_brfss_bq_payload_insurance_status_state, data_ingestion_dag + "phrma_brfss_to_bq_insurance_status_state", phrma_brfss_bq_payload_insurance_status_state, data_ingestion_dag ) # EXPORT BY DEMOGRAPHIC payload_race = { - 'dataset_name': _PHRMA_BRFSS_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _PHRMA_BRFSS_DATASET_NAME, + "demographic": "race_and_ethnicity", } phrma_brfss_exporter_operator_race = util.create_exporter_operator( - 'phrma_brfss_exporter_race', payload_race, data_ingestion_dag + "phrma_brfss_exporter_race", payload_race, data_ingestion_dag ) -payload_age = {'dataset_name': _PHRMA_BRFSS_DATASET_NAME, 'demographic': "age"} +payload_age = {"dataset_name": _PHRMA_BRFSS_DATASET_NAME, "demographic": "age"} phrma_brfss_exporter_operator_age = util.create_exporter_operator( - 'phrma_brfss_exporter_age', payload_age, data_ingestion_dag + "phrma_brfss_exporter_age", payload_age, data_ingestion_dag ) -payload_sex = {'dataset_name': _PHRMA_BRFSS_DATASET_NAME, 'demographic': "sex", 'should_export_as_alls': True} +payload_sex = {"dataset_name": _PHRMA_BRFSS_DATASET_NAME, "demographic": "sex", "should_export_as_alls": True} phrma_brfss_exporter_operator_sex = util.create_exporter_operator( - 'phrma_brfss_exporter_sex', payload_sex, data_ingestion_dag + "phrma_brfss_exporter_sex", payload_sex, data_ingestion_dag ) -payload_insurance_status = {'dataset_name': _PHRMA_BRFSS_DATASET_NAME, 'demographic': "insurance_status"} +payload_insurance_status = {"dataset_name": _PHRMA_BRFSS_DATASET_NAME, "demographic": "insurance_status"} phrma_brfss_exporter_operator_insurance_status = util.create_exporter_operator( - 'phrma_brfss_exporter_insurance_status', payload_insurance_status, data_ingestion_dag + "phrma_brfss_exporter_insurance_status", payload_insurance_status, data_ingestion_dag ) -payload_education = {'dataset_name': _PHRMA_BRFSS_DATASET_NAME, 'demographic': "education"} +payload_education = {"dataset_name": _PHRMA_BRFSS_DATASET_NAME, "demographic": "education"} phrma_brfss_exporter_operator_education = util.create_exporter_operator( - 'phrma_brfss_exporter_education', payload_education, data_ingestion_dag + "phrma_brfss_exporter_education", payload_education, data_ingestion_dag ) payload_income = { - 'dataset_name': _PHRMA_BRFSS_DATASET_NAME, - 'demographic': "income", + "dataset_name": _PHRMA_BRFSS_DATASET_NAME, + "demographic": "income", } phrma_brfss_exporter_operator_income = util.create_exporter_operator( - 'phrma_brfss_exporter_income', payload_income, data_ingestion_dag + "phrma_brfss_exporter_income", payload_income, data_ingestion_dag ) # Ingestion DAG ( diff --git a/airflow/dags/sanity_check.py b/airflow/dags/sanity_check.py index 71598a57c0..59d94a5b30 100644 --- a/airflow/dags/sanity_check.py +++ b/airflow/dags/sanity_check.py @@ -5,14 +5,14 @@ AGE_COL = "age" COUNTY_FIPS_COL = "county_fips" STATE_FIPS_COL = "state_fips" -TIME_PERIOD_COL = 'time_period' +TIME_PERIOD_COL = "time_period" def generate_cols(df: pd.DataFrame): - share_cols = df.columns.to_series().loc[df.columns.str.contains('share')].tolist() + share_cols = df.columns.to_series().loc[df.columns.str.contains("share")].tolist() # determine demographic column - dem_col = '' + dem_col = "" if RACE_CATEGORY_ID_COL in df.columns: dem_col = RACE_CATEGORY_ID_COL elif SEX_COL in df.columns: @@ -24,7 +24,7 @@ def generate_cols(df: pd.DataFrame): if COUNTY_FIPS_COL in df.columns: std_cols = [COUNTY_FIPS_COL] else: - df = df[df[STATE_FIPS_COL] != 'Unknown'] + df = df[df[STATE_FIPS_COL] != "Unknown"] std_cols = [STATE_FIPS_COL] # determine if standard columns @@ -41,7 +41,7 @@ def check_pct_values(df, table_name): df = df[cols] # remove rows with 'All' & 'Unknown' as values/only known values are considered for pct share calc - options = ['All', 'Unknown', 'UNKNOWN', 'ALL'] + options = ["All", "Unknown", "UNKNOWN", "ALL"] df = df[-df[dem_col].isin(options)] # group and sum rows @@ -58,8 +58,8 @@ def check_pct_values(df, table_name): if len(bad_fips_df) > 0: fip_list = [*set(bad_fips_df[std_cols[0]].tolist())] - errors = {'table': table_name, 'fips': fip_list} + errors = {"table": table_name, "fips": fip_list} return [False, errors] - return [True, f'No errors detected on table, {table_name}'] + return [True, f"No errors detected on table, {table_name}"] diff --git a/airflow/dags/test_sanity_check.py b/airflow/dags/test_sanity_check.py index 73f24e0311..ede1c66e72 100644 --- a/airflow/dags/test_sanity_check.py +++ b/airflow/dags/test_sanity_check.py @@ -2,45 +2,46 @@ import pandas as pd from sanity_check import check_pct_values -TEST_DIR = os.path.join(os.getcwd(), 'python', 'tests', - 'data', 'sanity_checks') +TEST_DIR = os.path.join(os.getcwd(), "python", "tests", "data", "sanity_checks") -CDC_RESTRICTED = {'age_county': os.path.join(TEST_DIR, 'cdc_restricted_age_county.json'), - 'sex_county_time': os.path.join(TEST_DIR, 'cdc_restricted_sex_county_time.json'), - 'sex_county': os.path.join(TEST_DIR, 'cdc_restricted_sex_county.json'), - 'sex_national': os.path.join(TEST_DIR, 'cdc_restricted_sex_national.json'), - 'sex_state': os.path.join(TEST_DIR, 'cdc_restricted_sex_state.json'), } +CDC_RESTRICTED = { + "age_county": os.path.join(TEST_DIR, "cdc_restricted_age_county.json"), + "sex_county_time": os.path.join(TEST_DIR, "cdc_restricted_sex_county_time.json"), + "sex_county": os.path.join(TEST_DIR, "cdc_restricted_sex_county.json"), + "sex_national": os.path.join(TEST_DIR, "cdc_restricted_sex_national.json"), + "sex_state": os.path.join(TEST_DIR, "cdc_restricted_sex_state.json"), +} -test_dtype = {'county_fips': str, - 'state_fips': str, } +test_dtype = { + "county_fips": str, + "state_fips": str, +} def testGenerateCountyDatasetAge(): - df = pd.read_json(CDC_RESTRICTED['age_county'], dtype=test_dtype) - assert check_pct_values(df, 'by_cdc_restricted_age_county') + df = pd.read_json(CDC_RESTRICTED["age_county"], dtype=test_dtype) + assert check_pct_values(df, "by_cdc_restricted_age_county") def testGenerateCountyDatasetSexTime(): - df = pd.read_json(CDC_RESTRICTED['sex_county_time'], dtype=test_dtype) - assert check_pct_values(df, 'by_cdc_restricted_sex_county_time') + df = pd.read_json(CDC_RESTRICTED["sex_county_time"], dtype=test_dtype) + assert check_pct_values(df, "by_cdc_restricted_sex_county_time") def testGenerateNationalDatasetSex(): - df = pd.read_json(CDC_RESTRICTED['sex_national'], dtype={ - 'state_fips': str}) - assert check_pct_values(df, 'by_cdc_restricted_sex_national') + df = pd.read_json(CDC_RESTRICTED["sex_national"], dtype={"state_fips": str}) + assert check_pct_values(df, "by_cdc_restricted_sex_national") def testGenerateStateDatasetSex(): - df = pd.read_json(CDC_RESTRICTED['sex_state'], dtype={'state_fips': str}) - assert check_pct_values(df, 'by_cdc_restricted_sex_state') + df = pd.read_json(CDC_RESTRICTED["sex_state"], dtype={"state_fips": str}) + assert check_pct_values(df, "by_cdc_restricted_sex_state") def testGenerateCountyDatasetSexError(): - df = pd.read_json(CDC_RESTRICTED['sex_county'], dtype=test_dtype) + df = pd.read_json(CDC_RESTRICTED["sex_county"], dtype=test_dtype) - output = check_pct_values(df, 'cdc_restricted_sex_county') - expected_output = [ - False, {'table': 'cdc_restricted_sex_county', 'fips': ['06123']}] + output = check_pct_values(df, "cdc_restricted_sex_county") + expected_output = [False, {"table": "cdc_restricted_sex_county", "fips": ["06123"]}] assert output == expected_output diff --git a/airflow/dags/util.py b/airflow/dags/util.py index b238671a8d..9f8bb9341b 100644 --- a/airflow/dags/util.py +++ b/airflow/dags/util.py @@ -1,4 +1,4 @@ -'''Collection of shared Airflow functionality.''' +"""Collection of shared Airflow functionality.""" import os import pandas as pd @@ -22,11 +22,11 @@ def get_required_attrs(workflow_id: str, gcs_bucket: str | None = None) -> dict: gcs_bucket: GCS bucket to write to. Defaults to the GCS_LANDING_BUCKET env var.""" if gcs_bucket is None: - gcs_bucket = Variable.get('GCS_LANDING_BUCKET') + gcs_bucket = Variable.get("GCS_LANDING_BUCKET") return { - 'is_airflow_run': True, - 'id': workflow_id, - 'gcs_bucket': gcs_bucket, + "is_airflow_run": True, + "id": workflow_id, + "gcs_bucket": gcs_bucket, } @@ -47,12 +47,12 @@ def generate_gcs_payload( var.""" message = get_required_attrs(workflow_id, gcs_bucket=gcs_bucket) if filename is not None: - message['filename'] = filename + message["filename"] = filename if url is not None: - message['url'] = url + message["url"] = url if year is not None: - message['year'] = year - return {'message': message} + message["year"] = year + return {"message": message} def generate_bq_payload( @@ -87,64 +87,64 @@ def generate_bq_payload( """ message = get_required_attrs(workflow_id, gcs_bucket=gcs_bucket) - message['dataset'] = dataset + message["dataset"] = dataset if filename is not None: - message['filename'] = filename + message["filename"] = filename if url is not None: - message['url'] = url + message["url"] = url if demographic is not None: - message['demographic'] = demographic + message["demographic"] = demographic if geographic is not None: - message['geographic'] = geographic + message["geographic"] = geographic if year is not None: - message['year'] = year + message["year"] = year if category is not None: - message['category'] = category + message["category"] = category - return {'message': message} + return {"message": message} def create_gcs_ingest_operator(task_id: str, payload: dict, dag: DAG) -> PythonOperator: - return create_request_operator(task_id, Variable.get('INGEST_TO_GCS_SERVICE_ENDPOINT'), payload, dag) + return create_request_operator(task_id, Variable.get("INGEST_TO_GCS_SERVICE_ENDPOINT"), payload, dag) def create_bq_ingest_operator(task_id: str, payload: dict, dag: DAG) -> PythonOperator: - return create_request_operator(task_id, Variable.get('GCS_TO_BQ_SERVICE_ENDPOINT'), payload, dag) + return create_request_operator(task_id, Variable.get("GCS_TO_BQ_SERVICE_ENDPOINT"), payload, dag) def create_exporter_operator(task_id: str, payload: dict, dag: DAG) -> PythonOperator: - return create_request_operator(task_id, Variable.get('EXPORTER_SERVICE_ENDPOINT'), payload, dag) + return create_request_operator(task_id, Variable.get("EXPORTER_SERVICE_ENDPOINT"), payload, dag) def service_request(url: str, data: dict, **kwargs): # pylint: disable=unused-argument receiving_service_headers = {} - environment = os.getenv('ENV') + environment = os.getenv("ENV") - if environment == 'local': + if environment == "local": # Obtain the identity token for local environment using the gcloud command identity_token = subprocess.check_output(["gcloud", "auth", "print-identity-token"]).strip().decode("utf-8") - receiving_service_headers = {'Authorization': f'Bearer {identity_token}'} + receiving_service_headers = {"Authorization": f"Bearer {identity_token}"} - elif environment != 'dev': + elif environment != "dev": # Set up metadata server request # See https://cloud.google.com/compute/docs/instances/verifying-instance-identity#request_signature - token_url = 'http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=' + token_url = "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=" token_request_url = token_url + url - token_request_headers = {'Metadata-Flavor': 'Google'} + token_request_headers = {"Metadata-Flavor": "Google"} # Fetch the token for the default compute service account token_response = requests.get(token_request_url, headers=token_request_headers, timeout=600) jwt = token_response.content.decode("utf-8") - receiving_service_headers = {'Authorization': f'Bearer {jwt}'} + receiving_service_headers = {"Authorization": f"Bearer {jwt}"} try: resp = requests.post(url, json=data, headers=receiving_service_headers, timeout=600) resp.raise_for_status() # Allow the most recent response code to be accessed by a downstream task for possible short circuiting. except requests.exceptions.HTTPError as err: - raise Exception(f'Failed response code: {err}') + raise Exception(f"Failed response code: {err}") def sanity_check_request(dataset_id: str): @@ -154,9 +154,9 @@ def sanity_check_request(dataset_id: str): tables = bq_client.list_tables(dataset_id) for table in tables: - table_name = f'{table.project}.{table.dataset_id}.{table.table_id}' + table_name = f"{table.project}.{table.dataset_id}.{table.table_id}" - query_string = f'SELECT * FROM `{table_name}`' + query_string = f"SELECT * FROM `{table_name}`" df: pd.DataFrame = bq_client.query(query_string).result().to_dataframe() @@ -165,9 +165,9 @@ def sanity_check_request(dataset_id: str): failing_tables.append(output[1]) if len(failing_tables) > 0: - raise RuntimeError(f'These percent share values do not equal 100% {failing_tables}') + raise RuntimeError(f"These percent share values do not equal 100% {failing_tables}") - print('All checks have passed. No errors detected.') + print("All checks have passed. No errors detected.") def create_request_operator( @@ -177,7 +177,7 @@ def create_request_operator( task_id=task_id, provide_context=provide_context, python_callable=service_request, - op_kwargs={'url': url, 'data': payload}, + op_kwargs={"url": url, "data": payload}, dag=dag, ) @@ -186,6 +186,6 @@ def sanity_check_operator(task_id: str, dataset_id: str, dag: DAG) -> PythonOper return PythonOperator( task_id=task_id, python_callable=sanity_check_request, - op_kwargs={'dataset_id': dataset_id}, + op_kwargs={"dataset_id": dataset_id}, dag=dag, ) diff --git a/airflow/dags/vera_incarceration_county.py b/airflow/dags/vera_incarceration_county.py index 7460cb827e..b8324e797c 100644 --- a/airflow/dags/vera_incarceration_county.py +++ b/airflow/dags/vera_incarceration_county.py @@ -6,58 +6,58 @@ from datetime import timedelta -_VERA_WORKFLOW_ID = 'VERA_INCARCERATION_COUNTY' -_VERA_DATASET_NAME = 'vera_incarceration_county' +_VERA_WORKFLOW_ID = "VERA_INCARCERATION_COUNTY" +_VERA_DATASET_NAME = "vera_incarceration_county" default_args = { - 'start_date': days_ago(0), - 'execution_timeout': timedelta(minutes=15), + "start_date": days_ago(0), + "execution_timeout": timedelta(minutes=15), } data_ingestion_dag = DAG( - 'vera_incarceration_county_ingestion_dag', + "vera_incarceration_county_ingestion_dag", default_args=default_args, schedule_interval=None, - description='Ingestion configuration for VERA', + description="Ingestion configuration for VERA", ) vera_bq_payload_race = util.generate_bq_payload(_VERA_WORKFLOW_ID, _VERA_DATASET_NAME, demographic="race_and_ethnicity") vera_bq_operator_race = util.create_bq_ingest_operator( - 'vera_incarceration_race_county_to_bq', vera_bq_payload_race, data_ingestion_dag + "vera_incarceration_race_county_to_bq", vera_bq_payload_race, data_ingestion_dag ) vera_bq_payload_age = util.generate_bq_payload(_VERA_WORKFLOW_ID, _VERA_DATASET_NAME, demographic="age") vera_bq_operator_age = util.create_bq_ingest_operator( - 'vera_incarceration_age_county_to_bq', vera_bq_payload_age, data_ingestion_dag + "vera_incarceration_age_county_to_bq", vera_bq_payload_age, data_ingestion_dag ) vera_bq_payload_sex = util.generate_bq_payload(_VERA_WORKFLOW_ID, _VERA_DATASET_NAME, demographic="sex") vera_bq_operator_sex = util.create_bq_ingest_operator( - 'vera_incarceration_sex_county_to_bq', vera_bq_payload_sex, data_ingestion_dag + "vera_incarceration_sex_county_to_bq", vera_bq_payload_sex, data_ingestion_dag ) vera_exporter_payload_race = { - 'dataset_name': _VERA_DATASET_NAME, - 'demographic': "race_and_ethnicity", + "dataset_name": _VERA_DATASET_NAME, + "demographic": "race_and_ethnicity", } vera_exporter_operator_race = util.create_exporter_operator( - 'vera_incarceration_county_exporter_race', vera_exporter_payload_race, data_ingestion_dag + "vera_incarceration_county_exporter_race", vera_exporter_payload_race, data_ingestion_dag ) -vera_exporter_payload_age = {'dataset_name': _VERA_DATASET_NAME, 'demographic': "age"} +vera_exporter_payload_age = {"dataset_name": _VERA_DATASET_NAME, "demographic": "age"} vera_exporter_operator_age = util.create_exporter_operator( - 'vera_incarceration_county_exporter_age', vera_exporter_payload_age, data_ingestion_dag + "vera_incarceration_county_exporter_age", vera_exporter_payload_age, data_ingestion_dag ) vera_exporter_payload_sex = { - 'dataset_name': _VERA_DATASET_NAME, - 'demographic': "sex", - 'should_export_as_alls': True, + "dataset_name": _VERA_DATASET_NAME, + "demographic": "sex", + "should_export_as_alls": True, } vera_exporter_operator_sex = util.create_exporter_operator( - 'vera_incarceration_county_exporter_sex', vera_exporter_payload_sex, data_ingestion_dag + "vera_incarceration_county_exporter_sex", vera_exporter_payload_sex, data_ingestion_dag ) # Ingestion DAG diff --git a/data_server/main.py b/data_server/main.py index 942a2607ec..9750ecee17 100644 --- a/data_server/main.py +++ b/data_server/main.py @@ -12,66 +12,66 @@ cache = DatasetCache() -@app.route('/', methods=['GET']) +@app.route("/", methods=["GET"]) def get_program_name(): - return 'Running data server.' + return "Running data server." -@app.route('/metadata', methods=['GET']) +@app.route("/metadata", methods=["GET"]) def get_metadata(): """Downloads and returns metadata about available download files.""" try: - metadata = cache.getDataset(os.environ.get('GCS_BUCKET'), os.environ.get('METADATA_FILENAME')) + metadata = cache.getDataset(os.environ.get("GCS_BUCKET"), os.environ.get("METADATA_FILENAME")) except Exception as err: logging.error(err) - return f'Internal server error: {err}', 500 + return f"Internal server error: {err}", 500 def generate_response(data: bytes): - next_row = b'[' + next_row = b"[" for row in data.splitlines(): yield next_row - next_row = row + b',' - yield next_row.rstrip(b',') + b']' + next_row = row + b"," + yield next_row.rstrip(b",") + b"]" headers = Headers() - headers.add('Content-Disposition', 'attachment', filename=os.environ.get('METADATA_FILENAME')) - headers.add('Vary', 'Accept-Encoding') - return Response(generate_response(metadata), mimetype='application/json', headers=headers) + headers.add("Content-Disposition", "attachment", filename=os.environ.get("METADATA_FILENAME")) + headers.add("Vary", "Accept-Encoding") + return Response(generate_response(metadata), mimetype="application/json", headers=headers) -@app.route('/dataset', methods=['GET']) +@app.route("/dataset", methods=["GET"]) def get_dataset(): """Downloads and returns the requested dataset if it exists.""" - dataset_name = request.args.get('name') + dataset_name = request.args.get("name") if dataset_name is None: - return 'Request missing required url param \'name\'', 400 + return "Request missing required url param 'name'", 400 try: - dataset = cache.getDataset(os.environ.get('GCS_BUCKET'), dataset_name) + dataset = cache.getDataset(os.environ.get("GCS_BUCKET"), dataset_name) except Exception as err: logging.error(err) - return f'Internal server error: {err}', 500 + return f"Internal server error: {err}", 500 def generate_response(): - next_row = b'[' + next_row = b"[" for row in dataset.splitlines(): yield next_row - next_row = row + b',' - yield next_row.rstrip(b',') + b']' + next_row = row + b"," + yield next_row.rstrip(b",") + b"]" headers = Headers() - headers.add('Content-Disposition', 'attachment', filename=dataset_name) - headers.add('Vary', 'Accept-Encoding') + headers.add("Content-Disposition", "attachment", filename=dataset_name) + headers.add("Vary", "Accept-Encoding") # Allow browsers to cache datasets for 2 hours, the same as the DatasetCache # TODO: If we want to make sure this stays in sync with the DatasetCache # TTL, move this to a constant that's shared between them. - headers.add('Cache-Control', 'public, max-age=7200') + headers.add("Cache-Control", "public, max-age=7200") - if dataset_name.endswith('.csv'): - return Response(dataset, mimetype='text/csv', headers=headers) + if dataset_name.endswith(".csv"): + return Response(dataset, mimetype="text/csv", headers=headers) - return Response(generate_response(), mimetype='application/json', headers=headers) + return Response(generate_response(), mimetype="application/json", headers=headers) if __name__ == "__main__": - app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080))) + app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) diff --git a/data_server/test_data_server.py b/data_server/test_data_server.py index 9add612f73..f051c6e656 100644 --- a/data_server/test_data_server.py +++ b/data_server/test_data_server.py @@ -1,16 +1,15 @@ +# pylint: disable=unused-argument import json import os from unittest import mock - import google.cloud.exceptions import pytest from flask.testing import FlaskClient - from data_server.dataset_cache import DatasetCache -from main import app, cache +from main import app, cache # pylint: disable=no-name-in-module -os.environ['GCS_BUCKET'] = 'test' -os.environ['METADATA_FILENAME'] = 'test_data.ndjson' +os.environ["GCS_BUCKET"] = "test" +os.environ["METADATA_FILENAME"] = "test_data.ndjson" test_data = ( b'{"label1":"value1","label2":["value2a","value2b"],"label3":"value3"}\n' @@ -30,7 +29,7 @@ b'{"label1":"value6","label2":["value7a","value2b"],"label3":"value18"}]' ) -test_data_csv = b'label1,label2,label3\nvalueA,valueB,valueC\nvalueD,valueE,valueF\n' +test_data_csv = b"label1,label2,label3\nvalueA,valueB,valueC\nvalueD,valueE,valueF\n" def get_test_data(gcs_bucket: str, filename: str): @@ -54,24 +53,24 @@ def reset_cache(): @pytest.fixture def client(): """Creates a Flask test client for each test.""" - app.config['TESTING'] = True + app.config["TESTING"] = True with app.test_client() as app_client: yield app_client def testGetProgramName(client: FlaskClient): - response = client.get('/') - assert b'Running data server.' in response.data + response = client.get("/") + assert b"Running data server." in response.data -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetMetadata(mock_func: mock.MagicMock, client: FlaskClient): - response = client.get('/metadata') - mock_func.assert_called_once_with('test', 'test_data.ndjson') + response = client.get("/metadata") + mock_func.assert_called_once_with("test", "test_data.ndjson") assert response.status_code == 200 - assert response.headers.get('Content-Disposition') == 'attachment; filename=test_data.ndjson' - assert response.headers.get('Access-Control-Allow-Origin') == '*' - assert response.headers.get('Vary') == 'Accept-Encoding' + assert response.headers.get("Content-Disposition") == "attachment; filename=test_data.ndjson" + assert response.headers.get("Access-Control-Allow-Origin") == "*" + assert response.headers.get("Vary") == "Accept-Encoding" assert response.data == test_data_json # Make sure that the response is valid json try: @@ -80,36 +79,36 @@ def testGetMetadata(mock_func: mock.MagicMock, client: FlaskClient): pytest.fail(err.msg) -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetMetadata_FromCache(mock_func: mock.MagicMock, client: FlaskClient): # Make the first request, which will incur an API call. - response = client.get('/metadata') - mock_func.assert_called_once_with('test', 'test_data.ndjson') + response = client.get("/metadata") + mock_func.assert_called_once_with("test", "test_data.ndjson") assert response.status_code == 200 # Make the second request, which should not incur an API call. - response = client.get('/metadata') + response = client.get("/metadata") assert response.status_code == 200 mock_func.assert_called_once() -@mock.patch.object(DatasetCache, 'getDataset') +@mock.patch.object(DatasetCache, "getDataset") def testGetMetadata_InternalError(mock_func: mock.MagicMock, client: FlaskClient): - mock_func.side_effect = google.cloud.exceptions.NotFound('File not found') + mock_func.side_effect = google.cloud.exceptions.NotFound("File not found") - response = client.get('/metadata') + response = client.get("/metadata") assert response.status_code == 500 - assert b'Internal server error: 404 File not found' in response.data + assert b"Internal server error: 404 File not found" in response.data -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetDataset_DataExists(mock_func: mock.MagicMock, client: FlaskClient): - response = client.get('/dataset?name=test_dataset') - mock_func.assert_called_once_with('test', 'test_dataset') + response = client.get("/dataset?name=test_dataset") + mock_func.assert_called_once_with("test", "test_dataset") assert response.status_code == 200 - assert response.headers.get('Content-Disposition') == 'attachment; filename=test_dataset' - assert response.headers.get('Access-Control-Allow-Origin') == '*' - assert response.headers.get('Vary') == 'Accept-Encoding' + assert response.headers.get("Content-Disposition") == "attachment; filename=test_dataset" + assert response.headers.get("Access-Control-Allow-Origin") == "*" + assert response.headers.get("Vary") == "Accept-Encoding" assert response.data == test_data_json # Make sure that the response is valid json try: @@ -119,32 +118,32 @@ def testGetDataset_DataExists(mock_func: mock.MagicMock, client: FlaskClient): @mock.patch( - 'data_server.gcs_utils.download_blob_as_bytes', side_effect=google.cloud.exceptions.NotFound('File not found') + "data_server.gcs_utils.download_blob_as_bytes", side_effect=google.cloud.exceptions.NotFound("File not found") ) def testGetDataset_DatasetNotFound(mock_func: mock.MagicMock, client: FlaskClient): - response = client.get('/dataset?name=not_found') - mock_func.assert_called_once_with('test', 'not_found') - assert response.headers.get('Access-Control-Allow-Origin') == '*' + response = client.get("/dataset?name=not_found") + mock_func.assert_called_once_with("test", "not_found") + assert response.headers.get("Access-Control-Allow-Origin") == "*" assert response.status_code == 500 - assert b'Internal server error: 404 File not found' in response.data + assert b"Internal server error: 404 File not found" in response.data def testGetDataset_UrlParamMissing(client: FlaskClient): - response = client.get('/dataset') + response = client.get("/dataset") assert response.status_code == 400 - assert b'Request missing required url param \'name\'' in response.data + assert b"Request missing required url param 'name'" in response.data - response = client.get('/dataset?random_param=stuff') + response = client.get("/dataset?random_param=stuff") assert response.status_code == 400 - assert b'Request missing required url param \'name\'' in response.data + assert b"Request missing required url param 'name'" in response.data -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', side_effect=get_test_data_csv) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data_csv) def testGetDataset_csvType(mock_func: mock.MagicMock, client: FlaskClient): - response = client.get('/dataset?name=test_dataset.csv') - mock_func.assert_called_once_with('test', 'test_dataset.csv') + response = client.get("/dataset?name=test_dataset.csv") + mock_func.assert_called_once_with("test", "test_dataset.csv") assert response.status_code == 200 - assert response.mimetype == 'text/csv' - assert response.headers.get('Content-Disposition') == 'attachment; filename=test_dataset.csv' + assert response.mimetype == "text/csv" + assert response.headers.get("Content-Disposition") == "attachment; filename=test_dataset.csv" # Make sure that the response hasn't changed assert response.data == test_data_csv diff --git a/e2e_tests/data_serving.py b/e2e_tests/data_serving.py index 55681327b1..4864ff08f6 100644 --- a/e2e_tests/data_serving.py +++ b/e2e_tests/data_serving.py @@ -7,8 +7,8 @@ def testUnauthed_permissionDenied(): # Get the url of the service. - service_url = os.environ.get('SERVICE_URL').strip('"') - print(f'SERVICE_URL={service_url}') + service_url = os.environ.get("SERVICE_URL").strip('"') + print(f"SERVICE_URL={service_url}") resp = requests.get(service_url, timeout=600) assert resp.status_code == 200 # this service used to require authorization but not anymore @@ -16,31 +16,31 @@ def testUnauthed_permissionDenied(): def testDataServerDataServing(): # Get the url of the service. - service_url = os.environ.get('SERVICE_URL').strip('"') - print(f'SERVICE_URL={service_url}') + service_url = os.environ.get("SERVICE_URL").strip('"') + print(f"SERVICE_URL={service_url}") # Get service account credentials to make request to private URL creds = service_account.IDTokenCredentials.from_service_account_file( - os.environ.get('PATH_TO_SA_CREDS'), target_audience=service_url + os.environ.get("PATH_TO_SA_CREDS"), target_audience=service_url ) authed_session = AuthorizedSession(creds) resp = authed_session.get(service_url) assert resp.ok - assert b'Running data server.' in resp.content + assert b"Running data server." in resp.content def testDataServingThroughFrontend(): # Get the url of the frontend. - frontend_url = os.environ.get('FRONTEND_URL').strip('"') + '/api/dataset?name=acs_population-by_sex_state.json' - print(f'FRONTEND_URL={frontend_url}') + frontend_url = os.environ.get("FRONTEND_URL").strip('"') + "/api/dataset?name=acs_population-by_sex_state.json" + print(f"FRONTEND_URL={frontend_url}") - frame = pd.DataFrame(pd.read_json(frontend_url, orient='values')) + frame = pd.DataFrame(pd.read_json(frontend_url, orient="values")) assert len(frame.index) == 156 assert frame.columns.size == 5 - assert frame.columns[0] == 'state_fips' - assert frame.columns[1] == 'state_name' - assert frame.columns[2] == 'sex' - assert frame.columns[3] == 'population' - assert frame.columns[4] == 'population_pct' + assert frame.columns[0] == "state_fips" + assert frame.columns[1] == "state_name" + assert frame.columns[2] == "sex" + assert frame.columns[3] == "population" + assert frame.columns[4] == "population_pct" diff --git a/e2e_tests/scripts/ensure_datasets_equal.py b/e2e_tests/scripts/ensure_datasets_equal.py index 0edf07a341..2e89cb3226 100644 --- a/e2e_tests/scripts/ensure_datasets_equal.py +++ b/e2e_tests/scripts/ensure_datasets_equal.py @@ -23,12 +23,9 @@ parser = argparse.ArgumentParser() -parser.add_argument("-ep", "--expectedproject", - help="GCP project name with the reference data") -parser.add_argument("-tp", "--testproject", - help="GCP project name with the test data") -parser.add_argument("-d", "--dataset", - help="Big query dataset id name to compare") +parser.add_argument("-ep", "--expectedproject", help="GCP project name with the reference data") +parser.add_argument("-tp", "--testproject", help="GCP project name with the test data") +parser.add_argument("-d", "--dataset", help="Big query dataset id name to compare") def main(): @@ -38,40 +35,33 @@ def main(): dataset = args.dataset bq_client = bigquery.Client() - test_tables = bq_client.list_tables('%s.%s' % (test_project, dataset)) + test_tables = bq_client.list_tables(f"{test_project}.{dataset}") for table in test_tables: - test_table_name = "{}.{}.{}".format( - table.project, table.dataset_id, table.table_id) - expected_table_name = "{}.{}.{}".format( - expected_project, table.dataset_id, table.table_id) + test_table_name = f"{table.project}.{table.dataset_id}.{table.table_id}" + expected_table_name = f"{expected_project}.{table.dataset_id}.{table.table_id}" - print('checking %s against %s' % - (test_table_name, expected_table_name)) + print(f"checking {test_table_name} against {expected_table_name}") - query_string_test = 'SELECT * FROM `%s`' % test_table_name - query_string_expected = 'SELECT * FROM `%s`' % expected_table_name + query_string_test = f"SELECT * FROM `{test_table_name}`" + query_string_expected = f"SELECT * FROM `{expected_table_name}`" test_df = bq_client.query(query_string_test).result().to_dataframe() sort_values = list(test_df.columns) - if 'ingestion_ts' in sort_values: - sort_values.remove('ingestion_ts') + if "ingestion_ts" in sort_values: + sort_values.remove("ingestion_ts") - if 'ingestion_ts' in test_df.columns: - test_df = test_df.drop( - columns=['ingestion_ts']).reset_index(drop=True) + if "ingestion_ts" in test_df.columns: + test_df = test_df.drop(columns=["ingestion_ts"]).reset_index(drop=True) test_df = test_df.sort_values(by=sort_values).reset_index(drop=True) - expected_df = bq_client.query( - query_string_expected).result().to_dataframe() - if 'ingestion_ts' in expected_df.columns: - expected_df = expected_df.drop( - columns=['ingestion_ts']).reset_index(drop=True) + expected_df = bq_client.query(query_string_expected).result().to_dataframe() + if "ingestion_ts" in expected_df.columns: + expected_df = expected_df.drop(columns=["ingestion_ts"]).reset_index(drop=True) - expected_df = expected_df.sort_values( - by=sort_values).reset_index(drop=True) + expected_df = expected_df.sort_values(by=sort_values).reset_index(drop=True) assert_frame_equal(test_df, expected_df, check_like=True) diff --git a/exporter/main.py b/exporter/main.py index a8821751ec..ab355c87c7 100644 --- a/exporter/main.py +++ b/exporter/main.py @@ -1,34 +1,34 @@ import logging import os from flask import Flask, request -from google.cloud import bigquery, storage +from google.cloud import bigquery, storage # type: ignore app = Flask(__name__) -@app.route('/', methods=['POST']) +@app.route("/", methods=["POST"]) def export_dataset_tables(): """Exports the tables in the given dataset to GCS. Request form must include the dataset name.""" data = request.get_json() - if data.get('dataset_name') is None: - return ('Request must include dataset name.', 400) + if data.get("dataset_name") is None: + return ("Request must include dataset name.", 400) demographic = None - if data.get('demographic') is not None: - demographic = data.get('demographic') + if data.get("demographic") is not None: + demographic = data.get("demographic") category = None - if data.get('category') is not None: - category = data.get('category') - should_export_as_alls = data.get('should_export_as_alls', False) - dataset_name = data['dataset_name'] - project_id = os.environ.get('PROJECT_ID') - export_bucket = os.environ.get('EXPORT_BUCKET') - dataset_id = f'{project_id}.{dataset_name}' + if data.get("category") is not None: + category = data.get("category") + should_export_as_alls = data.get("should_export_as_alls", False) + dataset_name = data["dataset_name"] + project_id = os.environ.get("PROJECT_ID") + export_bucket = os.environ.get("EXPORT_BUCKET") + dataset_id = f"{project_id}.{dataset_name}" bq_client = bigquery.Client() dataset = bq_client.get_dataset(dataset_id) tables = list(bq_client.list_tables(dataset)) @@ -61,30 +61,30 @@ def export_dataset_tables(): export_split_county_tables(bq_client, table, export_bucket) # export the full table - dest_uri = f'gs://{export_bucket}/{dataset_name}-{table.table_id}.json' + dest_uri = f"gs://{export_bucket}/{dataset_name}-{table.table_id}.json" table_ref = dataset.table(table.table_id) try: - export_table(bq_client, table_ref, dest_uri, 'NEWLINE_DELIMITED_JSON') + export_table(bq_client, table_ref, dest_uri, "NEWLINE_DELIMITED_JSON") except Exception as err: logging.error(err) return ( - f'Error exporting table {table.table_id} to {dest_uri}:\n{err}', + f"Error exporting table {table.table_id} to {dest_uri}:\n{err}", 500, ) if should_export_as_alls: export_alls(bq_client, table, export_bucket, demographic) - return ('', 204) + return ("", 204) def export_table(bq_client, table_ref, dest_uri, dest_fmt): """Run the extract job to export the given table to the given destination and wait for completion""" job_config = bigquery.ExtractJobConfig(destination_format=dest_fmt) - extract_job = bq_client.extract_table(table_ref, dest_uri, location='US', job_config=job_config) + extract_job = bq_client.extract_table(table_ref, dest_uri, location="US", job_config=job_config) extract_job.result() - logging.info(f'Exported {table_ref.table_id} to {dest_uri}') + logging.info(f"Exported {table_ref.table_id} to {dest_uri}") def export_split_county_tables(bq_client: bigquery.Client, table: bigquery.Table, export_bucket: str): @@ -95,11 +95,11 @@ def export_split_county_tables(bq_client: bigquery.Client, table: bigquery.Table if "county" not in table_name: return - logging.info(f'Exporting county-level data from {table_name} into additional files, split by state/territory.') + logging.info(f"Exporting county-level data from {table_name} into additional files, split by state/territory.") bucket = prepare_bucket(export_bucket) for fips in STATE_LEVEL_FIPS_LIST: - state_file_name = f'{table.dataset_id}-{table.table_id}-{fips}.json' + state_file_name = f"{table.dataset_id}-{table.table_id}-{fips}.json" query = f""" SELECT * FROM {table_name} @@ -115,7 +115,7 @@ def export_split_county_tables(bq_client: bigquery.Client, table: bigquery.Table except Exception as err: logging.error(err) return ( - f'Error splitting county-level table {table_name} into {state_file_name}:\n {err}', + f"Error splitting county-level table {table_name} into {state_file_name}:\n {err}", 500, ) @@ -131,7 +131,7 @@ def has_multi_demographics(table_id: str): boolean of whether there is more than one demographic substring found""" # Age adjusted tables are still just by race/eth. - if table_id.endswith('with_age_adjust'): + if table_id.endswith("with_age_adjust"): return False return ( @@ -145,20 +145,20 @@ def export_alls(bq_client: bigquery.Client, table: bigquery.Table, export_bucket """Export json file with just the ALLS rows from the given table, frontend can use as a fallback in compare mode""" table_name = get_table_name(table) demo_cols = [] - demo_to_replace = demographic if demographic != 'black_women' else 'age' + demo_to_replace = demographic if demographic != "black_women" else "age" demo_col = demographic - if demographic == 'black_women': - demo_col = 'age' - if demographic == 'race': - demo_col = 'race_and_ethnicity' - - alls_table_id = table.table_id.replace(demo_to_replace, 'alls') - logging.info(f'Exporting ALLs data {alls_table_id} from {table_name}.') - alls_file_name = f'{table.dataset_id}-{alls_table_id}.json' + if demographic == "black_women": + demo_col = "age" + if demographic == "race": + demo_col = "race_and_ethnicity" + + alls_table_id = table.table_id.replace(demo_to_replace, "alls") + logging.info(f"Exporting ALLs data {alls_table_id} from {table_name}.") + alls_file_name = f"{table.dataset_id}-{alls_table_id}.json" demo_cols = [demo_col] - if demographic == 'race': - demo_cols.append('race_category_id') + if demographic == "race": + demo_cols.append("race_category_id") bucket = prepare_bucket(export_bucket) query = f""" @@ -177,13 +177,13 @@ def export_alls(bq_client: bigquery.Client, table: bigquery.Table, export_bucket except Exception as err: logging.error(err) return ( - f'Error extracting the ALLS rows from table {table_name} into {alls_file_name}:\n {err}', + f"Error extracting the ALLS rows from table {table_name} into {alls_file_name}:\n {err}", 500, ) def get_table_name(table): - return f'{table.project}.{table.dataset_id}.{table.table_id}' + return f"{table.project}.{table.dataset_id}.{table.table_id}" def get_query_results_as_df(bq_client, query): @@ -201,11 +201,11 @@ def prepare_blob(bucket, state_file_name): def export_nd_json_to_blob(blob, nd_json): - blob.upload_from_string(nd_json, content_type='application/octet-stream') + blob.upload_from_string(nd_json, content_type="application/octet-stream") if __name__ == "__main__": - app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080))) + app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) STATE_LEVEL_FIPS_LIST = [ diff --git a/exporter/test_exporter.py b/exporter/test_exporter.py index db130a5faf..aed0f1f1bb 100644 --- a/exporter/test_exporter.py +++ b/exporter/test_exporter.py @@ -1,3 +1,4 @@ +# pylint: disable=unused-argument from unittest import mock from unittest.mock import Mock import pytest @@ -25,42 +26,37 @@ NUM_STATES_AND_TERRITORIES = len(STATE_LEVEL_FIPS_LIST) -TEST_TABLES = [bigquery.Table("my-project.my-dataset.t1-sex"), - bigquery.Table("my-project.my-dataset.t2-age"), - bigquery.Table("my-project.my-dataset.t3-age"), - bigquery.Table("my-project.my-county-dataset.t4-age"), - ] +TEST_TABLES = [ + bigquery.Table("my-project.my-dataset.t1-sex"), + bigquery.Table("my-project.my-dataset.t2-age"), + bigquery.Table("my-project.my-dataset.t3-age"), + bigquery.Table("my-project.my-county-dataset.t4-age"), +] -os.environ['PROJECT_ID'] = 'my-project' -os.environ['EXPORT_BUCKET'] = 'my-bucket' +os.environ["PROJECT_ID"] = "my-project" +os.environ["EXPORT_BUCKET"] = "my-bucket" @pytest.fixture def client(): """Creates a Flask test client for each test.""" - app.config['TESTING'] = True + app.config["TESTING"] = True with app.test_client() as app_client: yield app_client # TEST FULL FILE EXTRACT CALLS -@mock.patch('main.export_split_county_tables') -@mock.patch('google.cloud.bigquery.Client') -def testExportDatasetTables( - mock_bq_client: mock.MagicMock, - mock_split_county: mock.MagicMock, - client: FlaskClient -): + +@mock.patch("main.export_split_county_tables") +@mock.patch("google.cloud.bigquery.Client") +def testExportDatasetTables(mock_bq_client: mock.MagicMock, mock_split_county: mock.MagicMock, client: FlaskClient): # Set up mocks mock_bq_instance = mock_bq_client.return_value mock_bq_instance.list_tables.return_value = TEST_TABLES - payload = { - 'dataset_name': 'my-dataset', - 'demographic': 'age' - } - response = client.post('/', json=payload) + payload = {"dataset_name": "my-dataset", "demographic": "age"} + response = client.post("/", json=payload) assert response.status_code == 204 # called once per "age" table @@ -69,59 +65,46 @@ def testExportDatasetTables( assert mock_split_county.call_count == 3 -@mock.patch('main.export_split_county_tables') -@mock.patch('google.cloud.bigquery.Client') +@mock.patch("main.export_split_county_tables") +@mock.patch("google.cloud.bigquery.Client") def testExportDatasetTables_InvalidInput( - mock_bq_client: mock.MagicMock, - mock_split_county: mock.MagicMock, - client: FlaskClient + mock_bq_client: mock.MagicMock, mock_split_county: mock.MagicMock, client: FlaskClient ): - response = client.post('/', json={}) + response = client.post("/", json={}) assert response.status_code == 400 assert mock_split_county.call_count == 0 -@mock.patch('main.export_split_county_tables') -@mock.patch('google.cloud.bigquery.Client') +@mock.patch("main.export_split_county_tables") +@mock.patch("google.cloud.bigquery.Client") def testExportDatasetTables_NoTables( - mock_bq_client: mock.MagicMock, - mock_split_county: mock.MagicMock, - client: FlaskClient + mock_bq_client: mock.MagicMock, mock_split_county: mock.MagicMock, client: FlaskClient ): # Set up mocks mock_bq_instance = mock_bq_client.return_value mock_bq_instance.list_tables.return_value = iter(()) - payload = { - 'dataset_name': 'my-dataset', - 'demographic': 'age' - } - response = client.post('/', json=payload) + payload = {"dataset_name": "my-dataset", "demographic": "age"} + response = client.post("/", json=payload) assert response.status_code == 500 assert mock_split_county.call_count == 0 -@mock.patch('main.export_split_county_tables') -@mock.patch('google.cloud.bigquery.Client') +@mock.patch("main.export_split_county_tables") +@mock.patch("google.cloud.bigquery.Client") def testExportDatasetTables_ExtractJobFailure( - mock_bq_client: mock.MagicMock, - mock_split_county: mock.MagicMock, - client: FlaskClient + mock_bq_client: mock.MagicMock, mock_split_county: mock.MagicMock, client: FlaskClient ): # Set up mocks mock_bq_instance = mock_bq_client.return_value mock_bq_instance.list_tables.return_value = TEST_TABLES mock_extract_job = Mock() mock_bq_instance.extract_table.return_value = mock_extract_job - mock_extract_job.result.side_effect = google.cloud.exceptions.InternalServerError( - 'Internal') + mock_extract_job.result.side_effect = google.cloud.exceptions.InternalServerError("Internal") - payload = { - 'dataset_name': 'my-dataset', - 'demographic': 'age' - } - response = client.post('/', json=payload) + payload = {"dataset_name": "my-dataset", "demographic": "age"} + response = client.post("/", json=payload) assert response.status_code == 500 assert mock_split_county.call_count == 1 @@ -129,34 +112,33 @@ def testExportDatasetTables_ExtractJobFailure( # TEST ADDITIONAL COUNTY-LEVEL DATASET SPLIT FUNCTIONS -_test_query_results_df = pd.DataFrame({ - 'county_fips': ["01001", "01002", "01003"], - 'some_condition_per_100k': [None, 1, 2], -}) +_test_query_results_df = pd.DataFrame( + { + "county_fips": ["01001", "01002", "01003"], + "some_condition_per_100k": [None, 1, 2], + } +) -@mock.patch('main.export_nd_json_to_blob') -@mock.patch('main.prepare_blob') -@mock.patch('main.prepare_bucket') -@mock.patch('main.get_query_results_as_df', return_value=_test_query_results_df) -@mock.patch('google.cloud.bigquery.Client') +@mock.patch("main.export_nd_json_to_blob") +@mock.patch("main.prepare_blob") +@mock.patch("main.prepare_bucket") +@mock.patch("main.get_query_results_as_df", return_value=_test_query_results_df) +@mock.patch("google.cloud.bigquery.Client") def testExportSplitCountyTables( - mock_bq_client: mock.MagicMock, - mock_query_df: mock.MagicMock, - mock_prepare_bucket: mock.MagicMock, - mock_prepare_blob: mock.MagicMock, - mock_export: mock.MagicMock, - client: FlaskClient + mock_bq_client: mock.MagicMock, + mock_query_df: mock.MagicMock, + mock_prepare_bucket: mock.MagicMock, + mock_prepare_blob: mock.MagicMock, + mock_export: mock.MagicMock, + client: FlaskClient, ): mock_bq_instance = mock_bq_client.return_value mock_bq_instance.list_tables.return_value = TEST_TABLES - payload = { - 'dataset_name': 'my-dataset', - 'demographic': 'age' - } - client.post('/', json=payload) + payload = {"dataset_name": "my-dataset", "demographic": "age"} + client.post("/", json=payload) # ensure initial call to bq client and county-level calls per state/terr assert mock_bq_client.call_count == 1 @@ -166,12 +148,10 @@ def testExportSplitCountyTables( # ensure generated ndjson for bq.storage matches expected ndjson generated_nd_json = mock_export.call_args[0][1] - assert (sorted(generated_nd_json) == - sorted(_test_query_results_df.to_json(orient="records", - lines=True))) + assert sorted(generated_nd_json) == sorted(_test_query_results_df.to_json(orient="records", lines=True)) bucket_name = mock_prepare_bucket.call_args[0][0] - assert bucket_name == os.environ['EXPORT_BUCKET'] + assert bucket_name == os.environ["EXPORT_BUCKET"] # for each state/terr for i, fips in enumerate(STATE_LEVEL_FIPS_LIST): @@ -191,5 +171,5 @@ def testExportSplitCountyTables( # ensure county level files are named as expected state_file_name = mock_prepare_blob.call_args_list[i][0][1] table = TEST_TABLES[3] - expected_file_name = f'{table.dataset_id}-{table.table_id}-{fips}.json' + expected_file_name = f"{table.dataset_id}-{table.table_id}-{fips}.json" assert state_file_name == expected_file_name diff --git a/pyproject.toml b/pyproject.toml index b70efa61fe..de6d86390c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,6 @@ # Configuration settings for python linters when run locally / pre-commit # SUPER LINTER settings are stored in .github/workflows/ [tool.black] -skip-string-normalization = true line-length = 120 [tool.pytest.ini_options] diff --git a/python/data_server/dataset_cache.py b/python/data_server/dataset_cache.py index 2c89d02a1c..99ee4c5307 100644 --- a/python/data_server/dataset_cache.py +++ b/python/data_server/dataset_cache.py @@ -5,7 +5,7 @@ from data_server import gcs_utils -class DatasetCache(): +class DatasetCache: """DatasetCache manages and stores datasets accessed through GCS. DatasetCache is a thin, thread-safe wrapper around cachetools.TTLCache.""" diff --git a/python/data_server/setup.py b/python/data_server/setup.py index be1f91ec43..bec43cc476 100644 --- a/python/data_server/setup.py +++ b/python/data_server/setup.py @@ -1,6 +1,7 @@ from setuptools import setup -setup(name='data_server', - package_dir={'data_server': ''}, - packages=['data_server'], - ) +setup( + name="data_server", + package_dir={"data_server": ""}, + packages=["data_server"], +) diff --git a/python/datasources/acs_condition.py b/python/datasources/acs_condition.py index c9e9fdcc26..e576e858a0 100644 --- a/python/datasources/acs_condition.py +++ b/python/datasources/acs_condition.py @@ -37,103 +37,103 @@ generate_column_name, ) -EARLIEST_ACS_CONDITION_YEAR = '2012' -CURRENT_ACS_CONDITION_YEAR = '2022' +EARLIEST_ACS_CONDITION_YEAR = "2012" +CURRENT_ACS_CONDITION_YEAR = "2022" # available years with all topics working ACS_URLS_MAP = { - EARLIEST_ACS_CONDITION_YEAR: 'https://api.census.gov/data/2012/acs/acs5', - '2013': 'https://api.census.gov/data/2013/acs/acs5', - '2014': 'https://api.census.gov/data/2014/acs/acs5', - '2015': 'https://api.census.gov/data/2015/acs/acs5', - '2016': 'https://api.census.gov/data/2016/acs/acs5', - '2017': 'https://api.census.gov/data/2017/acs/acs5', - '2018': 'https://api.census.gov/data/2018/acs/acs5', - '2019': 'https://api.census.gov/data/2019/acs/acs5', - '2020': 'https://api.census.gov/data/2020/acs/acs5', - '2021': 'https://api.census.gov/data/2021/acs/acs5', - CURRENT_ACS_CONDITION_YEAR: 'https://api.census.gov/data/2022/acs/acs5', + EARLIEST_ACS_CONDITION_YEAR: "https://api.census.gov/data/2012/acs/acs5", + "2013": "https://api.census.gov/data/2013/acs/acs5", + "2014": "https://api.census.gov/data/2014/acs/acs5", + "2015": "https://api.census.gov/data/2015/acs/acs5", + "2016": "https://api.census.gov/data/2016/acs/acs5", + "2017": "https://api.census.gov/data/2017/acs/acs5", + "2018": "https://api.census.gov/data/2018/acs/acs5", + "2019": "https://api.census.gov/data/2019/acs/acs5", + "2020": "https://api.census.gov/data/2020/acs/acs5", + "2021": "https://api.census.gov/data/2021/acs/acs5", + CURRENT_ACS_CONDITION_YEAR: "https://api.census.gov/data/2022/acs/acs5", } HEALTH_INSURANCE_RACE_TO_CONCEPT_CAPS = { - Race.AIAN.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)', - Race.ASIAN.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (ASIAN ALONE)', - Race.HISP.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (HISPANIC OR LATINO)', - Race.BLACK.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (BLACK OR AFRICAN AMERICAN ALONE)', - Race.NHPI.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)', - Race.WHITE.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (WHITE ALONE)', - Race.OTHER_STANDARD.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (SOME OTHER RACE ALONE)', - Race.MULTI.value: 'HEALTH INSURANCE COVERAGE STATUS BY AGE (TWO OR MORE RACES)', + Race.AIAN.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)", + Race.ASIAN.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (ASIAN ALONE)", + Race.HISP.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (HISPANIC OR LATINO)", + Race.BLACK.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (BLACK OR AFRICAN AMERICAN ALONE)", + Race.NHPI.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)", + Race.WHITE.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (WHITE ALONE)", + Race.OTHER_STANDARD.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (SOME OTHER RACE ALONE)", + Race.MULTI.value: "HEALTH INSURANCE COVERAGE STATUS BY AGE (TWO OR MORE RACES)", } HEALTH_INSURANCE_RACE_TO_CONCEPT_TITLE = { - Race.AIAN.value: 'Health Insurance Coverage Status by Age (American Indian and Alaska Native Alone)', - Race.ASIAN.value: 'Health Insurance Coverage Status by Age (Asian Alone)', - Race.HISP.value: 'Health Insurance Coverage Status by Age (Hispanic or Latino)', - Race.BLACK.value: 'Health Insurance Coverage Status by Age (Black or African American Alone)', - Race.NHPI.value: 'Health Insurance Coverage Status by Age (Native Hawaiian and Other Pacific Islander Alone)', - Race.WHITE.value: 'Health Insurance Coverage Status by Age (White Alone)', - Race.OTHER_STANDARD.value: 'Health Insurance Coverage Status by Age (Some Other Race Alone)', - Race.MULTI.value: 'Health Insurance Coverage Status by Age (Two or More Races)', + Race.AIAN.value: "Health Insurance Coverage Status by Age (American Indian and Alaska Native Alone)", + Race.ASIAN.value: "Health Insurance Coverage Status by Age (Asian Alone)", + Race.HISP.value: "Health Insurance Coverage Status by Age (Hispanic or Latino)", + Race.BLACK.value: "Health Insurance Coverage Status by Age (Black or African American Alone)", + Race.NHPI.value: "Health Insurance Coverage Status by Age (Native Hawaiian and Other Pacific Islander Alone)", + Race.WHITE.value: "Health Insurance Coverage Status by Age (White Alone)", + Race.OTHER_STANDARD.value: "Health Insurance Coverage Status by Age (Some Other Race Alone)", + Race.MULTI.value: "Health Insurance Coverage Status by Age (Two or More Races)", } POVERTY_RACE_TO_CONCEPT_CAPS = { - Race.AIAN.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)', - Race.ASIAN.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)', - Race.HISP.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)', - Race.BLACK.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)', + Race.AIAN.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)", + Race.ASIAN.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)", + Race.HISP.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)", + Race.BLACK.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)", Race.NHPI.value: ( - 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)' + "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)" ), - Race.WHITE.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)', - Race.OTHER_STANDARD.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)', - Race.MULTI.value: 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)', + Race.WHITE.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)", + Race.OTHER_STANDARD.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)", + Race.MULTI.value: "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)", } POVERTY_RACE_TO_CONCEPT_TITLE = { - Race.AIAN.value: 'Poverty Status in the Past 12 Months by Sex by Age (American Indian and Alaska Native Alone)', - Race.ASIAN.value: 'Poverty Status in the Past 12 Months by Sex by Age (Asian Alone)', - Race.HISP.value: 'Poverty Status in the Past 12 Months by Sex by Age (Hispanic or Latino)', - Race.BLACK.value: 'Poverty Status in the Past 12 Months by Sex by Age (Black or African American Alone)', + Race.AIAN.value: "Poverty Status in the Past 12 Months by Sex by Age (American Indian and Alaska Native Alone)", + Race.ASIAN.value: "Poverty Status in the Past 12 Months by Sex by Age (Asian Alone)", + Race.HISP.value: "Poverty Status in the Past 12 Months by Sex by Age (Hispanic or Latino)", + Race.BLACK.value: "Poverty Status in the Past 12 Months by Sex by Age (Black or African American Alone)", Race.NHPI.value: ( - 'Poverty Status in the Past 12 Months by Sex by Age (Native Hawaiian and Other Pacific Islander Alone)' + "Poverty Status in the Past 12 Months by Sex by Age (Native Hawaiian and Other Pacific Islander Alone)" ), - Race.WHITE.value: 'Poverty Status in the Past 12 Months by Sex by Age (White Alone)', - Race.OTHER_STANDARD.value: 'Poverty Status in the Past 12 Months by Sex by Age (Some Other Race Alone)', - Race.MULTI.value: 'Poverty Status in the Past 12 Months by Sex by Age (Two or More Races)', + Race.WHITE.value: "Poverty Status in the Past 12 Months by Sex by Age (White Alone)", + Race.OTHER_STANDARD.value: "Poverty Status in the Past 12 Months by Sex by Age (Some Other Race Alone)", + Race.MULTI.value: "Poverty Status in the Past 12 Months by Sex by Age (Two or More Races)", } # Acs variables are in the form C27001A_xxx0 C27001A_xxx2 etc # to determine age buckets. The metadata variables are merged with the suffixes to form the entire metadata. HEALTH_INSURANCE_BY_RACE_GROUP_PREFIXES = { - 'C27001A': Race.WHITE.value, - 'C27001B': Race.BLACK.value, - 'C27001C': Race.AIAN.value, - 'C27001D': Race.ASIAN.value, - 'C27001E': Race.NHPI.value, - 'C27001F': Race.OTHER_STANDARD.value, - 'C27001G': Race.MULTI.value, - 'C27001I': Race.HISP.value, + "C27001A": Race.WHITE.value, + "C27001B": Race.BLACK.value, + "C27001C": Race.AIAN.value, + "C27001D": Race.ASIAN.value, + "C27001E": Race.NHPI.value, + "C27001F": Race.OTHER_STANDARD.value, + "C27001G": Race.MULTI.value, + "C27001I": Race.HISP.value, } POVERTY_BY_RACE_SEX_AGE_GROUP_PREFIXES = { - 'B17001A': Race.WHITE.value, - 'B17001B': Race.BLACK.value, - 'B17001C': Race.AIAN.value, - 'B17001D': Race.ASIAN.value, - 'B17001E': Race.NHPI.value, - 'B17001F': Race.OTHER_STANDARD.value, - 'B17001G': Race.MULTI.value, - 'B17001I': Race.HISP.value, + "B17001A": Race.WHITE.value, + "B17001B": Race.BLACK.value, + "B17001C": Race.AIAN.value, + "B17001D": Race.ASIAN.value, + "B17001E": Race.NHPI.value, + "B17001F": Race.OTHER_STANDARD.value, + "B17001G": Race.MULTI.value, + "B17001I": Race.HISP.value, } def get_poverty_age_range(age_range): - if age_range in {'0-4', '5-5'}: - return '0-5' - elif age_range in {'12-14', '15-15', '16-17'}: - return '12-17' + if age_range in {"0-4", "5-5"}: + return "0-5" + elif age_range in {"12-14", "15-15", "16-17"}: + return "12-17" else: return age_range @@ -178,30 +178,30 @@ def __init__( # Health insurance by Sex only has one prefix, and is kept # in the form of a dict to help with standardizing code flow -HEALTH_INSURANCE_BY_SEX_GROUPS_PREFIX = 'B27001' -HEALTH_INSURANCE_SEX_BY_AGE_CONCEPT_CAPS = 'HEALTH INSURANCE COVERAGE STATUS BY SEX BY AGE' -HEALTH_INSURANCE_SEX_BY_AGE_CONCEPT_TITLE = 'Health Insurance Coverage Status by Sex by Age' +HEALTH_INSURANCE_BY_SEX_GROUPS_PREFIX = "B27001" +HEALTH_INSURANCE_SEX_BY_AGE_CONCEPT_CAPS = "HEALTH INSURANCE COVERAGE STATUS BY SEX BY AGE" +HEALTH_INSURANCE_SEX_BY_AGE_CONCEPT_TITLE = "Health Insurance Coverage Status by Sex by Age" -POVERTY_BY_SEX_AGE_GROUPS_PREFIX = 'B17001' -POVERTY_BY_SEX_AGE_CONCEPT_CAPS = 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE' -POVERTY_BY_SEX_AGE_CONCEPT_TITLE = 'Poverty Status in the Past 12 Months by Sex by Age' +POVERTY_BY_SEX_AGE_GROUPS_PREFIX = "B17001" +POVERTY_BY_SEX_AGE_CONCEPT_CAPS = "POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE" +POVERTY_BY_SEX_AGE_CONCEPT_TITLE = "Poverty Status in the Past 12 Months by Sex by Age" -HAS_HEALTH_INSURANCE = 'has_health_insurance' -INCOME_UNDER_POVERTY = 'under_poverty_line' +HAS_HEALTH_INSURANCE = "has_health_insurance" +INCOME_UNDER_POVERTY = "under_poverty_line" # Col names for temporary df, never written to bq -AMOUNT = 'amount' -POP_SUFFIX = 'pop' -HAS_ACS_ITEM_SUFFIX = 'has_acs_item' +AMOUNT = "amount" +POP_SUFFIX = "pop" +HAS_ACS_ITEM_SUFFIX = "has_acs_item" -HEALTH_INSURANCE_KEY = 'No health insurance coverage' -WITH_HEALTH_INSURANCE_KEY = 'With health insurance coverage' +HEALTH_INSURANCE_KEY = "No health insurance coverage" +WITH_HEALTH_INSURANCE_KEY = "With health insurance coverage" -NOT_IN_POVERTY_KEY = 'Income in the past 12 months at or above poverty level' -POVERTY_KEY = 'Income in the past 12 months below poverty level' +NOT_IN_POVERTY_KEY = "Income in the past 12 months at or above poverty level" +POVERTY_KEY = "Income in the past 12 months below poverty level" -HEALTH_INSURANCE_MEASURE = 'health_insurance' -POVERTY_MEASURE = 'poverty' +HEALTH_INSURANCE_MEASURE = "health_insurance" +POVERTY_MEASURE = "poverty" ACS_ITEMS_2021_AND_EARLIER = { HEALTH_INSURANCE_MEASURE: AcsItem( @@ -272,21 +272,21 @@ def update_col_types(df): class AcsCondition(DataSource): def get_filename_race(self, measure, race, is_county, year): - geo = 'COUNTY' if is_county else 'STATE' + geo = "COUNTY" if is_county else "STATE" race = race.replace(" ", "_").upper() - return f'{year}-{measure.upper()}_BY_RACE_{geo}_{race}.json' + return f"{year}-{measure.upper()}_BY_RACE_{geo}_{race}.json" def get_filename_sex(self, measure, is_county, year): - geo = 'COUNTY' if is_county else 'STATE' - return f'{year}-{measure.upper()}_BY_SEX_{geo}.json' + geo = "COUNTY" if is_county else "STATE" + return f"{year}-{measure.upper()}_BY_SEX_{geo}.json" @staticmethod def get_id(): - return 'ACS_CONDITION' + return "ACS_CONDITION" @staticmethod def get_table_name(): - return 'acs_condition' + return "acs_condition" # Uploads the ACS data to GCS by providing # the ACS Base URL @@ -300,7 +300,7 @@ def get_table_name(): # FileDiff = If the data has changed by diffing the old run vs the new run. def upload_to_gcs(self, bucket, **attrs): - year = self.get_attr(attrs, 'year') + year = self.get_attr(attrs, "year") self.year = year self.base_url = ACS_URLS_MAP[year] @@ -344,7 +344,7 @@ def upload_to_gcs(self, bucket, **attrs): def write_to_bq(self, dataset, gcs_bucket, **attrs): - year = self.get_attr(attrs, 'year') + year = self.get_attr(attrs, "year") self.year = year self.base_url = ACS_URLS_MAP[year] if int(year) < 2022: @@ -378,7 +378,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): suffixes_current_only = [ std_col.POP_PCT_SUFFIX, std_col.RAW_SUFFIX, # numerator counts - f'{POP_SUFFIX}_{std_col.RAW_SUFFIX}', # denominator counts + f"{POP_SUFFIX}_{std_col.RAW_SUFFIX}", # denominator counts ] for [demo, geo], df in dfs.items(): @@ -474,7 +474,7 @@ def get_raw_data(self, demo, geo, metadata, acs_items, gcs_bucket): concept_dfs.append(concept_df) concept_df = pd.concat(concept_dfs) - df = pd.merge(df, concept_df, on=merge_cols, how='outer') + df = pd.merge(df, concept_df, on=merge_cols, how="outer") return df @@ -495,7 +495,7 @@ def get_raw_data(self, demo, geo, metadata, acs_items, gcs_bucket): var_map, ) - df = pd.merge(df, concept_df, on=merge_cols, how='outer') + df = pd.merge(df, concept_df, on=merge_cols, how="outer") return df @@ -523,7 +523,7 @@ def generate_df_for_concept(self, measure, acs_item, df, demo, geo, concept, var # `"label": "Estimate!!Total:!!19 to 64 years:!!No health insurance coverage"` # we take the std_col.AGE_COL first, and the AMOUNT second # (The Estimate and Total keys are stripped off in the standardize frame function) - tmp_amount_key = 'tmp_amount_key' + tmp_amount_key = "tmp_amount_key" if measure == POVERTY_MEASURE: group_cols = [tmp_amount_key, std_col.SEX_COL, std_col.AGE_COL] elif measure == HEALTH_INSURANCE_MEASURE: @@ -548,7 +548,7 @@ def generate_df_for_concept(self, measure, acs_item, df, demo, geo, concept, var df_with_without[tmp_amount_key] == acs_item.does_not_have_condition_key ].reset_index(drop=True) - without_condition_raw_count = generate_column_name(measure, 'without') + without_condition_raw_count = generate_column_name(measure, "without") df_without_condition = df_without_condition.rename(columns={AMOUNT: without_condition_raw_count}) raw_count = generate_column_name(measure, HAS_ACS_ITEM_SUFFIX) @@ -565,7 +565,7 @@ def generate_df_for_concept(self, measure, acs_item, df, demo, geo, concept, var # Generate the population for each condition by adding together # the raw counts of people with and without the condition. - population_df = pd.merge(df_without_condition, df_with_condition, on=merge_cols, how='left') + population_df = pd.merge(df_without_condition, df_with_condition, on=merge_cols, how="left") population = generate_column_name(measure, POP_SUFFIX) population_df[[raw_count, without_condition_raw_count]] = population_df[ [raw_count, without_condition_raw_count] @@ -575,7 +575,7 @@ def generate_df_for_concept(self, measure, acs_item, df, demo, geo, concept, var # Merge the population df back into the df of people with the condition # to create our main df. - df = pd.merge(df_with_condition, population_df, on=merge_cols, how='left') + df = pd.merge(df_with_condition, population_df, on=merge_cols, how="left") df = df[merge_cols + [population, raw_count]] df = update_col_types(df) @@ -671,13 +671,13 @@ def post_process(self, df, demo, geo, acs_items, health_insurance_race_to_concep df = generate_pct_share_col_without_unknowns(df, pct_share_cols, demo_col, all_val) for item in acs_items.values(): - pct_rel_inequity_col = f'{item.bq_prefix}_{std_col.PCT_REL_INEQUITY_SUFFIX}' + pct_rel_inequity_col = f"{item.bq_prefix}_{std_col.PCT_REL_INEQUITY_SUFFIX}" # PCT_REL_INEQUITY df = generate_pct_rel_inequity_col( df, - f'{item.bq_prefix}_{std_col.PCT_SHARE_SUFFIX}', - f'{item.bq_prefix}_{std_col.POP_PCT_SUFFIX}', + f"{item.bq_prefix}_{std_col.PCT_SHARE_SUFFIX}", + f"{item.bq_prefix}_{std_col.POP_PCT_SUFFIX}", pct_rel_inequity_col, ) all_columns.append(pct_rel_inequity_col) @@ -690,7 +690,7 @@ def post_process(self, df, demo, geo, acs_items, health_insurance_race_to_concep acs_item.bq_prefix, std_col.RAW_SUFFIX ), # Rename denominators e.g. health_insurance_pop to uninsurance_population_estimated_total - generate_column_name(measure, POP_SUFFIX): f'{acs_item.bq_prefix}_{POP_SUFFIX}_{std_col.RAW_SUFFIX}', + generate_column_name(measure, POP_SUFFIX): f"{acs_item.bq_prefix}_{POP_SUFFIX}_{std_col.RAW_SUFFIX}", } all_columns.extend(rename_map.values()) diff --git a/python/datasources/acs_population.py b/python/datasources/acs_population.py index 5fe667f10b..4ce0ed7110 100644 --- a/python/datasources/acs_population.py +++ b/python/datasources/acs_population.py @@ -18,20 +18,20 @@ ACS_URLS_MAP = { - ACS_EARLIEST_YEAR: 'https://api.census.gov/data/2009/acs/acs5', - '2010': 'https://api.census.gov/data/2010/acs/acs5', - '2011': 'https://api.census.gov/data/2011/acs/acs5', - '2012': 'https://api.census.gov/data/2012/acs/acs5', - '2013': 'https://api.census.gov/data/2013/acs/acs5', - '2014': 'https://api.census.gov/data/2014/acs/acs5', - '2015': 'https://api.census.gov/data/2015/acs/acs5', - '2016': 'https://api.census.gov/data/2016/acs/acs5', - '2017': 'https://api.census.gov/data/2017/acs/acs5', - '2018': 'https://api.census.gov/data/2018/acs/acs5', - '2019': 'https://api.census.gov/data/2019/acs/acs5', - '2020': 'https://api.census.gov/data/2020/acs/acs5', - '2021': 'https://api.census.gov/data/2021/acs/acs5', - ACS_CURRENT_YEAR: 'https://api.census.gov/data/2022/acs/acs5', + ACS_EARLIEST_YEAR: "https://api.census.gov/data/2009/acs/acs5", + "2010": "https://api.census.gov/data/2010/acs/acs5", + "2011": "https://api.census.gov/data/2011/acs/acs5", + "2012": "https://api.census.gov/data/2012/acs/acs5", + "2013": "https://api.census.gov/data/2013/acs/acs5", + "2014": "https://api.census.gov/data/2014/acs/acs5", + "2015": "https://api.census.gov/data/2015/acs/acs5", + "2016": "https://api.census.gov/data/2016/acs/acs5", + "2017": "https://api.census.gov/data/2017/acs/acs5", + "2018": "https://api.census.gov/data/2018/acs/acs5", + "2019": "https://api.census.gov/data/2019/acs/acs5", + "2020": "https://api.census.gov/data/2020/acs/acs5", + "2021": "https://api.census.gov/data/2021/acs/acs5", + ACS_CURRENT_YEAR: "https://api.census.gov/data/2022/acs/acs5", } # For the 2022 ACS, the variable names in the metadata are title-cased, not all caps @@ -126,24 +126,24 @@ # granular age buckets when looking at all races than when breaking down by # race. def get_decade_age_bucket(age_range): - if age_range in {'0-4', '5-9'}: - return '0-9' - elif age_range in {'10-14', '15-17', '18-19'}: - return '10-19' - elif age_range in {'20-20', '21-21', '22-24', '25-29'}: - return '20-29' - elif age_range in {'30-34', '35-39'}: - return '30-39' - elif age_range in {'40-44', '45-49'}: - return '40-49' - elif age_range in {'50-54', '55-59'}: - return '50-59' - elif age_range in {'60-61', '62-64', '65-66', '67-69'}: - return '60-69' - elif age_range in {'70-74', '75-79'}: - return '70-79' - elif age_range in {'80-84', '85+'}: - return '80+' + if age_range in {"0-4", "5-9"}: + return "0-9" + elif age_range in {"10-14", "15-17", "18-19"}: + return "10-19" + elif age_range in {"20-20", "21-21", "22-24", "25-29"}: + return "20-29" + elif age_range in {"30-34", "35-39"}: + return "30-39" + elif age_range in {"40-44", "45-49"}: + return "40-49" + elif age_range in {"50-54", "55-59"}: + return "50-59" + elif age_range in {"60-61", "62-64", "65-66", "67-69"}: + return "60-69" + elif age_range in {"70-74", "75-79"}: + return "70-79" + elif age_range in {"80-84", "85+"}: + return "80+" elif age_range == std_col.ALL_VALUE: return std_col.ALL_VALUE @@ -153,119 +153,119 @@ def get_ahr_standard_age_bucket(age_range): return std_col.ALL_VALUE # buckets for most AHR topics elif age_range in { - '18-19', - '20-24', - '20-20', - '21-21', - '22-24', - '25-29', - '30-34', - '35-44', - '35-39', - '40-44', + "18-19", + "20-24", + "20-20", + "21-21", + "22-24", + "25-29", + "30-34", + "35-44", + "35-39", + "40-44", }: - return '18-44' - elif age_range in {'45-54', '45-49', '50-54', '55-64', '55-59', '60-61', '62-64'}: - return '45-64' + return "18-44" + elif age_range in {"45-54", "45-49", "50-54", "55-64", "55-59", "60-61", "62-64"}: + return "45-64" elif age_range in { - '65-74', - '65-66', - '67-69', - '70-74', - '75-84', - '75-79', - '80-84', - '85+', + "65-74", + "65-66", + "67-69", + "70-74", + "75-84", + "75-79", + "80-84", + "85+", }: - return '65+' + return "65+" def get_ahr_decade_plus_5_age_bucket(age_range): if age_range == std_col.ALL_VALUE: return std_col.ALL_VALUE # buckets for Suicide metrics - elif age_range in {'15-17', '18-19', '20-20', '21-21', '22-24'}: - return '15-24' - elif age_range in {'25-29', '30-34'}: - return '25-34' - elif age_range in {'35-39', '40-44'}: - return '35-44' - elif age_range in {'45-49', '50-54'}: - return '45-54' - elif age_range in {'55-59', '60-61', '62-64'}: - return '55-64' - elif age_range in {'65-66', '67-69', '70-74'}: - return '65-74' - elif age_range in {'75-79', '80-84'}: - return '75-84' - elif age_range in {'85+'}: - return '85+' + elif age_range in {"15-17", "18-19", "20-20", "21-21", "22-24"}: + return "15-24" + elif age_range in {"25-29", "30-34"}: + return "25-34" + elif age_range in {"35-39", "40-44"}: + return "35-44" + elif age_range in {"45-49", "50-54"}: + return "45-54" + elif age_range in {"55-59", "60-61", "62-64"}: + return "55-64" + elif age_range in {"65-66", "67-69", "70-74"}: + return "65-74" + elif age_range in {"75-79", "80-84"}: + return "75-84" + elif age_range in {"85+"}: + return "85+" def get_ahr_suicide_denominators_age_bucket(age_range): - if age_range in {'0-4', '5-9', '10-14'}: - return '0-14' + if age_range in {"0-4", "5-9", "10-14"}: + return "0-14" elif age_range in { - '15-17', - '18-19', - '20-20', - '21-21', - '22-24', - '25-29', - '30-34', - '35-39', - '40-44', - '45-49', - '50-54', - '55-59', - '60-61', - '62-64', - '65-66', - '67-69', - '70-74', - '75-79', - '80-84', - '85+', + "15-17", + "18-19", + "20-20", + "21-21", + "22-24", + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", + "60-61", + "62-64", + "65-66", + "67-69", + "70-74", + "75-79", + "80-84", + "85+", }: - return '15+' + return "15+" def get_ahr_voter_age_bucket(age_range): if age_range == std_col.ALL_VALUE: return std_col.ALL_VALUE # buckets for Voter Participation - elif age_range in {'18-19', '20-20', '21-21', '22-24'}: - return '18-24' - elif age_range in {'25-29', '30-34'}: - return '25-34' - elif age_range in {'35-39', '40-44'}: - return '35-44' - elif age_range in {'45-49', '50-54'}: - return '45-54' - elif age_range in {'55-59', '60-61', '62-64'}: - return '55-64' + elif age_range in {"18-19", "20-20", "21-21", "22-24"}: + return "18-24" + elif age_range in {"25-29", "30-34"}: + return "25-34" + elif age_range in {"35-39", "40-44"}: + return "35-44" + elif age_range in {"45-49", "50-54"}: + return "45-54" + elif age_range in {"55-59", "60-61", "62-64"}: + return "55-64" # buckets for BJS prisoners 2020 def get_prison_age_bucket(age_range): - if age_range in {'18-19'}: + if age_range in {"18-19"}: return age_range - elif age_range in {'20-20', '21-21', '22-24'}: - return '20-24' + elif age_range in {"20-20", "21-21", "22-24"}: + return "20-24" elif age_range in { - '25-29', - '30-34', - '35-39', - '40-44', - '45-49', - '50-54', - '55-59', + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", }: return age_range - elif age_range in {'60-61', '62-64'}: - return '60-64' - elif age_range in {'65-66', '67-69', '70-74', '75-79', '80-84', '85+'}: - return '65+' + elif age_range in {"60-61", "62-64"}: + return "60-64" + elif age_range in {"65-66", "67-69", "70-74", "75-79", "80-84", "85+"}: + return "65+" elif age_range == std_col.ALL_VALUE: return std_col.ALL_VALUE @@ -275,46 +275,46 @@ def get_prison_age_bucket(age_range): def get_jail_age_bucket(age_range): if age_range in { - '0-4', - '5-9', - '10-14', - '15-17', + "0-4", + "5-9", + "10-14", + "15-17", }: - return '0-17' + return "0-17" elif age_range in { - '18-19', - '20-20', - '21-21', - '22-24', - '25-29', - '30-34', - '35-39', - '40-44', - '45-49', - '50-54', - '55-59', - '60-61', - '62-64', - '65-66', - '67-69', - '70-74', - '75-79', - '80-84', - '85+', + "18-19", + "20-20", + "21-21", + "22-24", + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", + "60-61", + "62-64", + "65-66", + "67-69", + "70-74", + "75-79", + "80-84", + "85+", }: - return '18+' + return "18+" elif age_range == std_col.ALL_VALUE: return std_col.ALL_VALUE def get_phrma_age_bucket(age_range): - if age_range in {'18-19', '20-20', '21-21', '22-24', '25-29', '30-34', '35-39'}: - return '18-39' - elif age_range in {'40-44', '45-49', '50-54', '55-59', '60-61', '62-64'}: - return '40-64' - elif age_range in {'65-66', '67-69'}: - return '65-69' - elif age_range in {'70-74', '75-79', '80-84', '85+', std_col.ALL_VALUE}: + if age_range in {"18-19", "20-20", "21-21", "22-24", "25-29", "30-34", "35-39"}: + return "18-39" + elif age_range in {"40-44", "45-49", "50-54", "55-59", "60-61", "62-64"}: + return "40-64" + elif age_range in {"65-66", "67-69"}: + return "65-69" + elif age_range in {"70-74", "75-79", "80-84", "85+", std_col.ALL_VALUE}: return age_range @@ -426,7 +426,7 @@ def write_to_bq(self, dataset, gcs_bucket): gcs_to_bq_util.add_df_to_bq( df_for_time_series, dataset, - f'{table_name}_time_series', + f"{table_name}_time_series", column_types=column_types, overwrite=overwrite, ) @@ -463,7 +463,7 @@ def build_frames_for_this_year(self, gcs_bucket: str): self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race(var_map, sex_by_age_frames), } - frames[f'by_sex_age_{self.get_geo_name()}'] = self.get_by_sex_age( + frames[f"by_sex_age_{self.get_geo_name()}"] = self.get_by_sex_age( frames[self.get_table_name_by_sex_age_race()], get_decade_age_bucket ) @@ -498,8 +498,8 @@ def build_frames_for_this_year(self, gcs_bucket: str): ) by_sex_phrma_age = self.get_by_sex_age(frames[self.get_table_name_by_sex_age_race()], get_phrma_age_bucket) - frames[f'by_age_{self.get_geo_name()}'] = self.get_by_age( - frames[f'by_sex_age_{self.get_geo_name()}'], + frames[f"by_age_{self.get_geo_name()}"] = self.get_by_age( + frames[f"by_sex_age_{self.get_geo_name()}"], by_sex_standard_age_ahr, by_sex_suicide_denominator_ahr, by_sex_decade_plus_5_age_ahr, @@ -509,13 +509,13 @@ def build_frames_for_this_year(self, gcs_bucket: str): by_sex_phrma_age, ) - frames[f'by_sex_{self.get_geo_name()}'] = self.get_by_sex(frames[self.get_table_name_by_sex_age_race()]) + frames[f"by_sex_{self.get_geo_name()}"] = self.get_by_sex(frames[self.get_table_name_by_sex_age_race()]) # Generate national level datasets based on state datasets if not self.county_level: - for demo in ['age', 'race', 'sex']: - state_table_name = f'by_{demo}_state' - frames[f'by_{demo}_national'] = generate_national_dataset_with_all_states( + for demo in ["age", "race", "sex"]: + state_table_name = f"by_{demo}_state" + frames[f"by_{demo}_national"] = generate_national_dataset_with_all_states( frames[state_table_name], demo ) @@ -525,7 +525,7 @@ def get_table_geo_suffix(self): return "_county" if self.county_level else "_state" def get_geo_name(self): - return 'county' if self.county_level else 'state' + return "county" if self.county_level else "state" def get_fips_col(self): return std_col.COUNTY_FIPS_COL if self.county_level else std_col.STATE_FIPS_COL @@ -546,7 +546,7 @@ def get_filename(self, concept: str): filename = self.add_filename_suffix(concept.replace(" ", "_")) - return f'{self.year}-{filename}' + return f"{self.year}-{filename}" def add_filename_suffix(self, root_name): """Adds geography and file type suffix to the root name. @@ -577,7 +577,7 @@ def standardize_race_exclude_hispanic(self, df): return pd.DataFrame(columns=[std_col.RACE_CATEGORY_ID_COL] + self.base_group_by_cols) def get_race_category_id_exclude_hispanic(row): - if row[std_col.HISPANIC_COL] == 'Hispanic or Latino': + if row[std_col.HISPANIC_COL] == "Hispanic or Latino": return Race.HISP.value else: return RACE_STRING_TO_CATEGORY_ID_EXCLUDE_HISP[row[std_col.RACE_COL]] @@ -603,7 +603,7 @@ def standardize_race_include_hispanic(self, df): by_hispanic = by_hispanic.groupby(group_by_cols).sum(numeric_only=True).reset_index() if not by_hispanic.empty: by_hispanic[std_col.RACE_CATEGORY_ID_COL] = by_hispanic.apply( - lambda r: (Race.HISP.value if r[std_col.HISPANIC_COL] == 'Hispanic or Latino' else Race.NH.value), + lambda r: (Race.HISP.value if r[std_col.HISPANIC_COL] == "Hispanic or Latino" else Race.NH.value), axis=1, ) by_hispanic.drop(std_col.HISPANIC_COL, axis=1, inplace=True) @@ -842,7 +842,7 @@ def get_table_name(): @staticmethod def get_id(): """Returns the data source's unique id.""" - return 'ACS_POPULATION' + return "ACS_POPULATION" def upload_to_gcs(self, gcs_bucket, **attrs): @@ -860,7 +860,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): """Called once per year url from DAG, creates a county and non-county ingester to proceed with processing the time-series tables and potentially single year tables""" - year = self.get_attr(attrs, 'year') + year = self.get_attr(attrs, "year") for is_county in [True, False]: ingester = ACSPopulationIngester(is_county, year) @@ -877,9 +877,9 @@ def GENERATE_NATIONAL_DATASET(state_df, states_to_include, demographic_breakdown df = df.drop(columns=std_col.POPULATION_PCT_COL) breakdown_map = { - 'race': std_col.RACE_CATEGORY_ID_COL, - 'age': std_col.AGE_COL, - 'sex': std_col.SEX_COL, + "race": std_col.RACE_CATEGORY_ID_COL, + "age": std_col.AGE_COL, + "sex": std_col.SEX_COL, } df = df.groupby(breakdown_map[demographic_breakdown_category]).sum(numeric_only=True).reset_index() @@ -893,13 +893,13 @@ def GENERATE_NATIONAL_DATASET(state_df, states_to_include, demographic_breakdown std_col.POPULATION_COL, std_col.POPULATION_PCT_COL, ] - if demographic_breakdown_category == 'race': + if demographic_breakdown_category == "race": needed_cols.extend(std_col.RACE_COLUMNS) else: needed_cols.append(breakdown_map[demographic_breakdown_category]) total_val = std_col.ALL_VALUE - if demographic_breakdown_category == 'race': + if demographic_breakdown_category == "race": total_val = Race.ALL.value df = generate_pct_share_col_without_unknowns( @@ -909,7 +909,7 @@ def GENERATE_NATIONAL_DATASET(state_df, states_to_include, demographic_breakdown total_val, ) - if demographic_breakdown_category == 'race': + if demographic_breakdown_category == "race": std_col.add_race_columns_from_category_id(df) df[std_col.STATE_FIPS_COL] = df[std_col.STATE_FIPS_COL].astype(str) diff --git a/python/datasources/age_adjust_cdc_hiv.py b/python/datasources/age_adjust_cdc_hiv.py index f5702ff68f..3b8acc643b 100644 --- a/python/datasources/age_adjust_cdc_hiv.py +++ b/python/datasources/age_adjust_cdc_hiv.py @@ -25,30 +25,30 @@ Race.MULTI_NH.value, } -EXPECTED_DEATHS = 'expected_deaths' +EXPECTED_DEATHS = "expected_deaths" class AgeAdjustCDCHiv(DataSource): @staticmethod def get_id(): - return 'AGE_ADJUST_CDC_HIV' + return "AGE_ADJUST_CDC_HIV" @staticmethod def get_table_name(): - return 'cdc_hiv_data' + return "cdc_hiv_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for AgeAdjustCDCHiv') + raise NotImplementedError("upload_to_gcs should not be called for AgeAdjustCDCHiv") def write_to_bq(self, dataset, gcs_bucket, **attrs): for geo in [NATIONAL_LEVEL, STATE_LEVEL]: # only merges current year age_adjusted_df = self.generate_age_adjustment(geo) - only_race_source = f'race_and_ethnicity_{geo}_current' - table_name = f'{only_race_source}-with_age_adjust' + only_race_source = f"race_and_ethnicity_{geo}_current" + table_name = f"{only_race_source}-with_age_adjust" only_race_df = gcs_to_bq_util.load_df_from_bigquery( - 'cdc_hiv_data', + "cdc_hiv_data", only_race_source, dtype={std_col.STATE_FIPS_COL: str}, ) @@ -89,10 +89,10 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): def generate_age_adjustment(self, geo): race_age_df = gcs_to_bq_util.load_df_from_bigquery( - 'cdc_hiv_data', - f'by_race_age_{geo}', + "cdc_hiv_data", + f"by_race_age_{geo}", dtype={ - 'state_fips': str, + "state_fips": str, }, ) @@ -132,7 +132,7 @@ def merge_age_adjusted(df, age_adjusted_df): df = df.reset_index(drop=True) age_adjusted_df = age_adjusted_df.reset_index(drop=True) - return pd.merge(df, age_adjusted_df, how='left', on=merge_cols) + return pd.merge(df, age_adjusted_df, how="left", on=merge_cols) def get_expected_col(race_and_age_df, population_df, expected_col, raw_number_col): @@ -150,7 +150,7 @@ def get_expected_col(race_and_age_df, population_df, expected_col, raw_number_co raw_number_col: string column name to get the raw number of cases to age adjust from""" - this_pop_size, ref_pop_size = 'this_pop_size', 'ref_pop_size' + this_pop_size, ref_pop_size = "this_pop_size", "ref_pop_size" def get_expected(row): """Calculates the expected value of each race/age split based on the @@ -158,7 +158,7 @@ def get_expected(row): split.""" if not row[ref_pop_size]: - raise ValueError(f'Population size for {REFERENCE_POPULATION} demographic is 0 or nil') + raise ValueError(f"Population size for {REFERENCE_POPULATION} demographic is 0 or nil") if not row[raw_number_col]: return None @@ -215,7 +215,7 @@ def get_age_adjusted_ratios(row): ) return row - base_pop_expected_deaths = 'base_pop_expected_deaths' + base_pop_expected_deaths = "base_pop_expected_deaths" groupby_cols = [ std_col.STATE_FIPS_COL, diff --git a/python/datasources/age_adjust_cdc_restricted.py b/python/datasources/age_adjust_cdc_restricted.py index aa41dd47f6..af6b9468d1 100644 --- a/python/datasources/age_adjust_cdc_restricted.py +++ b/python/datasources/age_adjust_cdc_restricted.py @@ -25,21 +25,21 @@ Race.ASIAN_NH.value, } -EXPECTED_HOSPS = 'expected_hosps' -EXPECTED_DEATHS = 'expected_deaths' +EXPECTED_HOSPS = "expected_hosps" +EXPECTED_DEATHS = "expected_deaths" class AgeAdjustCDCRestricted(DataSource): @staticmethod def get_id(): - return 'AGE_ADJUST_CDC_RESTRICTED' + return "AGE_ADJUST_CDC_RESTRICTED" @staticmethod def get_table_name(): - return 'cdc_restricted_data' + return "cdc_restricted_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for AgeAdjustCDCRestricted') + raise NotImplementedError("upload_to_gcs should not be called for AgeAdjustCDCRestricted") def write_to_bq(self, dataset, gcs_bucket, **attrs): for time_series in [False, True]: @@ -47,20 +47,20 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): age_adjusted_df = self.generate_age_adjustment(geo, time_series) - only_race = f'by_race_{geo}_processed' - table_name = f'{only_race}-with_age_adjust' + only_race = f"by_race_{geo}_processed" + table_name = f"{only_race}-with_age_adjust" if time_series: - table_name += '_time_series' - only_race += '_time_series' + table_name += "_time_series" + only_race += "_time_series" - only_race_df = gcs_to_bq_util.load_df_from_bigquery('cdc_restricted_data', only_race) + only_race_df = gcs_to_bq_util.load_df_from_bigquery("cdc_restricted_data", only_race) df = merge_age_adjusted(only_race_df, age_adjusted_df, time_series) column_types = get_col_types(df) - column_types[std_col.COVID_HOSP_RATIO_AGE_ADJUSTED] = 'FLOAT' - column_types[std_col.COVID_DEATH_RATIO_AGE_ADJUSTED] = 'FLOAT' + column_types[std_col.COVID_HOSP_RATIO_AGE_ADJUSTED] = "FLOAT" + column_types[std_col.COVID_DEATH_RATIO_AGE_ADJUSTED] = "FLOAT" # Clean up column names. self.clean_frame_column_names(df) @@ -70,14 +70,14 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=column_types) def generate_age_adjustment(self, geo, time_series): - print(f'age adjusting {geo} with time_series= {time_series}') - with_race_age = 'by_race_age_state' + print(f"age adjusting {geo} with time_series= {time_series}") + with_race_age = "by_race_age_state" with_race_age_df = gcs_to_bq_util.load_df_from_bigquery( - 'cdc_restricted_data', with_race_age, dtype={'state_fips': str} + "cdc_restricted_data", with_race_age, dtype={"state_fips": str} ) pop_df = gcs_to_bq_util.load_df_from_bigquery( - 'census_pop_estimates', 'race_and_ethnicity', dtype={'state_fips': str} + "census_pop_estimates", "race_and_ethnicity", dtype={"state_fips": str} ) # Only get the covid data from states we have population data for @@ -142,7 +142,7 @@ def merge_age_adjusted(df, age_adjusted_df, time_series): df = df.reset_index(drop=True) age_adjusted_df = age_adjusted_df.reset_index(drop=True) - return pd.merge(df, age_adjusted_df, how='left', on=merge_cols) + return pd.merge(df, age_adjusted_df, how="left", on=merge_cols) def get_expected_col(race_and_age_df, population_df, expected_col, raw_number_col): @@ -160,7 +160,7 @@ def get_expected_col(race_and_age_df, population_df, expected_col, raw_number_co raw_number_col: string column name to get the raw number of cases to age adjust from""" - this_pop_size, ref_pop_size = 'this_pop_size', 'ref_pop_size' + this_pop_size, ref_pop_size = "this_pop_size", "ref_pop_size" def get_expected(row): """Calculates the expected value of each race/age split based on the @@ -168,7 +168,7 @@ def get_expected(row): split.""" if not row[ref_pop_size]: - raise ValueError(f'Population size for {REFERENCE_POPULATION} demographic is 0 or nil') + raise ValueError(f"Population size for {REFERENCE_POPULATION} demographic is 0 or nil") if not row[raw_number_col]: return None @@ -228,7 +228,7 @@ def get_age_adjusted_ratios(row): return row - base_pop_expected_deaths, base_pop_expected_hosps = 'base_pop_expected_deaths', 'base_pop_expected_hosps' + base_pop_expected_deaths, base_pop_expected_hosps = "base_pop_expected_deaths", "base_pop_expected_hosps" groupby_cols = [std_col.STATE_FIPS_COL, std_col.STATE_NAME_COL, std_col.RACE_CATEGORY_ID_COL] if time_series: diff --git a/python/datasources/bjs_incarceration.py b/python/datasources/bjs_incarceration.py index 4cc2f5b1fe..996c6b6b32 100644 --- a/python/datasources/bjs_incarceration.py +++ b/python/datasources/bjs_incarceration.py @@ -142,7 +142,7 @@ def generate_raw_breakdown(demo, geo_level, table_list): df_prison = df_prison.reset_index(drop=True) merge_cols = [std_col.STATE_NAME_COL, demo_for_flip] - df = pd.merge(df_prison, df_jail, how='outer', on=merge_cols) + df = pd.merge(df_prison, df_jail, how="outer", on=merge_cols) return df @@ -175,11 +175,11 @@ def generate_raw_national_age_breakdown(table_list): # get and store the total value from the last row total_raw_prison_value = prison_10.loc[ - prison_10[std_col.AGE_COL] == 'Number of sentenced prisoners', PRISON_PCT_SHARE_COL + prison_10[std_col.AGE_COL] == "Number of sentenced prisoners", PRISON_PCT_SHARE_COL ].values[0] # drop the last row and just keep normal rows - df_prison = prison_10.loc[prison_10[std_col.AGE_COL] != 'Number of sentenced prisoners'] + df_prison = prison_10.loc[prison_10[std_col.AGE_COL] != "Number of sentenced prisoners"] # standardize df_prison with ADULT RAW # / AGE / USA df_prison = merge_state_ids(df_prison) @@ -191,7 +191,7 @@ def generate_raw_national_age_breakdown(table_list): df_prison = df_prison[[RAW_PRISON_COL, std_col.STATE_NAME_COL, std_col.AGE_COL, PRISON_PCT_SHARE_COL]] merge_cols = [std_col.STATE_NAME_COL, std_col.AGE_COL] - df = pd.merge(df_prison, df_jail, how='outer', on=merge_cols) + df = pd.merge(df_prison, df_jail, how="outer", on=merge_cols) return df @@ -251,18 +251,18 @@ def post_process(df, breakdown, geo, children_tables): prison_13, jail_6 = children_tables # get RAW JAIL for 0-17 and melt to set as new property for "All" rows for every demo-breakdowns - jail_6 = jail_6.rename(columns={'0-17': all_val}) + jail_6 = jail_6.rename(columns={"0-17": all_val}) jail_6 = jail_6[[std_col.STATE_NAME_COL, all_val]] jail_6 = cols_to_rows(jail_6, [all_val], group_col, TOTAL_CHILDREN_COL) - jail_6 = jail_6.rename(columns={TOTAL_CHILDREN_COL: f'{TOTAL_CHILDREN_COL}_jail'}) + jail_6 = jail_6.rename(columns={TOTAL_CHILDREN_COL: f"{TOTAL_CHILDREN_COL}_jail"}) # get RAW PRISON for 0-17 and set as new property for "All" rows for every demo-breakdowns - prison_13 = prison_13.rename(columns={RAW_PRISON_COL: f'{TOTAL_CHILDREN_COL}_prison', "age": group_col}) + prison_13 = prison_13.rename(columns={RAW_PRISON_COL: f"{TOTAL_CHILDREN_COL}_prison", "age": group_col}) prison_13[group_col] = all_val # sum confined children in prison+jail df_confined = pd.merge(jail_6, prison_13, how="outer", on=[std_col.STATE_NAME_COL, group_col]) - df_confined[TOTAL_CHILDREN_COL] = df_confined[[f'{TOTAL_CHILDREN_COL}_jail', f'{TOTAL_CHILDREN_COL}_prison']].sum( + df_confined[TOTAL_CHILDREN_COL] = df_confined[[f"{TOTAL_CHILDREN_COL}_jail", f"{TOTAL_CHILDREN_COL}_prison"]].sum( axis="columns", numeric_only=True ) df_confined = df_confined[[std_col.STATE_NAME_COL, TOTAL_CHILDREN_COL, group_col]] @@ -284,14 +284,14 @@ def post_process(df, breakdown, geo, children_tables): class BJSIncarcerationData(DataSource): @staticmethod def get_id(): - return 'BJS_INCARCERATION_DATA' + return "BJS_INCARCERATION_DATA" @staticmethod def get_table_name(): - return 'bjs_incarceration_data' + return "bjs_incarceration_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for BJSIncarcerationData') + raise NotImplementedError("upload_to_gcs should not be called for BJSIncarcerationData") def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **attrs): """ @@ -314,12 +314,12 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at # BJS tables needed per breakdown table_lookup = { - f'{std_col.AGE_COL}_{NATIONAL_LEVEL}_{CURRENT}': [prisoners_10, jail_6], - f'{std_col.AGE_COL}_{STATE_LEVEL}_{CURRENT}': [prisoners_2, prisoners_23, jail_6], - f'{std_col.RACE_OR_HISPANIC_COL}_{NATIONAL_LEVEL}_{CURRENT}': [prisoners_app_2, prisoners_23, jail_7], - f'{std_col.RACE_OR_HISPANIC_COL}_{STATE_LEVEL}_{CURRENT}': [prisoners_app_2, prisoners_23, jail_7], - f'{std_col.SEX_COL}_{NATIONAL_LEVEL}_{CURRENT}': [prisoners_2, prisoners_23, jail_6], - f'{std_col.SEX_COL}_{STATE_LEVEL}_{CURRENT}': [prisoners_2, prisoners_23, jail_6], + f"{std_col.AGE_COL}_{NATIONAL_LEVEL}_{CURRENT}": [prisoners_10, jail_6], + f"{std_col.AGE_COL}_{STATE_LEVEL}_{CURRENT}": [prisoners_2, prisoners_23, jail_6], + f"{std_col.RACE_OR_HISPANIC_COL}_{NATIONAL_LEVEL}_{CURRENT}": [prisoners_app_2, prisoners_23, jail_7], + f"{std_col.RACE_OR_HISPANIC_COL}_{STATE_LEVEL}_{CURRENT}": [prisoners_app_2, prisoners_23, jail_7], + f"{std_col.SEX_COL}_{NATIONAL_LEVEL}_{CURRENT}": [prisoners_2, prisoners_23, jail_6], + f"{std_col.SEX_COL}_{STATE_LEVEL}_{CURRENT}": [prisoners_2, prisoners_23, jail_6], } children_tables = [prisoners_13, jail_6] diff --git a/python/datasources/cawp_time.py b/python/datasources/cawp_time.py index 368aad9062..b25b3d6f2c 100644 --- a/python/datasources/cawp_time.py +++ b/python/datasources/cawp_time.py @@ -135,29 +135,29 @@ def get_stleg_url(id: str): state info pages, for example: https://cawp.rutgers.edu/facts/state-state-information/alabama """ - return f'https://cawp.rutgers.edu/tablefield/export/paragraph/{id}/field_table/und/0' + return f"https://cawp.rutgers.edu/tablefield/export/paragraph/{id}/field_table/und/0" CAWP_MULTI = "Multiracial Alone" # CAWP labels CAWP_RACE_GROUPS_TO_STANDARD = { - 'Asian American/Pacific Islander': Race.ASIAN_PAC.value, - 'Latina': Race.HISP.value, - 'Middle Eastern/North African': Race.MENA.value, - 'Native American/Alaska Native/Native Hawaiian': Race.AIANNH.value, - 'Black': Race.BLACK.value, - 'White': Race.WHITE.value, - 'Unavailable': Race.UNKNOWN.value, - 'Other': Race.OTHER_STANDARD.value, + "Asian American/Pacific Islander": Race.ASIAN_PAC.value, + "Latina": Race.HISP.value, + "Middle Eastern/North African": Race.MENA.value, + "Native American/Alaska Native/Native Hawaiian": Race.AIANNH.value, + "Black": Race.BLACK.value, + "White": Race.WHITE.value, + "Unavailable": Race.UNKNOWN.value, + "Other": Race.OTHER_STANDARD.value, # will combine CAWP's "Multiracial Alone" with women who selected more than one specific race CAWP_MULTI: Race.MULTI.value, } AIAN_API_RACES = [ - 'Asian American/Pacific Islander', - 'Native American/Alaska Native/Native Hawaiian', + "Asian American/Pacific Islander", + "Native American/Alaska Native/Native Hawaiian", ] @@ -205,14 +205,14 @@ def get_stleg_url(id: str): class CAWPTimeData(DataSource): @staticmethod def get_id(): - return 'CAWP_TIME_DATA' + return "CAWP_TIME_DATA" @staticmethod def get_table_name(): - return 'cawp_time_data' + return "cawp_time_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for CAWPTimeData') + raise NotImplementedError("upload_to_gcs should not be called for CAWPTimeData") def write_to_bq(self, dataset, gcs_bucket, **attrs): base_df = self.generate_base_df() @@ -223,7 +223,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_to_bq_util.add_df_to_bq( df_names, dataset, - 'race_and_ethnicity_state_historical_names', + "race_and_ethnicity_state_historical_names", column_types=column_types, ) @@ -359,7 +359,7 @@ def generate_base_df(self): # Replace nulls with empty lists df.loc[df[col].isnull(), col] = df.loc[df[col].isnull(), col].apply(lambda x: []) # Convert lists to comma-separated strings - df[col] = df[col].apply(lambda item: ','.join(map(str, item))) + df[col] = df[col].apply(lambda item: ",".join(map(str, item))) # remove brackets and inner quotes, leaving just comma separated names df[names_cols] = df[names_cols].replace(["'", "[", "]"], "") @@ -555,9 +555,9 @@ def get_us_congress_totals_df(): for year in term_years: year = str(year) title = ( - f'{POSITION_LABELS[CONGRESS][term[TYPE]]}' if term[STATE] not in TERRITORY_POSTALS else "U.S. Del." + f"{POSITION_LABELS[CONGRESS][term[TYPE]]}" if term[STATE] not in TERRITORY_POSTALS else "U.S. Del." ) - full_name = f'{title} {legislator[NAME][FIRST]} {legislator[NAME][LAST]}' + full_name = f"{title} {legislator[NAME][FIRST]} {legislator[NAME][LAST]}" entry = { ID: legislator[ID]["govtrack"], NAME: full_name, @@ -634,7 +634,7 @@ def get_women_dfs(): columns "time_period" by year and "state_postal", "race_ethnicity" with specific CAWP race strings""" - df = gcs_to_bq_util.load_csv_as_df_from_data_dir('cawp_time', CAWP_LINE_ITEMS_FILE) + df = gcs_to_bq_util.load_csv_as_df_from_data_dir("cawp_time", CAWP_LINE_ITEMS_FILE) # keep only needed cols df = df[[ID, YEAR, STATE, FIRST_NAME, LAST_NAME, POSITION, RACE_ETH]] @@ -740,7 +740,7 @@ def get_state_leg_totals_df(): territory_dfs = [] for fips in TERRITORY_FIPS_LIST: - filename = f'cawp_state_leg_{fips}.csv' + filename = f"cawp_state_leg_{fips}.csv" territory_df = gcs_to_bq_util.load_csv_as_df_from_data_dir( "cawp_time", filename, dtype={"state_fips": str, "time_period": str} ) @@ -752,10 +752,10 @@ def get_state_leg_totals_df(): state_df = gcs_to_bq_util.load_csv_as_df_from_web(get_stleg_url(id), dtype=str) # remove weird chars from col headers - state_df.columns = state_df.columns.str.replace(r'\W', '', regex=True) + state_df.columns = state_df.columns.str.replace(r"\W", "", regex=True) # standardize the year col - state_df = state_df.rename(columns={'Year': std_col.TIME_PERIOD_COL}) + state_df = state_df.rename(columns={"Year": std_col.TIME_PERIOD_COL}) # Drop rows where year is NaN state_df = state_df.dropna(subset=[std_col.TIME_PERIOD_COL]) @@ -763,8 +763,8 @@ def get_state_leg_totals_df(): # extract totals state_df[[std_col.W_ALL_RACES_STLEG_COUNT, std_col.STLEG_COUNT]] = state_df[ - 'TotalWomenTotalLegislature' - ].str.split('/', n=1, expand=True) + "TotalWomenTotalLegislature" + ].str.split("/", n=1, expand=True) # keep only needed cols state_df = state_df[[std_col.TIME_PERIOD_COL, std_col.STLEG_COUNT]] @@ -917,7 +917,7 @@ def add_aian_api_rows(df): # re-merge with this to preserve the non-summed rows like "total_congress_count", etc # could use either Asian or AIAN, the totals would be the same orig_df = df.copy() - df_denom_cols_aian_api_rows = orig_df.copy().loc[orig_df[RACE_ETH] == 'Asian American/Pacific Islander'] + df_denom_cols_aian_api_rows = orig_df.copy().loc[orig_df[RACE_ETH] == "Asian American/Pacific Islander"] denom_cols = [std_col.TIME_PERIOD_COL, *STATE_COLS, *level_denom_cols] @@ -1041,7 +1041,7 @@ def handle_other_and_multi_races(df): are renamed, allowing these 2 types of multi- to be combined in the aggregation """ # convert comma separated names string into list, doesn't affect single race strings - df[RACE_ETH] = df[RACE_ETH].str.split(', ') + df[RACE_ETH] = df[RACE_ETH].str.split(", ") # rows with multiple specific races will sum later with # CAWP's incoming "multiracial alone" diff --git a/python/datasources/cdc_hiv.py b/python/datasources/cdc_hiv.py index e6df41992a..f6c7e4d2ac 100644 --- a/python/datasources/cdc_hiv.py +++ b/python/datasources/cdc_hiv.py @@ -232,7 +232,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): # copy so iterative changes dont interfere df_for_bq = df.copy() - table_demo = demographic if demographic != std_col.BLACK_WOMEN else 'black_women_by_age' + table_demo = demographic if demographic != std_col.BLACK_WOMEN else "black_women_by_age" table_id = gcs_to_bq_util.make_bq_table_id(table_demo, geo_level, time_view) if demographic == std_col.BLACK_WOMEN: df_for_bq.rename(columns=BW_FLOAT_COLS_RENAME_MAP, inplace=True) diff --git a/python/datasources/cdc_restricted.py b/python/datasources/cdc_restricted.py index 24f1a43929..17b616cc06 100644 --- a/python/datasources/cdc_restricted.py +++ b/python/datasources/cdc_restricted.py @@ -25,11 +25,11 @@ zero_out_pct_rel_inequity, ) -DC_COUNTY_FIPS = '11001' +DC_COUNTY_FIPS = "11001" ONLY_FIPS_FILES = { # These files only need to get their fips codes merged in - 'cdc_restricted_by_race_and_age_state.csv': 'by_race_age_state', + "cdc_restricted_by_race_and_age_state.csv": "by_race_age_state", } COVID_CONDITION_TO_PREFIX = { @@ -50,39 +50,39 @@ SEX: (std_col.SEX_COL, list(SEX_NAMES_MAPPING.values())), } -POPULATION_SUFFIX = 'population' +POPULATION_SUFFIX = "population" class CDCRestrictedData(DataSource): @staticmethod def get_id(): - return 'CDC_RESTRICTED_DATA' + return "CDC_RESTRICTED_DATA" @staticmethod def get_table_name(): - return 'cdc_restricted_data' + return "cdc_restricted_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for CDCRestrictedData') + raise NotImplementedError("upload_to_gcs should not be called for CDCRestrictedData") def write_to_bq(self, dataset, gcs_bucket, **attrs): - demo = self.get_attr(attrs, 'demographic') - geo = self.get_attr(attrs, 'geographic') + demo = self.get_attr(attrs, "demographic") + geo = self.get_attr(attrs, "geographic") geo_to_pull = STATE_LEVEL if geo == NATIONAL_LEVEL else geo - filename = f'cdc_restricted_by_{demo}_{geo_to_pull}.csv' + filename = f"cdc_restricted_by_{demo}_{geo_to_pull}.csv" df_from_gcs = gcs_to_bq_util.load_csv_as_df( gcs_bucket, filename, dtype={ - 'county_fips': str, - 'cases': 'uint32', - 'hosp_y': 'uint32', - 'hosp_n': 'uint32', - 'hosp_unknown': 'uint32', - 'death_y': 'uint32', - 'death_n': 'uint32', - 'death_unknown': 'uint32', + "county_fips": str, + "cases": "uint32", + "hosp_y": "uint32", + "hosp_n": "uint32", + "hosp_unknown": "uint32", + "death_y": "uint32", + "death_n": "uint32", + "death_unknown": "uint32", }, ) @@ -93,9 +93,9 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): column_types = get_col_types(df, add_rel_inequality_col=time_series) - table_name = f'by_{demo}_{geo}_processed' + table_name = f"by_{demo}_{geo}_processed" if time_series: - table_name += '_time_series' + table_name += "_time_series" gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=column_types) # Only do this once, open to a less weird way of doing this @@ -123,16 +123,16 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): if std_col.RACE_CATEGORY_ID_COL in df.columns: std_col.add_race_columns_from_category_id(df) - column_types = {c: 'STRING' for c in df.columns} + column_types = {c: "STRING" for c in df.columns} for col in int_cols: if col in column_types: - column_types[col] = 'FLOAT' + column_types[col] = "FLOAT" - print(f'uploading {table_name}') + print(f"uploading {table_name}") gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=column_types) def generate_breakdown(self, df, demo, geo, time_series): - print(f'processing {demo} {geo} time_series = {time_series}') + print(f"processing {demo} {geo} time_series = {time_series}") start = time.time() demo_col = std_col.RACE_CATEGORY_ID_COL if demo == RACE else demo @@ -268,15 +268,15 @@ def get_col_types(df, add_rel_inequality_col=False): add_rel_inequality_col: Optional boolean paramater to add the `rel_inequality` parameter, defaults to False""" - column_types = {c: 'STRING' for c in df.columns} + column_types = {c: "STRING" for c in df.columns} for prefix in COVID_CONDITION_TO_PREFIX.values(): - column_types[generate_column_name(prefix, std_col.PER_100K_SUFFIX)] = 'FLOAT' - column_types[generate_column_name(prefix, std_col.SHARE_SUFFIX)] = 'FLOAT' + column_types[generate_column_name(prefix, std_col.PER_100K_SUFFIX)] = "FLOAT" + column_types[generate_column_name(prefix, std_col.SHARE_SUFFIX)] = "FLOAT" if add_rel_inequality_col: - column_types[generate_column_name(prefix, std_col.PCT_REL_INEQUITY_SUFFIX)] = 'FLOAT' + column_types[generate_column_name(prefix, std_col.PCT_REL_INEQUITY_SUFFIX)] = "FLOAT" - column_types[std_col.COVID_POPULATION_PCT] = 'FLOAT' + column_types[std_col.COVID_POPULATION_PCT] = "FLOAT" return column_types @@ -415,17 +415,17 @@ def remove_or_set_to_zero(df, geo, demographic): demog_col = DEMO_COL_MAPPING[demographic][0] grouped_df = df.groupby(geo_cols + [demog_col]).sum(min_count=1, numeric_only=True).reset_index() - grouped_df = grouped_df.rename(columns={std_col.COVID_CASES: 'grouped_cases'}) - grouped_df = grouped_df[geo_cols + [demog_col, 'grouped_cases']] + grouped_df = grouped_df.rename(columns={std_col.COVID_CASES: "grouped_cases"}) + grouped_df = grouped_df[geo_cols + [demog_col, "grouped_cases"]] # Remove all rows that have zero cases throughout the pandemic - df = pd.merge(df, grouped_df, how='left', on=geo_cols + [demog_col]) - df = df[pd.notna(df['grouped_cases'])] - df = df.drop(columns='grouped_cases') + df = pd.merge(df, grouped_df, how="left", on=geo_cols + [demog_col]) + df = df[pd.notna(df["grouped_cases"])] + df = df.drop(columns="grouped_cases") # Unknowns are a special case, we want to keep the per_100k values # as NULL no matter what - unknown = Race.UNKNOWN.value if demographic == 'race' else UNKNOWN + unknown = Race.UNKNOWN.value if demographic == "race" else UNKNOWN unknown_df = df.loc[df[demog_col] == unknown] # Set all other null conditions to zero diff --git a/python/datasources/cdc_restricted_local.py b/python/datasources/cdc_restricted_local.py index e83dc875d6..7697d845fa 100644 --- a/python/datasources/cdc_restricted_local.py +++ b/python/datasources/cdc_restricted_local.py @@ -35,12 +35,12 @@ # Geo columns (state, county) - we aggregate or groupby either state or county. # Demog columns (race, age, sex) - we groupby one of these at a time. # Outcome columns (hosp, death) - these are the measured variables we count. -STATE_COL = 'res_state' -COUNTY_FIPS_COL = 'county_fips_code' -COUNTY_COL = 'res_county' -AGE_COL = 'age_group' -OUTCOME_COLS = ['hosp_yn', 'death_yn'] -CASE_DATE_COL = 'cdc_case_earliest_dt' +STATE_COL = "res_state" +COUNTY_FIPS_COL = "county_fips_code" +COUNTY_COL = "res_county" +AGE_COL = "age_group" +OUTCOME_COLS = ["hosp_yn", "death_yn"] +CASE_DATE_COL = "cdc_case_earliest_dt" USE_COLS = [ STATE_COL, @@ -55,7 +55,7 @@ ] # column no longer provided by CDC that we need to recreate -RACE_ETH_COL = 'race_ethnicity_combined' +RACE_ETH_COL = "race_ethnicity_combined" # Convenience list for when we group the data by county. COUNTY_COLS = [COUNTY_FIPS_COL, COUNTY_COL, STATE_COL] @@ -86,7 +86,7 @@ "Multiple/Other": std_col.Race.MULTI_OR_OTHER_STANDARD_NH.value, "Native Hawaiian/Other Pacific Islander": std_col.Race.NHPI_NH.value, "White": std_col.Race.WHITE_NH.value, - 'Hispanic/Latino': std_col.Race.HISP.value, + "Hispanic/Latino": std_col.Race.HISP.value, } SEX_NAMES_MAPPING = { @@ -115,12 +115,12 @@ # Mapping from geo and demo to relevant column(s) in the data. The demo # mapping also includes the values mapping for transforming demographic values # to their standardized form. -GEO_COL_MAPPING = {'state': [STATE_COL], 'county': COUNTY_COLS} +GEO_COL_MAPPING = {"state": [STATE_COL], "county": COUNTY_COLS} DEMOGRAPHIC_COL_MAPPING = { - 'race': ([std_col.RACE_COL, std_col.ETH_COL], RACE_NAMES_MAPPING), - 'sex': ([std_col.SEX_COL], SEX_NAMES_MAPPING), - 'age': ([AGE_COL], AGE_NAMES_MAPPING), - 'race_and_age': ( + "race": ([std_col.RACE_COL, std_col.ETH_COL], RACE_NAMES_MAPPING), + "sex": ([std_col.SEX_COL], SEX_NAMES_MAPPING), + "age": ([AGE_COL], AGE_NAMES_MAPPING), + "race_and_age": ( [std_col.RACE_COL, std_col.ETH_COL, AGE_COL], {**AGE_NAMES_MAPPING, **RACE_NAMES_MAPPING}, ), @@ -144,12 +144,12 @@ def accumulate_data(df, geo_cols, overall_df, demog_cols, names_mapping): # Add columns for hospitalization yes/no/unknown and death yes/no/unknown, # as we aggregate and count these individually. Do a sanity check that we # covered all the data and drop the original hospitalization/death columns. - df[std_col.COVID_HOSP_Y] = df['hosp_yn'] == 'Yes' - df[std_col.COVID_HOSP_N] = df['hosp_yn'] == 'No' - df[std_col.COVID_HOSP_UNKNOWN] = (df['hosp_yn'] == 'Unknown') | (df['hosp_yn'] == 'Missing') - df[std_col.COVID_DEATH_Y] = df['death_yn'] == 'Yes' - df[std_col.COVID_DEATH_N] = df['death_yn'] == 'No' - df[std_col.COVID_DEATH_UNKNOWN] = (df['death_yn'] == 'Unknown') | (df['death_yn'] == 'Missing') + df[std_col.COVID_HOSP_Y] = df["hosp_yn"] == "Yes" + df[std_col.COVID_HOSP_N] = df["hosp_yn"] == "No" + df[std_col.COVID_HOSP_UNKNOWN] = (df["hosp_yn"] == "Unknown") | (df["hosp_yn"] == "Missing") + df[std_col.COVID_DEATH_Y] = df["death_yn"] == "Yes" + df[std_col.COVID_DEATH_N] = df["death_yn"] == "No" + df[std_col.COVID_DEATH_UNKNOWN] = (df["death_yn"] == "Unknown") | (df["death_yn"] == "Missing") check_hosp = (df[std_col.COVID_HOSP_Y] | df[std_col.COVID_HOSP_N] | df[std_col.COVID_HOSP_UNKNOWN]).all() check_deaths = (df[std_col.COVID_DEATH_Y] | df[std_col.COVID_DEATH_N] | df[std_col.COVID_DEATH_UNKNOWN]).all() @@ -157,7 +157,7 @@ def accumulate_data(df, geo_cols, overall_df, demog_cols, names_mapping): assert check_hosp, "All possible hosp_yn values are not accounted for" assert check_deaths, "All possible death_yn values are not accounted for" - df = df.drop(columns=['hosp_yn', 'death_yn']) + df = df.drop(columns=["hosp_yn", "death_yn"]) counts_cols_to_sum = [ std_col.COVID_CASES, @@ -184,7 +184,7 @@ def accumulate_data(df, geo_cols, overall_df, demog_cols, names_mapping): df, counts_cols_to_sum, RACE_NAMES_MAPPING, - ethnicity_value='Hispanic/Latino', + ethnicity_value="Hispanic/Latino", additional_group_cols=source_groupby_cols, race_eth_output_col=std_col.RACE_ETH_COL, ) @@ -315,10 +315,10 @@ def process_data(dir, files): # Slice the data and aggregate for the given dimension. sliced_df = df[geo_cols + demog_col + OUTCOME_COLS + [CASE_DATE_COL]] - if demo == 'race': + if demo == "race": demog_col = [RACE_ETH_COL] - if demo == 'race_and_age': + if demo == "race_and_age": demog_col = [RACE_ETH_COL, AGE_COL] all_dfs[(geo, demo)] = accumulate_data( @@ -364,8 +364,8 @@ def main(): matching_files = [] files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))] for f in files: - filename_parts = f.split('.') - if len(filename_parts) == 2 and prefix in filename_parts[0] and filename_parts[1] == 'csv': + filename_parts = f.split(".") + if len(filename_parts) == 2 and prefix in filename_parts[0] and filename_parts[1] == "csv": matching_files.append(f) if len(matching_files) == 0: diff --git a/python/datasources/cdc_vaccination_county.py b/python/datasources/cdc_vaccination_county.py index e4f8cb8ffc..4bd2e3d020 100644 --- a/python/datasources/cdc_vaccination_county.py +++ b/python/datasources/cdc_vaccination_county.py @@ -6,14 +6,14 @@ from ingestion.standardized_columns import Race import ingestion.standardized_columns as std_col -BASE_CDC_URL = 'https://data.cdc.gov/resource/8xkx-amqh.csv' +BASE_CDC_URL = "https://data.cdc.gov/resource/8xkx-amqh.csv" FILE_SIZE_LIMIT = 5000 -CDC_COUNTY_FIPS_COL = 'fips' -CDC_COUNTY_COL = 'recip_county' -CDC_DOSE_ONE_COL = 'administered_dose1_recip' -CDC_DATE_COL = 'date' -CDC_ONE_DOSE = 'one_dose' +CDC_COUNTY_FIPS_COL = "fips" +CDC_COUNTY_COL = "recip_county" +CDC_DOSE_ONE_COL = "administered_dose1_recip" +CDC_DATE_COL = "date" +CDC_ONE_DOSE = "one_dose" COL_NAME_MAPPING = { CDC_COUNTY_FIPS_COL: std_col.COUNTY_FIPS_COL, @@ -25,14 +25,14 @@ class CDCVaccinationCounty(DataSource): @staticmethod def get_id(): - return 'CDC_VACCINATION_COUNTY' + return "CDC_VACCINATION_COUNTY" @staticmethod def get_table_name(): - return 'cdc_vaccination_county' + return "cdc_vaccination_county" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for CDCVaccinationCounty') + raise NotImplementedError("upload_to_gcs should not be called for CDCVaccinationCounty") def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **attrs): params = {"$limit": FILE_SIZE_LIMIT} diff --git a/python/datasources/cdc_vaccination_national.py b/python/datasources/cdc_vaccination_national.py index e6011da08a..e067d7bad3 100644 --- a/python/datasources/cdc_vaccination_national.py +++ b/python/datasources/cdc_vaccination_national.py @@ -7,35 +7,35 @@ from ingestion.constants import Sex, NATIONAL_LEVEL, US_FIPS, US_NAME, RACE, AGE, SEX, UNKNOWN, CURRENT CDC_SEX_GROUPS_TO_STANDARD = { - 'Sex_Female': Sex.FEMALE, - 'Sex_Male': Sex.MALE, - 'Sex_unknown': 'Unknown', - 'US': std_col.ALL_VALUE, + "Sex_Female": Sex.FEMALE, + "Sex_Male": Sex.MALE, + "Sex_unknown": "Unknown", + "US": std_col.ALL_VALUE, } CDC_RACE_GROUPS_TO_STANDARD = { - 'Race_eth_Hispanic': Race.HISP.value, - 'Race_eth_NHAIAN': Race.AIAN_NH.value, - 'Race_eth_NHAsian': Race.ASIAN_NH.value, - 'Race_eth_NHBlack': Race.BLACK_NH.value, - 'Race_eth_NHMult_Oth': Race.MULTI_OR_OTHER_STANDARD_NH.value, - 'Race_eth_NHNHOPI': Race.NHPI_NH.value, - 'Race_eth_NHWhite': Race.WHITE_NH.value, - 'Race_eth_unknown': Race.UNKNOWN.value, - 'US': Race.ALL.value, + "Race_eth_Hispanic": Race.HISP.value, + "Race_eth_NHAIAN": Race.AIAN_NH.value, + "Race_eth_NHAsian": Race.ASIAN_NH.value, + "Race_eth_NHBlack": Race.BLACK_NH.value, + "Race_eth_NHMult_Oth": Race.MULTI_OR_OTHER_STANDARD_NH.value, + "Race_eth_NHNHOPI": Race.NHPI_NH.value, + "Race_eth_NHWhite": Race.WHITE_NH.value, + "Race_eth_unknown": Race.UNKNOWN.value, + "US": Race.ALL.value, } CDC_AGE_GROUPS_TO_STANDARD = { - 'Ages_<2yrs': '0-1', - 'Ages_2-4_yrs': '2-4', - 'Ages_5-11_yrs': '5-11', - 'Ages_12-17_yrs': '12-17', - 'Ages_18-24_yrs': '18-24', - 'Ages_25-49_yrs': '25-49', - 'Ages_50-64_yrs': '50-64', - 'Ages_65+_yrs': '65+', - 'Age_unknown': 'Unknown', - 'US': std_col.ALL_VALUE, + "Ages_<2yrs": "0-1", + "Ages_2-4_yrs": "2-4", + "Ages_5-11_yrs": "5-11", + "Ages_12-17_yrs": "12-17", + "Ages_18-24_yrs": "18-24", + "Ages_25-49_yrs": "25-49", + "Ages_50-64_yrs": "50-64", + "Ages_65+_yrs": "65+", + "Age_unknown": "Unknown", + "US": std_col.ALL_VALUE, } @@ -44,15 +44,15 @@ # taking the population percentages directly off of the chart here: # https://covid.cdc.gov/covid-data-tracker/#vaccination-demographic AGE_GROUPS_TO_POP_PCT = { - '0-1': '2.3', - '2-4': '3.6', - '5-11': '8.7', - '12-17': '7.6', - '18-24': '9.2', - '25-49': '32.9', - '50-64': '19.2', - '65+': '16.5', - std_col.ALL_VALUE: '100', + "0-1": "2.3", + "2-4": "3.6", + "5-11": "8.7", + "12-17": "7.6", + "18-24": "9.2", + "25-49": "32.9", + "50-64": "19.2", + "65+": "16.5", + std_col.ALL_VALUE: "100", } BREAKDOWN_MAP = { @@ -69,29 +69,29 @@ class CDCVaccinationNational(DataSource): @staticmethod def get_id(): - return 'CDC_VACCINATION_NATIONAL' + return "CDC_VACCINATION_NATIONAL" @staticmethod def get_table_name(): - return 'cdc_vaccination_national' + return "cdc_vaccination_national" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for CDCVaccinationNational') + raise NotImplementedError("upload_to_gcs should not be called for CDCVaccinationNational") def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **attrs): df = gcs_to_bq_util.load_json_as_df_from_web( - BASE_CDC_URL, dtype={'administered_dose1_pct': float, 'administered_dose1': float} + BASE_CDC_URL, dtype={"administered_dose1_pct": float, "administered_dose1": float} ) - latest_date = df['date'].max() - df = df.loc[df['date'] == latest_date] + latest_date = df["date"].max() + df = df.loc[df["date"] == latest_date] for breakdown in [RACE, SEX, AGE]: breakdown_df = self.generate_breakdown(breakdown, df) if write_local_instead_of_bq: local_pipeline_utils.write_df_as_json_to_frontend_tmp( - breakdown_df, f'{self.get_table_name()}-{breakdown}_current' + breakdown_df, f"{self.get_table_name()}-{breakdown}_current" ) else: float_cols = [std_col.VACCINATED_PCT_RATE, std_col.VACCINATED_PCT_SHARE, std_col.VACCINATED_POP_PCT] @@ -103,7 +103,7 @@ def generate_breakdown(self, breakdown, df): demo_col = std_col.RACE_CATEGORY_ID_COL if breakdown == RACE else breakdown unknown = Race.UNKNOWN.value if breakdown == RACE else UNKNOWN - df = df.rename(columns={'demographic_category': demo_col, 'administered_dose1': std_col.VACCINATED_RAW}) + df = df.rename(columns={"demographic_category": demo_col, "administered_dose1": std_col.VACCINATED_RAW}) demo_rows = set(BREAKDOWN_MAP[breakdown].keys()) df = df.loc[df[demo_col].isin(demo_rows)].reset_index(drop=True) @@ -112,11 +112,11 @@ def generate_breakdown(self, breakdown, df): known_df = df.loc[df[demo_col] != unknown].reset_index(drop=True) unknown_df = df.loc[df[demo_col] == unknown].reset_index(drop=True) - known_df = known_df.rename(columns={'administered_dose1_pct_known': std_col.VACCINATED_PCT_SHARE}) - unknown_df = unknown_df.rename(columns={'administered_dose1_pct_us': std_col.VACCINATED_PCT_SHARE}) + known_df = known_df.rename(columns={"administered_dose1_pct_known": std_col.VACCINATED_PCT_SHARE}) + unknown_df = unknown_df.rename(columns={"administered_dose1_pct_us": std_col.VACCINATED_PCT_SHARE}) df = pd.concat([known_df, unknown_df]) - df[std_col.VACCINATED_PCT_RATE] = df['administered_dose1_pct'] + df[std_col.VACCINATED_PCT_RATE] = df["administered_dose1_pct"] df.loc[df[demo_col].isin(ALLS), std_col.VACCINATED_PCT_SHARE] = 100.0 diff --git a/python/datasources/cdc_wisqars.py b/python/datasources/cdc_wisqars.py index 69db3e5291..e26ad3d83c 100644 --- a/python/datasources/cdc_wisqars.py +++ b/python/datasources/cdc_wisqars.py @@ -76,14 +76,14 @@ COL_DICTS: List[RATE_CALC_COLS_TYPE] = [ { - 'numerator_col': 'gun_violence_homicide_estimated_total', - 'denominator_col': 'fatal_population', - 'rate_col': 'gun_violence_homicide_per_100k', + "numerator_col": "gun_violence_homicide_estimated_total", + "denominator_col": "fatal_population", + "rate_col": "gun_violence_homicide_per_100k", }, { - 'numerator_col': 'gun_violence_suicide_estimated_total', - 'denominator_col': 'fatal_population', - 'rate_col': 'gun_violence_suicide_per_100k', + "numerator_col": "gun_violence_suicide_estimated_total", + "denominator_col": "fatal_population", + "rate_col": "gun_violence_suicide_per_100k", }, ] @@ -176,9 +176,9 @@ def generate_breakdown_df(self, demographic: WISQARS_DEMO_TYPE, geo_level: GEO_T has_unknown = df.map(contains_unknown).any().any() if has_unknown: - unknown = 'Unknown' + unknown = "Unknown" if demographic == std_col.RACE_OR_HISPANIC_COL: - unknown = 'Unknown race' + unknown = "Unknown race" df = generate_pct_share_col_with_unknowns(df, PCT_SHARE_MAP, demographic, std_col.ALL_VALUE, unknown) else: @@ -203,25 +203,25 @@ def process_wisqars_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_TYPE): """ output_df = pd.DataFrame(columns=["year"]) - fatal_gun_injuries: WISQARS_VAR_TYPE = 'fatal_gun_injuries' + fatal_gun_injuries: WISQARS_VAR_TYPE = "fatal_gun_injuries" df = load_wisqars_as_df_from_data_dir(fatal_gun_injuries, geo_level, demographic) df.columns = df.columns.str.lower() - df = df[~df['intent'].isin(['Unintentional', 'Undetermined', 'Legal Intervention'])] + df = df[~df["intent"].isin(["Unintentional", "Undetermined", "Legal Intervention"])] # Reshapes df to add the intent rows as columns pivot_df = df.pivot( index=PIVOT_DEM_COLS.get(demographic, []), columns="intent", - values=['deaths', 'crude rate'], + values=["deaths", "crude rate"], ) new_columns = [ ( f"gun_violence_{col[1].lower().replace(' ', '_')}_{std_col.RAW_SUFFIX}" - if col[0] == 'deaths' + if col[0] == "deaths" else f"gun_violence_{col[1].lower().replace(' ', '_')}_{std_col.PER_100K_SUFFIX}" ) for col in pivot_df.columns @@ -234,24 +234,24 @@ def process_wisqars_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_TYPE): df.rename( columns={ "age group": std_col.AGE_COL, - 'population': 'fatal_population', - 'sex': std_col.SEX_COL, + "population": "fatal_population", + "sex": std_col.SEX_COL, }, inplace=True, ) if demographic == std_col.AGE_COL: - df[std_col.AGE_COL] = df[std_col.AGE_COL].str.replace(' to ', '-') + df[std_col.AGE_COL] = df[std_col.AGE_COL].str.replace(" to ", "-") if std_col.ETH_COL in df.columns.to_list(): - count_cols_to_sum = list(RAW_TOTALS_MAP.values()) + ['fatal_population'] + count_cols_to_sum = list(RAW_TOTALS_MAP.values()) + ["fatal_population"] df = combine_race_ethnicity( df, count_cols_to_sum, RACE_NAMES_MAPPING, - ethnicity_value='Hispanic', - additional_group_cols=['year', 'state'], + ethnicity_value="Hispanic", + additional_group_cols=["year", "state"], ) for raw_total_col in RAW_TOTALS_MAP.values(): @@ -259,7 +259,7 @@ def process_wisqars_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_TYPE): if raw_total_col in df.columns: topic_prefix = std_col.extract_prefix(raw_total_col) topic_rate_col = PER_100K_MAP[topic_prefix] - df = generate_per_100k_col(df, raw_total_col, 'fatal_population', topic_rate_col, decimal_places=2) + df = generate_per_100k_col(df, raw_total_col, "fatal_population", topic_rate_col, decimal_places=2) output_df = output_df.merge(df, how="outer") diff --git a/python/datasources/cdc_wisqars_black_men.py b/python/datasources/cdc_wisqars_black_men.py index 6a87fdf9f8..6c8708fe31 100644 --- a/python/datasources/cdc_wisqars_black_men.py +++ b/python/datasources/cdc_wisqars_black_men.py @@ -84,9 +84,9 @@ COL_DICTS: List[RATE_CALC_COLS_TYPE] = [ { - 'numerator_col': 'gun_homicides_black_men_estimated_total', - 'denominator_col': 'gun_homicides_black_men_population_estimated_total', - 'rate_col': 'gun_homicides_black_men_per_100k', + "numerator_col": "gun_homicides_black_men_estimated_total", + "denominator_col": "gun_homicides_black_men_population_estimated_total", + "rate_col": "gun_homicides_black_men_per_100k", } ] @@ -159,7 +159,7 @@ def process_wisqars_black_men_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_ df.insert(2, WISQARS_URBANICITY, std_col.ALL_VALUE) df.insert(3, WISQARS_AGE_GROUP, std_col.ALL_VALUE) elif demographic == std_col.AGE_COL: - df[WISQARS_AGE_GROUP] = df[WISQARS_AGE_GROUP].str.replace(' to ', '-') + df[WISQARS_AGE_GROUP] = df[WISQARS_AGE_GROUP].str.replace(" to ", "-") df.rename( columns={ @@ -170,6 +170,6 @@ def process_wisqars_black_men_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_ inplace=True, ) - output_df = output_df.merge(df, how='outer') + output_df = output_df.merge(df, how="outer") return output_df diff --git a/python/datasources/cdc_wisqars_youth.py b/python/datasources/cdc_wisqars_youth.py index 1a502b728b..2fd4dd9726 100644 --- a/python/datasources/cdc_wisqars_youth.py +++ b/python/datasources/cdc_wisqars_youth.py @@ -112,7 +112,7 @@ def generate_breakdown_df(self, demographic: WISQARS_DEMO_TYPE, geo_level: GEO_T PCT_SHARE_MAP, std_col.RACE_OR_HISPANIC_COL, std_col.ALL_VALUE, - 'Unknown race', + "Unknown race", ) for col in ESTIMATED_TOTALS_MAP.values(): @@ -127,7 +127,7 @@ def generate_breakdown_df(self, demographic: WISQARS_DEMO_TYPE, geo_level: GEO_T def process_wisqars_youth_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_TYPE): - output_df = pd.DataFrame(columns=['year', 'state', 'race']) + output_df = pd.DataFrame(columns=["year", "state", "race"]) for variable_string in [std_col.GUN_DEATHS_YOUNG_ADULTS_PREFIX, std_col.GUN_DEATHS_YOUTH_PREFIX]: @@ -142,34 +142,34 @@ def process_wisqars_youth_df(demographic: WISQARS_DEMO_TYPE, geo_level: GEO_TYPE if std_col.ETH_COL in df.columns.to_list(): df = combine_race_ethnicity( df, - ['deaths', 'population', 'crude rate'], + ["deaths", "population", "crude rate"], RACE_NAMES_MAPPING, - ethnicity_value='Hispanic', - additional_group_cols=['year', 'state'], + ethnicity_value="Hispanic", + additional_group_cols=["year", "state"], treat_zero_count_as_missing=True, ) # Identify rows where 'race' is 'HISP' or 'UNKNOWN' - subset_mask = df[std_col.RACE_CATEGORY_ID_COL].isin(['HISP', 'UNKNOWN']) + subset_mask = df[std_col.RACE_CATEGORY_ID_COL].isin(["HISP", "UNKNOWN"]) # Create a temporary DataFrame with just the subset temp_df = df[subset_mask].copy() # Apply the function to the temporary DataFrame - temp_df = generate_per_100k_col(temp_df, 'deaths', 'population', 'crude rate') + temp_df = generate_per_100k_col(temp_df, "deaths", "population", "crude rate") # Update the original DataFrame with the results for the 'crude rate' column - df.loc[subset_mask, 'crude rate'] = temp_df['crude rate'] + df.loc[subset_mask, "crude rate"] = temp_df["crude rate"] df.rename( columns={ - 'deaths': f'{variable_string}_{std_col.RAW_SUFFIX}', - 'population': f'{variable_string}_{std_col.POPULATION_COL}', - 'crude rate': f'{variable_string}_{std_col.PER_100K_SUFFIX}', + "deaths": f"{variable_string}_{std_col.RAW_SUFFIX}", + "population": f"{variable_string}_{std_col.POPULATION_COL}", + "crude rate": f"{variable_string}_{std_col.PER_100K_SUFFIX}", }, inplace=True, ) - output_df = output_df.merge(df, how='outer') + output_df = output_df.merge(df, how="outer") return output_df diff --git a/python/datasources/cdc_wonder.py b/python/datasources/cdc_wonder.py index c42e61d6b0..dcdc49838c 100644 --- a/python/datasources/cdc_wonder.py +++ b/python/datasources/cdc_wonder.py @@ -31,18 +31,18 @@ class CdcWonderData(DataSource): @staticmethod def get_id(): - return 'CDC_WONDER_DATA' + return "CDC_WONDER_DATA" @staticmethod def get_table_name(): - return 'cdc_wonder_data' + return "cdc_wonder_data" def upload_to_gcs(self, gcs_bucket, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for CdcWonderData') + raise NotImplementedError("upload_to_gcs should not be called for CdcWonderData") def write_to_bq(self, dataset, gcs_bucket, **attrs): - demo_type = self.get_attr(attrs, 'demographic') - geo_level = self.get_attr(attrs, 'geographic') + demo_type = self.get_attr(attrs, "demographic") + geo_level = self.get_attr(attrs, "geographic") df = self.generate_breakdown_df(demo_type, geo_level) @@ -91,11 +91,11 @@ def generate_breakdown_df( for condition in conditions: # HET cols to make cancer_type = condition.lower() - het_rate_numerator = f'{cancer_type}_count_{std_col.RAW_SUFFIX}' - het_rate_denominator = f'{cancer_type}_{std_col.RAW_POP_SUFFIX}' - het_pct_share = f'{cancer_type}_{std_col.PCT_SHARE_SUFFIX}' - het_pop_pct_share = f'{cancer_type}_{std_col.POP_PCT_SUFFIX}' - het_pct_rel_inequity = f'{cancer_type}_{std_col.PCT_REL_INEQUITY_SUFFIX}' + het_rate_numerator = f"{cancer_type}_count_{std_col.RAW_SUFFIX}" + het_rate_denominator = f"{cancer_type}_{std_col.RAW_POP_SUFFIX}" + het_pct_share = f"{cancer_type}_{std_col.PCT_SHARE_SUFFIX}" + het_pop_pct_share = f"{cancer_type}_{std_col.POP_PCT_SUFFIX}" + het_pct_rel_inequity = f"{cancer_type}_{std_col.PCT_REL_INEQUITY_SUFFIX}" # Pct share mappings count_to_pct_share_map[het_rate_numerator] = het_pct_share @@ -114,7 +114,7 @@ def generate_breakdown_df( # For age breakdowns, calculate totals from available age groups non_all_df = df[df[demo_breakdown] != ALL_VALUE] for condition in conditions: - count_col = f'{condition.lower()}_count_{std_col.RAW_SUFFIX}' + count_col = f"{condition.lower()}_count_{std_col.RAW_SUFFIX}" if count_col in df.columns: # Update the 'All' row with sum of available age groups available_total = non_all_df[count_col].sum() diff --git a/python/datasources/census_pop_estimates.py b/python/datasources/census_pop_estimates.py index b6ea11f04b..b8f5c0342c 100644 --- a/python/datasources/census_pop_estimates.py +++ b/python/datasources/census_pop_estimates.py @@ -11,39 +11,39 @@ BASE_POPULATION_URL = ( - 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/asrh/cc-est2019-alldata.csv' + "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/asrh/cc-est2019-alldata.csv" ) RACES_MAP = { - 'NHWA': Race.WHITE_NH.value, - 'NHBA': Race.BLACK_NH.value, - 'NHIA': Race.AIAN_NH.value, - 'NHAA': Race.ASIAN_NH.value, - 'NHNA': Race.NHPI_NH.value, - 'H': Race.HISP.value, - 'ALL': Race.ALL.value, + "NHWA": Race.WHITE_NH.value, + "NHBA": Race.BLACK_NH.value, + "NHIA": Race.AIAN_NH.value, + "NHAA": Race.ASIAN_NH.value, + "NHNA": Race.NHPI_NH.value, + "H": Race.HISP.value, + "ALL": Race.ALL.value, } AGES_MAP = { - 'All': (0,), - '0-9': (1, 2), - '10-19': (3, 4), - '20-29': (5, 6), - '30-39': (7, 8), - '40-49': (9, 10), - '50-59': (11, 12), - '60-69': (13, 14), - '70-79': (15, 16), - '80+': (17, 18), + "All": (0,), + "0-9": (1, 2), + "10-19": (3, 4), + "20-29": (5, 6), + "30-39": (7, 8), + "40-49": (9, 10), + "50-59": (11, 12), + "60-69": (13, 14), + "70-79": (15, 16), + "80+": (17, 18), } YEAR_2019 = 12 def total_race(row, race): - if race == 'ALL': - return row['TOT_POP'] + if race == "ALL": + return row["TOT_POP"] return row[f"{race}_MALE"] + row[f"{race}_FEMALE"] @@ -51,18 +51,18 @@ def total_race(row, race): class CensusPopEstimates(DataSource): @staticmethod def get_id(): - return 'CENSUS_POP_ESTIMATES' + return "CENSUS_POP_ESTIMATES" @staticmethod def get_table_name(): - return 'census_pop_estimates' + return "census_pop_estimates" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for CensusPopEstimates') + raise NotImplementedError("upload_to_gcs should not be called for CensusPopEstimates") def write_to_bq(self, dataset, gcs_bucket, **attrs): df = gcs_to_bq_util.load_csv_as_df_from_web( - BASE_POPULATION_URL, dtype={'STATE': str, 'COUNTY': str}, encoding="ISO-8859-1" + BASE_POPULATION_URL, dtype={"STATE": str, "COUNTY": str}, encoding="ISO-8859-1" ) state_df = generate_state_pop_data(df) @@ -81,9 +81,9 @@ def generate_state_pop_data(df): df: the raw census county population estimates.""" # Only get estimates from 2019 - df = df.loc[df['YEAR'] == YEAR_2019].reset_index(drop=True) + df = df.loc[df["YEAR"] == YEAR_2019].reset_index(drop=True) - groupby_cols = ['STATE', 'STNAME', 'AGEGRP'] + groupby_cols = ["STATE", "STNAME", "AGEGRP"] df = df.groupby(groupby_cols).sum(numeric_only=True).reset_index() needed_cols = groupby_cols @@ -96,19 +96,19 @@ def generate_state_pop_data(df): new_df = [] for std_age, census_age in AGES_MAP.items(): - age_df = df.loc[df['AGEGRP'].isin(census_age)] - age_df = age_df.groupby(['STATE', 'STNAME']).sum(numeric_only=True).reset_index() + age_df = df.loc[df["AGEGRP"].isin(census_age)] + age_df = age_df.groupby(["STATE", "STNAME"]).sum(numeric_only=True).reset_index() age_df[std_col.AGE_COL] = std_age - for state_fips in age_df['STATE'].drop_duplicates().to_list(): - state_name = age_df.loc[age_df['STATE'] == state_fips]['STNAME'].drop_duplicates().to_list()[0] + for state_fips in age_df["STATE"].drop_duplicates().to_list(): + state_name = age_df.loc[age_df["STATE"] == state_fips]["STNAME"].drop_duplicates().to_list()[0] for race in RACES_MAP.values(): pop_row = {} pop_row[std_col.STATE_FIPS_COL] = state_fips pop_row[std_col.STATE_NAME_COL] = state_name pop_row[std_col.AGE_COL] = std_age - pop_row[std_col.POPULATION_COL] = age_df.loc[age_df['STATE'] == state_fips][race].values[0] + pop_row[std_col.POPULATION_COL] = age_df.loc[age_df["STATE"] == state_fips][race].values[0] pop_row[std_col.RACE_CATEGORY_ID_COL] = race new_df.append(pop_row) diff --git a/python/datasources/census_pop_estimates_sc.py b/python/datasources/census_pop_estimates_sc.py index 9a30d41a0b..86ad6e8ab7 100644 --- a/python/datasources/census_pop_estimates_sc.py +++ b/python/datasources/census_pop_estimates_sc.py @@ -10,13 +10,14 @@ """ BASE_POPULATION_URL = ( - 'https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/state/asrh/sc-est2021-alldata6.csv') + "https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/state/asrh/sc-est2021-alldata6.csv" +) census_to_het_cols = { - 'AGE': std_col.AGE_COL, - 'SEX': std_col.SEX_COL, - 'STATE': std_col.STATE_FIPS_COL, - 'NAME': std_col.STATE_NAME_COL, + "AGE": std_col.AGE_COL, + "SEX": std_col.SEX_COL, + "STATE": std_col.STATE_FIPS_COL, + "NAME": std_col.STATE_NAME_COL, } race_map = { @@ -25,54 +26,39 @@ 3: Race.AIAN_NH.value, 4: Race.ASIAN_NH.value, 5: Race.NHPI_NH.value, - 6: Race.MULTI_OR_OTHER_STANDARD_NH.value + 6: Race.MULTI_OR_OTHER_STANDARD_NH.value, } -sex_map = { - 0: "All", - 1: "Male", - 2: "Female" -} +sex_map = {0: "All", 1: "Male", 2: "Female"} -year_map = { - "POPESTIMATE2020": "2020", - "POPESTIMATE2021": "2021" -} +year_map = {"POPESTIMATE2020": "2020", "POPESTIMATE2021": "2021"} class CensusPopEstimatesSC(DataSource): - - @ staticmethod + @staticmethod def get_id(): - return 'CENSUS_POP_ESTIMATES_SC' + return "CENSUS_POP_ESTIMATES_SC" - @ staticmethod + @staticmethod def get_table_name(): - return 'census_pop_estimates_sc' + return "census_pop_estimates_sc" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError( - 'upload_to_gcs should not be called for CensusPopEstimatesSC') + raise NotImplementedError("upload_to_gcs should not be called for CensusPopEstimatesSC") def write_to_bq(self, dataset, gcs_bucket, **attrs): - df = gcs_to_bq_util.load_csv_as_df_from_web( - BASE_POPULATION_URL, dtype={'STATE': str}, encoding="ISO-8859-1") + df = gcs_to_bq_util.load_csv_as_df_from_web(BASE_POPULATION_URL, dtype={"STATE": str}, encoding="ISO-8859-1") for geo in [STATE_LEVEL, NATIONAL_LEVEL]: - for breakdown in [ - std_col.SEX_COL, - std_col.RACE_CATEGORY_ID_COL - ]: + for breakdown in [std_col.SEX_COL, std_col.RACE_CATEGORY_ID_COL]: - breakdown_df = generate_pop_data_18plus( - df, breakdown, geo) + breakdown_df = generate_pop_data_18plus(df, breakdown, geo) col_types = gcs_to_bq_util.get_bq_column_types(breakdown_df, []) demo = breakdown if breakdown != std_col.RACE_CATEGORY_ID_COL else std_col.RACE_OR_HISPANIC_COL - gcs_to_bq_util.add_df_to_bq( - breakdown_df, dataset, f'by_{demo}_age_{geo}', column_types=col_types) + gcs_to_bq_util.add_df_to_bq(breakdown_df, dataset, f"by_{demo}_age_{geo}", column_types=col_types) def generate_pop_data_18plus(df, breakdown, geo): @@ -93,32 +79,37 @@ def generate_pop_data_18plus(df, breakdown, geo): # drop the ALL ethnicity rows to avoid double counting df = df[df["ORIGIN"] != 0] - df = df.rename(census_to_het_cols, axis='columns') + df = df.rename(census_to_het_cols, axis="columns") # calculate HET race/eth based on census race + eth columns df[std_col.RACE_CATEGORY_ID_COL] = df.apply( - lambda row: Race.HISP.value if row["ORIGIN"] == 2 else race_map[row["RACE"]], axis="columns") - - df = df[[ - std_col.AGE_COL, - std_col.SEX_COL, - std_col.RACE_CATEGORY_ID_COL, - std_col.STATE_FIPS_COL, - std_col.STATE_NAME_COL, - "POPESTIMATE2020", - "POPESTIMATE2021" - ]] + lambda row: Race.HISP.value if row["ORIGIN"] == 2 else race_map[row["RACE"]], axis="columns" + ) + + df = df[ + [ + std_col.AGE_COL, + std_col.SEX_COL, + std_col.RACE_CATEGORY_ID_COL, + std_col.STATE_FIPS_COL, + std_col.STATE_NAME_COL, + "POPESTIMATE2020", + "POPESTIMATE2021", + ] + ] # make two cols of pop data by year into unique rows by year - df = df.melt(id_vars=[ - std_col.AGE_COL, - std_col.SEX_COL, - std_col.RACE_CATEGORY_ID_COL, - std_col.STATE_FIPS_COL, - std_col.STATE_NAME_COL - ], + df = df.melt( + id_vars=[ + std_col.AGE_COL, + std_col.SEX_COL, + std_col.RACE_CATEGORY_ID_COL, + std_col.STATE_FIPS_COL, + std_col.STATE_NAME_COL, + ], var_name=std_col.TIME_PERIOD_COL, - value_name=std_col.POPULATION_COL) + value_name=std_col.POPULATION_COL, + ) # remove the "ALL" rows for SEX if RACE is the breakdown (to prevent dbl counting). # Census doesn't provide rows for "ALL" races combined so no need for the reverse @@ -129,21 +120,18 @@ def generate_pop_data_18plus(df, breakdown, geo): df = df[df[std_col.AGE_COL] >= 18] # drop unneeded columns - df = df[[ - std_col.STATE_FIPS_COL, - std_col.STATE_NAME_COL, - std_col.TIME_PERIOD_COL, - std_col.POPULATION_COL, - breakdown - ]] + df = df[ + [std_col.STATE_FIPS_COL, std_col.STATE_NAME_COL, std_col.TIME_PERIOD_COL, std_col.POPULATION_COL, breakdown] + ] # combine all year/state/group rows into, summing the populations - df = df.groupby([ - std_col.STATE_FIPS_COL, - std_col.STATE_NAME_COL, - std_col.TIME_PERIOD_COL, - breakdown - ])[std_col.POPULATION_COL].sum().reset_index() + df = ( + df.groupby([std_col.STATE_FIPS_COL, std_col.STATE_NAME_COL, std_col.TIME_PERIOD_COL, breakdown])[ + std_col.POPULATION_COL + ] + .sum() + .reset_index() + ) if breakdown == std_col.SEX_COL: # swap census SEX number codes for HET strings @@ -153,34 +141,28 @@ def generate_pop_data_18plus(df, breakdown, geo): # need to make ALL rows for race if breakdown == std_col.RACE_CATEGORY_ID_COL: - df_alls = df[[ - std_col.STATE_FIPS_COL, - std_col.STATE_NAME_COL, - std_col.TIME_PERIOD_COL, - std_col.POPULATION_COL - ]] - df_alls = df_alls.groupby([ - std_col.STATE_FIPS_COL, - std_col.STATE_NAME_COL, - std_col.TIME_PERIOD_COL, - ])[std_col.POPULATION_COL].sum().reset_index() + df_alls = df[[std_col.STATE_FIPS_COL, std_col.STATE_NAME_COL, std_col.TIME_PERIOD_COL, std_col.POPULATION_COL]] + df_alls = ( + df_alls.groupby( + [ + std_col.STATE_FIPS_COL, + std_col.STATE_NAME_COL, + std_col.TIME_PERIOD_COL, + ] + )[std_col.POPULATION_COL] + .sum() + .reset_index() + ) df_alls[std_col.RACE_CATEGORY_ID_COL] = Race.ALL.value df = pd.concat([df, df_alls], axis=0, ignore_index=True) if geo == NATIONAL_LEVEL: # drop state cols - df = df[[ - std_col.TIME_PERIOD_COL, - std_col.POPULATION_COL, - breakdown - ]] + df = df[[std_col.TIME_PERIOD_COL, std_col.POPULATION_COL, breakdown]] # sum matching rows from all states to get national population per breakdown - df = df.groupby([ - std_col.TIME_PERIOD_COL, - breakdown - ])[std_col.POPULATION_COL].sum().reset_index() + df = df.groupby([std_col.TIME_PERIOD_COL, breakdown])[std_col.POPULATION_COL].sum().reset_index() df[std_col.STATE_FIPS_COL] = US_FIPS df[std_col.STATE_NAME_COL] = US_NAME diff --git a/python/datasources/chr.py b/python/datasources/chr.py index ee2751cc72..913700c03d 100644 --- a/python/datasources/chr.py +++ b/python/datasources/chr.py @@ -6,28 +6,28 @@ # NOTE: col values for numerator and denominator are NULL -CHR_DIR = 'chr' +CHR_DIR = "chr" het_to_source_select_topic_all_to_race_prefix_map: Dict[str, Dict[str, Optional[str]]] = { - std_col.PREVENTABLE_HOSP_PREFIX: {'Preventable Hospitalization Rate': 'Preventable Hosp. Rate'}, - std_col.EXCESSIVE_DRINKING_PREFIX: {'% Excessive Drinking': None}, + std_col.PREVENTABLE_HOSP_PREFIX: {"Preventable Hospitalization Rate": "Preventable Hosp. Rate"}, + std_col.EXCESSIVE_DRINKING_PREFIX: {"% Excessive Drinking": None}, } het_to_source_additional_topic_all_to_race_prefix_map: Dict[str, Dict[str, Optional[str]]] = { - std_col.SUICIDE_PREFIX: {'Crude Rate': 'Suicide Rate'}, - std_col.FREQUENT_MENTAL_DISTRESS_PREFIX: {'% Frequent Mental Distress': None}, - std_col.DIABETES_PREFIX: {'% Adults with Diabetes': None}, - std_col.VOTER_PARTICIPATION_PREFIX: {'% Voter Turnout': None}, + std_col.SUICIDE_PREFIX: {"Crude Rate": "Suicide Rate"}, + std_col.FREQUENT_MENTAL_DISTRESS_PREFIX: {"% Frequent Mental Distress": None}, + std_col.DIABETES_PREFIX: {"% Adults with Diabetes": None}, + std_col.VOTER_PARTICIPATION_PREFIX: {"% Voter Turnout": None}, } # frequent mental distress source_race_to_id_map = { - '(AIAN)': std_col.Race.AIAN_NH.value, - '(Asian)': std_col.Race.API_NH.value, - '(Black)': std_col.Race.BLACK_NH.value, - '(Hispanic)': std_col.Race.HISP.value, - '(White)': std_col.Race.WHITE_NH.value, + "(AIAN)": std_col.Race.AIAN_NH.value, + "(Asian)": std_col.Race.API_NH.value, + "(Black)": std_col.Race.BLACK_NH.value, + "(Hispanic)": std_col.Race.HISP.value, + "(White)": std_col.Race.WHITE_NH.value, } # suicide 2024 @@ -43,17 +43,17 @@ # suicide 2023 source_race_code_to_id_map_2023 = { - '(AIAN)': std_col.Race.AIAN_NH.value, - '(Asian)': std_col.Race.ASIAN_NH.value, - '(Black)': std_col.Race.BLACK_NH.value, - '(Hispanic)': std_col.Race.HISP.value, - '(White)': std_col.Race.WHITE_NH.value, + "(AIAN)": std_col.Race.AIAN_NH.value, + "(Asian)": std_col.Race.ASIAN_NH.value, + "(Black)": std_col.Race.BLACK_NH.value, + "(Hispanic)": std_col.Race.HISP.value, + "(White)": std_col.Race.WHITE_NH.value, } -source_fips_col = 'FIPS' -source_per_100k = 'Rate' -source_pct_rate = '%' +source_fips_col = "FIPS" +source_per_100k = "Rate" +source_pct_rate = "%" class CHRData(DataSource): @@ -75,12 +75,12 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): dfs = [] - for year in ['2023', '2024']: + for year in ["2023", "2024"]: - main_sheet_name = 'Select Measure Data' if year == '2024' else 'Ranked Measure Data' + main_sheet_name = "Select Measure Data" if year == "2024" else "Ranked Measure Data" main_source_df = get_df_from_chr_excel_sheet(year, main_sheet_name) - additional_source_df = get_df_from_chr_excel_sheet(year, 'Additional Measure Data') - year_df = pd.merge(main_source_df, additional_source_df, how='outer', on=source_fips_col) + additional_source_df = get_df_from_chr_excel_sheet(year, "Additional Measure Data") + year_df = pd.merge(main_source_df, additional_source_df, how="outer", on=source_fips_col) year_df = year_df.rename( columns={ source_fips_col: std_col.COUNTY_FIPS_COL, @@ -88,7 +88,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): ) # # drop national and state-level rows - year_df = year_df[~year_df[std_col.COUNTY_FIPS_COL].str.endswith('000')] + year_df = year_df[~year_df[std_col.COUNTY_FIPS_COL].str.endswith("000")] melt_map = get_melt_map(year) year_df = dataset_utils.melt_to_het_style_df( year_df, std_col.RACE_CATEGORY_ID_COL, [std_col.COUNTY_FIPS_COL], melt_map @@ -119,7 +119,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): topic_prefixes = list(het_to_source_select_topic_all_to_race_prefix_map.keys()) + list( het_to_source_additional_topic_all_to_race_prefix_map.keys() ) - topic_prefixes.append('chr_population') + topic_prefixes.append("chr_population") df_for_bq, col_types = dataset_utils.get_timeview_df_and_cols(df_for_bq, timeview, topic_prefixes) @@ -139,12 +139,12 @@ def get_source_usecols(year: str, sheet_name: str) -> List[str]: sheet_topic_map: Dict[str, Dict[str, Optional[str]]] = {} sheet_race_map: Dict[str, str] = {} - if sheet_name in ['Ranked Measure Data', 'Select Measure Data']: + if sheet_name in ["Ranked Measure Data", "Select Measure Data"]: sheet_topic_map = het_to_source_select_topic_all_to_race_prefix_map sheet_race_map = source_race_to_id_map - if sheet_name == 'Additional Measure Data': + if sheet_name == "Additional Measure Data": sheet_topic_map = het_to_source_additional_topic_all_to_race_prefix_map - sheet_race_map = source_nh_race_code_to_id_map_2024 if year == '2024' else source_race_code_to_id_map_2023 + sheet_race_map = source_nh_race_code_to_id_map_2024 if year == "2024" else source_race_code_to_id_map_2023 for source_topic_all_to_race_prefix_map in sheet_topic_map.values(): for source_topic, source_topic_race_prefix in source_topic_all_to_race_prefix_map.items(): @@ -153,7 +153,7 @@ def get_source_usecols(year: str, sheet_name: str) -> List[str]: # some topics only have ALLs if source_topic_race_prefix is not None: for race_suffix in sheet_race_map.keys(): - source_usecols.append(f'{source_topic_race_prefix} {race_suffix}') + source_usecols.append(f"{source_topic_race_prefix} {race_suffix}") return source_usecols @@ -167,7 +167,7 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]: dict: A nested dict """ - race_code_to_id_map = source_nh_race_code_to_id_map_2024 if year == '2024' else source_race_code_to_id_map_2023 + race_code_to_id_map = source_nh_race_code_to_id_map_2024 if year == "2024" else source_race_code_to_id_map_2023 melt_map: Dict[str, Dict[str, str]] = {} # each topic get its own sub-mapping @@ -180,10 +180,10 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]: # some topics only have ALLs if source_race_prefix is not None: for source_race_suffix, het_race_id in source_race_to_id_map.items(): - select_topic_melt_map[f'{source_race_prefix} {source_race_suffix}'] = het_race_id + select_topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id # assign 100k or pct_rate as needed - rate_suffix = '' + rate_suffix = "" source_all_col = list(source_all_race_map.keys())[0] if source_per_100k in source_all_col: rate_suffix = std_col.PER_100K_SUFFIX @@ -191,7 +191,7 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]: rate_suffix = std_col.PCT_RATE_SUFFIX # set this metrics sub melt map - melt_map[f'{het_prefix}_{rate_suffix}'] = select_topic_melt_map + melt_map[f"{het_prefix}_{rate_suffix}"] = select_topic_melt_map for het_prefix, source_all_race_map in het_to_source_additional_topic_all_to_race_prefix_map.items(): additional_topic_melt_map: Dict[str, str] = {} @@ -202,10 +202,10 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]: # some topics only have ALLs if source_race_prefix is not None: for source_race_suffix, het_race_id in race_code_to_id_map.items(): - additional_topic_melt_map[f'{source_race_prefix} {source_race_suffix}'] = het_race_id + additional_topic_melt_map[f"{source_race_prefix} {source_race_suffix}"] = het_race_id # assign 100k or pct_rate as needed - rate_suffix = '' + rate_suffix = "" source_all_col = list(source_all_race_map.keys())[0] if source_per_100k in source_all_col: rate_suffix = std_col.PER_100K_SUFFIX @@ -213,7 +213,7 @@ def get_melt_map(year: str) -> Dict[str, Dict[str, str]]: rate_suffix = std_col.PCT_RATE_SUFFIX # set this metrics sub melt map - melt_map[f'{het_prefix}_{rate_suffix}'] = additional_topic_melt_map + melt_map[f"{het_prefix}_{rate_suffix}"] = additional_topic_melt_map return melt_map @@ -246,12 +246,12 @@ def get_float_cols() -> Dict[str, List[str]]: continue source_all_col = list(source_dict.keys())[0] - rate_suffix = '' + rate_suffix = "" if source_per_100k in source_all_col: rate_suffix = std_col.PER_100K_SUFFIX if source_pct_rate in source_all_col: rate_suffix = std_col.PCT_RATE_SUFFIX - topic_rate_col = f'{topic_prefix}_{rate_suffix}' + topic_rate_col = f"{topic_prefix}_{rate_suffix}" current_float_cols.append(topic_rate_col) historical_float_cols.append(topic_rate_col) @@ -264,8 +264,8 @@ def get_df_from_chr_excel_sheet(year: str, sheet_name: str) -> pd.DataFrame: source_usecols = get_source_usecols(year, sheet_name) file_name_lookup = { - '2024': '2024_county_health_release_data_-_v1.xlsx', - '2023': '2023 County Health Rankings Data - v2.xlsx', + "2024": "2024_county_health_release_data_-_v1.xlsx", + "2023": "2023 County Health Rankings Data - v2.xlsx", } file_name = file_name_lookup[year] @@ -277,7 +277,7 @@ def get_df_from_chr_excel_sheet(year: str, sheet_name: str) -> pd.DataFrame: header=1, usecols=source_usecols, dtype={ - source_fips_col: 'str', + source_fips_col: "str", }, ) @@ -292,9 +292,9 @@ def convert_some_pct_rate_to_100k(df: pd.DataFrame, float_cols: List[str]) -> Tu """ cols_conversion_map = { - 'excessive_drinking_pct_rate': 'excessive_drinking_per_100k', - 'frequent_mental_distress_pct_rate': 'frequent_mental_distress_per_100k', - 'diabetes_pct_rate': 'diabetes_per_100k', + "excessive_drinking_pct_rate": "excessive_drinking_per_100k", + "frequent_mental_distress_pct_rate": "frequent_mental_distress_per_100k", + "diabetes_pct_rate": "diabetes_per_100k", } # swap col names in df and float cols diff --git a/python/datasources/data_source.py b/python/datasources/data_source.py index 50e2ea88c7..8767481bf1 100644 --- a/python/datasources/data_source.py +++ b/python/datasources/data_source.py @@ -12,13 +12,13 @@ class DataSource(ABC): @staticmethod def get_id() -> str: """Returns the data source's unique id usually all uppercase like `SOME_SOURCE_DATA`""" - return '' + return "" @staticmethod def get_table_name() -> str: """Returns the BigQuery base table name where the data source's data will stored, usually all lowercase like `some_source_data`""" - return '' + return "" def get_attr(self, attributes: dict, key: str) -> Any: attr = attributes.get(key) @@ -41,7 +41,7 @@ def upload_to_gcs(self, gcs_bucket: str, **attrs) -> bool: return true if there is at least one file that is different. """ return url_file_to_gcs.url_file_to_gcs( - self.get_attr(attrs, 'url'), None, gcs_bucket, self.get_attr(attrs, 'filename') + self.get_attr(attrs, "url"), None, gcs_bucket, self.get_attr(attrs, "filename") ) def write_to_bq(self, dataset: str, gcs_bucket: str, write_local_instead_of_bq=False, **attrs) -> None: @@ -54,7 +54,7 @@ def write_to_bq(self, dataset: str, gcs_bucket: str, write_local_instead_of_bq=F needed for this data source.""" if write_local_instead_of_bq: print("TODO: Writing to local file instead of BigQuery") - self.write_to_bq_table(dataset, gcs_bucket, self.get_attr(attrs, 'filename'), self.get_table_name()) + self.write_to_bq_table(dataset, gcs_bucket, self.get_attr(attrs, "filename"), self.get_table_name()) def write_to_bq_table(self, dataset: str, gcs_bucket: str, filename: str, table_name: str, project=None) -> None: """Writes source data from GCS bucket to BigQuery @@ -80,6 +80,6 @@ def clean_frame_column_names(self, frame: pd.DataFrame) -> None: frame: The pandas dataframe with unclean columns """ frame.rename( - columns=lambda col: (re.sub('[^0-9a-zA-Z_=%]+', '_', col).lower().replace('=', 'eq').replace('%', 'pct')), + columns=lambda col: (re.sub("[^0-9a-zA-Z_=%]+", "_", col).lower().replace("=", "eq").replace("%", "pct")), inplace=True, ) diff --git a/python/datasources/decia_2010_territory_population.py b/python/datasources/decia_2010_territory_population.py index 3025f7224d..312267bd98 100644 --- a/python/datasources/decia_2010_territory_population.py +++ b/python/datasources/decia_2010_territory_population.py @@ -25,26 +25,26 @@ def get_breakdown_col(df): class Decia2010TerritoryPopulationData(DataSource): @staticmethod def get_id(): - return 'DECIA_2010_POPULATION' + return "DECIA_2010_POPULATION" @staticmethod def get_table_name(): - return 'decia_2010_territory_population' + return "decia_2010_territory_population" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for Decia2010TerritoryPopulationData') + raise NotImplementedError("upload_to_gcs should not be called for Decia2010TerritoryPopulationData") def write_to_bq(self, dataset, gcs_bucket, **attrs): - gcs_files = self.get_attr(attrs, 'filename') + gcs_files = self.get_attr(attrs, "filename") # In this instance, we expect filename to be a string with # comma-separated CSV filenames. - if ',' not in gcs_files: - raise ValueError('filename passed to write_to_bq is not a ' + 'comma-separated list of files') - files = gcs_files.split(',') + if "," not in gcs_files: + raise ValueError("filename passed to write_to_bq is not a " + "comma-separated list of files") + files = gcs_files.split(",") for f in files: - df = gcs_to_bq_util.load_json_as_df_from_data_dir("decia_2010_territory_population", f, {'state_fips': str}) + df = gcs_to_bq_util.load_json_as_df_from_data_dir("decia_2010_territory_population", f, {"state_fips": str}) total_val = Race.ALL.value if get_breakdown_col(df) == std_col.RACE_CATEGORY_ID_COL else std_col.ALL_VALUE @@ -58,7 +58,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): # Clean up column names. self.clean_frame_column_names(df) - demo_type = next((demo for demo in ['race', 'sex', 'age', 'race_and_ethnicity'] if demo in f), None) + demo_type = next((demo for demo in ["race", "sex", "age", "race_and_ethnicity"] if demo in f), None) table_id = gcs_to_bq_util.make_bq_table_id(demo_type, constants.STATE_LEVEL, constants.CURRENT) column_types = gcs_to_bq_util.get_bq_column_types( diff --git a/python/datasources/decia_2020_territory_population.py b/python/datasources/decia_2020_territory_population.py index 3d9f96cc34..c265854df2 100644 --- a/python/datasources/decia_2020_territory_population.py +++ b/python/datasources/decia_2020_territory_population.py @@ -154,29 +154,29 @@ } # used to differentiate renamed columns before melting into HET style df -TMP_COUNT_SUFFIX: Final = '_count' +TMP_COUNT_SUFFIX: Final = "_count" COUNT_CHAR: Final = "C" -TMP_PCT_SHARE_SUFFIX: Final = '_pct_share' +TMP_PCT_SHARE_SUFFIX: Final = "_pct_share" PCT_CHAR: Final = "P" class Decia2020TerritoryPopulationData(DataSource): @staticmethod def get_id(): - return 'DECIA_2020_TERRITORY_POPULATION_DATA' + return "DECIA_2020_TERRITORY_POPULATION_DATA" @staticmethod def get_table_name(): - return 'decia_2020_territory_population_data' + return "decia_2020_territory_population_data" def upload_to_gcs(self, gcs_bucket, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for Decia2020TerritoryPopulationData') + raise NotImplementedError("upload_to_gcs should not be called for Decia2020TerritoryPopulationData") def write_to_bq(self, dataset, gcs_bucket, **attrs): # get GEO and DEMO from DAG payload - breakdown = self.get_attr(attrs, 'demographic') - geo_level = self.get_attr(attrs, 'geographic') + breakdown = self.get_attr(attrs, "demographic") + geo_level = self.get_attr(attrs, "geographic") raw_dfs_by_postal_map = load_source_dfs() df = self.generate_breakdown_df(raw_dfs_by_postal_map, breakdown, geo_level) float_cols = [std_col.POPULATION_COL, std_col.POPULATION_PCT_COL] @@ -221,7 +221,7 @@ def generate_breakdown_df( rename_map = get_rename_map(RACE_CODES_TO_STD[postal]) # cleanup and store raw dfs - raw_df.loc[:, value_cols] = raw_df[value_cols].replace(['-', '(X)'], np.nan).astype(float) + raw_df.loc[:, value_cols] = raw_df[value_cols].replace(["-", "(X)"], np.nan).astype(float) needed_cols = [geo_col] + value_cols raw_df = raw_df[needed_cols] raw_df = raw_df.rename(columns=rename_map) @@ -295,7 +295,7 @@ def get_source_col_names(source_codes_map: Dict[str, str], metric: Literal["_cou suffix_char = COUNT_CHAR if metric == TMP_PCT_SHARE_SUFFIX: suffix_char = PCT_CHAR - return [f'{code}{suffix_char}' for code in list(source_codes_map.keys())] + return [f"{code}{suffix_char}" for code in list(source_codes_map.keys())] def generate_summed_age_cols(df: pd.DataFrame) -> pd.DataFrame: @@ -311,8 +311,8 @@ def generate_summed_age_cols(df: pd.DataFrame) -> pd.DataFrame: for buckets_to_sum_tuple, summed_bucket in STD_AGES_SUM_MAP.items(): for metric_suffix in [TMP_COUNT_SUFFIX, TMP_PCT_SHARE_SUFFIX]: - cols_to_sum = [f'{bucket}{metric_suffix}' for bucket in buckets_to_sum_tuple] - df[f'{summed_bucket}{metric_suffix}'] = df[cols_to_sum].sum(min_count=1, axis=1) + cols_to_sum = [f"{bucket}{metric_suffix}" for bucket in buckets_to_sum_tuple] + df[f"{summed_bucket}{metric_suffix}"] = df[cols_to_sum].sum(min_count=1, axis=1) return df @@ -321,7 +321,7 @@ def format_fips_col(df: pd.DataFrame, geo_col: str) -> pd.DataFrame: ("state_fips" or "county_fips")""" # FIPS codes are at the end of the string - df[geo_col] = df["GEO_ID"].str.split('US').str[1] + df[geo_col] = df["GEO_ID"].str.split("US").str[1] # only keep the requested geo level rows if geo_col == std_col.STATE_FIPS_COL: df = df[df[geo_col].str.len() == 2] @@ -342,8 +342,8 @@ def get_rename_map(code_map: Dict) -> Dict[str, str]: to temporary, pre-melt, HET-group col names""" rename_map = {} for code, group in code_map.items(): - rename_map[f'{code}{COUNT_CHAR}'] = f'{group}{TMP_COUNT_SUFFIX}' - rename_map[f'{code}{PCT_CHAR}'] = f'{group}{TMP_PCT_SHARE_SUFFIX}' + rename_map[f"{code}{COUNT_CHAR}"] = f"{group}{TMP_COUNT_SUFFIX}" + rename_map[f"{code}{PCT_CHAR}"] = f"{group}{TMP_PCT_SHARE_SUFFIX}" return rename_map @@ -354,7 +354,7 @@ def get_melt_map(code_map: Dict, metric_suffix: Literal["_count", "_pct_share"]) Returns a map for melting the temporary, pre-melt, HET-group metric col names into final HET groups used per row in the metric col """ - return {f'{group}{metric_suffix}': group for group in code_map.values()} + return {f"{group}{metric_suffix}": group for group in code_map.values()} def use_nonNH_as_NH(df: pd.DataFrame) -> pd.DataFrame: @@ -365,8 +365,8 @@ def use_nonNH_as_NH(df: pd.DataFrame) -> pd.DataFrame: the non-NH races as the NH races when they are not provided (as in VI)""" for non_nh_col, nh_col in NON_NH_TO_NH_RACE_MAP.items(): - df[f'{nh_col}_count'] = df[f'{non_nh_col}_count'] - df[f'{nh_col}_pct_share'] = df[f'{non_nh_col}_pct_share'] + df[f"{nh_col}_count"] = df[f"{non_nh_col}_count"] + df[f"{nh_col}_pct_share"] = df[f"{non_nh_col}_pct_share"] return df @@ -379,7 +379,7 @@ def add_combo_race_cols(df: pd.DataFrame) -> pd.DataFrame: for suffix in ["_count", "_pct_share"]: for races_to_sum_tuple, combo_race in COMBO_RACES_SUM_MAP.items(): - race_cols_to_sum = [f'{race}{suffix}' for race in races_to_sum_tuple if f'{race}{suffix}' in df.columns] - df[f'{combo_race}{suffix}'] = df[race_cols_to_sum].sum(axis=1) + race_cols_to_sum = [f"{race}{suffix}" for race in races_to_sum_tuple if f"{race}{suffix}" in df.columns] + df[f"{combo_race}{suffix}"] = df[race_cols_to_sum].sum(axis=1) return df diff --git a/python/datasources/geo_context.py b/python/datasources/geo_context.py index bcfb892d3a..c95414b7f1 100644 --- a/python/datasources/geo_context.py +++ b/python/datasources/geo_context.py @@ -26,20 +26,20 @@ def format_svi(value: float) -> float: return np.nan if 0 <= value <= 1: return round(value, 2) - raise ValueError(f'The provided SVI: {value} is not an expected number between 0.0-1.0') + raise ValueError(f"The provided SVI: {value} is not an expected number between 0.0-1.0") class GeoContext(DataSource): @staticmethod def get_id(): - return 'GEO_CONTEXT' + return "GEO_CONTEXT" @staticmethod def get_table_name(): - return 'geo_context' + return "geo_context" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for GeoContext') + raise NotImplementedError("upload_to_gcs should not be called for GeoContext") def write_to_bq(self, dataset, gcs_bucket, **attrs): @@ -49,7 +49,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): if geo_level == COUNTY_LEVEL: float_cols.append(std_col.SVI) column_types = gcs_to_bq_util.get_bq_column_types(df, float_cols=float_cols) - table_id = gcs_to_bq_util.make_bq_table_id('alls', geo_level, CURRENT) + table_id = gcs_to_bq_util.make_bq_table_id("alls", geo_level, CURRENT) gcs_to_bq_util.add_df_to_bq(df, dataset, table_id, column_types=column_types) def generate_breakdown(self, geo_level: Literal["national", "state", "county"]) -> pd.DataFrame: @@ -80,7 +80,7 @@ def merge_svi_data(df): original df with added std_col.SVI column of floats """ svi_df = gcs_to_bq_util.load_csv_as_df_from_data_dir( - 'cdc_svi_county', "cdc_svi_county_totals.csv", dtype={'FIPS': str} + "cdc_svi_county", "cdc_svi_county_totals.csv", dtype={"FIPS": str} ) columns_to_standard = {"FIPS": std_col.COUNTY_FIPS_COL, "RPL_THEMES": std_col.SVI} svi_df = svi_df.rename(columns=columns_to_standard) diff --git a/python/datasources/graphql_ahr.py b/python/datasources/graphql_ahr.py index c52738f5ea..dc00910088 100644 --- a/python/datasources/graphql_ahr.py +++ b/python/datasources/graphql_ahr.py @@ -29,36 +29,36 @@ from ingestion.merge_utils import merge_state_ids, merge_yearly_pop_numbers, merge_intersectional_pop # String constants from AHR source data -AHR_MEASURE = 'measure' -AHR_VALUE = 'value' +AHR_MEASURE = "measure" +AHR_VALUE = "value" LAST_COMPLETE_DATA_YEAR = 2022 AGE_GROUPS_TO_STANDARD = { - 'Ages 15-24': '15-24', - 'Ages 18-24': '18-24', - 'Ages 18-44': '18-44', - 'Ages 25-34': '25-34', - 'Ages 35-44': '35-44', - 'Ages 45-54': '45-54', - 'Ages 45-64': '45-64', - 'Ages 55-64': '55-64', - 'Age 65+': '65+', - 'Ages 65-74': '65-74', - 'Ages 75-84': '75-84', - 'Age 85+': '85+', + "Ages 15-24": "15-24", + "Ages 18-24": "18-24", + "Ages 18-44": "18-44", + "Ages 25-34": "25-34", + "Ages 35-44": "35-44", + "Ages 45-54": "45-54", + "Ages 45-64": "45-64", + "Ages 55-64": "55-64", + "Age 65+": "65+", + "Ages 65-74": "65-74", + "Ages 75-84": "75-84", + "Age 85+": "85+", } RACE_GROUPS_TO_STANDARD = { - 'American Indian/Alaska Native': std_col.Race.AIAN_NH.value, - 'Asian': std_col.Race.ASIAN_NH.value, - 'Asian/Pacific Islander': std_col.Race.API_NH.value, - 'Black': std_col.Race.BLACK_NH.value, - 'Hispanic': std_col.Race.HISP.value, - 'Hawaiian/Pacific Islander': std_col.Race.NHPI_NH.value, - 'Other Race': std_col.Race.OTHER_STANDARD_NH.value, - 'White': std_col.Race.WHITE_NH.value, - 'Multiracial': std_col.Race.MULTI_NH.value, - 'All': std_col.Race.ALL.value, + "American Indian/Alaska Native": std_col.Race.AIAN_NH.value, + "Asian": std_col.Race.ASIAN_NH.value, + "Asian/Pacific Islander": std_col.Race.API_NH.value, + "Black": std_col.Race.BLACK_NH.value, + "Hispanic": std_col.Race.HISP.value, + "Hawaiian/Pacific Islander": std_col.Race.NHPI_NH.value, + "Other Race": std_col.Race.OTHER_STANDARD_NH.value, + "White": std_col.Race.WHITE_NH.value, + "Multiracial": std_col.Race.MULTI_NH.value, + "All": std_col.Race.ALL.value, } AHR_AGE_GROUPS = list(AGE_GROUPS_TO_STANDARD.keys()) @@ -66,22 +66,22 @@ AHR_SEX_GROUPS = [Sex.FEMALE, Sex.MALE] RATE_TO_RAW_18PLUS_MAP = { - rate_col: f'{std_col.extract_prefix(rate_col)}_{std_col.RAW_SUFFIX}' + rate_col: f"{std_col.extract_prefix(rate_col)}_{std_col.RAW_SUFFIX}" for rate_col in AHR_MEASURES_TO_RATES_MAP_18PLUS.values() } RATE_TO_RAW_ALL_AGES_MAP = { - rate_col: f'{std_col.extract_prefix(rate_col)}_{std_col.RAW_SUFFIX}' + rate_col: f"{std_col.extract_prefix(rate_col)}_{std_col.RAW_SUFFIX}" for rate_col in AHR_MEASURES_TO_RATES_MAP_ALL_AGES.values() } RAW_TO_SHARE_ALL_AGES_MAP = { - raw_col: f'{std_col.extract_prefix(raw_col)}_{std_col.PCT_SHARE_SUFFIX}' + raw_col: f"{std_col.extract_prefix(raw_col)}_{std_col.PCT_SHARE_SUFFIX}" for raw_col in RATE_TO_RAW_ALL_AGES_MAP.values() } RAW_TO_SHARE_18PLUS_MAP = { - raw_col: f'{std_col.extract_prefix(raw_col)}_{std_col.PCT_SHARE_SUFFIX}' + raw_col: f"{std_col.extract_prefix(raw_col)}_{std_col.PCT_SHARE_SUFFIX}" for raw_col in RATE_TO_RAW_18PLUS_MAP.values() } @@ -92,14 +92,14 @@ def __init__(self) -> None: @staticmethod def get_id(): - return 'GRAPHQL_AHR_DATA' + return "GRAPHQL_AHR_DATA" @staticmethod def get_table_name(): - return 'graphql_ahr_data' + return "graphql_ahr_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for AHRData') + raise NotImplementedError("upload_to_gcs should not be called for AHRData") def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **attrs): demographic = self.get_attr(attrs, "demographic") @@ -116,7 +116,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at for rate_col in AHR_BASE_MEASURES_TO_RATES_MAP.values() if rate_col in df.columns ] - topic_prefixes.append('ahr') + topic_prefixes.append("ahr") df = df[df[std_col.TIME_PERIOD_COL].astype(int) <= LAST_COMPLETE_DATA_YEAR] df_for_bq, col_types = get_timeview_df_and_cols(df, time_view, topic_prefixes) first_two_columns = df_for_bq.columns[:2].tolist() @@ -197,7 +197,7 @@ def post_process(self, df: pd.DataFrame, demographic: DEMOGRAPHIC_TYPE, geo_leve if demographic != std_col.AGE_COL: breakdown_df, pop_18plus_col = merge_intersectional_pop( - breakdown_df, geo_level, demographic, age_specific_group='18+' + breakdown_df, geo_level, demographic, age_specific_group="18+" ) rate_to_raw_18plus_map = { @@ -214,7 +214,7 @@ def post_process(self, df: pd.DataFrame, demographic: DEMOGRAPHIC_TYPE, geo_leve ) # all columns need to be provider-specific for the frontend - ahr_pop18plus_col = 'ahr_' + pop_18plus_col + ahr_pop18plus_col = "ahr_" + pop_18plus_col breakdown_df = breakdown_df.rename( columns={ pop_18plus_col: ahr_pop18plus_col, @@ -287,7 +287,7 @@ def parse_raw_data(df: pd.DataFrame, breakdown_col: DEMOGRAPHIC_TYPE): index=[std_col.TIME_PERIOD_COL, std_col.STATE_POSTAL_COL, breakdown_col], columns=AHR_MEASURE, values=AHR_VALUE, - aggfunc='first', + aggfunc="first", ).reset_index() pivot_df = pivot_df.sort_values(by=std_col.TIME_PERIOD_COL, ascending=False) @@ -296,7 +296,7 @@ def parse_raw_data(df: pd.DataFrame, breakdown_col: DEMOGRAPHIC_TYPE): def get_float_cols( - time_type: Literal['current', 'historical'], demo_col: DEMOGRAPHIC_TYPE, intersectional_pop_cols: List[str] + time_type: Literal["current", "historical"], demo_col: DEMOGRAPHIC_TYPE, intersectional_pop_cols: List[str] ) -> List[str]: """Builds a list of col names representing numerical data per breakdown. diff --git a/python/datasources/kff_vaccination.py b/python/datasources/kff_vaccination.py index 823eea3562..ad537b20b0 100644 --- a/python/datasources/kff_vaccination.py +++ b/python/datasources/kff_vaccination.py @@ -9,68 +9,68 @@ from ingestion.constants import STATE_LEVEL, RACE, CURRENT BASE_KFF_URL_TOTALS_STATE = ( - 'https://raw.githubusercontent.com/KFFData/COVID-19-Data/kff_master/State%20Trend%20Data/State_Trend_Data.csv' + "https://raw.githubusercontent.com/KFFData/COVID-19-Data/kff_master/State%20Trend%20Data/State_Trend_Data.csv" ) BASE_GITHUB_API_URL = "https://api.github.com/repos/KFFData/COVID-19-Data/git/trees/kff_master?recursive=1" -TOTAL_KEY = 'one_dose' +TOTAL_KEY = "one_dose" UNKNOWN_TO_STANDARD = { - '% of Vaccinations with Unknown Race': Race.UNKNOWN.value, - '% of Vaccinations with Unknown Ethnicity': Race.ETHNICITY_UNKNOWN.value, + "% of Vaccinations with Unknown Race": Race.UNKNOWN.value, + "% of Vaccinations with Unknown Ethnicity": Race.ETHNICITY_UNKNOWN.value, } KFF_RACES_PCT_SHARE = [ - 'White', - 'Black', - 'Hispanic', - 'Asian', - 'American Indian or Alaska Native', - 'Native Hawaiian or Other Pacific Islander', - 'Other', + "White", + "Black", + "Hispanic", + "Asian", + "American Indian or Alaska Native", + "Native Hawaiian or Other Pacific Islander", + "Other", ] -KFF_RACES_PCT_TOTAL = ['White', 'Black', 'Hispanic', 'Asian'] +KFF_RACES_PCT_TOTAL = ["White", "Black", "Hispanic", "Asian"] KFF_RACES_TO_STANDARD_NH = { - 'White': Race.WHITE_NH.value, - 'Black': Race.BLACK_NH.value, - 'Hispanic': Race.HISP.value, - 'Asian': Race.ASIAN_NH.value, - 'American Indian or Alaska Native': Race.AIAN_NH.value, - 'Native Hawaiian or Other Pacific Islander': Race.NHPI_NH.value, - 'AAPI': Race.API_NH.value, - 'Other': Race.OTHER_NONSTANDARD_NH.value, + "White": Race.WHITE_NH.value, + "Black": Race.BLACK_NH.value, + "Hispanic": Race.HISP.value, + "Asian": Race.ASIAN_NH.value, + "American Indian or Alaska Native": Race.AIAN_NH.value, + "Native Hawaiian or Other Pacific Islander": Race.NHPI_NH.value, + "AAPI": Race.API_NH.value, + "Other": Race.OTHER_NONSTANDARD_NH.value, } KFF_RACES_TO_STANDARD = { - 'White': Race.WHITE.value, - 'Black': Race.BLACK.value, - 'Hispanic': Race.HISP.value, - 'Asian': Race.ASIAN.value, - 'American Indian or Alaska Native': Race.AIAN.value, - 'Native Hawaiian or Other Pacific Islander': Race.NHPI.value, - 'AAPI': Race.API.value, - 'Other': Race.OTHER_NONSTANDARD.value, + "White": Race.WHITE.value, + "Black": Race.BLACK.value, + "Hispanic": Race.HISP.value, + "Asian": Race.ASIAN.value, + "American Indian or Alaska Native": Race.AIAN.value, + "Native Hawaiian or Other Pacific Islander": Race.NHPI.value, + "AAPI": Race.API.value, + "Other": Race.OTHER_NONSTANDARD.value, } AAPI_STATES = { - 'Arizona', - 'Connecticut', - 'District of Columbia', - 'Michigan', - 'Minnesota', - 'Nevada', - 'New Mexico', - 'North Carolina', - 'Oklahoma', - 'South Carolina', - 'Virginia', + "Arizona", + "Connecticut", + "District of Columbia", + "Michigan", + "Minnesota", + "Nevada", + "New Mexico", + "North Carolina", + "Oklahoma", + "South Carolina", + "Virginia", } -KFF_TERRITORIES = ['Guam', 'Puerto Rico', 'Northern Mariana Islands'] -VACCINATED_FIRST_DOSE = 'one_dose' +KFF_TERRITORIES = ["Guam", "Puerto Rico", "Northern Mariana Islands"] +VACCINATED_FIRST_DOSE = "one_dose" def get_data_url(data_type): @@ -81,15 +81,15 @@ def get_data_url(data_type): or 'pct_population' """ data_types_to_strings = { - 'pct_total': 'Percent of Total Population that has Received a COVID-19 Vaccine by RaceEthnicity', - 'pct_share': 'COVID19 Vaccinations by RE', - 'pct_population': 'Distribution of Vaccinations, Cases, Deaths', + "pct_total": "Percent of Total Population that has Received a COVID-19 Vaccine by RaceEthnicity", + "pct_share": "COVID19 Vaccinations by RE", + "pct_population": "Distribution of Vaccinations, Cases, Deaths", } df = gcs_to_bq_util.load_json_as_df_from_web_based_on_key(BASE_GITHUB_API_URL, "tree") - df = df.loc[df['path'].str.contains(data_types_to_strings[data_type])] + df = df.loc[df["path"].str.contains(data_types_to_strings[data_type])] - urls = df.loc[df['path'] == df['path'].max()].url + urls = df.loc[df["path"] == df["path"].max()].url if len(urls) != 1: raise ValueError(f"Found {len(urls)} urls, should have only found 1") @@ -98,15 +98,15 @@ def get_data_url(data_type): def generate_total_pct_key(race): - return f'% of Total {race} Population Vaccinated' + return f"% of Total {race} Population Vaccinated" def generate_pct_share_key(race): - return f'{race} % of Vaccinations' + return f"{race} % of Vaccinations" def generate_pct_of_population_key(race): - return f'{race} Percent of Total Population' + return f"{race} Percent of Total Population" def get_unknown_rows(df, state): @@ -140,7 +140,7 @@ def generate_output_row(state_row_pct_share, state_row_pct_total, state_row_pct_ """ races_map = KFF_RACES_TO_STANDARD - if state_row_pct_share['Race Categories Include Hispanic Individuals'].values[0] != 'Yes': + if state_row_pct_share["Race Categories Include Hispanic Individuals"].values[0] != "Yes": races_map = KFF_RACES_TO_STANDARD_NH output_row = {} @@ -154,7 +154,7 @@ def generate_output_row(state_row_pct_share, state_row_pct_total, state_row_pct_ ) if race == "Asian" and state in AAPI_STATES: - race = 'AAPI' + race = "AAPI" output_row[std_col.RACE_CATEGORY_ID_COL] = races_map[race] @@ -171,8 +171,8 @@ def generate_total_row(state_row_totals, state): output_row[std_col.STATE_NAME_COL] = state output_row[std_col.RACE_CATEGORY_ID_COL] = Race.ALL.value - state_row_totals = state_row_totals.loc[~state_row_totals['one_dose'].isnull()] - latest_row = state_row_totals.loc[state_row_totals['date'] == state_row_totals['date'].max()] + state_row_totals = state_row_totals.loc[~state_row_totals["one_dose"].isnull()] + latest_row = state_row_totals.loc[state_row_totals["date"] == state_row_totals["date"].max()] output_row[VACCINATED_FIRST_DOSE] = str(latest_row[TOTAL_KEY].values[0]) output_row[std_col.VACCINATED_POP_PCT] = "1.0" return output_row @@ -181,26 +181,26 @@ def generate_total_row(state_row_totals, state): class KFFVaccination(DataSource): @staticmethod def get_id(): - return 'KFF_VACCINATION' + return "KFF_VACCINATION" @staticmethod def get_table_name(): - return 'kff_vaccination' + return "kff_vaccination" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for KFFVaccination') + raise NotImplementedError("upload_to_gcs should not be called for KFFVaccination") def parse_data(self): """Parses vaccine data from all needed data sources and places all needed info into HET style df.""" - percentage_of_total_url = get_data_url('pct_total') + percentage_of_total_url = get_data_url("pct_total") percentage_of_total_df = github_util.decode_json_from_url_into_df(percentage_of_total_url) - pct_share_url = get_data_url('pct_share') + pct_share_url = get_data_url("pct_share") pct_share_df = github_util.decode_json_from_url_into_df(pct_share_url) - pct_population_url = get_data_url('pct_population') + pct_population_url = get_data_url("pct_population") pct_population_df = github_util.decode_json_from_url_into_df(pct_population_url) total_df = gcs_to_bq_util.load_csv_as_df_from_web(BASE_KFF_URL_TOTALS_STATE, dtype={TOTAL_KEY: str}) @@ -215,14 +215,14 @@ def parse_data(self): std_col.VACCINATED_POP_PCT, ] - states = percentage_of_total_df['Location'].drop_duplicates().to_list() - states.remove('United States') + states = percentage_of_total_df["Location"].drop_duplicates().to_list() + states.remove("United States") for state in states: - state_row_pct_share = pct_share_df.loc[pct_share_df['Location'] == state] - state_row_pct_total = percentage_of_total_df.loc[percentage_of_total_df['Location'] == state] - state_row_totals = total_df.loc[total_df['state'] == state] - state_row_pct_population = pct_population_df.loc[pct_population_df['State'] == state] + state_row_pct_share = pct_share_df.loc[pct_share_df["Location"] == state] + state_row_pct_total = percentage_of_total_df.loc[percentage_of_total_df["Location"] == state] + state_row_totals = total_df.loc[total_df["state"] == state] + state_row_pct_population = pct_population_df.loc[pct_population_df["State"] == state] output.extend(get_unknown_rows(state_row_pct_share, state)) @@ -241,7 +241,7 @@ def parse_data(self): output.append(generate_total_row(state_row_totals, state)) for territory in KFF_TERRITORIES: - state_row_totals = total_df.loc[total_df['state'] == territory] + state_row_totals = total_df.loc[total_df["state"] == territory] output.append(generate_total_row(state_row_totals, territory)) return pd.DataFrame(output, columns=columns) @@ -322,7 +322,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at df.loc[:, std_col.SEX_COL] = std_col.ALL_VALUE df.loc[:, std_col.AGE_COL] = std_col.ALL_VALUE col_types = gcs_to_bq_util.get_bq_column_types(df, float_cols) - gcs_to_bq_util.add_df_to_bq(df, dataset, 'alls_state_current', column_types=col_types) + gcs_to_bq_util.add_df_to_bq(df, dataset, "alls_state_current", column_types=col_types) def clean_row(df, column): @@ -334,8 +334,8 @@ def clean_row(df, column): column: Column name to clean.""" df[column] = df[column].fillna(np.nan) df[column] = df[column].replace(0, np.nan) - df[column] = df[column].replace('<0.01', np.nan) - df[column] = df[column].replace('NR', np.nan) - df[column] = df[column].replace('>.99', 1.0) + df[column] = df[column].replace("<0.01", np.nan) + df[column] = df[column].replace("NR", np.nan) + df[column] = df[column].replace(">.99", 1.0) df[column] = df[column].astype(float) return df diff --git a/python/datasources/maternal_mortality.py b/python/datasources/maternal_mortality.py index bcd1fc9753..3e8d0abdf8 100644 --- a/python/datasources/maternal_mortality.py +++ b/python/datasources/maternal_mortality.py @@ -19,12 +19,12 @@ JAMA_CURRENT_YEAR = "2019" JAMA_RACE_GROUPS_TO_STANDARD = { - 'Non-Hispanic American Indian and Alaska Native': std_col.Race.AIAN_NH.value, - 'Non-Hispanic Asian, Native Hawaiian, or Other Pacific Islander': std_col.Race.API_NH.value, - 'Non-Hispanic Black': std_col.Race.BLACK_NH.value, - 'Non-Hispanic White': std_col.Race.WHITE_NH.value, - 'Hispanic and any race': std_col.Race.HISP.value, - 'All racial and ethnic groups': std_col.Race.ALL.value, + "Non-Hispanic American Indian and Alaska Native": std_col.Race.AIAN_NH.value, + "Non-Hispanic Asian, Native Hawaiian, or Other Pacific Islander": std_col.Race.API_NH.value, + "Non-Hispanic Black": std_col.Race.BLACK_NH.value, + "Non-Hispanic White": std_col.Race.WHITE_NH.value, + "Hispanic and any race": std_col.Race.HISP.value, + "All racial and ethnic groups": std_col.Race.ALL.value, } # Constants from the CDC Natality data @@ -49,9 +49,9 @@ # DATA FOR NATIONAL AND REGIONAL COUNTS ARE FROM THE IMAGE IN THE # ORIGINAL STUDY LABELED "Table" AND MANUALLY INPUTTED TO /data -JAMA_RACE = 'race_group' -JAMA_STATE_NAME = 'location_name' -JAMA_TIME_PERIOD = 'year_id' +JAMA_RACE = "race_group" +JAMA_STATE_NAME = "location_name" +JAMA_TIME_PERIOD = "year_id" COLS_TO_STANDARD = { JAMA_RACE: std_col.RACE_CATEGORY_ID_COL, @@ -59,20 +59,20 @@ JAMA_TIME_PERIOD: std_col.TIME_PERIOD_COL, } -RATE_COLS_TO_STANDARD = {'val': std_col.MM_PER_100K, **COLS_TO_STANDARD} +RATE_COLS_TO_STANDARD = {"val": std_col.MM_PER_100K, **COLS_TO_STANDARD} class MaternalMortalityData(DataSource): @staticmethod def get_id(): - return 'MATERNAL_MORTALITY_DATA' + return "MATERNAL_MORTALITY_DATA" @staticmethod def get_table_name(): - return 'maternal_mortality_data' + return "maternal_mortality_data" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for MaternalMortalityData') + raise NotImplementedError("upload_to_gcs should not be called for MaternalMortalityData") def write_to_bq(self, dataset, gcs_bucket, **attrs): @@ -125,7 +125,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): ).reset_index(drop=True) for time_type in [HISTORICAL, CURRENT]: - table_id = gcs_to_bq_util.make_bq_table_id('race', geo_level, time_type) + table_id = gcs_to_bq_util.make_bq_table_id("race", geo_level, time_type) float_cols = get_float_cols(time_type, geo_level) df_for_bq = df.copy()[keep_string_cols + float_cols] if time_type == CURRENT: @@ -140,8 +140,8 @@ def preprocess_source_rates() -> pd.DataFrame: pandas.DataFrame: preprocessed source data including state and national rows """ df = gcs_to_bq_util.load_csv_as_df_from_data_dir( - 'maternal_mortality', - 'IHME_USA_MMR_STATE_RACE_ETHN_1999_2019_ESTIMATES_Y2023M07D03.CSV', + "maternal_mortality", + "IHME_USA_MMR_STATE_RACE_ETHN_1999_2019_ESTIMATES_Y2023M07D03.CSV", dtype={JAMA_TIME_PERIOD: str}, usecols=RATE_COLS_TO_STANDARD.keys(), ) @@ -163,8 +163,8 @@ def merge_national_counts(df: pd.DataFrame) -> pd.DataFrame: """ jama_national_counts_df = gcs_to_bq_util.load_csv_as_df_from_data_dir( - 'maternal_mortality', - 'Table.csv', + "maternal_mortality", + "Table.csv", dtype={JAMA_TIME_PERIOD: str}, usecols=[ JAMA_RACE, @@ -245,9 +245,9 @@ def read_live_births_denominators() -> pd.DataFrame: usecols = [CDC_RACE, CDC_ETH, CDC_BIRTHS, CDC_STATE_FIPS] df = gcs_to_bq_util.load_tsv_as_df_from_data_dir( - 'maternal_mortality', - 'Natality, 2016-2022 expanded.txt', - delimiter='\t', + "maternal_mortality", + "Natality, 2016-2022 expanded.txt", + delimiter="\t", skipinitialspace=True, dtype={CDC_STATE_FIPS: str}, usecols=usecols, @@ -290,7 +290,7 @@ def merge_state_counts(df: pd.DataFrame) -> pd.DataFrame: df, live_births_df, on=[std_col.STATE_FIPS_COL, std_col.TIME_PERIOD_COL, std_col.RACE_CATEGORY_ID_COL], - how='left', + how="left", ) # estimate the number of maternal deaths using the rate per 100k and the original denominator of live births diff --git a/python/datasources/phrma.py b/python/datasources/phrma.py index 221c4e6f6e..24d0d937a9 100644 --- a/python/datasources/phrma.py +++ b/python/datasources/phrma.py @@ -53,18 +53,18 @@ class PhrmaData(DataSource): @staticmethod def get_id(): - return 'PHRMA_DATA' + return "PHRMA_DATA" @staticmethod def get_table_name(): - return 'phrma_data' + return "phrma_data" def upload_to_gcs(self, gcs_bucket, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for PhrmaData') + raise NotImplementedError("upload_to_gcs should not be called for PhrmaData") def write_to_bq(self, dataset, gcs_bucket, **attrs): - demo_type = self.get_attr(attrs, 'demographic') - geo_level = self.get_attr(attrs, 'geographic') + demo_type = self.get_attr(attrs, "demographic") + geo_level = self.get_attr(attrs, "geographic") alls_df = load_phrma_df_from_data_dir(geo_level, TMP_ALL, PHRMA_MEDICARE, PHRMA_MEDICARE_CONDITIONS) @@ -74,8 +74,8 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): # POP COMPARE FOR 100K float_cols = [ - f'{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}_{std_col.PCT_SHARE_SUFFIX}', - f'{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}', + f"{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}_{std_col.PCT_SHARE_SUFFIX}", + f"{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}", ] # PCT_RATE CONDITIONS @@ -86,9 +86,9 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): std_col.PCT_SHARE_SUFFIX, std_col.RAW_SUFFIX, ]: - float_cols.append(f'{condition}_{ADHERENCE}_{metric}') + float_cols.append(f"{condition}_{ADHERENCE}_{metric}") # valid-population comparison pct_share and count cols - float_cols.append(f'{condition}_{BENEFICIARIES}_{std_col.RAW_SUFFIX}') + float_cols.append(f"{condition}_{BENEFICIARIES}_{std_col.RAW_SUFFIX}") # PER_100K CONDITIONS for condition in PHRMA_100K_CONDITIONS: @@ -98,7 +98,7 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs): std_col.PCT_SHARE_SUFFIX, std_col.RAW_SUFFIX, ]: - float_cols.append(f'{condition}_{metric}') + float_cols.append(f"{condition}_{metric}") col_types = gcs_to_bq_util.get_bq_column_types(df, float_cols) @@ -134,13 +134,13 @@ def generate_breakdown_df( # ADHERENCE rate for condition in PHRMA_PCT_CONDITIONS: - source_col_name = f'{condition}_{ADHERENCE_RATE}' - het_col_name = f'{condition}_{ADHERENCE}_{std_col.PCT_RATE_SUFFIX}' + source_col_name = f"{condition}_{ADHERENCE_RATE}" + het_col_name = f"{condition}_{ADHERENCE}_{std_col.PCT_RATE_SUFFIX}" df[het_col_name] = df[source_col_name].multiply(100).round() for condition in PHRMA_100K_CONDITIONS: - source_col_name = f'{condition}_{PER_100K}' - het_col_name = f'{condition}_{std_col.PER_100K_SUFFIX}' + source_col_name = f"{condition}_{PER_100K}" + het_col_name = f"{condition}_{std_col.PER_100K_SUFFIX}" df[het_col_name] = df[source_col_name].round() if geo_level == COUNTY_LEVEL: @@ -155,16 +155,16 @@ def generate_breakdown_df( count_to_share_map = { # Pct share of adherence **{ - f'{condition}_{COUNT_YES}': f'{condition}_{ADHERENCE}_{std_col.PCT_SHARE_SUFFIX}' + f"{condition}_{COUNT_YES}": f"{condition}_{ADHERENCE}_{std_col.PCT_SHARE_SUFFIX}" for condition in PHRMA_PCT_CONDITIONS }, # Pct Share for disease **{ - f'{condition}_{MEDICARE_DISEASE_COUNT}': f'{condition}_{std_col.PCT_SHARE_SUFFIX}' + f"{condition}_{MEDICARE_DISEASE_COUNT}": f"{condition}_{std_col.PCT_SHARE_SUFFIX}" for condition in PHRMA_100K_CONDITIONS }, # Shared comparison population share col for all 100ks - MEDICARE_POP_COUNT: (f'{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}_{std_col.PCT_SHARE_SUFFIX}'), + MEDICARE_POP_COUNT: (f"{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}_{std_col.PCT_SHARE_SUFFIX}"), } if demo_breakdown == std_col.RACE_OR_HISPANIC_COL: @@ -179,19 +179,19 @@ def generate_breakdown_df( df, count_to_share_map, cast(PHRMA_BREAKDOWN_TYPE, demo_col), all_val ) - rename_col_map = {MEDICARE_POP_COUNT: f'{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}'} + rename_col_map = {MEDICARE_POP_COUNT: f"{std_col.MEDICARE_PREFIX}_{std_col.POPULATION_COL}"} for condition in PHRMA_PCT_CONDITIONS: - rename_col_map[f'{condition}_{COUNT_YES}'] = f'{condition}_{ADHERENCE}_{std_col.RAW_SUFFIX}' - rename_col_map[f'{condition}_{COUNT_TOTAL}'] = f'{condition}_{BENEFICIARIES}_{std_col.RAW_SUFFIX}' + rename_col_map[f"{condition}_{COUNT_YES}"] = f"{condition}_{ADHERENCE}_{std_col.RAW_SUFFIX}" + rename_col_map[f"{condition}_{COUNT_TOTAL}"] = f"{condition}_{BENEFICIARIES}_{std_col.RAW_SUFFIX}" for condition in PHRMA_100K_CONDITIONS: - rename_col_map[f'{condition}_{MEDICARE_DISEASE_COUNT}'] = f'{condition}_{std_col.RAW_SUFFIX}' + rename_col_map[f"{condition}_{MEDICARE_DISEASE_COUNT}"] = f"{condition}_{std_col.RAW_SUFFIX}" df = df.rename(columns=rename_col_map) df = df.drop( columns=[ - *[f'{condition}_{ADHERENCE_RATE}' for condition in PHRMA_PCT_CONDITIONS], - *[f'{condition}_{PER_100K}' for condition in PHRMA_100K_CONDITIONS], + *[f"{condition}_{ADHERENCE_RATE}" for condition in PHRMA_PCT_CONDITIONS], + *[f"{condition}_{PER_100K}" for condition in PHRMA_100K_CONDITIONS], ] ) diff --git a/python/datasources/phrma_brfss.py b/python/datasources/phrma_brfss.py index a6e75e08d7..3c706c868e 100644 --- a/python/datasources/phrma_brfss.py +++ b/python/datasources/phrma_brfss.py @@ -39,18 +39,18 @@ class PhrmaBrfssData(DataSource): @staticmethod def get_id(): - return 'PHRMA_BRFSS_DATA' + return "PHRMA_BRFSS_DATA" @staticmethod def get_table_name(): - return 'phrma_brfss_data' + return "phrma_brfss_data" def upload_to_gcs(self, gcs_bucket, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for PhrmaBrfssData') + raise NotImplementedError("upload_to_gcs should not be called for PhrmaBrfssData") def write_to_bq(self, dataset, gcs_bucket, **attrs): - demo_type = self.get_attr(attrs, 'demographic') - geo_level = self.get_attr(attrs, 'geographic') + demo_type = self.get_attr(attrs, "demographic") + geo_level = self.get_attr(attrs, "geographic") table_id = gcs_to_bq_util.make_bq_table_id(demo_type, geo_level, CURRENT) df = self.generate_breakdown_df(demo_type, geo_level) bq_col_types = build_bq_col_types(df) @@ -86,8 +86,8 @@ def generate_breakdown_df( # ADHERENCE rate for condition in conditions: - source_col_name = f'{condition}_{ADHERENCE_RATE_LOWER}' - het_col_name = f'{condition.lower()}_{SCREENED}_{std_col.PCT_RATE_SUFFIX}' + source_col_name = f"{condition}_{ADHERENCE_RATE_LOWER}" + het_col_name = f"{condition.lower()}_{SCREENED}_{std_col.PCT_RATE_SUFFIX}" df[het_col_name] = df[source_col_name].round() df = df.drop(source_col_name, axis=1) @@ -102,15 +102,15 @@ def generate_breakdown_df( for condition in conditions: # source cols - source_rate_numerator = f'{condition}_{COUNT_YES_LOWER}' - source_rate_denominator = f'{condition}_{COUNT_TOTAL_LOWER}' + source_rate_numerator = f"{condition}_{COUNT_YES_LOWER}" + source_rate_denominator = f"{condition}_{COUNT_TOTAL_LOWER}" # het cols to make cancer_type = condition.lower() - het_rate_numerator = f'{cancer_type}_{SCREENED}_{std_col.RAW_SUFFIX}' - het_rate_denominator = f'{cancer_type}_{SCREENING_ELIGIBLE}_{std_col.RAW_SUFFIX}' - het_pct_share = f'{cancer_type}_{SCREENED}_{std_col.PCT_SHARE_SUFFIX}' - het_pop_pct_share = f'{cancer_type}_{SCREENING_ELIGIBLE}_{std_col.POP_PCT_SUFFIX}' + het_rate_numerator = f"{cancer_type}_{SCREENED}_{std_col.RAW_SUFFIX}" + het_rate_denominator = f"{cancer_type}_{SCREENING_ELIGIBLE}_{std_col.RAW_SUFFIX}" + het_pct_share = f"{cancer_type}_{SCREENED}_{std_col.PCT_SHARE_SUFFIX}" + het_pop_pct_share = f"{cancer_type}_{SCREENING_ELIGIBLE}_{std_col.POP_PCT_SUFFIX}" # prepare rename mappings rename_col_map[source_rate_numerator] = het_rate_numerator diff --git a/python/datasources/setup.py b/python/datasources/setup.py index 348208b78a..7f1a49dca5 100644 --- a/python/datasources/setup.py +++ b/python/datasources/setup.py @@ -1,6 +1,7 @@ from setuptools import setup -setup(name='datasources', - package_dir={'datasources': ''}, - packages=['datasources'], - ) +setup( + name="datasources", + package_dir={"datasources": ""}, + packages=["datasources"], +) diff --git a/python/datasources/vera_incarceration_county.py b/python/datasources/vera_incarceration_county.py index 1f00bee27d..9e1d487f96 100644 --- a/python/datasources/vera_incarceration_county.py +++ b/python/datasources/vera_incarceration_county.py @@ -16,7 +16,7 @@ from ingestion.het_types import SEX_RACE_AGE_TYPE, SEX_RACE_ETH_AGE_TYPE, DEMOGRAPHIC_TYPE, GEO_TYPE COUNTY: GEO_TYPE = "county" -BASE_VERA_URL = 'https://github.com/vera-institute/incarceration_trends/blob/master/incarceration_trends.csv?raw=true' +BASE_VERA_URL = "https://github.com/vera-institute/incarceration_trends/blob/master/incarceration_trends.csv?raw=true" VERA_YEAR = "year" VERA_FIPS = "fips" @@ -173,22 +173,22 @@ def get_vera_col_types(demo_type: str): class VeraIncarcerationCounty(DataSource): @staticmethod def get_id(): - return 'VERA_INCARCERATION_COUNTY' + return "VERA_INCARCERATION_COUNTY" @staticmethod def get_table_name(): - return 'vera_incarceration_county' + return "vera_incarceration_county" def upload_to_gcs(self, _, **attrs): - raise NotImplementedError('upload_to_gcs should not be called for VeraIncarcerationCounty') + raise NotImplementedError("upload_to_gcs should not be called for VeraIncarcerationCounty") def write_to_bq(self, dataset, gcs_bucket, **attrs): - demo_type = self.get_attr(attrs, 'demographic') + demo_type = self.get_attr(attrs, "demographic") vera_col_types = get_vera_col_types(demo_type) df = gcs_to_bq_util.load_csv_as_df_from_data_dir( - "vera", 'incarceration_trends.csv', usecols=list(vera_col_types.keys()), dtype=vera_col_types + "vera", "incarceration_trends.csv", usecols=list(vera_col_types.keys()), dtype=vera_col_types ) df = df.rename(columns={VERA_FIPS: std_col.COUNTY_FIPS_COL, VERA_YEAR: std_col.TIME_PERIOD_COL}) df = ensure_leading_zeros(df, std_col.COUNTY_FIPS_COL, 5) diff --git a/python/ingestion/bjs_utils.py b/python/ingestion/bjs_utils.py index faba5832f0..6d4b3fa506 100644 --- a/python/ingestion/bjs_utils.py +++ b/python/ingestion/bjs_utils.py @@ -26,17 +26,17 @@ # maps BJS labels to our race CODES BJS_RACE_GROUPS_TO_STANDARD = { - 'White': Race.WHITE_NH, - 'Black': Race.BLACK_NH, - 'Hispanic': Race.HISP, - 'American Indian/Alaska Native': Race.AIAN_NH, - 'Asian': Race.ASIAN_NH, - 'Native Hawaiian/Other Pacific Islander': Race.NHPI_NH, - 'Two or more races': Race.MULTI_NH, - 'Other': Race.OTHER_STANDARD_NH, - 'Unknown': Race.UNKNOWN, + "White": Race.WHITE_NH, + "Black": Race.BLACK_NH, + "Hispanic": Race.HISP, + "American Indian/Alaska Native": Race.AIAN_NH, + "Asian": Race.ASIAN_NH, + "Native Hawaiian/Other Pacific Islander": Race.NHPI_NH, + "Two or more races": Race.MULTI_NH, + "Other": Race.OTHER_STANDARD_NH, + "Unknown": Race.UNKNOWN, # 'Unknown' + 'Did not report' -> "Unknown" - 'Total': Race.ALL, + "Total": Race.ALL, } STANDARD_RACE_CODES = [race_tuple.value for race_tuple in BJS_RACE_GROUPS_TO_STANDARD.values()] @@ -136,7 +136,7 @@ def load_tables(zip_url: str, table_crops): encoding="ISO-8859-1", skiprows=table_crops[file]["header_rows"], skipfooter=table_crops[file]["footer_rows"], - thousands=',', + thousands=",", engine="python", ) @@ -164,7 +164,7 @@ def strip_footnote_refs_from_df(df): """ def strip_footnote_refs(cell_value): - return re.sub(r'/[a-z].*', "", cell_value) if isinstance(cell_value, str) else cell_value + return re.sub(r"/[a-z].*", "", cell_value) if isinstance(cell_value, str) else cell_value df.columns = [strip_footnote_refs(col_name) for col_name in df.columns] df = df.map(strip_footnote_refs) @@ -191,7 +191,7 @@ def missing_data_to_nan(df): symbols_to_null = ["/", "~", "^"] # TODO: remove after updating to pandas 3 - with pd.option_context('future.no_silent_downcasting', True): + with pd.option_context("future.no_silent_downcasting", True): df = df.replace(symbols_to_null, np.nan) return df @@ -208,16 +208,16 @@ def set_state_col(df): df (Pandas Dataframe): the same dataframe with a "state_name" column added, using existing place columns """ - if 'U.S. territory/U.S. commonwealth' in list(df.columns): - df[std_col.STATE_NAME_COL] = df['U.S. territory/U.S. commonwealth'] + if "U.S. territory/U.S. commonwealth" in list(df.columns): + df[std_col.STATE_NAME_COL] = df["U.S. territory/U.S. commonwealth"] return df - elif 'Jurisdiction' in list(df.columns): - df[std_col.STATE_NAME_COL] = df['Jurisdiction'].combine_first(df["Unnamed: 1"]) + elif "Jurisdiction" in list(df.columns): + df[std_col.STATE_NAME_COL] = df["Jurisdiction"].combine_first(df["Unnamed: 1"]) return df - elif 'State' in list(df.columns): - df[std_col.STATE_NAME_COL] = df['State'].combine_first(df["Unnamed: 1"]) + elif "State" in list(df.columns): + df[std_col.STATE_NAME_COL] = df["State"].combine_first(df["Unnamed: 1"]) return df return df @@ -240,7 +240,7 @@ def filter_cols(df, demo_type): std_col.SEX_COL: BJS_SEX_GROUPS, } if demo_type not in cols_to_keep.keys(): - raise ValueError(f'{demo_type} is not a demographic option, must be one of: {list(cols_to_keep.keys())} ') + raise ValueError(f"{demo_type} is not a demographic option, must be one of: {list(cols_to_keep.keys())} ") df = df[df.columns.intersection([std_col.STATE_NAME_COL, *cols_to_keep[demo_type]])] df = df.copy() df[df.columns.intersection(cols_to_keep[demo_type])] = df[df.columns.intersection(cols_to_keep[demo_type])].astype( @@ -287,7 +287,7 @@ def standardize_table_2_df(df): df = df.rename( columns={ - 'Total.1': std_col.ALL_VALUE, + "Total.1": std_col.ALL_VALUE, "Male": "Male-2019", "Female": "Female-2019", "Male.1": constants.Sex.MALE, @@ -313,10 +313,10 @@ def standardize_table_10_df(df): df (Pandas Dataframe): a "clean" dataframe ready for manipulation """ - df[std_col.AGE_COL] = df['Age'].combine_first(df["Unnamed: 1"]) + df[std_col.AGE_COL] = df["Age"].combine_first(df["Unnamed: 1"]) # replace all weird characters (specifically EN-DASH –) with normal hyphen - df[std_col.AGE_COL] = df[std_col.AGE_COL].apply(lambda datum: re.sub('[^0-9a-zA-Z ]+', '-', datum)) + df[std_col.AGE_COL] = df[std_col.AGE_COL].apply(lambda datum: re.sub("[^0-9a-zA-Z ]+", "-", datum)) df = df[[std_col.AGE_COL, "Total"]] @@ -342,7 +342,7 @@ def standardize_table_13_df(df): df (Pandas Dataframe): a "clean" dataframe ready for manipulation """ - df = df.rename(columns={'Total': RAW_PRISON_COL}) + df = df.rename(columns={"Total": RAW_PRISON_COL}) df = df[[std_col.STATE_NAME_COL, RAW_PRISON_COL]] df = df.replace("U.S. total", constants.US_NAME) df[std_col.AGE_COL] = "0-17" @@ -362,7 +362,7 @@ def standardize_table_23_df(df): df (Pandas Dataframe): a "clean" dataframe ready for manipulation """ - df = df.rename(columns={'Total': Race.ALL.value}) + df = df.rename(columns={"Total": Race.ALL.value}) # since American Samoa reports numbers differently, # we will use their Custody # instead of the null jurisdiction # df[Race.ALL.value] = df[Race.ALL.value].combine_first(df["Total custody population"]) @@ -413,7 +413,7 @@ def standardize_jail_6(df): df = df.rename( columns={ - 'Total inmates in custody': RAW_JAIL_COL, + "Total inmates in custody": RAW_JAIL_COL, "Total": "18+", "Male": "Male 18+", "Female": "Female 18+", @@ -462,7 +462,7 @@ def standardize_jail_7(df): df = swap_race_col_names_to_codes(df) df = df.rename( columns={ - 'Total inmates in custody': Race.ALL.value, + "Total inmates in custody": Race.ALL.value, } ) diff --git a/python/ingestion/cdc_wisqars_utils.py b/python/ingestion/cdc_wisqars_utils.py index 986c06d176..74d4a5a0b1 100644 --- a/python/ingestion/cdc_wisqars_utils.py +++ b/python/ingestion/cdc_wisqars_utils.py @@ -36,7 +36,7 @@ WISQARS_CRUDE_RATE = "Crude Rate" WISQARS_POP = "Population" -WISQARS_ALL: WISQARS_DEMO_TYPE = 'all' +WISQARS_ALL: WISQARS_DEMO_TYPE = "all" WISQARS_COLS = [ "Age-Adjusted Rate", @@ -64,10 +64,10 @@ def clean_numeric(val): Takes a single parameter 'val' and returns the cleaned value. """ if isinstance(val, str): - if '**' in val: + if "**" in val: return np.nan - if ',' in val: - return val.replace(',', '') + if "," in val: + return val.replace(",", "") return val @@ -82,7 +82,7 @@ def contains_unknown(x): Returns: bool: True if the input contains the word 'unknown', False otherwise. """ - if isinstance(x, str) and 'unknown' in x.lower(): + if isinstance(x, str) and "unknown" in x.lower(): return True return False @@ -101,7 +101,7 @@ def convert_columns_to_numeric(df: pd.DataFrame, columns_to_convert: List[str]): """ for column in columns_to_convert: df[column] = df[column].apply(clean_numeric) - df[column] = pd.to_numeric(df[column], errors='coerce') + df[column] = pd.to_numeric(df[column], errors="coerce") def generate_cols_map(prefixes: List[WISQARS_VAR_TYPE], suffix: str): @@ -140,34 +140,34 @@ def condense_age_groups(df: pd.DataFrame, col_dicts: List[RATE_CALC_COLS_TYPE]) """ bucket_map = { - ('All',): 'All', - ('Unknown',): 'Unknown', + ("All",): "All", + ("Unknown",): "Unknown", ( - '0-4', - '5-9', - '10-14', - ): '0-14', - ('15-19',): '15-19', - ('20-24',): '20-24', - ('25-29',): '25-29', - ('30-34',): '30-34', + "0-4", + "5-9", + "10-14", + ): "0-14", + ("15-19",): "15-19", + ("20-24",): "20-24", + ("25-29",): "25-29", + ("30-34",): "30-34", ( - '35-39', - '40-44', - ): '35-44', + "35-39", + "40-44", + ): "35-44", ( - '45-49', - '50-54', - '55-59', - '60-64', - ): '45-64', + "45-49", + "50-54", + "55-59", + "60-64", + ): "45-64", ( - '65-69', - '70-74', - '75-79', - '80-84', - '85+', - ): '65+', + "65-69", + "70-74", + "75-79", + "80-84", + "85+", + ): "65+", } het_bucket_dfs = [] @@ -180,12 +180,12 @@ def condense_age_groups(df: pd.DataFrame, col_dicts: List[RATE_CALC_COLS_TYPE]) if len(source_bucket) > 1: # create a list of all count cols - numerator_cols = [col_dict['numerator_col'] for col_dict in col_dicts] - denominator_cols = [col_dict['denominator_col'] for col_dict in col_dicts] + numerator_cols = [col_dict["numerator_col"] for col_dict in col_dicts] + denominator_cols = [col_dict["denominator_col"] for col_dict in col_dicts] count_cols = list(set(numerator_cols + denominator_cols)) # aggregate by state and year, summing count cols and dropping source rate cols - agg_map = {count_col: 'sum' for count_col in count_cols} + agg_map = {count_col: "sum" for count_col in count_cols} het_bucket_df = ( het_bucket_df.groupby([std_col.TIME_PERIOD_COL, std_col.STATE_NAME_COL]).agg(agg_map).reset_index() ) diff --git a/python/ingestion/cdc_wonder_utils.py b/python/ingestion/cdc_wonder_utils.py index 435753e31d..69cbf386df 100644 --- a/python/ingestion/cdc_wonder_utils.py +++ b/python/ingestion/cdc_wonder_utils.py @@ -18,10 +18,10 @@ # State column names for different demographic types STATE_CODE_RACE = "States Code" -STATE_CODE_DEFAULT = 'States and Puerto Rico Code' +STATE_CODE_DEFAULT = "States and Puerto Rico Code" -TMP_ALL: CANCER_TYPE_OR_ALL = 'all' -CDC_WONDER_DIR = 'cdc_wonder' +TMP_ALL: CANCER_TYPE_OR_ALL = "all" +CDC_WONDER_DIR = "cdc_wonder" # Cancer conditions based on sex demographic requirements CANCERS_WITH_SEX_DEMOGRAPHIC = ["Colorectal", "Lung"] @@ -37,12 +37,12 @@ DEMOGRAPHIC_TO_STANDARD_BY_COL = { # Age source groups already match needed HET groups std_col.RACE_CATEGORY_ID_COL: { - 'American Indian or Alaska Native': std_col.Race.AIAN_NH.value, - 'Asian or Pacific Islander': std_col.Race.API_NH.value, - 'Hispanic': std_col.Race.HISP.value, - 'White': std_col.Race.WHITE_NH.value, - 'Black or African American': std_col.Race.BLACK_NH.value, - 'Other Races and Unknown combined': std_col.Race.OTHER_NONSTANDARD_NH.value, + "American Indian or Alaska Native": std_col.Race.AIAN_NH.value, + "Asian or Pacific Islander": std_col.Race.API_NH.value, + "Hispanic": std_col.Race.HISP.value, + "White": std_col.Race.WHITE_NH.value, + "Black or African American": std_col.Race.BLACK_NH.value, + "Other Races and Unknown combined": std_col.Race.OTHER_NONSTANDARD_NH.value, }, # Sex source groups already match needed HET groups } @@ -122,15 +122,15 @@ def load_cdc_df_from_data_dir( topic_dfs = [] for condition in conditions: - folder_name = f'CDC_Wonder_{condition}_Cancer' - file_name = f'{folder_name}-{source_type}.csv' + folder_name = f"CDC_Wonder_{condition}_Cancer" + file_name = f"{folder_name}-{source_type}.csv" topic_df = gcs_to_bq_util.load_csv_as_df_from_data_dir( CDC_WONDER_DIR, file_name, subdirectory=folder_name, dtype=DTYPE, - na_values=['Not Applicable'], + na_values=["Not Applicable"], usecols=keep_cols, ) @@ -198,9 +198,9 @@ def standardize_columns( pd.DataFrame: DataFrame with standardized column names """ rename_cols_map: Dict[str, str] = { - COUNT_COL: f'{condition.lower()}_count_{std_col.RAW_SUFFIX}', - POP_COL: f'{condition.lower()}_{std_col.RAW_POP_SUFFIX}', - CRUDE_RATE_COL: f'{condition.lower()}_{std_col.PER_100K_SUFFIX}', + COUNT_COL: f"{condition.lower()}_count_{std_col.RAW_SUFFIX}", + POP_COL: f"{condition.lower()}_{std_col.RAW_POP_SUFFIX}", + CRUDE_RATE_COL: f"{condition.lower()}_{std_col.PER_100K_SUFFIX}", } if geo_level in [STATE_LEVEL, NATIONAL_LEVEL]: @@ -240,7 +240,7 @@ def get_float_cols(time_type: str, conditions: List[str]) -> List[str]: cols.extend( [ f"{cancer_type}_count_{std_col.RAW_SUFFIX}", - f'{cancer_type}_{std_col.POP_PCT_SUFFIX}', + f"{cancer_type}_{std_col.POP_PCT_SUFFIX}", f"{cancer_type}_{std_col.RAW_POP_SUFFIX}", f"{cancer_type}_{std_col.PCT_SHARE_SUFFIX}", ] diff --git a/python/ingestion/census.py b/python/ingestion/census.py index bf97adadb3..1c4981a039 100644 --- a/python/ingestion/census.py +++ b/python/ingestion/census.py @@ -1,3 +1,4 @@ +# pylint: disable=missing-timeout import requests # type: ignore import json from ingestion.standardized_columns import ( @@ -10,10 +11,10 @@ def rename_age_bracket(bracket): """Converts ACS age bracket label to standardized bracket format of "a-b", - where a is the lower end of the bracket and b is the upper end, - inclusive. + where a is the lower end of the bracket and b is the upper end, + inclusive. - bracket: ACS age bracket.""" + bracket: ACS age bracket.""" parts = bracket.split() if len(parts) == 3 and parts[0] == "Under": return "0-" + str(int(parts[1]) - 1) @@ -52,10 +53,10 @@ def get_census_params(variable_ids, county_level=False): def get_all_params_for_group(group, county_level=False): """Gets census url params to get all variables for a group. - group: String group ID to get variables for. - county_level: Whether to request at the county or state level.""" - geo = 'county' if county_level else 'state' - return {'get': f'group({group})', 'for': geo} + group: String group ID to get variables for. + county_level: Whether to request at the county or state level.""" + geo = "county" if county_level else "state" + return {"get": f"group({group})", "for": geo} def fetch_acs_variables(base_acs_url, variable_ids, county_level): @@ -66,9 +67,7 @@ def fetch_acs_variables(base_acs_url, variable_ids, county_level): variable_ids: The ids of the variables to request. Automatically includes NAME. county_level: Whether to request at the county level, or the state level.""" - resp2 = requests.get( - base_acs_url, params=get_census_params(variable_ids, county_level) - ) + resp2 = requests.get(base_acs_url, params=get_census_params(variable_ids, county_level)) json_result = resp2.json() json_string = json.dumps(json_result) return json_string @@ -94,9 +93,7 @@ def fetch_acs_group(base_acs_url, group_concept, var_map, num_breakdowns, county has one breakdown while "SEX BY AGE" has two. county_level: Whether to request at the county level, or the state level.""" group_vars = get_vars_for_group(group_concept, var_map, num_breakdowns) - json_string = fetch_acs_variables( - base_acs_url, list(group_vars.keys()), county_level - ) + json_string = fetch_acs_variables(base_acs_url, list(group_vars.keys()), county_level) return json_string @@ -142,8 +139,7 @@ def get_vars_for_group(group_concept, var_map, num_breakdowns): num_parts = 2 + num_breakdowns if len(parts) == num_parts: attributes = parts[2:num_parts] - attributes = [ - a[:-1] if a.endswith(":") else a for a in attributes] + attributes = [a[:-1] if a.endswith(":") else a for a in attributes] group_vars[group] = attributes return group_vars @@ -170,11 +166,9 @@ def standardize_frame(df, var_to_labels_map, breakdowns, county_level, measured_ # First, "melt" the frame so that each column other than the geo identifier # gets converted to a value with the column name "variable" id_cols = ["state", "county"] if county_level else ["state"] - if 'NAME' in df.columns: - id_cols.append('NAME') - sort_cols = ( - ["state", "county", "variable"] if county_level else ["state", "variable"] - ) + if "NAME" in df.columns: + id_cols.append("NAME") + sort_cols = ["state", "county", "variable"] if county_level else ["state", "variable"] df = df.melt(id_vars=id_cols) df = df.sort_values(sort_cols) diff --git a/python/ingestion/constants.py b/python/ingestion/constants.py index c37981d06e..f4d7a0d99a 100644 --- a/python/ingestion/constants.py +++ b/python/ingestion/constants.py @@ -1,8 +1,8 @@ HISTORICAL = "historical" CURRENT = "current" -BQ_STRING = 'STRING' -BQ_FLOAT = 'FLOAT64' +BQ_STRING = "STRING" +BQ_FLOAT = "FLOAT64" TERRITORY_FIPS_LIST = ["11", "60", "66", "69", "72", "78"] diff --git a/python/ingestion/dataset_utils.py b/python/ingestion/dataset_utils.py index cd6a6919f2..d2980eba36 100644 --- a/python/ingestion/dataset_utils.py +++ b/python/ingestion/dataset_utils.py @@ -22,7 +22,7 @@ from ingestion.het_types import TIME_VIEW_TYPE # pylint: disable=no-name-in-module INGESTION_DIR = os.path.dirname(os.path.abspath(__file__)) -ACS_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, 'acs_population') +ACS_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, "acs_population") # shared dataset utility functions @@ -114,7 +114,7 @@ def scaffold_fips_df(geo_level: Literal["national", "state", "county"]) -> pd.Da } ) - raise ValueError(f'The provided geo_level: {geo_level} in invalid; it must be `national`, `state`, or `county`.') + raise ValueError(f"The provided geo_level: {geo_level} in invalid; it must be `national`, `state`, or `county`.") def generate_pct_share_col_without_unknowns( @@ -134,9 +134,9 @@ def generate_pct_share_col_without_unknowns( all_val: The value representing 'ALL'""" all_demo_values = set(df[breakdown_col].to_list()) - if Race.UNKNOWN.value in all_demo_values or 'Unknown' in all_demo_values: + if Race.UNKNOWN.value in all_demo_values or "Unknown" in all_demo_values: raise ValueError( - ('This dataset contains unknowns, use the `generate_pct_share_col_with_unknowns` function instead') + ("This dataset contains unknowns, use the `generate_pct_share_col_with_unknowns` function instead") ) return _generate_pct_share_col(df, raw_count_to_pct_share, breakdown_col, all_val) @@ -174,8 +174,8 @@ def generate_pct_share_col_with_unknowns( if len(unknown_df) == 0: raise ValueError( ( - 'This dataset does not contains unknowns, use the ' - 'generate_pct_share_col_without_unknowns function instead' + "This dataset does not contains unknowns, use the " + "generate_pct_share_col_without_unknowns function instead" ) ) @@ -207,11 +207,11 @@ def _generate_pct_share_col( df, raw_count_to_pct_share: dict[str, str], breakdown_col: str, all_val: str ): # pylint: disable=unsubscriptable-object def calc_pct_share(record, raw_count_col): - return percent_avoid_rounding_to_zero(record[raw_count_col], record[f'{raw_count_col}_all']) + return percent_avoid_rounding_to_zero(record[raw_count_col], record[f"{raw_count_col}_all"]) rename_cols = {} for raw_count_col in raw_count_to_pct_share.keys(): - rename_cols[raw_count_col] = f'{raw_count_col}_all' + rename_cols[raw_count_col] = f"{raw_count_col}_all" alls = df.loc[df[breakdown_col] == all_val] alls = alls.rename(columns=rename_cols).reset_index(drop=True) @@ -237,9 +237,9 @@ def calc_pct_share(record, raw_count_col): for f in all_splits: count = value_counts[f] if count != 1: - raise ValueError(f'Fips {f} has {count} ALL rows, there should be 1') + raise ValueError(f"Fips {f} has {count} ALL rows, there should be 1") - df = pd.merge(df, alls, how='left', on=on_cols) + df = pd.merge(df, alls, how="left", on=on_cols) for raw_count_col, pct_share_col in raw_count_to_pct_share.items(): df[pct_share_col] = df.apply(calc_pct_share, axis=1, args=(raw_count_col,)) @@ -250,7 +250,7 @@ def calc_pct_share(record, raw_count_col): # pylint: disable=unsubscriptable-object def generate_pct_share_col_of_summed_alls( - df: pd.DataFrame, raw_count_to_pct_share: dict[str, str], demo_col: Literal['age', 'sex', 'race_and_ethnicity'] + df: pd.DataFrame, raw_count_to_pct_share: dict[str, str], demo_col: Literal["age", "sex", "race_and_ethnicity"] ) -> pd.DataFrame: """ Adds a `pct_share` column for each raw_count_to_pct_share item. Rather than using the "All" row's @@ -280,11 +280,11 @@ def generate_pct_share_col_of_summed_alls( sums_df = df[df[demo_col] != ALL_VALUE].groupby(group_by_cols)[raw_col].sum().reset_index() # Rename the column to avoid conflict when merging - sum_raw_col = f'sum_{raw_col}' + sum_raw_col = f"sum_{raw_col}" sums_df.rename(columns={raw_col: sum_raw_col}, inplace=True) # Merge the sums back into the original DataFrame - df = df.merge(sums_df, on=group_by_cols, how='left') + df = df.merge(sums_df, on=group_by_cols, how="left") # Overwrite the "topic_estimated_total" value where demographic group is "All" df.loc[df[demo_col] == ALL_VALUE, raw_col] = df[sum_raw_col] @@ -433,7 +433,7 @@ def ensure_leading_zeros(df: pd.DataFrame, fips_col_name: str, num_digits: int) fips_col_name: string column name containing the values to be padded num_digits: how many digits should be present after leading zeros are added """ - df[fips_col_name] = df[fips_col_name].apply(lambda code: (str(code).rjust(num_digits, '0'))) + df[fips_col_name] = df[fips_col_name].apply(lambda code: (str(code).rjust(num_digits, "0"))) return df @@ -453,8 +453,8 @@ def generate_pct_rel_inequity_col( """ # Ensure input columns are float - df[pct_share_col] = pd.to_numeric(df[pct_share_col], errors='coerce') - df[pct_pop_col] = pd.to_numeric(df[pct_pop_col], errors='coerce') + df[pct_share_col] = pd.to_numeric(df[pct_share_col], errors="coerce") + df[pct_pop_col] = pd.to_numeric(df[pct_pop_col], errors="coerce") # Create a mask for valid calculations valid_mask = (~df[pct_share_col].isna()) & (~df[pct_pop_col].isna()) & (df[pct_pop_col] != 0) @@ -520,7 +520,7 @@ def zero_out_pct_rel_inequity( per_100k_col_names = {} for rate_col in rate_to_inequity_col_map.keys(): - per_100k_col_names[rate_col] = f'{rate_col}_grouped' + per_100k_col_names[rate_col] = f"{rate_col}_grouped" demo_col = std_col.RACE_CATEGORY_ID_COL if demographic == RACE else demographic unknown_val = Race.UNKNOWN.value if demographic == RACE else UNKNOWN @@ -539,7 +539,7 @@ def zero_out_pct_rel_inequity( df = pd.merge(df_without_all_unknown, grouped_df, on=geo_cols + [std_col.TIME_PERIOD_COL]) for rate_col, pct_inequity_col in rate_to_inequity_col_map.items(): - grouped_col = f'{rate_col}_grouped' + grouped_col = f"{rate_col}_grouped" # set pct_inequity to 0 in a place/time_period if the summed rates are zero df.loc[df[grouped_col] == 0, pct_inequity_col] = 0 @@ -627,11 +627,11 @@ def get_topic_primary_col(topic_prefix: str, df: pd.DataFrame) -> str: std_col.POP_PCT_SUFFIX, std_col.RAW_SUFFIX, ]: - possible_primary_col = f'{topic_prefix}_{primary_col_suffix}' + possible_primary_col = f"{topic_prefix}_{primary_col_suffix}" if possible_primary_col in df.columns: return possible_primary_col - raise ValueError(f'Could not find primary column (e.g. rate or pop. share) for topic prefix: {topic_prefix}') + raise ValueError(f"Could not find primary column (e.g. rate or pop. share) for topic prefix: {topic_prefix}") # TODO: Remove this in favor of preserve_most_recent_year_rows_per_topic above @@ -645,13 +645,13 @@ def preserve_only_current_time_period_rows( and removes (or optionally keeps) the original string time_period col""" if time_period_col not in df.columns: - raise ValueError(f'df does not contain column: {time_period_col}.') + raise ValueError(f"df does not contain column: {time_period_col}.") # Convert time_period to datetime-like object - df['time_period_dt'] = pd.to_datetime(df[time_period_col], errors='coerce', format='%Y-%m') + df["time_period_dt"] = pd.to_datetime(df[time_period_col], errors="coerce", format="%Y-%m") # For rows that failed to convert (NaT), try again assuming just a year is provided - df.loc[df['time_period_dt'].isna(), 'time_period_dt'] = pd.to_datetime( - df[time_period_col], format='%Y', errors='coerce' + df.loc[df["time_period_dt"].isna(), "time_period_dt"] = pd.to_datetime( + df[time_period_col], format="%Y", errors="coerce" ) # Filter the DataFrame to keep only the rows with the most recent rows @@ -696,10 +696,10 @@ def combine_race_ethnicity( # Require std_col.RACE_COL and std_col.ETH_COL if std_col.RACE_COL not in df.columns or std_col.ETH_COL not in df.columns: - raise ValueError('df must contain columns: std_col.RACE_COL and std_col.ETH_COL') + raise ValueError("df must contain columns: std_col.RACE_COL and std_col.ETH_COL") if unknown_values is None: - unknown_values = ['NA', 'Missing', 'Unknown'] + unknown_values = ["NA", "Missing", "Unknown"] # Create a copy of the DataFrame to avoid SettingWithCopyWarning df = df.copy() @@ -772,14 +772,14 @@ def get_timeview_df_and_cols( - A tuple containing the processed DataFrame and a dict mapping column names needed by BigQuery """ - if time_view not in ['current', 'historical']: + if time_view not in ["current", "historical"]: raise ValueError('time_view must be either "current" or "historical"') df = df.copy() # remove unneeded columns unwanted_suffixes = ( - std_col.SUFFIXES_CURRENT_TIME_VIEWS if time_view == 'historical' else std_col.SUFFIXES_HISTORICAL_TIME_VIEWS + std_col.SUFFIXES_CURRENT_TIME_VIEWS if time_view == "historical" else std_col.SUFFIXES_HISTORICAL_TIME_VIEWS ) for col in df.columns: @@ -787,7 +787,7 @@ def get_timeview_df_and_cols( df.drop(columns=[col], inplace=True) # remove unneeded rows - if time_view == 'current': + if time_view == "current": df = preserve_most_recent_year_rows_per_topic(df, topic_prefixes) bq_col_types = build_bq_col_types(df) @@ -807,8 +807,8 @@ def build_bq_col_types(df: pd.DataFrame) -> Dict[str, str]: def generate_time_df_with_cols_and_types( df: pd.DataFrame, numerical_cols_to_keep: List[str], - table_type: Literal['current', 'historical'], - dem_col: Literal['age', 'race', 'race_and_ethnicity', 'sex'], + table_type: Literal["current", "historical"], + dem_col: Literal["age", "race", "race_and_ethnicity", "sex"], ) -> tuple[pd.DataFrame, Dict[str, str]]: # pylint: disable=unsubscriptable-object """ Accepts a DataFrame along with list of column names for either current or @@ -893,7 +893,7 @@ def generate_estimated_total_col( elif std_col.PER_100K_SUFFIX in rate_col: conversion_factor = 100_000 else: - raise ValueError(f'{rate_col} must have a suffix of _pct_rate or _per_100k.') + raise ValueError(f"{rate_col} must have a suffix of _pct_rate or _per_100k.") df[raw_col] = df[rate_col] / conversion_factor * df[intersectional_pop_col] df[raw_col] = df[raw_col].round() diff --git a/python/ingestion/gcs_to_bq_util.py b/python/ingestion/gcs_to_bq_util.py index a8e5feb4cb..7383ee109d 100644 --- a/python/ingestion/gcs_to_bq_util.py +++ b/python/ingestion/gcs_to_bq_util.py @@ -15,7 +15,7 @@ ) # pylint: disable=no-name-in-module -DATA_DIR = os.path.join(os.sep, 'app', 'data') +DATA_DIR = os.path.join(os.sep, "app", "data") def __convert_frame_to_json(frame): @@ -23,7 +23,7 @@ def __convert_frame_to_json(frame): # Repeated fields are not supported with bigquery.Client.load_table_from_dataframe() # (See https://github.com/googleapis/python-bigquery/issues/19). We have to # use load_table_from_json as a workaround. - result = frame.to_json(orient='records') + result = frame.to_json(orient="records") json_data = json.loads(result) return json_data @@ -117,10 +117,10 @@ def get_schema(frame, column_types, col_modes): input_cols = column_types.keys() if len(input_cols) != len(frame.columns) or set(input_cols) != set(frame.columns): - raise ValueError('Column types did not match frame columns') + raise ValueError("Column types did not match frame columns") def create_field(col): - return bigquery.SchemaField(col, column_types[col], mode=(col_modes[col] if col in col_modes else 'NULLABLE')) + return bigquery.SchemaField(col, column_types[col], mode=(col_modes[col] if col in col_modes else "NULLABLE")) return list(map(create_field, column_types.keys())) @@ -144,12 +144,12 @@ def values_json_to_df(values_json, dtype=None) -> pd.DataFrame: NOTE: To test without needing a real json file, wrap the json string in StringIO() """ - frame = pd.DataFrame(pd.read_json(values_json, orient='values', dtype=dtype)) + frame = pd.DataFrame(pd.read_json(values_json, orient="values", dtype=dtype)) new_column_names = dict(frame.iloc[0]) frame.rename(columns=new_column_names, inplace=True) # pylint: disable=E1101 frame.drop([0], inplace=True) # pylint: disable=E1101 # Fill None values with np.nan TODO: remove after updating to pandas 3 - with pd.option_context('future.no_silent_downcasting', True): + with pd.option_context("future.no_silent_downcasting", True): frame = frame.fillna(np.nan) # pylint: disable=E1101 return frame @@ -162,7 +162,7 @@ def load_values_blob_as_df(blob): blob: google.cloud.storage.blob.Blob object""" json_string = blob.download_as_string() - json_string = json_string.decode('utf-8') + json_string = json_string.decode("utf-8") return values_json_to_df(StringIO(json_string)) @@ -227,7 +227,7 @@ def load_csv_as_df_from_web(url, dtype=None, params=None, encoding=None) -> pd.D url: url to download the csv file from""" - url = requests.Request('GET', url, params=params).prepare().url + url = requests.Request("GET", url, params=params).prepare().url return pd.read_csv(url, dtype=dtype, encoding=encoding) @@ -255,7 +255,7 @@ def load_xlsx_as_df_from_data_dir( def load_csv_as_df_from_data_dir( - directory, filename, subdirectory='', dtype=None, skiprows=None, na_values=None, thousands=None, usecols=None + directory, filename, subdirectory="", dtype=None, skiprows=None, na_values=None, thousands=None, usecols=None ) -> pd.DataFrame: """Loads csv data from /data/{directory}/{filename} into a DataFrame. Expects the data to be in csv format, with the first row as the column @@ -281,13 +281,13 @@ def load_csv_as_df_from_data_dir( def load_tsv_as_df_from_data_dir( directory, filename, - subdirectory='', + subdirectory="", dtype=None, skiprows=None, na_values=None, thousands=None, usecols=None, - delimiter='\t', + delimiter="\t", skipinitialspace=True, ) -> pd.DataFrame: """Loads tsv data from /data/{directory}/{filename} into a DataFrame. @@ -357,7 +357,7 @@ def load_json_as_df_from_data_dir_based_on_key_list(directory, filename, key_lis """ file_path = os.path.join(DATA_DIR, directory, filename) - with open(file_path, 'r', encoding='utf-8') as data_file: + with open(file_path, "r", encoding="utf-8") as data_file: data = json.loads(data_file.read()) df = pd.json_normalize(data, key_list) return df @@ -368,7 +368,7 @@ def load_json_as_df_from_web(url, dtype=None, params=None) -> pd.DataFrame: url: url to download the json from """ - url = requests.Request('GET', url, params=params).prepare().url + url = requests.Request("GET", url, params=params).prepare().url return pd.read_json(url, dtype=dtype) @@ -390,7 +390,7 @@ def load_public_dataset_from_bigquery_as_df(dataset, table_name, dtype=None) -> dataset: The BigQuery dataset to write to. table_name: The BigQuery table to write to.""" client = bigquery.Client() - table_id = f'bigquery-public-data.{dataset}.{table_name}' + table_id = f"bigquery-public-data.{dataset}.{table_name}" return client.list_rows(table_id).to_dataframe(dtypes=dtype) @@ -417,11 +417,11 @@ def load_values_as_json(gcs_bucket, filename): client = storage.Client() bucket = client.get_bucket(gcs_bucket) blob = bucket.blob(filename) - return json.loads(blob.download_as_bytes().decode('utf-8')) + return json.loads(blob.download_as_bytes().decode("utf-8")) def local_file_path(filename): - return f'/tmp/{filename}' + return f"/tmp/{filename}" def list_bucket_files(bucket_name: str) -> list: diff --git a/python/ingestion/github_util.py b/python/ingestion/github_util.py index d69effab1a..10df352f04 100644 --- a/python/ingestion/github_util.py +++ b/python/ingestion/github_util.py @@ -15,7 +15,7 @@ def decode_json_from_url_into_df(url): url: url to a base64 encoded github file""" r = requests.get(url, timeout=10) jsn = json.loads(r.text) - decoded = base64.b64decode(jsn['content']) + decoded = base64.b64decode(jsn["content"]) return pandas.read_csv(BytesIO(decoded)) @@ -25,5 +25,5 @@ def decode_excel_from_url_into_df(url): url: url to a base64 encoded github file""" r = requests.get(url, timeout=10) jsn = json.loads(r.text) - decoded = base64.b64decode(jsn['content']) + decoded = base64.b64decode(jsn["content"]) return pandas.read_excel(BytesIO(decoded)) diff --git a/python/ingestion/graphql_ahr_utils.py b/python/ingestion/graphql_ahr_utils.py index d38b0e5589..4d7e2856d7 100644 --- a/python/ingestion/graphql_ahr_utils.py +++ b/python/ingestion/graphql_ahr_utils.py @@ -10,33 +10,33 @@ ahr_api_key = os.getenv("AHR_API_KEY") # Constants -AHR_US = 'ALL' -GRAPHQL_URL = 'https://api.americashealthrankings.org/graphql' -GRAPHQL_HEADERS = {'Content-Type': 'application/json', 'x-api-key': ahr_api_key} +AHR_US = "ALL" +GRAPHQL_URL = "https://api.americashealthrankings.org/graphql" +GRAPHQL_HEADERS = {"Content-Type": "application/json", "x-api-key": ahr_api_key} AHR_MEASURES_TO_RATES_MAP_18PLUS = { - 'Asthma': 'asthma_per_100k', - 'Avoided Care Due to Cost': 'avoided_care_pct_rate', - 'Cardiovascular Diseases': 'cardiovascular_diseases_per_100k', - 'Chronic Kidney Disease': 'chronic_kidney_disease_per_100k', - 'Chronic Obstructive Pulmonary Disease': 'copd_per_100k', - 'Depression': 'depression_per_100k', - 'Diabetes': 'diabetes_per_100k', - 'Excessive Drinking': 'excessive_drinking_per_100k', - 'Frequent Mental Distress': 'frequent_mental_distress_per_100k', - 'Non-Medical Drug Use': 'non_medical_drug_use_per_100k', + "Asthma": "asthma_per_100k", + "Avoided Care Due to Cost": "avoided_care_pct_rate", + "Cardiovascular Diseases": "cardiovascular_diseases_per_100k", + "Chronic Kidney Disease": "chronic_kidney_disease_per_100k", + "Chronic Obstructive Pulmonary Disease": "copd_per_100k", + "Depression": "depression_per_100k", + "Diabetes": "diabetes_per_100k", + "Excessive Drinking": "excessive_drinking_per_100k", + "Frequent Mental Distress": "frequent_mental_distress_per_100k", + "Non-Medical Drug Use": "non_medical_drug_use_per_100k", } AHR_MEASURES_TO_RATES_MAP_ALL_AGES = { - 'Suicide': 'suicide_per_100k', + "Suicide": "suicide_per_100k", } AHR_MEASURES_TO_RATES_MAP_CITIZENS_18PLUS = { - 'Voter Participation (Presidential)': 'voter_participation_pct_rate', + "Voter Participation (Presidential)": "voter_participation_pct_rate", } AHR_MEASURES_TO_RATES_MAP_MEDICARE_18PLUS = { - 'Preventable Hospitalizations': 'preventable_hospitalizations_per_100k', + "Preventable Hospitalizations": "preventable_hospitalizations_per_100k", } AHR_BASE_MEASURES_TO_RATES_MAP = { @@ -63,9 +63,9 @@ # Utility functions def load_ahr_measures_json(category: TOPIC_CATEGORY_TYPE): current_dir = os.path.dirname(os.path.abspath(__file__)) - config_file_path = os.path.join(current_dir, 'ahr_config', f'graphql_ahr_measure_ids_{category}.json') + config_file_path = os.path.join(current_dir, "ahr_config", f"graphql_ahr_measure_ids_{category}.json") - with open(config_file_path, 'r') as file: + with open(config_file_path, "r") as file: return json.load(file) @@ -89,7 +89,7 @@ def get_measure_ids(demographic: str, category: TOPIC_CATEGORY_TYPE, data=None): demographic_measures = data.get(demographic) for measure in demographic_measures: - ids = measure.get('ids') or measure.get('demographics') + ids = measure.get("ids") or measure.get("demographics") if isinstance(ids, dict): # Flatten the dictionary values into a single list ids = [item for sublist in ids.values() for item in sublist] @@ -126,7 +126,7 @@ def fetch_ahr_data_from_graphql(demographic: str, geo_level: str, category: TOPI Returns: a list containing the data retrieved from the API. """ - measure_ids = get_measure_ids('all', category) + get_measure_ids(demographic, category) + measure_ids = get_measure_ids("all", category) + get_measure_ids(demographic, category) results = [] state_filter = '{eq: "ALL"}' if geo_level == NATIONAL_LEVEL else '{neq: "ALL"}' @@ -149,10 +149,10 @@ def fetch_ahr_data_from_graphql(demographic: str, geo_level: str, category: TOPI }} """ - response = requests.post(GRAPHQL_URL, json={'query': graphql_query}, headers=GRAPHQL_HEADERS, timeout=300) + response = requests.post(GRAPHQL_URL, json={"query": graphql_query}, headers=GRAPHQL_HEADERS, timeout=300) if response.status_code == 200: - results.append(response.json().get('data')['measure_A']) + results.append(response.json().get("data")["measure_A"]) else: print(f"Query failed to run with a {response.status_code} for metricId: {measure_id}") @@ -178,18 +178,18 @@ def graphql_response_to_dataframe(response_data): flattened_data = [] for dataset in response_data: - for row in dataset['data']: - time_period = row['endDate'][:4] - measure = row['measure']['name'] - state_postal = row['state'] - value = row['value'] + for row in dataset["data"]: + time_period = row["endDate"][:4] + measure = row["measure"]["name"] + state_postal = row["state"] + value = row["value"] flattened_data.append( { std_col.TIME_PERIOD_COL: time_period, - 'measure': measure, + "measure": measure, std_col.STATE_POSTAL_COL: state_postal, - 'value': value, + "value": value, } ) diff --git a/python/ingestion/het_types.py b/python/ingestion/het_types.py index e2cc01d501..11c4ba1aa9 100644 --- a/python/ingestion/het_types.py +++ b/python/ingestion/het_types.py @@ -2,23 +2,23 @@ from typing_extensions import TypeAlias COMPREHENSIVE_DEMOGRAPHIC_TYPE: TypeAlias = Literal[ - 'sex', - 'age', - 'race', - 'race_and_ethnicity', - 'lis', - 'eligibility', - 'insurance_status', - 'education', - 'income', - 'all', - 'black_women', - 'urbanicity', - 'black_women_by_age', - 'black_men_by_age', - 'black_men_by_urbanicity', - 'youth_by_race_and_ethnicity', - 'alls', + "sex", + "age", + "race", + "race_and_ethnicity", + "lis", + "eligibility", + "insurance_status", + "education", + "income", + "all", + "black_women", + "urbanicity", + "black_women_by_age", + "black_men_by_age", + "black_men_by_urbanicity", + "youth_by_race_and_ethnicity", + "alls", ] @@ -31,30 +31,30 @@ def create_subset_type(*options): # Define type aliases explicitly -SEX_RACE_AGE_TYPE: TypeAlias = Literal['sex', 'age', 'race'] -SEX_RACE_ETH_AGE_TYPE: TypeAlias = Literal['sex', 'age', 'race_and_ethnicity'] -DEMOGRAPHIC_TYPE: TypeAlias = Literal['sex', 'age', 'race', 'race_and_ethnicity'] +SEX_RACE_AGE_TYPE: TypeAlias = Literal["sex", "age", "race"] +SEX_RACE_ETH_AGE_TYPE: TypeAlias = Literal["sex", "age", "race_and_ethnicity"] +DEMOGRAPHIC_TYPE: TypeAlias = Literal["sex", "age", "race", "race_and_ethnicity"] PHRMA_BREAKDOWN_TYPE: TypeAlias = Literal[ - 'age', 'sex', 'race_and_ethnicity', 'lis', 'eligibility', 'insurance_status', 'education', 'income' + "age", "sex", "race_and_ethnicity", "lis", "eligibility", "insurance_status", "education", "income" ] PHRMA_BREAKDOWN_TYPE_OR_ALL: TypeAlias = Literal[ - 'age', 'sex', 'race_and_ethnicity', 'lis', 'eligibility', 'insurance_status', 'education', 'income', 'all' + "age", "sex", "race_and_ethnicity", "lis", "eligibility", "insurance_status", "education", "income", "all" ] -HIV_BREAKDOWN_TYPE: TypeAlias = Literal['age', 'sex', 'race', 'race_and_ethnicity', 'black_women'] -WISQARS_DEMO_TYPE: TypeAlias = Literal['sex', 'age', 'race_and_ethnicity', 'urbanicity', 'all'] +HIV_BREAKDOWN_TYPE: TypeAlias = Literal["age", "sex", "race", "race_and_ethnicity", "black_women"] +WISQARS_DEMO_TYPE: TypeAlias = Literal["sex", "age", "race_and_ethnicity", "urbanicity", "all"] GEO_TYPE = Literal["county", "state", "national"] -CANCER_TYPE_OR_ALL = Literal['age', 'race', 'race_and_ethnicity', 'sex', 'all'] +CANCER_TYPE_OR_ALL = Literal["age", "race", "race_and_ethnicity", "sex", "all"] -TIME_VIEW_TYPE = Literal['historical', 'current'] +TIME_VIEW_TYPE = Literal["historical", "current"] TOPIC_CATEGORY_TYPE = Literal[ - 'non-behavioral_health', # TODO: delete this once AHR is split across all categories properly - 'all', - 'behavioral_health', + "non-behavioral_health", # TODO: delete this once AHR is split across all categories properly + "all", + "behavioral_health", ] PHRMA_DATASET_TYPE = Literal["brfss", "medicare"] diff --git a/python/ingestion/local_pipeline_utils.py b/python/ingestion/local_pipeline_utils.py index 8ea5172333..d419095961 100644 --- a/python/ingestion/local_pipeline_utils.py +++ b/python/ingestion/local_pipeline_utils.py @@ -1,12 +1,12 @@ import os import pandas as pd -DATA_DIR = os.path.abspath('data') -FRONTEND_TMP_DIR = os.path.abspath('frontend/public/tmp') +DATA_DIR = os.path.abspath("data") +FRONTEND_TMP_DIR = os.path.abspath("frontend/public/tmp") def load_csv_as_df_from_data_dir( - directory, filename, subdirectory='', dtype=None, skiprows=None, na_values=None, thousands=None, usecols=None + directory, filename, subdirectory="", dtype=None, skiprows=None, na_values=None, thousands=None, usecols=None ) -> pd.DataFrame: file_path = os.path.join(DATA_DIR, directory, subdirectory, filename) @@ -17,4 +17,4 @@ def load_csv_as_df_from_data_dir( def write_df_as_json_to_frontend_tmp(df: pd.DataFrame, filename: str): file_path = os.path.join(FRONTEND_TMP_DIR, filename) - df.to_json(f'{file_path}.json', orient='records', date_format='iso', date_unit='s') + df.to_json(f"{file_path}.json", orient="records", date_format="iso", date_unit="s") diff --git a/python/ingestion/merge_utils.py b/python/ingestion/merge_utils.py index 5206758f3b..ca9c03ba1b 100644 --- a/python/ingestion/merge_utils.py +++ b/python/ingestion/merge_utils.py @@ -5,19 +5,19 @@ from typing import Literal, List, Union, Type, Optional, Tuple, Dict import os -ACS_EARLIEST_YEAR = '2009' -ACS_CURRENT_YEAR = '2022' -DECIA_CUTOFF_YEAR = '2016' +ACS_EARLIEST_YEAR = "2009" +ACS_CURRENT_YEAR = "2022" +DECIA_CUTOFF_YEAR = "2016" # This works for both local runs and also in a container within the /app directory INGESTION_DIR = os.path.dirname(os.path.abspath(__file__)) -ACS_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, 'acs_population') -DECIA_2010_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, 'decia_2010_territory_population') -DECIA_2020_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, 'decia_2020_territory_population') -FIPS_CODES_DIR = os.path.join(INGESTION_DIR, 'fips_codes') -COUNTY_LEVEL_FIPS_CSV = os.path.join(FIPS_CODES_DIR, 'county_level_fips.csv') -STATE_LEVEL_FIPS_CSV = os.path.join(FIPS_CODES_DIR, 'state_level_fips.csv') +ACS_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, "acs_population") +DECIA_2010_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, "decia_2010_territory_population") +DECIA_2020_MERGE_DATA_DIR = os.path.join(INGESTION_DIR, "decia_2020_territory_population") +FIPS_CODES_DIR = os.path.join(INGESTION_DIR, "fips_codes") +COUNTY_LEVEL_FIPS_CSV = os.path.join(FIPS_CODES_DIR, "county_level_fips.csv") +STATE_LEVEL_FIPS_CSV = os.path.join(FIPS_CODES_DIR, "state_level_fips.csv") def merge_county_names(df: pd.DataFrame) -> pd.DataFrame: @@ -33,8 +33,8 @@ def merge_county_names(df: pd.DataFrame) -> pd.DataFrame: if std_col.COUNTY_FIPS_COL not in df.columns: raise ValueError( - 'df must be county-level with `county_fips` col of 5 digit FIPS strings.' - + f'This dataframe only contains these columns: {list(df.columns)}' + "df must be county-level with `county_fips` col of 5 digit FIPS strings." + + f"This dataframe only contains these columns: {list(df.columns)}" ) county_level_fips_df = pd.read_csv(COUNTY_LEVEL_FIPS_CSV, dtype=str) @@ -42,7 +42,7 @@ def merge_county_names(df: pd.DataFrame) -> pd.DataFrame: if std_col.COUNTY_NAME_COL in df.columns: df = df.drop(columns=std_col.COUNTY_NAME_COL) - df = pd.merge(df, county_level_fips_df, how='left', on=std_col.COUNTY_FIPS_COL).reset_index(drop=True) + df = pd.merge(df, county_level_fips_df, how="left", on=std_col.COUNTY_FIPS_COL).reset_index(drop=True) return df @@ -67,11 +67,11 @@ def merge_state_ids(df, keep_postal=False): and std_col.STATE_FIPS_COL not in df.columns ): raise ValueError( - 'Dataframe must be a state-level table ' - + 'with at least one of the following columns: ' - + '`state_name`, `state_fips` (2 digit FIPS strings), ' - + ' or `state_postal` containing 2 digit FIPS strings.' - + f'This dataframe only contains these columns: {list(df.columns)}' + "Dataframe must be a state-level table " + + "with at least one of the following columns: " + + "`state_name`, `state_fips` (2 digit FIPS strings), " + + " or `state_postal` containing 2 digit FIPS strings." + + f"This dataframe only contains these columns: {list(df.columns)}" ) state_level_fips_df = pd.read_csv(STATE_LEVEL_FIPS_CSV, dtype=str) @@ -79,9 +79,9 @@ def merge_state_ids(df, keep_postal=False): united_states_fips = pd.DataFrame( [ { - 'state_fips_code': US_FIPS, - 'state_name': US_NAME, - 'state_postal_abbreviation': US_ABBR, + "state_fips_code": US_FIPS, + "state_name": US_NAME, + "state_postal_abbreviation": US_ABBR, } ] ) @@ -89,20 +89,20 @@ def merge_state_ids(df, keep_postal=False): unknown_fips = pd.DataFrame( [ { - 'state_fips_code': 'Unknown', - 'state_name': 'Unknown', - 'state_postal_abbreviation': 'Unknown', + "state_fips_code": "Unknown", + "state_name": "Unknown", + "state_postal_abbreviation": "Unknown", } ] ) - state_level_fips_df = state_level_fips_df[['state_fips_code', 'state_name', 'state_postal_abbreviation']] + state_level_fips_df = state_level_fips_df[["state_fips_code", "state_name", "state_postal_abbreviation"]] state_level_fips_df = pd.concat([state_level_fips_df, united_states_fips, unknown_fips]) state_level_fips_df = state_level_fips_df.rename( columns={ - 'state_fips_code': std_col.STATE_FIPS_COL, - 'state_postal_abbreviation': std_col.STATE_POSTAL_COL, + "state_fips_code": std_col.STATE_FIPS_COL, + "state_postal_abbreviation": std_col.STATE_POSTAL_COL, } ).reset_index(drop=True) @@ -112,7 +112,7 @@ def merge_state_ids(df, keep_postal=False): if std_col.STATE_FIPS_COL in df.columns: merge_col = std_col.STATE_FIPS_COL - df = pd.merge(df, state_level_fips_df, how='left', on=merge_col).reset_index(drop=True) + df = pd.merge(df, state_level_fips_df, how="left", on=merge_col).reset_index(drop=True) if (not keep_postal) and (std_col.STATE_POSTAL_COL in df.columns): df = df.drop(columns=std_col.STATE_POSTAL_COL) @@ -120,7 +120,7 @@ def merge_state_ids(df, keep_postal=False): return df -def merge_pop_numbers(df, demo: Literal['age', 'sex', 'race'], loc: Literal['county', 'state', 'national']): +def merge_pop_numbers(df, demo: Literal["age", "sex", "race"], loc: Literal["county", "state", "national"]): """Merges the corresponding `population` and `population_pct` column into the given df df: a pandas df with demographic column and a `state_fips` column @@ -132,8 +132,8 @@ def merge_pop_numbers(df, demo: Literal['age', 'sex', 'race'], loc: Literal['cou def merge_yearly_pop_numbers( df: pd.DataFrame, - demo: Literal['age', 'race', 'sex'], - geo_level: Literal['county', 'state', 'national'], + demo: Literal["age", "race", "sex"], + geo_level: Literal["county", "state", "national"], ) -> pd.DataFrame: """Merges multiple years of population data onto incoming df that contains a `time_period` col of 4 digit string year values @@ -200,7 +200,7 @@ def merge_yearly_pop_numbers( return df -def merge_multiple_pop_cols(df: pd.DataFrame, demo: Literal['age', 'race', 'sex'], condition_cols: List[str]): +def merge_multiple_pop_cols(df: pd.DataFrame, demo: Literal["age", "race", "sex"], condition_cols: List[str]): """Merges the population of each state into a column for each condition in `condition_cols`. If a condition is NaN for that state the population gets counted as zero. @@ -222,9 +222,9 @@ def merge_multiple_pop_cols(df: pd.DataFrame, demo: Literal['age', 'race', 'sex' def _merge_pop(df, demo, loc, on_time_period: Optional[bool] = None): on_col_map = { - 'age': std_col.AGE_COL, - 'race': std_col.RACE_CATEGORY_ID_COL, - 'sex': std_col.SEX_COL, + "age": std_col.AGE_COL, + "race": std_col.RACE_CATEGORY_ID_COL, + "sex": std_col.SEX_COL, } pop_dtype = { @@ -237,17 +237,17 @@ def _merge_pop(df, demo, loc, on_time_period: Optional[bool] = None): pop_dtype[std_col.COUNTY_FIPS_COL] = str if demo not in on_col_map: - raise ValueError(f'{demo} not a demographic option, must be one of: {list(on_col_map.keys())}') + raise ValueError(f"{demo} not a demographic option, must be one of: {list(on_col_map.keys())}") - pop_table_name = f'by_{demo}_{loc}' + pop_table_name = f"by_{demo}_{loc}" - print(f'\nMerging real ACS population from python/ingestion/acs_population/{pop_table_name}') + print(f"\nMerging real ACS population from python/ingestion/acs_population/{pop_table_name}") if on_time_period: pop_table_name += "_time_series" pop_dtype[std_col.TIME_PERIOD_COL] = str - pop_file = os.path.join(ACS_MERGE_DATA_DIR, f'{pop_table_name}.csv') + pop_file = os.path.join(ACS_MERGE_DATA_DIR, f"{pop_table_name}.csv") pop_df = pd.read_csv(pop_file, dtype=pop_dtype) needed_cols = [on_col_map[demo], std_col.POPULATION_COL, std_col.POPULATION_PCT_COL] @@ -266,7 +266,7 @@ def _merge_pop(df, demo, loc, on_time_period: Optional[bool] = None): # from DECIA_2020 (VI, GU, AS, MP) if loc != NATIONAL_LEVEL: verbose_demo = std_col.RACE_OR_HISPANIC_COL if demo == std_col.RACE_COL else demo - pop_terr_table_name = f'by_{verbose_demo}_territory_{loc}_level' + pop_terr_table_name = f"by_{verbose_demo}_territory_{loc}_level" terr_pop_dtype = { std_col.STATE_FIPS_COL: str, @@ -277,7 +277,7 @@ def _merge_pop(df, demo, loc, on_time_period: Optional[bool] = None): if loc == COUNTY_LEVEL: terr_pop_dtype[std_col.COUNTY_FIPS_COL] = str - pop_terr_2020_file = os.path.join(DECIA_2020_MERGE_DATA_DIR, f'{pop_terr_table_name}.csv') + pop_terr_2020_file = os.path.join(DECIA_2020_MERGE_DATA_DIR, f"{pop_terr_table_name}.csv") pop_terr_2020_df = pd.read_csv(pop_terr_2020_file, dtype=terr_pop_dtype) pop_terr_df = pop_terr_2020_df[needed_cols] @@ -288,7 +288,7 @@ def _merge_pop(df, demo, loc, on_time_period: Optional[bool] = None): pop_terr_2010_file = ( pop_terr_2020_file if loc == COUNTY_LEVEL - else os.path.join(DECIA_2010_MERGE_DATA_DIR, f'{pop_terr_table_name}.csv') + else os.path.join(DECIA_2010_MERGE_DATA_DIR, f"{pop_terr_table_name}.csv") ) pop_terr_2010_df = pd.read_csv(pop_terr_2010_file, dtype=terr_pop_dtype) @@ -319,15 +319,15 @@ def _merge_pop(df, demo, loc, on_time_period: Optional[bool] = None): if on_time_period: on_cols.append(std_col.TIME_PERIOD_COL) - df = pd.merge(df, pop_df, how='left', on=on_cols) + df = pd.merge(df, pop_df, how="left", on=on_cols) return df.reset_index(drop=True) def merge_intersectional_pop( df: pd.DataFrame, - geo_level: Literal['national', 'state', 'county'], - primary_demo_col: Literal['age', 'race_and_ethnicity', 'sex', 'race'], + geo_level: Literal["national", "state", "county"], + primary_demo_col: Literal["age", "race_and_ethnicity", "sex", "race"], race_specific_group: Optional[str] = None, age_specific_group: Optional[str] = None, sex_specific_group: Optional[str] = None, @@ -353,13 +353,13 @@ def merge_intersectional_pop( """ if primary_demo_col == std_col.RACE_COL: - primary_demo_col = 'race_and_ethnicity' + primary_demo_col = "race_and_ethnicity" pop_dtype: Dict[str, Union[Type[float], Type[str]]] = { std_col.POPULATION_COL: float, } - geo_file = '' + geo_file = "" if geo_level == COUNTY_LEVEL: pop_dtype[std_col.COUNTY_FIPS_COL] = str @@ -368,22 +368,22 @@ def merge_intersectional_pop( pop_dtype[std_col.STATE_FIPS_COL] = str geo_file = STATE_LEVEL - pop_file = os.path.join(ACS_MERGE_DATA_DIR, f'by_sex_age_race_{geo_file}.csv') + pop_file = os.path.join(ACS_MERGE_DATA_DIR, f"by_sex_age_race_{geo_file}.csv") pop_df = pd.read_csv(pop_file, dtype=pop_dtype) if geo_level == NATIONAL_LEVEL: pop_df = sum_states_to_national(pop_df) # the primary demographic breakdown can't use a specific group - if primary_demo_col == 'race_and_ethnicity' and race_specific_group: - raise ValueError('race_specific_group kwarg is not applicable when primary_demo_col is race.') - if primary_demo_col == 'age' and age_specific_group: - raise ValueError('age_specific_group kwarg is not applicable when primary_demo_col is age.') - if primary_demo_col == 'sex' and sex_specific_group: - raise ValueError('sex_specific_group kwarg is not applicable when primary_demo_col is sex.') + if primary_demo_col == "race_and_ethnicity" and race_specific_group: + raise ValueError("race_specific_group kwarg is not applicable when primary_demo_col is race.") + if primary_demo_col == "age" and age_specific_group: + raise ValueError("age_specific_group kwarg is not applicable when primary_demo_col is age.") + if primary_demo_col == "sex" and sex_specific_group: + raise ValueError("sex_specific_group kwarg is not applicable when primary_demo_col is sex.") - if age_specific_group == '18+': - pop_df = sum_age_groups(pop_df, '18+') + if age_specific_group == "18+": + pop_df = sum_age_groups(pop_df, "18+") specific_group_map = {} specific_group_map[std_col.RACE_OR_HISPANIC_COL] = ALL_VALUE if race_specific_group is None else race_specific_group @@ -393,10 +393,10 @@ def merge_intersectional_pop( pop_col = std_col.POPULATION_COL for group in specific_group_map.values(): if group != ALL_VALUE: - group = group.replace('+', 'plus') - group = group.replace("-", '_') + group = group.replace("+", "plus") + group = group.replace("-", "_") group = group.lower() - pop_col = f'{group}_{pop_col}' + pop_col = f"{group}_{pop_col}" pop_df = pop_df.rename(columns={std_col.POPULATION_COL: pop_col}) @@ -442,17 +442,17 @@ def merge_intersectional_pop( if primary_demo_col == std_col.RACE_OR_HISPANIC_COL: # string "_NH" off race_category_id on everything except "WHITE_NH" race_id_replace_map = { - 'AIAN': 'AIAN_NH', - 'ASIAN': 'ASIAN_NH', - 'BLACK': 'BLACK_NH', - 'NHPI': 'NHPI_NH', - 'MULTI': 'MULTI_NH', - 'OTHER_STANDARD': 'OTHER_STANDARD_NH', + "AIAN": "AIAN_NH", + "ASIAN": "ASIAN_NH", + "BLACK": "BLACK_NH", + "NHPI": "NHPI_NH", + "MULTI": "MULTI_NH", + "OTHER_STANDARD": "OTHER_STANDARD_NH", } pop_df[std_col.RACE_CATEGORY_ID_COL] = pop_df[std_col.RACE_CATEGORY_ID_COL].replace(race_id_replace_map) - df = df.merge(pop_df, on=merge_cols, how='left') + df = df.merge(pop_df, on=merge_cols, how="left") if primary_demo_col == std_col.RACE_OR_HISPANIC_COL: std_col.add_race_columns_from_category_id(df) @@ -487,7 +487,7 @@ def sum_states_to_national(pop_df: pd.DataFrame) -> pd.DataFrame: return pop_df -def sum_age_groups(pop_df: pd.DataFrame, age_group: Literal['18+']) -> pd.DataFrame: +def sum_age_groups(pop_df: pd.DataFrame, age_group: Literal["18+"]) -> pd.DataFrame: """ Sums rows of smaller age groups together to generate new rows for target age group @@ -500,32 +500,32 @@ def sum_age_groups(pop_df: pd.DataFrame, age_group: Literal['18+']) -> pd.DataFr """ summed_age_groups_map = { - '18+': [ - '18-19', - '20-20', - '21-21', - '22-24', - '25-29', - '30-34', - '35-39', - '40-44', - '45-49', - '50-54', - '55-59', - '60-61', - '62-64', - '65-66', - '67-69', - '70-74', - '75-79', - '80-84', - '85+', + "18+": [ + "18-19", + "20-20", + "21-21", + "22-24", + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", + "60-61", + "62-64", + "65-66", + "67-69", + "70-74", + "75-79", + "80-84", + "85+", ], } # throw an error is user supplies an age group that isn't in the summed_age_groups_map if age_group not in summed_age_groups_map: - raise ValueError(f'age_group kwarg must be one of {summed_age_groups_map.keys()}') + raise ValueError(f"age_group kwarg must be one of {summed_age_groups_map.keys()}") possible_geo_cols = [ std_col.STATE_FIPS_COL, @@ -571,6 +571,6 @@ def merge_dfs_list(df_list: List[pd.DataFrame], merge_cols: List[str]) -> pd.Dat - A single dataframe containing the merged data. """ - merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_cols, how='outer'), df_list) + merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_cols, how="outer"), df_list) return merged_df diff --git a/python/ingestion/phrma_utils.py b/python/ingestion/phrma_utils.py index 821708d511..dcd8fc1e06 100644 --- a/python/ingestion/phrma_utils.py +++ b/python/ingestion/phrma_utils.py @@ -7,14 +7,14 @@ from typing import Dict, cast, List from ingestion.merge_utils import merge_dfs_list -PHRMA_BRFSS: PHRMA_DATASET_TYPE = 'brfss' -PHRMA_MEDICARE: PHRMA_DATASET_TYPE = 'medicare' +PHRMA_BRFSS: PHRMA_DATASET_TYPE = "brfss" +PHRMA_MEDICARE: PHRMA_DATASET_TYPE = "medicare" -TMP_ALL: PHRMA_BREAKDOWN_TYPE_OR_ALL = 'all' -PHRMA_DIR = 'phrma' +TMP_ALL: PHRMA_BREAKDOWN_TYPE_OR_ALL = "all" +PHRMA_DIR = "phrma" -ADHERENCE = 'adherence' -BENEFICIARIES = 'beneficiaries' +ADHERENCE = "adherence" +BENEFICIARIES = "beneficiaries" # PHRMA BRFSS CONSTANTS COUNT_TOTAL_LOWER = "total_bene" @@ -30,8 +30,8 @@ EDUCATION_GROUP_LOWER = "education_group" STATE_FIPS_LOWER = "state_fips" -SCREENED = 'screened' -SCREENING_ELIGIBLE = 'screening_eligible' +SCREENED = "screened" +SCREENING_ELIGIBLE = "screening_eligible" # PHRMA CMS CONSTANTS @@ -100,19 +100,19 @@ "_85+": "85+", }, std_col.RACE_CATEGORY_ID_COL: { - 'American Indian or Alaskan Native': std_col.Race.AIAN_NH.value, - 'Asian': std_col.Race.ASIAN_NH.value, - 'Black': std_col.Race.BLACK_NH.value, - 'Hispanic': std_col.Race.HISP.value, - 'Multiracial': std_col.Race.MULTI_NH.value, - 'Native Hawaiian or other Pacific Islander': std_col.Race.NHPI_NH.value, - 'White': std_col.Race.WHITE_NH.value, - 'Unknown': std_col.Race.UNKNOWN.value, - 'American Indian / Alaska Native': std_col.Race.AIAN_NH.value, - 'Asian/Pacific Islander': std_col.Race.API_NH.value, - 'Black or African-American': std_col.Race.BLACK_NH.value, - 'Other': std_col.Race.OTHER_NONSTANDARD_NH.value, - 'Non-Hispanic White': std_col.Race.WHITE_NH.value, + "American Indian or Alaskan Native": std_col.Race.AIAN_NH.value, + "Asian": std_col.Race.ASIAN_NH.value, + "Black": std_col.Race.BLACK_NH.value, + "Hispanic": std_col.Race.HISP.value, + "Multiracial": std_col.Race.MULTI_NH.value, + "Native Hawaiian or other Pacific Islander": std_col.Race.NHPI_NH.value, + "White": std_col.Race.WHITE_NH.value, + "Unknown": std_col.Race.UNKNOWN.value, + "American Indian / Alaska Native": std_col.Race.AIAN_NH.value, + "Asian/Pacific Islander": std_col.Race.API_NH.value, + "Black or African-American": std_col.Race.BLACK_NH.value, + "Other": std_col.Race.OTHER_NONSTANDARD_NH.value, + "Non-Hispanic White": std_col.Race.WHITE_NH.value, }, std_col.INSURANCE_COL: { "Have some form of insurance": "Insured", @@ -195,15 +195,15 @@ def rename_cols( """Renames columns based on the demo/geo breakdown""" rename_cols_map: Dict[str, str] = { - COUNT_YES: f'{condition}_{COUNT_YES}', - COUNT_TOTAL: f'{condition}_{COUNT_TOTAL}', - ADHERENCE_RATE: f'{condition}_{ADHERENCE_RATE}', - MEDICARE_DISEASE_COUNT: f'{condition}_{MEDICARE_DISEASE_COUNT}', - PER_100K: f'{condition}_{PER_100K}', - COUNT_YES_LOWER: f'{condition}_{COUNT_YES_LOWER}', - COUNT_TOTAL_LOWER: f'{condition}_{COUNT_TOTAL_LOWER}', - ADHERENCE_RATE_LOWER: f'{condition}_{ADHERENCE_RATE_LOWER}', - AGE_ADJ_RATE_LOWER: f'{condition}_{AGE_ADJ_RATE_LOWER}', + COUNT_YES: f"{condition}_{COUNT_YES}", + COUNT_TOTAL: f"{condition}_{COUNT_TOTAL}", + ADHERENCE_RATE: f"{condition}_{ADHERENCE_RATE}", + MEDICARE_DISEASE_COUNT: f"{condition}_{MEDICARE_DISEASE_COUNT}", + PER_100K: f"{condition}_{PER_100K}", + COUNT_YES_LOWER: f"{condition}_{COUNT_YES_LOWER}", + COUNT_TOTAL_LOWER: f"{condition}_{COUNT_TOTAL_LOWER}", + ADHERENCE_RATE_LOWER: f"{condition}_{ADHERENCE_RATE_LOWER}", + AGE_ADJ_RATE_LOWER: f"{condition}_{AGE_ADJ_RATE_LOWER}", } if geo_level == COUNTY_LEVEL: @@ -248,7 +248,7 @@ def rename_cols( return df -DTYPE = {'COUNTY_FIPS': str, 'STATE_FIPS': str} +DTYPE = {"COUNTY_FIPS": str, "STATE_FIPS": str} def load_phrma_df_from_data_dir( @@ -333,11 +333,11 @@ def load_phrma_df_from_data_dir( condition_keep_cols.append(AGE_ADJ_RATE_LOWER) if dataset_type == PHRMA_MEDICARE: - file_name = f'{condition}-{sheet_name}.csv' + file_name = f"{condition}-{sheet_name}.csv" subdirectory = condition else: # cancer - condition_folder = f'MSM_BRFSS {condition} Cancer Screening_2024-08-07' - file_name = f'{condition_folder}-{sheet_name}.csv' + condition_folder = f"MSM_BRFSS {condition} Cancer Screening_2024-08-07" + file_name = f"{condition_folder}-{sheet_name}.csv" subdirectory = condition_folder topic_df = gcs_to_bq_util.load_csv_as_df_from_data_dir( @@ -349,7 +349,7 @@ def load_phrma_df_from_data_dir( usecols=condition_keep_cols, ) - topic_df = topic_df.replace(['\n', '¬¥', '‚Äô'], [' ', "'", "'"], regex=True) + topic_df = topic_df.replace(["\n", "¬¥", "‚Äô"], [" ", "'", "'"], regex=True) if geo_level == NATIONAL_LEVEL: topic_df[STATE_FIPS] = US_FIPS @@ -376,12 +376,12 @@ def get_age_adjusted_ratios(df: pd.DataFrame, conditions: List[str]) -> pd.DataF """Adds columns for age adjusted ratios (comparing each race's rate to the rate for White NH) for each type of cancer screening.""" - _tmp_white_rates_col = 'WHITE_NH_AGE_ADJ_RATE' + _tmp_white_rates_col = "WHITE_NH_AGE_ADJ_RATE" for condition in conditions: - source_age_adj_rate_col = f'{condition}_{AGE_ADJ_RATE_LOWER}' + source_age_adj_rate_col = f"{condition}_{AGE_ADJ_RATE_LOWER}" cancer_type = condition.lower() - het_age_adj_ratio_col = f'{cancer_type}_{SCREENED}_{std_col.RATIO_AGE_ADJUSTED_SUFFIX}' + het_age_adj_ratio_col = f"{cancer_type}_{SCREENED}_{std_col.RATIO_AGE_ADJUSTED_SUFFIX}" # Step 1: Filter the DataFrame to get AGE_ADJ_RATE where RACE_ID is 'WHITE_NH' white_nh_rates = df[df[std_col.RACE_CATEGORY_ID_COL] == std_col.Race.WHITE_NH.value].set_index( diff --git a/python/ingestion/pubsub_publisher.py b/python/ingestion/pubsub_publisher.py index 0f363020cd..6fee208c55 100644 --- a/python/ingestion/pubsub_publisher.py +++ b/python/ingestion/pubsub_publisher.py @@ -19,11 +19,11 @@ def notify_topic(project_id, topic, **attrs): # Not sure if anything here is necessary since we can add attributes # directly. For now just adding a message to log. - data = 'Notifying data ingested' - data = data.encode('utf-8') + data = "Notifying data ingested" + data = data.encode("utf-8") future = publisher.publish(topic_path, data, **attrs) try: future.result() except Exception as e: - logging.warning('Error publishing message on topic %s: %s', topic, e) + logging.warning("Error publishing message on topic %s: %s", topic, e) diff --git a/python/ingestion/setup.py b/python/ingestion/setup.py index 7230b82672..948ef468fe 100644 --- a/python/ingestion/setup.py +++ b/python/ingestion/setup.py @@ -1,17 +1,17 @@ from setuptools import setup setup( - name='ingestion', - package_dir={'ingestion': ''}, - packages=['ingestion'], + name="ingestion", + package_dir={"ingestion": ""}, + packages=["ingestion"], include_package_data=True, package_data={ - 'ingestion': [ - 'acs_population/*', - 'ahr_config/*', - 'decia_2010_territory_population/*', - 'decia_2020_territory_population/*', - 'fips_codes/*', + "ingestion": [ + "acs_population/*", + "ahr_config/*", + "decia_2010_territory_population/*", + "decia_2020_territory_population/*", + "fips_codes/*", ] }, ) diff --git a/python/ingestion/standardized_columns.py b/python/ingestion/standardized_columns.py index 9be774e9cd..79a8e89b65 100644 --- a/python/ingestion/standardized_columns.py +++ b/python/ingestion/standardized_columns.py @@ -46,7 +46,7 @@ POPULATION_PCT_COL = "population_pct" SVI = "svi" -TIME_PERIOD_COL = 'time_period' +TIME_PERIOD_COL = "time_period" ALL_VALUE = "All" @@ -68,7 +68,7 @@ PCT_REL_INEQUITY_SUFFIX = "pct_relative_inequity" RAW_SUFFIX = "estimated_total" RAW_POP_SUFFIX = "population_estimated_total" -POP_PCT_SUFFIX = 'population_pct' +POP_PCT_SUFFIX = "population_pct" RATIO_AGE_ADJUSTED_SUFFIX = "ratio_age_adjusted" INDEX_SUFFIX = "index" @@ -99,12 +99,12 @@ COVID_DEATH_RATIO_AGE_ADJUSTED = "death_ratio_age_adjusted" COVID_HOSP_RATIO_AGE_ADJUSTED = "hosp_ratio_age_adjusted" -UNINSURED_PER_100K_COL = 'uninsured_per_100k' -UNINSURED_PCT_SHARE_COL = 'uninsured_pct_share' -UNINSURED_POPULATION_PCT = 'uninsured_population_pct' +UNINSURED_PER_100K_COL = "uninsured_per_100k" +UNINSURED_PCT_SHARE_COL = "uninsured_pct_share" +UNINSURED_POPULATION_PCT = "uninsured_population_pct" -UNINSURED_PREFIX = 'uninsured' -POVERTY_PREFIX = 'poverty' +UNINSURED_PREFIX = "uninsured" +POVERTY_PREFIX = "poverty" ABOVE_POVERTY_COL = "above_poverty_line" BELOW_POVERTY_COL = "below_poverty_line" @@ -137,8 +137,8 @@ VACCINATED_RAW = "vaccinated_estimated_total" VACCINATED_PCT_RATE = "vaccinated_pct_rate" VACCINATED_PCT_SHARE = "vaccinated_pct_share" -VACCINATED_POP_PCT = 'vaccinated_pop_pct' -ACS_VACCINATED_POP_PCT = 'acs_vaccinated_pop_pct' +VACCINATED_POP_PCT = "vaccinated_pop_pct" +ACS_VACCINATED_POP_PCT = "acs_vaccinated_pop_pct" # CAWP @@ -181,29 +181,29 @@ PRISON_PCT_INEQUITY = "prison_pct_relative_inequity" # HIV -BLACK_WOMEN = 'black_women' -HIV_BW_POPULATION_PCT = 'black_women_population_pct' +BLACK_WOMEN = "black_women" +HIV_BW_POPULATION_PCT = "black_women_population_pct" -HIV_POPULATION = 'hiv_population' -HIV_POPULATION_PCT = 'hiv_population_pct' +HIV_POPULATION = "hiv_population" +HIV_POPULATION_PCT = "hiv_population_pct" -HIV_CARE_LINKAGE = 'hiv_care_linkage' -HIV_CARE_POPULATION = 'hiv_care_population' -HIV_CARE_POPULATION_PCT = 'hiv_care_population_pct' -HIV_CARE_PREFIX = 'hiv_care' -HIV_DEATHS_PREFIX = 'hiv_deaths' -HIV_DIAGNOSES_PREFIX = 'hiv_diagnoses' -HIV_PREP_COVERAGE = 'hiv_prep_coverage' +HIV_CARE_LINKAGE = "hiv_care_linkage" +HIV_CARE_POPULATION = "hiv_care_population" +HIV_CARE_POPULATION_PCT = "hiv_care_population_pct" +HIV_CARE_PREFIX = "hiv_care" +HIV_DEATHS_PREFIX = "hiv_deaths" +HIV_DIAGNOSES_PREFIX = "hiv_diagnoses" +HIV_PREP_COVERAGE = "hiv_prep_coverage" # population of individuals with PrEP indicators -HIV_PREP_POPULATION = 'hiv_prep_population' -HIV_PREP_POPULATION_PCT = 'hiv_prep_population_pct' -HIV_PREP_PREFIX = 'hiv_prep' -HIV_PREVALENCE_PREFIX = 'hiv_prevalence' -HIV_STIGMA_INDEX = 'hiv_stigma_index' -TOTAL_TRANS_MEN = 'total_trans_men' -TOTAL_TRANS_WOMEN = 'total_trans_women' -TOTAL_ADDITIONAL_GENDER = 'total_additional_gender' +HIV_PREP_POPULATION = "hiv_prep_population" +HIV_PREP_POPULATION_PCT = "hiv_prep_population_pct" +HIV_PREP_PREFIX = "hiv_prep" +HIV_PREVALENCE_PREFIX = "hiv_prevalence" +HIV_STIGMA_INDEX = "hiv_stigma_index" +TOTAL_TRANS_MEN = "total_trans_men" +TOTAL_TRANS_WOMEN = "total_trans_women" +TOTAL_ADDITIONAL_GENDER = "total_additional_gender" HIV_DEATH_RATIO_AGE_ADJUSTED = "hiv_deaths_ratio_age_adjusted" @@ -450,14 +450,14 @@ def generate_column_name(prefix, suffix): prefix: A condition name suffix: a type of measurement (pct_share, per_100k)""" - return f'{prefix}_{suffix}' + return f"{prefix}_{suffix}" def extract_prefix(col_name: str) -> str: """Extracts the prefix from a column name that contains one of out standard HET suffixes.""" for suffix in SUFFIXES: - underscore_suffix = f'_{suffix}' + underscore_suffix = f"_{suffix}" if col_name.endswith(underscore_suffix): prefix = col_name[: -len(underscore_suffix)] return prefix diff --git a/python/ingestion/url_file_to_gcs.py b/python/ingestion/url_file_to_gcs.py index c89ee3e1c4..503a1e0682 100644 --- a/python/ingestion/url_file_to_gcs.py +++ b/python/ingestion/url_file_to_gcs.py @@ -10,7 +10,7 @@ def local_file_path(filename): - return f'/tmp/{filename}' + return f"/tmp/{filename}" def url_file_to_gcs(url, url_params, gcs_bucket, dest_filename): @@ -76,7 +76,7 @@ def download_first_url_to_gcs(url_list, gcs_bucket, dest_filename, url_params=No # Download the contents of the URL to a local file new_file_local_path = local_file_path(dest_filename) - with file_from_url, open(new_file_local_path, 'wb') as new_file: + with file_from_url, open(new_file_local_path, "wb") as new_file: new_file.write(file_from_url.content) # Downloads the current file in GCS to a local file @@ -94,7 +94,7 @@ def download_first_url_to_gcs(url_list, gcs_bucket, dest_filename, url_params=No if files_are_diff: # Upload the contents to the bucket bucket.blob(dest_filename).upload_from_filename(new_file_local_path) - print(f'Uploading to Gcs_Bucket: {gcs_bucket}, FileName: {dest_filename}') + print(f"Uploading to Gcs_Bucket: {gcs_bucket}, FileName: {dest_filename}") # Remove local files os.remove(new_file_local_path) os.remove(old_file_local_path) diff --git a/python/run_local_pipelines.py b/python/run_local_pipelines.py index f760182940..f80ae99a92 100644 --- a/python/run_local_pipelines.py +++ b/python/run_local_pipelines.py @@ -1,4 +1,4 @@ from datasources.data_sources import DATA_SOURCES_DICT -source = DATA_SOURCES_DICT['CDC_VACCINATION_NATIONAL'] -source.write_to_bq('', '', write_local_instead_of_bq=True) +source = DATA_SOURCES_DICT["CDC_VACCINATION_NATIONAL"] +source.write_to_bq("", "", write_local_instead_of_bq=True) diff --git a/python/tests/data_server/test_dataset_cache.py b/python/tests/data_server/test_dataset_cache.py index 9d14e78d99..679c7022e0 100644 --- a/python/tests/data_server/test_dataset_cache.py +++ b/python/tests/data_server/test_dataset_cache.py @@ -1,104 +1,102 @@ +# pylint: disable=unused-argument from unittest import mock from unittest.mock import call - from textwrap import dedent - from data_server.dataset_cache import DatasetCache -test_data = dedent(""" +test_data = dedent( + """ {"label1":"value1","label2":["value2a","value2b","value2c"],"label3":"value3"} {"label1":"value2","label2":["value3a","value2b","value2c"],"label3":"value6"} {"label1":"value3","label2":["value4a","value2b","value2c"],"label3":"value9"} {"label1":"value4","label2":["value5a","value2b","value2c"],"label3":"value12"} {"label1":"value5","label2":["value6a","value2b","value2c"],"label3":"value15"} {"label1":"value6","label2":["value7a","value2b","value2c"],"label3":"value18"} -""").strip() +""" +).strip() -test_data2 = dedent(""" +test_data2 = dedent( + """ {"county_geoid":"78020","neighbor_geoids":["78020","78030"]} {"county_geoid":"78030","neighbor_geoids":["78020","78030"]} {"county_geoid":"78030","neighbor_geoids":["78020","78030"]} {"county_geoid":"78030","neighbor_geoids":["78020","78030"]} {"county_geoid":"78030","neighbor_geoids":["78020","78030"]} {"county_geoid":"78030","neighbor_geoids":["78020","78030"]} -""").strip() +""" +).strip() def get_test_data(gcs_bucket: str, filename: str): """Returns the contents of filename as a bytes object. Meant to be used to patch gcs_utils.download_blob_as_bytes.""" - if filename == 'test_data': + if filename == "test_data": return test_data - elif filename == 'test_data2': + elif filename == "test_data2": return test_data2 - return '' + return "" -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', - side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetDataset(mock_func: mock.MagicMock): cache = DatasetCache() - data = cache.getDataset('test_bucket', 'test_data') - mock_func.assert_called_once_with('test_bucket', 'test_data') + data = cache.getDataset("test_bucket", "test_data") + mock_func.assert_called_once_with("test_bucket", "test_data") assert data == test_data.strip() -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', - side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetDataset_FromCache(mock_func: mock.MagicMock): # Make the first request, which should incur an API call. cache = DatasetCache() - cache.getDataset('test_bucket', 'test_data') - mock_func.assert_called_once_with('test_bucket', 'test_data') + cache.getDataset("test_bucket", "test_data") + mock_func.assert_called_once_with("test_bucket", "test_data") # Make the second request, which should be served from the cache. - cache.getDataset('test_bucket', 'test_data') + cache.getDataset("test_bucket", "test_data") mock_func.assert_called_once() -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', - side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetDataset_CacheEviction(mock_func: mock.MagicMock): cache = DatasetCache(max_cache_size=1) - data = cache.getDataset('test_bucket', 'test_data') + data = cache.getDataset("test_bucket", "test_data") assert data == test_data # Make a second call which doesn't make an API call. - data = cache.getDataset('test_bucket', 'test_data') + data = cache.getDataset("test_bucket", "test_data") assert data == test_data # Now request a file that is not in the cache. It should replace the # existing data. - data = cache.getDataset('test_bucket', 'test_data2') + data = cache.getDataset("test_bucket", "test_data2") assert data == test_data2 - data = cache.getDataset('test_bucket', 'test_data2') + data = cache.getDataset("test_bucket", "test_data2") assert data == test_data2 - data = cache.getDataset('test_bucket', 'test_data') + data = cache.getDataset("test_bucket", "test_data") assert data == test_data assert mock_func.call_count == 3 - mock_func.assert_has_calls([call('test_bucket', 'test_data'), - call('test_bucket', 'test_data2'), - call('test_bucket', 'test_data')]) + mock_func.assert_has_calls( + [call("test_bucket", "test_data"), call("test_bucket", "test_data2"), call("test_bucket", "test_data")] + ) -@mock.patch('data_server.gcs_utils.download_blob_as_bytes', - side_effect=get_test_data) +@mock.patch("data_server.gcs_utils.download_blob_as_bytes", side_effect=get_test_data) def testGetDataset_MultipleEntries(mock_func: mock.MagicMock): cache = DatasetCache() - data = cache.getDataset('test_bucket', 'test_data') + data = cache.getDataset("test_bucket", "test_data") assert data == test_data - data = cache.getDataset('test_bucket', 'test_data2') + data = cache.getDataset("test_bucket", "test_data2") assert data == test_data2 - data = cache.getDataset('test_bucket', 'test_data') + data = cache.getDataset("test_bucket", "test_data") assert data == test_data assert mock_func.call_count == 2 - mock_func.assert_has_calls([call('test_bucket', 'test_data'), - call('test_bucket', 'test_data2')]) + mock_func.assert_has_calls([call("test_bucket", "test_data"), call("test_bucket", "test_data2")]) diff --git a/python/tests/datasources/test_acs_condition.py b/python/tests/datasources/test_acs_condition.py index fe35727dce..9ab017a657 100644 --- a/python/tests/datasources/test_acs_condition.py +++ b/python/tests/datasources/test_acs_condition.py @@ -15,12 +15,12 @@ # Current working directory. THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data', 'acs_condition') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "acs_condition") -GOLDEN_BASE_TABLE_NATIONAL_SEX = os.path.join(TEST_DIR, 'golden_data', 'sex_national.csv') -GOLDEN_BASE_TABLE_STATE_SEX = os.path.join(TEST_DIR, 'golden_data', 'sex_state.csv') -GOLDEN_BASE_TABLE_COUNTY_SEX = os.path.join(TEST_DIR, 'golden_data', 'sex_county.csv') -GOLDEN_BASE_TABLE_COUNTY_RACE = os.path.join(TEST_DIR, 'golden_data', 'race_county.csv') +GOLDEN_BASE_TABLE_NATIONAL_SEX = os.path.join(TEST_DIR, "golden_data", "sex_national.csv") +GOLDEN_BASE_TABLE_STATE_SEX = os.path.join(TEST_DIR, "golden_data", "sex_state.csv") +GOLDEN_BASE_TABLE_COUNTY_SEX = os.path.join(TEST_DIR, "golden_data", "sex_county.csv") +GOLDEN_BASE_TABLE_COUNTY_RACE = os.path.join(TEST_DIR, "golden_data", "race_county.csv") # NOT USING SHARED POPULATION MOCKS BECAUSE THESE ARE THE CACHED ACS_CONDITION TABLES, @@ -28,28 +28,28 @@ def _get_by_race_as_df(*args): _, filename = args return gcs_to_bq_util.values_json_to_df( - os.path.join(TEST_DIR, filename), dtype={'state_fips': str, 'county_fips': str} + os.path.join(TEST_DIR, filename), dtype={"state_fips": str, "county_fips": str} ).reset_index(drop=True) acsCondition = AcsCondition() -acsCondition.year = '2022' +acsCondition.year = "2022" -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_get_by_race_as_df) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_get_by_race_as_df) def testSexNationalBaseTable(mock_acs: mock.MagicMock): df = acsCondition.get_raw_data( - 'sex', 'national', get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, 'some-bucket' + "sex", "national", get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, "some-bucket" ) df = acsCondition.post_process( df, - 'sex', - 'national', + "sex", + "national", ACS_ITEMS_2022_AND_LATER, HEALTH_INSURANCE_RACE_TO_CONCEPT_TITLE, ) - expected_df = pd.read_csv(GOLDEN_BASE_TABLE_NATIONAL_SEX, dtype={'state_fips': str}) + expected_df = pd.read_csv(GOLDEN_BASE_TABLE_NATIONAL_SEX, dtype={"state_fips": str}) cols = list(expected_df.columns) assert mock_acs.call_count == 2 @@ -61,20 +61,20 @@ def testSexNationalBaseTable(mock_acs: mock.MagicMock): ) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_get_by_race_as_df) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_get_by_race_as_df) def testSexStateBaseTable(mock_acs: mock.MagicMock): df = acsCondition.get_raw_data( - 'sex', 'state', get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, 'some-bucket' + "sex", "state", get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, "some-bucket" ) df = acsCondition.post_process( df, - 'sex', - 'state', + "sex", + "state", ACS_ITEMS_2022_AND_LATER, HEALTH_INSURANCE_RACE_TO_CONCEPT_TITLE, ) - expected_df = pd.read_csv(GOLDEN_BASE_TABLE_STATE_SEX, dtype={'state_fips': str}) + expected_df = pd.read_csv(GOLDEN_BASE_TABLE_STATE_SEX, dtype={"state_fips": str}) cols = list(expected_df.columns) assert mock_acs.call_count == 2 @@ -86,20 +86,20 @@ def testSexStateBaseTable(mock_acs: mock.MagicMock): ) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_get_by_race_as_df) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_get_by_race_as_df) def testSexCountyBaseTable(mock_acs: mock.MagicMock): df = acsCondition.get_raw_data( - 'sex', 'county', get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, 'some-bucket' + "sex", "county", get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, "some-bucket" ) df = acsCondition.post_process( df, - 'sex', - 'county', + "sex", + "county", ACS_ITEMS_2022_AND_LATER, HEALTH_INSURANCE_RACE_TO_CONCEPT_TITLE, ) - expected_df = pd.read_csv(GOLDEN_BASE_TABLE_COUNTY_SEX, dtype={'state_fips': str, 'county_fips': str}) + expected_df = pd.read_csv(GOLDEN_BASE_TABLE_COUNTY_SEX, dtype={"state_fips": str, "county_fips": str}) cols = list(expected_df.columns) assert mock_acs.call_count == 2 @@ -111,20 +111,20 @@ def testSexCountyBaseTable(mock_acs: mock.MagicMock): ) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_get_by_race_as_df) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_get_by_race_as_df) def testRaceCountyBaseTable(mock_acs: mock.MagicMock): df = acsCondition.get_raw_data( - 'race', 'county', get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, 'some-bucket' + "race", "county", get_acs_metadata_as_json(2022), ACS_ITEMS_2022_AND_LATER, "some-bucket" ) df = acsCondition.post_process( df, - 'race', - 'county', + "race", + "county", ACS_ITEMS_2022_AND_LATER, HEALTH_INSURANCE_RACE_TO_CONCEPT_TITLE, ) - expected_df = pd.read_csv(GOLDEN_BASE_TABLE_COUNTY_RACE, dtype={'state_fips': str, 'county_fips': str}) + expected_df = pd.read_csv(GOLDEN_BASE_TABLE_COUNTY_RACE, dtype={"state_fips": str, "county_fips": str}) cols = list(expected_df.columns) assert mock_acs.call_count == 16 @@ -136,59 +136,59 @@ def testRaceCountyBaseTable(mock_acs: mock.MagicMock): ) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2012)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_get_by_race_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2012)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_get_by_race_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqOverwriteEarliestYear( mock_bq: mock.MagicMock, mock_acs: mock.MagicMock, mock_json: mock.MagicMock, ): acsCondition2012 = AcsCondition() - acsCondition2012.write_to_bq('dataset', 'gcs_bucket', year='2012') + acsCondition2012.write_to_bq("dataset", "gcs_bucket", year="2012") assert mock_acs.call_count == 60 assert mock_json.call_count == 1 for call in mock_bq.call_args_list: # This earliest year should OVERWRITE and create brand new BQ tables - assert call[1]['overwrite'] is True + assert call[1]["overwrite"] is True # Column names should match between the shipped df and the BQ types object df_cols = sorted(call[0][0].columns) bq_types_cols = sorted(call[1]["column_types"].keys()) assert df_cols == bq_types_cols -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_get_by_race_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_get_by_race_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAppend2022( mock_bq: mock.MagicMock, mock_acs: mock.MagicMock, mock_json: mock.MagicMock, ): acsCondition2022 = AcsCondition() - acsCondition2022.write_to_bq('dataset', 'gcs_bucket', year='2022') + acsCondition2022.write_to_bq("dataset", "gcs_bucket", year="2022") # Non-earliest year like this should APPEND its TIME_SERIES yearly data onto the existing BQ tables # This most current year should also generate a CURRENT table with an undefined overwrite arg - assert mock_bq.call_args_list[0][1]['overwrite'] is False + assert mock_bq.call_args_list[0][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[1][1] - assert mock_bq.call_args_list[2][1]['overwrite'] is False + assert mock_bq.call_args_list[2][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[3][1] - assert mock_bq.call_args_list[4][1]['overwrite'] is False + assert mock_bq.call_args_list[4][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[5][1] - assert mock_bq.call_args_list[6][1]['overwrite'] is False + assert mock_bq.call_args_list[6][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[7][1] - assert mock_bq.call_args_list[8][1]['overwrite'] is False + assert mock_bq.call_args_list[8][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[9][1] - assert mock_bq.call_args_list[10][1]['overwrite'] is False + assert mock_bq.call_args_list[10][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[11][1] - assert mock_bq.call_args_list[12][1]['overwrite'] is False + assert mock_bq.call_args_list[12][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[13][1] - assert mock_bq.call_args_list[14][1]['overwrite'] is False + assert mock_bq.call_args_list[14][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[15][1] - assert mock_bq.call_args_list[16][1]['overwrite'] is False + assert mock_bq.call_args_list[16][1]["overwrite"] is False assert "overwrite" not in mock_bq.call_args_list[17][1] assert mock_json.call_count == 1 @@ -196,101 +196,101 @@ def testWriteToBqAppend2022( # One call per race per geo, and then one call for sex at each geo # and one for age at each geo assert mock_acs.call_count == ((8 * 3) + 3 + 3) * 2 - assert mock_acs.call_args_list[0].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_AIAN.json' - assert mock_acs.call_args_list[1].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_ASIAN.json' - assert mock_acs.call_args_list[2].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_HISP.json' - assert mock_acs.call_args_list[3].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_BLACK.json' - assert mock_acs.call_args_list[4].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_NHPI.json' - assert mock_acs.call_args_list[5].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_WHITE.json' - assert mock_acs.call_args_list[6].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_OTHER_STANDARD.json' - assert mock_acs.call_args_list[7].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_MULTI.json' - - assert mock_acs.call_args_list[8].args[1] == '2022-POVERTY_BY_RACE_STATE_AIAN.json' - assert mock_acs.call_args_list[9].args[1] == '2022-POVERTY_BY_RACE_STATE_ASIAN.json' - assert mock_acs.call_args_list[10].args[1] == '2022-POVERTY_BY_RACE_STATE_HISP.json' - assert mock_acs.call_args_list[11].args[1] == '2022-POVERTY_BY_RACE_STATE_BLACK.json' - assert mock_acs.call_args_list[12].args[1] == '2022-POVERTY_BY_RACE_STATE_NHPI.json' - assert mock_acs.call_args_list[13].args[1] == '2022-POVERTY_BY_RACE_STATE_WHITE.json' - assert mock_acs.call_args_list[14].args[1] == '2022-POVERTY_BY_RACE_STATE_OTHER_STANDARD.json' - assert mock_acs.call_args_list[15].args[1] == '2022-POVERTY_BY_RACE_STATE_MULTI.json' - - assert mock_acs.call_args_list[16].args[1] == '2022-HEALTH_INSURANCE_BY_SEX_STATE.json' - assert mock_acs.call_args_list[17].args[1] == '2022-POVERTY_BY_SEX_STATE.json' - assert mock_acs.call_args_list[18].args[1] == '2022-HEALTH_INSURANCE_BY_SEX_STATE.json' - assert mock_acs.call_args_list[19].args[1] == '2022-POVERTY_BY_SEX_STATE.json' - - assert mock_acs.call_args_list[20].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_AIAN.json' - assert mock_acs.call_args_list[21].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_ASIAN.json' - assert mock_acs.call_args_list[22].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_HISP.json' - assert mock_acs.call_args_list[23].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_BLACK.json' - assert mock_acs.call_args_list[24].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_NHPI.json' - assert mock_acs.call_args_list[25].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_WHITE.json' - assert mock_acs.call_args_list[26].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_OTHER_STANDARD.json' - assert mock_acs.call_args_list[27].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_STATE_MULTI.json' - - assert mock_acs.call_args_list[28].args[1] == '2022-POVERTY_BY_RACE_STATE_AIAN.json' - assert mock_acs.call_args_list[29].args[1] == '2022-POVERTY_BY_RACE_STATE_ASIAN.json' - assert mock_acs.call_args_list[30].args[1] == '2022-POVERTY_BY_RACE_STATE_HISP.json' - assert mock_acs.call_args_list[31].args[1] == '2022-POVERTY_BY_RACE_STATE_BLACK.json' - assert mock_acs.call_args_list[32].args[1] == '2022-POVERTY_BY_RACE_STATE_NHPI.json' - assert mock_acs.call_args_list[33].args[1] == '2022-POVERTY_BY_RACE_STATE_WHITE.json' - assert mock_acs.call_args_list[34].args[1] == '2022-POVERTY_BY_RACE_STATE_OTHER_STANDARD.json' - assert mock_acs.call_args_list[35].args[1] == '2022-POVERTY_BY_RACE_STATE_MULTI.json' - - assert mock_acs.call_args_list[36].args[1] == '2022-HEALTH_INSURANCE_BY_SEX_STATE.json' - assert mock_acs.call_args_list[37].args[1] == '2022-POVERTY_BY_SEX_STATE.json' - assert mock_acs.call_args_list[38].args[1] == '2022-HEALTH_INSURANCE_BY_SEX_STATE.json' - assert mock_acs.call_args_list[39].args[1] == '2022-POVERTY_BY_SEX_STATE.json' - - assert mock_acs.call_args_list[40].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_AIAN.json' - assert mock_acs.call_args_list[41].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_ASIAN.json' - assert mock_acs.call_args_list[42].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_HISP.json' - assert mock_acs.call_args_list[43].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_BLACK.json' - assert mock_acs.call_args_list[44].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_NHPI.json' - assert mock_acs.call_args_list[45].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_WHITE.json' - assert mock_acs.call_args_list[46].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_OTHER_STANDARD.json' - assert mock_acs.call_args_list[47].args[1] == '2022-HEALTH_INSURANCE_BY_RACE_COUNTY_MULTI.json' - - assert mock_acs.call_args_list[48].args[1] == '2022-POVERTY_BY_RACE_COUNTY_AIAN.json' - assert mock_acs.call_args_list[49].args[1] == '2022-POVERTY_BY_RACE_COUNTY_ASIAN.json' - assert mock_acs.call_args_list[50].args[1] == '2022-POVERTY_BY_RACE_COUNTY_HISP.json' - assert mock_acs.call_args_list[51].args[1] == '2022-POVERTY_BY_RACE_COUNTY_BLACK.json' - assert mock_acs.call_args_list[52].args[1] == '2022-POVERTY_BY_RACE_COUNTY_NHPI.json' - assert mock_acs.call_args_list[53].args[1] == '2022-POVERTY_BY_RACE_COUNTY_WHITE.json' - assert mock_acs.call_args_list[54].args[1] == '2022-POVERTY_BY_RACE_COUNTY_OTHER_STANDARD.json' - assert mock_acs.call_args_list[55].args[1] == '2022-POVERTY_BY_RACE_COUNTY_MULTI.json' - - assert mock_acs.call_args_list[56].args[1] == '2022-HEALTH_INSURANCE_BY_SEX_COUNTY.json' - assert mock_acs.call_args_list[57].args[1] == '2022-POVERTY_BY_SEX_COUNTY.json' - assert mock_acs.call_args_list[58].args[1] == '2022-HEALTH_INSURANCE_BY_SEX_COUNTY.json' - assert mock_acs.call_args_list[59].args[1] == '2022-POVERTY_BY_SEX_COUNTY.json' + assert mock_acs.call_args_list[0].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_AIAN.json" + assert mock_acs.call_args_list[1].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_ASIAN.json" + assert mock_acs.call_args_list[2].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_HISP.json" + assert mock_acs.call_args_list[3].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_BLACK.json" + assert mock_acs.call_args_list[4].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_NHPI.json" + assert mock_acs.call_args_list[5].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_WHITE.json" + assert mock_acs.call_args_list[6].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_OTHER_STANDARD.json" + assert mock_acs.call_args_list[7].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_MULTI.json" + + assert mock_acs.call_args_list[8].args[1] == "2022-POVERTY_BY_RACE_STATE_AIAN.json" + assert mock_acs.call_args_list[9].args[1] == "2022-POVERTY_BY_RACE_STATE_ASIAN.json" + assert mock_acs.call_args_list[10].args[1] == "2022-POVERTY_BY_RACE_STATE_HISP.json" + assert mock_acs.call_args_list[11].args[1] == "2022-POVERTY_BY_RACE_STATE_BLACK.json" + assert mock_acs.call_args_list[12].args[1] == "2022-POVERTY_BY_RACE_STATE_NHPI.json" + assert mock_acs.call_args_list[13].args[1] == "2022-POVERTY_BY_RACE_STATE_WHITE.json" + assert mock_acs.call_args_list[14].args[1] == "2022-POVERTY_BY_RACE_STATE_OTHER_STANDARD.json" + assert mock_acs.call_args_list[15].args[1] == "2022-POVERTY_BY_RACE_STATE_MULTI.json" + + assert mock_acs.call_args_list[16].args[1] == "2022-HEALTH_INSURANCE_BY_SEX_STATE.json" + assert mock_acs.call_args_list[17].args[1] == "2022-POVERTY_BY_SEX_STATE.json" + assert mock_acs.call_args_list[18].args[1] == "2022-HEALTH_INSURANCE_BY_SEX_STATE.json" + assert mock_acs.call_args_list[19].args[1] == "2022-POVERTY_BY_SEX_STATE.json" + + assert mock_acs.call_args_list[20].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_AIAN.json" + assert mock_acs.call_args_list[21].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_ASIAN.json" + assert mock_acs.call_args_list[22].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_HISP.json" + assert mock_acs.call_args_list[23].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_BLACK.json" + assert mock_acs.call_args_list[24].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_NHPI.json" + assert mock_acs.call_args_list[25].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_WHITE.json" + assert mock_acs.call_args_list[26].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_OTHER_STANDARD.json" + assert mock_acs.call_args_list[27].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_STATE_MULTI.json" + + assert mock_acs.call_args_list[28].args[1] == "2022-POVERTY_BY_RACE_STATE_AIAN.json" + assert mock_acs.call_args_list[29].args[1] == "2022-POVERTY_BY_RACE_STATE_ASIAN.json" + assert mock_acs.call_args_list[30].args[1] == "2022-POVERTY_BY_RACE_STATE_HISP.json" + assert mock_acs.call_args_list[31].args[1] == "2022-POVERTY_BY_RACE_STATE_BLACK.json" + assert mock_acs.call_args_list[32].args[1] == "2022-POVERTY_BY_RACE_STATE_NHPI.json" + assert mock_acs.call_args_list[33].args[1] == "2022-POVERTY_BY_RACE_STATE_WHITE.json" + assert mock_acs.call_args_list[34].args[1] == "2022-POVERTY_BY_RACE_STATE_OTHER_STANDARD.json" + assert mock_acs.call_args_list[35].args[1] == "2022-POVERTY_BY_RACE_STATE_MULTI.json" + + assert mock_acs.call_args_list[36].args[1] == "2022-HEALTH_INSURANCE_BY_SEX_STATE.json" + assert mock_acs.call_args_list[37].args[1] == "2022-POVERTY_BY_SEX_STATE.json" + assert mock_acs.call_args_list[38].args[1] == "2022-HEALTH_INSURANCE_BY_SEX_STATE.json" + assert mock_acs.call_args_list[39].args[1] == "2022-POVERTY_BY_SEX_STATE.json" + + assert mock_acs.call_args_list[40].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_AIAN.json" + assert mock_acs.call_args_list[41].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_ASIAN.json" + assert mock_acs.call_args_list[42].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_HISP.json" + assert mock_acs.call_args_list[43].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_BLACK.json" + assert mock_acs.call_args_list[44].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_NHPI.json" + assert mock_acs.call_args_list[45].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_WHITE.json" + assert mock_acs.call_args_list[46].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_OTHER_STANDARD.json" + assert mock_acs.call_args_list[47].args[1] == "2022-HEALTH_INSURANCE_BY_RACE_COUNTY_MULTI.json" + + assert mock_acs.call_args_list[48].args[1] == "2022-POVERTY_BY_RACE_COUNTY_AIAN.json" + assert mock_acs.call_args_list[49].args[1] == "2022-POVERTY_BY_RACE_COUNTY_ASIAN.json" + assert mock_acs.call_args_list[50].args[1] == "2022-POVERTY_BY_RACE_COUNTY_HISP.json" + assert mock_acs.call_args_list[51].args[1] == "2022-POVERTY_BY_RACE_COUNTY_BLACK.json" + assert mock_acs.call_args_list[52].args[1] == "2022-POVERTY_BY_RACE_COUNTY_NHPI.json" + assert mock_acs.call_args_list[53].args[1] == "2022-POVERTY_BY_RACE_COUNTY_WHITE.json" + assert mock_acs.call_args_list[54].args[1] == "2022-POVERTY_BY_RACE_COUNTY_OTHER_STANDARD.json" + assert mock_acs.call_args_list[55].args[1] == "2022-POVERTY_BY_RACE_COUNTY_MULTI.json" + + assert mock_acs.call_args_list[56].args[1] == "2022-HEALTH_INSURANCE_BY_SEX_COUNTY.json" + assert mock_acs.call_args_list[57].args[1] == "2022-POVERTY_BY_SEX_COUNTY.json" + assert mock_acs.call_args_list[58].args[1] == "2022-HEALTH_INSURANCE_BY_SEX_COUNTY.json" + assert mock_acs.call_args_list[59].args[1] == "2022-POVERTY_BY_SEX_COUNTY.json" # One call for each table write to BQ assert mock_bq.call_count == 18 - assert mock_bq.call_args_list[0].args[2] == 'race_national_historical' - assert mock_bq.call_args_list[1].args[2] == 'race_national_current' + assert mock_bq.call_args_list[0].args[2] == "race_national_historical" + assert mock_bq.call_args_list[1].args[2] == "race_national_current" - assert mock_bq.call_args_list[2].args[2] == 'age_national_historical' - assert mock_bq.call_args_list[3].args[2] == 'age_national_current' + assert mock_bq.call_args_list[2].args[2] == "age_national_historical" + assert mock_bq.call_args_list[3].args[2] == "age_national_current" - assert mock_bq.call_args_list[4].args[2] == 'sex_national_historical' - assert mock_bq.call_args_list[5].args[2] == 'sex_national_current' + assert mock_bq.call_args_list[4].args[2] == "sex_national_historical" + assert mock_bq.call_args_list[5].args[2] == "sex_national_current" - assert mock_bq.call_args_list[6].args[2] == 'race_state_historical' - assert mock_bq.call_args_list[7].args[2] == 'race_state_current' + assert mock_bq.call_args_list[6].args[2] == "race_state_historical" + assert mock_bq.call_args_list[7].args[2] == "race_state_current" - assert mock_bq.call_args_list[8].args[2] == 'age_state_historical' - assert mock_bq.call_args_list[9].args[2] == 'age_state_current' + assert mock_bq.call_args_list[8].args[2] == "age_state_historical" + assert mock_bq.call_args_list[9].args[2] == "age_state_current" - assert mock_bq.call_args_list[10].args[2] == 'sex_state_historical' - assert mock_bq.call_args_list[11].args[2] == 'sex_state_current' + assert mock_bq.call_args_list[10].args[2] == "sex_state_historical" + assert mock_bq.call_args_list[11].args[2] == "sex_state_current" - assert mock_bq.call_args_list[12].args[2] == 'race_county_historical' - assert mock_bq.call_args_list[13].args[2] == 'race_county_current' + assert mock_bq.call_args_list[12].args[2] == "race_county_historical" + assert mock_bq.call_args_list[13].args[2] == "race_county_current" - assert mock_bq.call_args_list[14].args[2] == 'age_county_historical' - assert mock_bq.call_args_list[15].args[2] == 'age_county_current' + assert mock_bq.call_args_list[14].args[2] == "age_county_historical" + assert mock_bq.call_args_list[15].args[2] == "age_county_current" - assert mock_bq.call_args_list[16].args[2] == 'sex_county_historical' - assert mock_bq.call_args_list[17].args[2] == 'sex_county_current' + assert mock_bq.call_args_list[16].args[2] == "sex_county_historical" + assert mock_bq.call_args_list[17].args[2] == "sex_county_current" diff --git a/python/tests/datasources/test_acs_population.py b/python/tests/datasources/test_acs_population.py index bb9fe38199..f46fb7a4e0 100644 --- a/python/tests/datasources/test_acs_population.py +++ b/python/tests/datasources/test_acs_population.py @@ -1,3 +1,4 @@ +# pylint: disable=unused-argument import os import pandas as pd from unittest import mock @@ -12,51 +13,51 @@ # Current working directory. THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data', 'acs_population') -GOLDEN_DIR = os.path.join(THIS_DIR, os.pardir, 'data', 'acs_population', 'golden_data') -MOCK_CACHE_DIR = os.path.join(THIS_DIR, os.pardir, 'data', 'acs_population', 'mock_cache') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "acs_population") +GOLDEN_DIR = os.path.join(THIS_DIR, os.pardir, "data", "acs_population", "golden_data") +MOCK_CACHE_DIR = os.path.join(THIS_DIR, os.pardir, "data", "acs_population", "mock_cache") # single year golden data -GOLDEN_DATA_AGE_NATIONAL_2009 = os.path.join(GOLDEN_DIR, 'age_national.csv') -GOLDEN_DATA_RACE = os.path.join(GOLDEN_DIR, 'table_by_race_state.csv') -GOLDEN_DATA_SEX_AGE = os.path.join(GOLDEN_DIR, 'table_by_sex_age.csv') -GOLDEN_DATA_SEX = os.path.join(GOLDEN_DIR, 'table_by_sex.csv') -GOLDEN_DATA_SEX_NATIONAL = os.path.join(GOLDEN_DIR, 'table_by_sex_national.csv') -GOLDEN_DATA_RACE_NATIONAL = os.path.join(GOLDEN_DIR, 'table_by_race_national.csv') -GOLDEN_DATA_AGE_COUNTY = os.path.join(GOLDEN_DIR, 'table_by_age_county.csv') +GOLDEN_DATA_AGE_NATIONAL_2009 = os.path.join(GOLDEN_DIR, "age_national.csv") +GOLDEN_DATA_RACE = os.path.join(GOLDEN_DIR, "table_by_race_state.csv") +GOLDEN_DATA_SEX_AGE = os.path.join(GOLDEN_DIR, "table_by_sex_age.csv") +GOLDEN_DATA_SEX = os.path.join(GOLDEN_DIR, "table_by_sex.csv") +GOLDEN_DATA_SEX_NATIONAL = os.path.join(GOLDEN_DIR, "table_by_sex_national.csv") +GOLDEN_DATA_RACE_NATIONAL = os.path.join(GOLDEN_DIR, "table_by_race_national.csv") +GOLDEN_DATA_AGE_COUNTY = os.path.join(GOLDEN_DIR, "table_by_age_county.csv") # time series golden data initial year OVERWRITES GOLDEN_DATA_SEX_AGE_RACE_TIME_SERIES_OVERWRITES = os.path.join( - GOLDEN_DIR, 'time_series_overwrites', 'table_by_sex_age_race_state_time_series.csv' + GOLDEN_DIR, "time_series_overwrites", "table_by_sex_age_race_state_time_series.csv" ) # time series golden data subsequent year APPENDS GOLDEN_DATA_RACE_TIME_SERIES_APPEND = os.path.join( - GOLDEN_DIR, 'time_series_appends', 'table_by_race_state_time_series.csv' + GOLDEN_DIR, "time_series_appends", "table_by_race_state_time_series.csv" ) GOLDEN_DATA_SEX_AGE_TIME_SERIES_APPEND = os.path.join( - GOLDEN_DIR, 'time_series_appends', 'table_by_sex_age_time_series.csv' + GOLDEN_DIR, "time_series_appends", "table_by_sex_age_time_series.csv" ) -GOLDEN_DATA_SEX_TIME_SERIES_APPEND = os.path.join(GOLDEN_DIR, 'time_series_appends', 'table_by_sex_time_series.csv') +GOLDEN_DATA_SEX_TIME_SERIES_APPEND = os.path.join(GOLDEN_DIR, "time_series_appends", "table_by_sex_time_series.csv") GOLDEN_DATA_SEX_NATIONAL_TIME_SERIES_APPEND = os.path.join( - GOLDEN_DIR, 'time_series_appends', 'table_by_sex_national_time_series.csv' + GOLDEN_DIR, "time_series_appends", "table_by_sex_national_time_series.csv" ) GOLDEN_DATA_RACE_NATIONAL_TIME_SERIES_APPEND = os.path.join( - GOLDEN_DIR, 'time_series_appends', 'table_by_race_national_time_series.csv' + GOLDEN_DIR, "time_series_appends", "table_by_race_national_time_series.csv" ) GOLDEN_DATA_AGE_COUNTY_TIME_SERIES_APPEND = os.path.join( - GOLDEN_DIR, 'time_series_appends', 'table_by_age_county_time_series.csv' + GOLDEN_DIR, "time_series_appends", "table_by_age_county_time_series.csv" ) def _load_values_as_df(*args, **kwargs): """mock out the retrieval of cached ACS tables from our GCS landing bucket, and instead return the equivalent test csv""" - dataset, filename = args - dtype = {'county_fips': str} if "county" in filename else {'state_fips': str} + _, filename = args + dtype = {"county_fips": str} if "county" in filename else {"state_fips": str} print("mock GCS cache:", filename) df = gcs_to_bq_util.values_json_to_df(os.path.join(MOCK_CACHE_DIR, filename), dtype=dtype).reset_index(drop=True) return df @@ -65,59 +66,59 @@ def _load_values_as_df(*args, **kwargs): # We export this function for use in other packages so it needs its own tests def testGenerateNationalDatasetRace(): state_df = pd.read_csv( - os.path.join(TEST_DIR, 'national', 'state_by_race.csv'), - dtype={'state_fips': str}, + os.path.join(TEST_DIR, "national", "state_by_race.csv"), + dtype={"state_fips": str}, ) expected_df = pd.read_csv( - os.path.join(TEST_DIR, 'national', 'national_by_race.csv'), - dtype={'state_fips': str}, + os.path.join(TEST_DIR, "national", "national_by_race.csv"), + dtype={"state_fips": str}, ) - states_to_include = {'01', '06'} + states_to_include = {"01", "06"} - national_df = GENERATE_NATIONAL_DATASET(state_df, states_to_include, 'race') + national_df = GENERATE_NATIONAL_DATASET(state_df, states_to_include, "race") assert_frame_equal(national_df, expected_df, check_like=True) def testGenerateNationalDatasetSex(): state_df = pd.read_csv( - os.path.join(TEST_DIR, 'national', 'state_by_sex.csv'), - dtype={'state_fips': str}, + os.path.join(TEST_DIR, "national", "state_by_sex.csv"), + dtype={"state_fips": str}, ) expected_df = pd.read_csv( - os.path.join(TEST_DIR, 'national', 'national_by_sex.csv'), - dtype={'state_fips': str}, + os.path.join(TEST_DIR, "national", "national_by_sex.csv"), + dtype={"state_fips": str}, ) - states_to_include = {'01', '06'} + states_to_include = {"01", "06"} - national_df = GENERATE_NATIONAL_DATASET(state_df, states_to_include, 'sex') + national_df = GENERATE_NATIONAL_DATASET(state_df, states_to_include, "sex") assert_frame_equal(national_df, expected_df, check_like=True) def testGenerateNationalDatasetAge(): state_df = pd.read_csv( - os.path.join(TEST_DIR, 'national', 'state_by_age.csv'), - dtype={'state_fips': str}, + os.path.join(TEST_DIR, "national", "state_by_age.csv"), + dtype={"state_fips": str}, ) expected_df = pd.read_csv( - os.path.join(TEST_DIR, 'national', 'national_by_age.csv'), - dtype={'state_fips': str}, + os.path.join(TEST_DIR, "national", "national_by_age.csv"), + dtype={"state_fips": str}, ) - states_to_include = {'01', '06'} + states_to_include = {"01", "06"} - national_df = GENERATE_NATIONAL_DATASET(state_df, states_to_include, 'age') + national_df = GENERATE_NATIONAL_DATASET(state_df, states_to_include, "age") assert_frame_equal(national_df, expected_df, check_like=True) DTYPE = { - 'county_fips': str, - 'state_fips': str, - 'time_period': str, + "county_fips": str, + "state_fips": str, + "time_period": str, } -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2009)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2009)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testOverWriteToBqStateNationalCalls2009( mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock ): @@ -125,41 +126,41 @@ def testOverWriteToBqStateNationalCalls2009( based on the order and structure of the mocked calls to ACS, our cache of ACS, and our BQ """ - acsPopulationIngester = ACSPopulationIngester(False, '2009') + acsPopulationIngester = ACSPopulationIngester(False, "2009") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # meta data - assert mock_json.call_args_list[0][0][0] == 'https://api.census.gov/data/2009/acs/acs5' + assert mock_json.call_args_list[0][0][0] == "https://api.census.gov/data/2009/acs/acs5" # our GCS caching of ACS raw tables assert mock_cache.call_count == 11 called_cached_gcs_names_in_order_ALL_CAPS = [call[0][1] for call in mock_cache.call_args_list] assert called_cached_gcs_names_in_order_ALL_CAPS == [ - '2009-HISPANIC_OR_LATINO_ORIGIN_BY_RACE_state.json', - '2009-SEX_BY_AGE_state.json', - '2009-SEX_BY_AGE_(WHITE_ALONE)_state.json', - '2009-SEX_BY_AGE_(BLACK_OR_AFRICAN_AMERICAN_ALONE)_state.json', - '2009-SEX_BY_AGE_(AMERICAN_INDIAN_AND_ALASKA_NATIVE_ALONE)_state.json', - '2009-SEX_BY_AGE_(ASIAN_ALONE)_state.json', - '2009-SEX_BY_AGE_(NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER_ALONE)_state.json', - '2009-SEX_BY_AGE_(SOME_OTHER_RACE_ALONE)_state.json', - '2009-SEX_BY_AGE_(TWO_OR_MORE_RACES)_state.json', - '2009-SEX_BY_AGE_(HISPANIC_OR_LATINO)_state.json', - '2009-SEX_BY_AGE_(WHITE_ALONE,_NOT_HISPANIC_OR_LATINO)_state.json', + "2009-HISPANIC_OR_LATINO_ORIGIN_BY_RACE_state.json", + "2009-SEX_BY_AGE_state.json", + "2009-SEX_BY_AGE_(WHITE_ALONE)_state.json", + "2009-SEX_BY_AGE_(BLACK_OR_AFRICAN_AMERICAN_ALONE)_state.json", + "2009-SEX_BY_AGE_(AMERICAN_INDIAN_AND_ALASKA_NATIVE_ALONE)_state.json", + "2009-SEX_BY_AGE_(ASIAN_ALONE)_state.json", + "2009-SEX_BY_AGE_(NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER_ALONE)_state.json", + "2009-SEX_BY_AGE_(SOME_OTHER_RACE_ALONE)_state.json", + "2009-SEX_BY_AGE_(TWO_OR_MORE_RACES)_state.json", + "2009-SEX_BY_AGE_(HISPANIC_OR_LATINO)_state.json", + "2009-SEX_BY_AGE_(WHITE_ALONE,_NOT_HISPANIC_OR_LATINO)_state.json", ] table_names_for_bq = [call[0][2] for call in mock_bq.call_args_list] assert table_names_for_bq == [ # 2021 should only write to the time_series tables - 'by_race_state_time_series', - 'by_sex_age_race_state_time_series', - 'by_sex_age_state_time_series', - 'by_age_state_time_series', - 'by_sex_state_time_series', - 'by_age_national_time_series', - 'by_race_national_time_series', - 'by_sex_national_time_series', + "by_race_state_time_series", + "by_sex_age_race_state_time_series", + "by_sex_age_state_time_series", + "by_age_state_time_series", + "by_sex_state_time_series", + "by_age_national_time_series", + "by_race_national_time_series", + "by_sex_national_time_series", ] df_age_national_2009_overwrite = mock_bq.call_args_list[5][0][0] @@ -168,9 +169,9 @@ def testOverWriteToBqStateNationalCalls2009( assert_frame_equal(df_age_national_2009_overwrite, expected_df_age_national_2009_overwrite, check_like=True) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqCountyCallsAppend2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): """Test the overall function structure for a county level ingester, based on the order and structure of the mocked calls to ACS, our cache of ACS, and our BQ @@ -179,7 +180,7 @@ def testWriteToBqCountyCallsAppend2022(mock_bq: mock.MagicMock, mock_cache: mock # instantiate with only 2 years to test acsPopulationIngester = ACSPopulationIngester(True, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # meta data assert mock_json.call_args_list[0][0][0] == "https://api.census.gov/data/2022/acs/acs5" @@ -188,43 +189,43 @@ def testWriteToBqCountyCallsAppend2022(mock_bq: mock.MagicMock, mock_cache: mock assert mock_cache.call_count == 11 called_cached_gcs_names_in_order_title_cases = [call[0][1] for call in mock_cache.call_args_list] assert called_cached_gcs_names_in_order_title_cases == [ - '2022-Hispanic_or_Latino_Origin_by_Race_county.json', - '2022-Sex_by_Age_county.json', - '2022-Sex_by_Age_(White_Alone)_county.json', - '2022-Sex_by_Age_(Black_or_African_American_Alone)_county.json', - '2022-Sex_by_Age_(American_Indian_and_Alaska_Native_Alone)_county.json', - '2022-Sex_by_Age_(Asian_Alone)_county.json', - '2022-Sex_by_Age_(Native_Hawaiian_and_Other_Pacific_Islander_Alone)_county.json', - '2022-Sex_by_Age_(Some_Other_Race_Alone)_county.json', - '2022-Sex_by_Age_(Two_or_More_Races)_county.json', - '2022-Sex_by_Age_(Hispanic_or_Latino)_county.json', - '2022-Sex_by_Age_(White_Alone,_Not_Hispanic_or_Latino)_county.json', + "2022-Hispanic_or_Latino_Origin_by_Race_county.json", + "2022-Sex_by_Age_county.json", + "2022-Sex_by_Age_(White_Alone)_county.json", + "2022-Sex_by_Age_(Black_or_African_American_Alone)_county.json", + "2022-Sex_by_Age_(American_Indian_and_Alaska_Native_Alone)_county.json", + "2022-Sex_by_Age_(Asian_Alone)_county.json", + "2022-Sex_by_Age_(Native_Hawaiian_and_Other_Pacific_Islander_Alone)_county.json", + "2022-Sex_by_Age_(Some_Other_Race_Alone)_county.json", + "2022-Sex_by_Age_(Two_or_More_Races)_county.json", + "2022-Sex_by_Age_(Hispanic_or_Latino)_county.json", + "2022-Sex_by_Age_(White_Alone,_Not_Hispanic_or_Latino)_county.json", ] table_names_for_bq = [call[0][2] for call in mock_bq.call_args_list] assert table_names_for_bq == [ # 2022 should write to both SINGLE YEAR and TIME SERIES tables - 'by_race_county', - 'by_race_county_time_series', - 'by_sex_age_race_county', - 'by_sex_age_race_county_time_series', - 'by_sex_age_county', - 'by_sex_age_county_time_series', - 'by_age_county', - 'by_age_county_time_series', - 'by_sex_county', - 'by_sex_county_time_series', + "by_race_county", + "by_race_county_time_series", + "by_sex_age_race_county", + "by_sex_age_race_county_time_series", + "by_sex_age_county", + "by_sex_age_county_time_series", + "by_age_county", + "by_age_county_time_series", + "by_sex_county", + "by_sex_county_time_series", ] -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqRaceAppend2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): acsPopulationIngester = ACSPopulationIngester(False, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # 2022 should send a SINGLE YEAR table single_year_df = mock_bq.call_args_list[0][0][0] @@ -232,38 +233,38 @@ def testWriteToBqRaceAppend2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicM assert_frame_equal(single_year_df, expected_single_year_df, check_like=True) # 2022 should only APPEND to an existing time_series table - assert mock_bq.call_args_list[1][1]['overwrite'] is False + assert mock_bq.call_args_list[1][1]["overwrite"] is False time_series_append_df = mock_bq.call_args_list[1][0][0] expected_time_series_append_df = pd.read_csv(GOLDEN_DATA_RACE_TIME_SERIES_APPEND, dtype=DTYPE) assert_frame_equal(time_series_append_df, expected_time_series_append_df, check_like=True) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2009)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2009)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSexAgeRaceOverwrite2009( mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock ): - acsPopulationIngester = ACSPopulationIngester(False, '2009') - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester = ACSPopulationIngester(False, "2009") + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # 2009 should NOT send a SINGLE YEAR table # 2009 should only OVERWRITE an existing time_series table (starting it fresh) - assert mock_bq.call_args_list[1][1]['overwrite'] is True + assert mock_bq.call_args_list[1][1]["overwrite"] is True time_series_overwrite_df = mock_bq.call_args_list[1][0][0] expected_time_series_overwrite_df = pd.read_csv(GOLDEN_DATA_SEX_AGE_RACE_TIME_SERIES_OVERWRITES, dtype=DTYPE) assert_frame_equal(time_series_overwrite_df, expected_time_series_overwrite_df, check_like=True) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSexAgeAppend2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): acsPopulationIngester = ACSPopulationIngester(False, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # 2022 should send a SINGLE YEAR table single_year_df = mock_bq.call_args_list[4][0][0] @@ -272,20 +273,20 @@ def testWriteToBqSexAgeAppend2022(mock_bq: mock.MagicMock, mock_cache: mock.Magi assert_frame_equal(single_year_df, expected_single_year_df, check_like=True) # 2022 should only APPEND to an existing time_series table - assert mock_bq.call_args_list[5][1]['overwrite'] is False + assert mock_bq.call_args_list[5][1]["overwrite"] is False time_series_append_df = mock_bq.call_args_list[5][0][0] expected_time_series_append_df = pd.read_csv(GOLDEN_DATA_SEX_AGE_TIME_SERIES_APPEND, dtype=DTYPE) assert_frame_equal(time_series_append_df, expected_time_series_append_df, check_like=True) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSex2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): acsPopulationIngester = ACSPopulationIngester(False, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # 2022 should send a SINGLE YEAR table single_year_df = mock_bq.call_args_list[8][0][0] @@ -293,20 +294,20 @@ def testWriteToBqSex2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mo assert_frame_equal(single_year_df, expected_single_year_df, check_like=True) # 2022 should only APPEND to an existing time_series table - assert mock_bq.call_args_list[9][1]['overwrite'] is False + assert mock_bq.call_args_list[9][1]["overwrite"] is False time_series_append_df = mock_bq.call_args_list[9][0][0] expected_time_series_append_df = pd.read_csv(GOLDEN_DATA_SEX_TIME_SERIES_APPEND, dtype=DTYPE) assert_frame_equal(time_series_append_df, expected_time_series_append_df, check_like=True) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqRaceNational2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): acsPopulationIngester = ACSPopulationIngester(False, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # 2022 should send a SINGLE YEAR table single_year_df = mock_bq.call_args_list[12][0][0] @@ -314,20 +315,20 @@ def testWriteToBqRaceNational2022(mock_bq: mock.MagicMock, mock_cache: mock.Magi assert_frame_equal(single_year_df, expected_single_year_df, check_like=True) # 2022 should only APPEND to an existing time_series table - assert mock_bq.call_args_list[13][1]['overwrite'] is False + assert mock_bq.call_args_list[13][1]["overwrite"] is False time_series_append_df = mock_bq.call_args_list[13][0][0] expected_time_series_append_df = pd.read_csv(GOLDEN_DATA_RACE_NATIONAL_TIME_SERIES_APPEND, dtype=DTYPE) assert_frame_equal(time_series_append_df, expected_time_series_append_df, check_like=True) -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSexNational2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): acsPopulationIngester = ACSPopulationIngester(False, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") # 2022 should send a SINGLE YEAR table single_year_df = mock_bq.call_args_list[14][0][0] @@ -335,7 +336,7 @@ def testWriteToBqSexNational2022(mock_bq: mock.MagicMock, mock_cache: mock.Magic assert_frame_equal(single_year_df, expected_single_year_df, check_like=True) # 2022 should only APPEND to an existing time_series table - assert mock_bq.call_args_list[15][1]['overwrite'] is False + assert mock_bq.call_args_list[15][1]["overwrite"] is False time_series_append_df = mock_bq.call_args_list[15][0][0] expected_time_series_append_df = pd.read_csv(GOLDEN_DATA_SEX_NATIONAL_TIME_SERIES_APPEND, dtype=DTYPE) @@ -343,14 +344,14 @@ def testWriteToBqSexNational2022(mock_bq: mock.MagicMock, mock_cache: mock.Magic # # Do one County level test to make sure our logic there is correct -@mock.patch('ingestion.census.fetch_acs_metadata', return_value=get_acs_metadata_as_json(2022)) -@mock.patch('ingestion.gcs_to_bq_util.load_values_as_df', side_effect=_load_values_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.census.fetch_acs_metadata", return_value=get_acs_metadata_as_json(2022)) +@mock.patch("ingestion.gcs_to_bq_util.load_values_as_df", side_effect=_load_values_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAgeCounty2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMock, mock_json: mock.MagicMock): acsPopulationIngester = ACSPopulationIngester(True, "2022") - acsPopulationIngester.write_to_bq('dataset', 'gcs_bucket') + acsPopulationIngester.write_to_bq("dataset", "gcs_bucket") single_year_df = mock_bq.call_args_list[6][0][0] expected_single_year_df = pd.read_csv(GOLDEN_DATA_AGE_COUNTY, dtype=DTYPE) @@ -360,4 +361,4 @@ def testWriteToBqAgeCounty2022(mock_bq: mock.MagicMock, mock_cache: mock.MagicMo expected_time_series_append_df = pd.read_csv(GOLDEN_DATA_AGE_COUNTY_TIME_SERIES_APPEND, dtype=DTYPE) assert_frame_equal(time_series_append_df, expected_time_series_append_df, check_like=True) - assert mock_bq.call_args_list[7][1]['overwrite'] is False + assert mock_bq.call_args_list[7][1]["overwrite"] is False diff --git a/python/tests/datasources/test_age_adjustment.py b/python/tests/datasources/test_age_adjustment.py index ef7187d4b5..344e840c70 100644 --- a/python/tests/datasources/test_age_adjustment.py +++ b/python/tests/datasources/test_age_adjustment.py @@ -1,3 +1,4 @@ +# pylint: disable=unused-argument from unittest import mock import os import pandas as pd @@ -9,203 +10,205 @@ # Current working directory. THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data', 'age_adjustment') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "age_adjustment") -COVID_DATA_SIMPLE = os.path.join(TEST_DIR, 'unit_tests', 'race_age_state_simple.json') -COVID_DATA_SIMPLE_TIME_SERIES = os.path.join(TEST_DIR, 'unit_tests', 'race_age_state_time_series_simple.json') +COVID_DATA_SIMPLE = os.path.join(TEST_DIR, "unit_tests", "race_age_state_simple.json") +COVID_DATA_SIMPLE_TIME_SERIES = os.path.join(TEST_DIR, "unit_tests", "race_age_state_time_series_simple.json") -EXPECTED_DEATHS_JSON = os.path.join(TEST_DIR, 'unit_tests', 'expected_deaths.json') -EXPECTED_DEATHS_TIME_SERIES_JSON = os.path.join(TEST_DIR, 'unit_tests', 'expected_deaths_time_series.json') +EXPECTED_DEATHS_JSON = os.path.join(TEST_DIR, "unit_tests", "expected_deaths.json") +EXPECTED_DEATHS_TIME_SERIES_JSON = os.path.join(TEST_DIR, "unit_tests", "expected_deaths_time_series.json") -AGE_ADJUST_JSON = os.path.join(TEST_DIR, 'unit_tests', 'age_adjusted.json') -AGE_ADJUST_TIME_SERIES_JSON = os.path.join(TEST_DIR, 'unit_tests', 'age_adjusted_time_series.json') +AGE_ADJUST_JSON = os.path.join(TEST_DIR, "unit_tests", "age_adjusted.json") +AGE_ADJUST_TIME_SERIES_JSON = os.path.join(TEST_DIR, "unit_tests", "age_adjusted_time_series.json") -GOLDEN_INTEGRATION_DATA_STATE = os.path.join( - TEST_DIR, 'cdc_restricted-by_race_state_processed-with_age_adjust.json') +GOLDEN_INTEGRATION_DATA_STATE = os.path.join(TEST_DIR, "cdc_restricted-by_race_state_processed-with_age_adjust.json") GOLDEN_INTEGRATION_DATA_NATIONAL = os.path.join( - TEST_DIR, 'cdc_restricted-by_race_national_processed-with_age_adjust.json') + TEST_DIR, "cdc_restricted-by_race_national_processed-with_age_adjust.json" +) GOLDEN_INTEGRATION_DATA_STATE_TIME_SERIES = os.path.join( - TEST_DIR, 'cdc_restricted-by_race_state_processed-with_age_adjust_time_series.json') + TEST_DIR, "cdc_restricted-by_race_state_processed-with_age_adjust_time_series.json" +) GOLDEN_INTEGRATION_DATA_NATIONAL_TIME_SERIES = os.path.join( - TEST_DIR, 'cdc_restricted-by_race_national_processed-with_age_adjust_time_series.json') + TEST_DIR, "cdc_restricted-by_race_national_processed-with_age_adjust_time_series.json" +) def get_census_pop_estimates_as_df(): - return pd.read_csv(os.path.join(TEST_DIR, 'census_pop_estimates.csv'), dtype={'state_fips': str}) + return pd.read_csv(os.path.join(TEST_DIR, "census_pop_estimates.csv"), dtype={"state_fips": str}) def get_mock_df_from_bq_as_df(*args, **kwargs): - if args[0] == 'census_pop_estimates': - return pd.read_csv(os.path.join(TEST_DIR, 'census_pop_estimates.csv'), dtype={'state_fips': str}) - elif args[1] == 'by_race_state_processed': + if args[0] == "census_pop_estimates": + return pd.read_csv(os.path.join(TEST_DIR, "census_pop_estimates.csv"), dtype={"state_fips": str}) + elif args[1] == "by_race_state_processed": return pd.read_json( - os.path.join(TEST_DIR, 'cdc_restricted_race_state_processed.json'), dtype={'state_fips': str}) - elif args[1] == 'by_race_national_processed': + os.path.join(TEST_DIR, "cdc_restricted_race_state_processed.json"), dtype={"state_fips": str} + ) + elif args[1] == "by_race_national_processed": return pd.read_json( - os.path.join(TEST_DIR, 'cdc_restricted_race_national_processed.json'), dtype={'state_fips': str}) - elif args[1] == 'by_race_age_state': - return pd.read_json(os.path.join(TEST_DIR, 'cdc_restricted-race_age_state.json'), dtype={'state_fips': str}) - elif args[1] == 'by_race_state_processed_time_series': + os.path.join(TEST_DIR, "cdc_restricted_race_national_processed.json"), dtype={"state_fips": str} + ) + elif args[1] == "by_race_age_state": + return pd.read_json(os.path.join(TEST_DIR, "cdc_restricted-race_age_state.json"), dtype={"state_fips": str}) + elif args[1] == "by_race_state_processed_time_series": return pd.read_json( - os.path.join(TEST_DIR, - 'cdc_restricted_race_state_processed_time_series.json'), - dtype={'state_fips': str}) - elif args[1] == 'by_race_national_processed_time_series': + os.path.join(TEST_DIR, "cdc_restricted_race_state_processed_time_series.json"), dtype={"state_fips": str} + ) + elif args[1] == "by_race_national_processed_time_series": return pd.read_json( - os.path.join(TEST_DIR, - 'cdc_restricted_race_national_processed_time_series.json'), - dtype={'state_fips': str}) - raise ValueError('No dataset for these args') + os.path.join(TEST_DIR, "cdc_restricted_race_national_processed_time_series.json"), dtype={"state_fips": str} + ) + raise ValueError("No dataset for these args") # "Unit" tests def testExpectedDeathsAndHospitalizations(): - covid_data = pd.read_json(COVID_DATA_SIMPLE, dtype={'state_fips': str}) + covid_data = pd.read_json(COVID_DATA_SIMPLE, dtype={"state_fips": str}) pop_data = get_census_pop_estimates_as_df() - df = age_adjust.get_expected_col(covid_data, pop_data, 'expected_deaths', 'death_y') - df = age_adjust.get_expected_col(df, pop_data, 'expected_hosps', 'hosp_y') - expected_df = pd.read_json(EXPECTED_DEATHS_JSON, dtype={'state_fips': str}) + df = age_adjust.get_expected_col(covid_data, pop_data, "expected_deaths", "death_y") + df = age_adjust.get_expected_col(df, pop_data, "expected_hosps", "hosp_y") + expected_df = pd.read_json(EXPECTED_DEATHS_JSON, dtype={"state_fips": str}) sortby_cols = list(df.columns) - assert_frame_equal(df.sort_values(sortby_cols).reset_index(drop=True), - expected_df.sort_values(sortby_cols).reset_index(drop=True), - check_like=True) + assert_frame_equal( + df.sort_values(sortby_cols).reset_index(drop=True), + expected_df.sort_values(sortby_cols).reset_index(drop=True), + check_like=True, + ) def testExpectedDeathsAndHospitalizationsTimeSeries(): - covid_data = pd.read_json(COVID_DATA_SIMPLE_TIME_SERIES, dtype={'state_fips': str}) + covid_data = pd.read_json(COVID_DATA_SIMPLE_TIME_SERIES, dtype={"state_fips": str}) pop_data = get_census_pop_estimates_as_df() - df = age_adjust.get_expected_col(covid_data, pop_data, 'expected_deaths', 'death_y') - df = age_adjust.get_expected_col(df, pop_data, 'expected_hosps', 'hosp_y') - expected_df = pd.read_json(EXPECTED_DEATHS_TIME_SERIES_JSON, dtype={'state_fips': str}) + df = age_adjust.get_expected_col(covid_data, pop_data, "expected_deaths", "death_y") + df = age_adjust.get_expected_col(df, pop_data, "expected_hosps", "hosp_y") + expected_df = pd.read_json(EXPECTED_DEATHS_TIME_SERIES_JSON, dtype={"state_fips": str}) sortby_cols = list(df.columns) - assert_frame_equal(df.sort_values(sortby_cols).reset_index(drop=True), - expected_df.sort_values(sortby_cols).reset_index(drop=True), - check_like=True) + assert_frame_equal( + df.sort_values(sortby_cols).reset_index(drop=True), + expected_df.sort_values(sortby_cols).reset_index(drop=True), + check_like=True, + ) def testAgeAdjust(): - expected_deaths_df = pd.read_json( - EXPECTED_DEATHS_JSON, dtype={'state_fips': str}) + expected_deaths_df = pd.read_json(EXPECTED_DEATHS_JSON, dtype={"state_fips": str}) df = age_adjust.age_adjust_from_expected(expected_deaths_df, False) - expected_df = pd.read_json(AGE_ADJUST_JSON, dtype={'state_fips': str}) + expected_df = pd.read_json(AGE_ADJUST_JSON, dtype={"state_fips": str}) assert_frame_equal(df, expected_df, check_like=True) def testAgeAdjustTimeSeries(): - expected_deaths_df = pd.read_json( - EXPECTED_DEATHS_TIME_SERIES_JSON, dtype={'state_fips': str}) + expected_deaths_df = pd.read_json(EXPECTED_DEATHS_TIME_SERIES_JSON, dtype={"state_fips": str}) df = age_adjust.age_adjust_from_expected(expected_deaths_df, True) - expected_df = pd.read_json(AGE_ADJUST_TIME_SERIES_JSON, dtype={'state_fips': str}) + expected_df = pd.read_json(AGE_ADJUST_TIME_SERIES_JSON, dtype={"state_fips": str}) sortby_cols = list(df.columns) - assert_frame_equal(df.sort_values(by=sortby_cols).reset_index(drop=True), - expected_df.sort_values(by=sortby_cols).reset_index(drop=True), - check_like=True) + assert_frame_equal( + df.sort_values(by=sortby_cols).reset_index(drop=True), + expected_df.sort_values(by=sortby_cols).reset_index(drop=True), + check_like=True, + ) # Integration tests -@mock.patch('ingestion.gcs_to_bq_util.load_df_from_bigquery', - side_effect=get_mock_df_from_bq_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', - return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_df_from_bigquery", side_effect=get_mock_df_from_bq_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqState(mock_bq: mock.MagicMock, mock_df: mock.MagicMock): adjust = AgeAdjustCDCRestricted() - kwargs = {'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table'} + kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} - adjust.write_to_bq('dataset', 'gcs_bucket', **kwargs) + adjust.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_bq.call_count == 4 - expected_df = pd.read_json(GOLDEN_INTEGRATION_DATA_STATE, dtype={ - 'state_fips': str, - 'death_ratio_age_adjusted': float, - }) + expected_df = pd.read_json( + GOLDEN_INTEGRATION_DATA_STATE, + dtype={ + "state_fips": str, + "death_ratio_age_adjusted": float, + }, + ) - assert_frame_equal( - mock_bq.call_args_list[0].args[0], expected_df, check_like=True) + assert_frame_equal(mock_bq.call_args_list[0].args[0], expected_df, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.load_df_from_bigquery', - side_effect=get_mock_df_from_bq_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', - return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_df_from_bigquery", side_effect=get_mock_df_from_bq_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqStateTimeSeries(mock_bq: mock.MagicMock, mock_df: mock.MagicMock): adjust = AgeAdjustCDCRestricted() - kwargs = {'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table'} + kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} - adjust.write_to_bq('dataset', 'gcs_bucket', **kwargs) + adjust.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_bq.call_count == 4 - expected_df = pd.read_json(GOLDEN_INTEGRATION_DATA_NATIONAL, dtype={ - 'state_fips': str, - 'death_ratio_age_adjusted': float, - }) + expected_df = pd.read_json( + GOLDEN_INTEGRATION_DATA_NATIONAL, + dtype={ + "state_fips": str, + "death_ratio_age_adjusted": float, + }, + ) - assert_frame_equal( - mock_bq.call_args_list[1].args[0], expected_df, check_like=True) + assert_frame_equal(mock_bq.call_args_list[1].args[0], expected_df, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.load_df_from_bigquery', - side_effect=get_mock_df_from_bq_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', - return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_df_from_bigquery", side_effect=get_mock_df_from_bq_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqNational(mock_bq: mock.MagicMock, mock_df: mock.MagicMock): adjust = AgeAdjustCDCRestricted() - kwargs = {'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table'} + kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} - adjust.write_to_bq('dataset', 'gcs_bucket', **kwargs) + adjust.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_bq.call_count == 4 - expected_df = pd.read_json(GOLDEN_INTEGRATION_DATA_STATE_TIME_SERIES, dtype={ - 'state_fips': str, - }) + expected_df = pd.read_json( + GOLDEN_INTEGRATION_DATA_STATE_TIME_SERIES, + dtype={ + "state_fips": str, + }, + ) sortby_cols = list(expected_df.columns) assert_frame_equal( mock_bq.call_args_list[2].args[0].sort_values(sortby_cols).reset_index(drop=True), expected_df.sort_values(sortby_cols).reset_index(drop=True), - check_like=True) + check_like=True, + ) -@mock.patch('ingestion.gcs_to_bq_util.load_df_from_bigquery', - side_effect=get_mock_df_from_bq_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', - return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_df_from_bigquery", side_effect=get_mock_df_from_bq_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqNationalTimeSeries(mock_bq: mock.MagicMock, mock_df: mock.MagicMock): adjust = AgeAdjustCDCRestricted() - kwargs = {'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table'} + kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} - adjust.write_to_bq('dataset', 'gcs_bucket', **kwargs) + adjust.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_bq.call_count == 4 - expected_df = pd.read_json(GOLDEN_INTEGRATION_DATA_NATIONAL_TIME_SERIES, dtype={ - 'state_fips': str, - }) + expected_df = pd.read_json( + GOLDEN_INTEGRATION_DATA_NATIONAL_TIME_SERIES, + dtype={ + "state_fips": str, + }, + ) sortby_cols = list(expected_df.columns) assert_frame_equal( mock_bq.call_args_list[3].args[0].sort_values(sortby_cols).reset_index(drop=True), expected_df.sort_values(sortby_cols).reset_index(drop=True), - check_like=True) + check_like=True, + ) diff --git a/python/tests/datasources/test_age_adjustment_cdc_hiv.py b/python/tests/datasources/test_age_adjustment_cdc_hiv.py index fa52a6240a..22ea99bc5e 100644 --- a/python/tests/datasources/test_age_adjustment_cdc_hiv.py +++ b/python/tests/datasources/test_age_adjustment_cdc_hiv.py @@ -6,57 +6,57 @@ # Current working directory. THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data', 'cdc_hiv_age_adjustment') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "cdc_hiv_age_adjustment") GOLDEN_INTEGRATION_DATA_NATIONAL = os.path.join( - TEST_DIR, 'golden_data', 'race_and_ethnicity_national_current-with_age_adjust.csv' + TEST_DIR, "golden_data", "race_and_ethnicity_national_current-with_age_adjust.csv" ) GOLDEN_INTEGRATION_DATA_STATE = os.path.join( - TEST_DIR, 'golden_data', 'race_and_ethnicity_state_current-with_age_adjust.csv' + TEST_DIR, "golden_data", "race_and_ethnicity_state_current-with_age_adjust.csv" ) def _load_df_from_bigquery(*args, **kwargs): dataset, table_name = args - print("mocking read of HET COVID tables (pre age-adjusted):", f'{dataset}-{table_name}') + print("mocking read of HET COVID tables (pre age-adjusted):", f"{dataset}-{table_name}") dtype = kwargs["dtype"] - race_age_df = pd.read_csv(os.path.join(TEST_DIR, f'{table_name}.csv'), dtype=dtype) + race_age_df = pd.read_csv(os.path.join(TEST_DIR, f"{table_name}.csv"), dtype=dtype) return race_age_df # Integration tests -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_df_from_bigquery', side_effect=_load_df_from_bigquery) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_df_from_bigquery", side_effect=_load_df_from_bigquery) def testWriteToBq( mock_race_age: mock.MagicMock, mock_bq: mock.MagicMock, ): adjust = AgeAdjustCDCHiv() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - adjust.write_to_bq('dataset', 'gcs_bucket', **kwargs) + adjust.write_to_bq("dataset", "gcs_bucket", **kwargs) # (RACE/AGE + RACE) X (STATE + NATIONAL) assert mock_race_age.call_count == 4 called_bq_tables = [call[0][1] for call in mock_race_age.call_args_list] assert called_bq_tables == [ - 'by_race_age_national', - 'race_and_ethnicity_national_current', - 'by_race_age_state', - 'race_and_ethnicity_state_current', + "by_race_age_national", + "race_and_ethnicity_national_current", + "by_race_age_state", + "race_and_ethnicity_state_current", ] # NATIONAL + STATE assert mock_bq.call_count == 2 dtype = { - 'state_fips': str, - 'death_ratio_age_adjusted': float, + "state_fips": str, + "death_ratio_age_adjusted": float, } national_df, _national_dataset, national_table_name = mock_bq.call_args_list[0][0] diff --git a/python/tests/datasources/test_bjs_incarceration.py b/python/tests/datasources/test_bjs_incarceration.py index 092a12ed15..889c3d58ed 100644 --- a/python/tests/datasources/test_bjs_incarceration.py +++ b/python/tests/datasources/test_bjs_incarceration.py @@ -25,9 +25,9 @@ def _get_test_table_files(*args): for file in table_crops.keys(): if file in table_crops: source_df = pd.read_csv( - os.path.join(TEST_DIR, f'bjs_test_input_{file}'), + os.path.join(TEST_DIR, f"bjs_test_input_{file}"), encoding="ISO-8859-1", - thousands=',', + thousands=",", engine="python", ) @@ -140,17 +140,17 @@ def _get_jail_7(): TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "bjs_incarceration") GOLDEN_DATA = { - 'race_national': os.path.join(TEST_DIR, 'bjs_test_output_race_and_ethnicity_national.json'), - 'age_national': os.path.join(TEST_DIR, 'bjs_test_output_age_national.json'), - 'sex_national': os.path.join(TEST_DIR, 'bjs_test_output_sex_national.json'), - 'race_state': os.path.join(TEST_DIR, 'bjs_test_output_race_and_ethnicity_state.json'), - 'age_state': os.path.join(TEST_DIR, 'bjs_test_output_age_state.json'), - 'sex_state': os.path.join(TEST_DIR, 'bjs_test_output_sex_state.json'), + "race_national": os.path.join(TEST_DIR, "bjs_test_output_race_and_ethnicity_national.json"), + "age_national": os.path.join(TEST_DIR, "bjs_test_output_age_national.json"), + "sex_national": os.path.join(TEST_DIR, "bjs_test_output_sex_national.json"), + "race_state": os.path.join(TEST_DIR, "bjs_test_output_race_and_ethnicity_state.json"), + "age_state": os.path.join(TEST_DIR, "bjs_test_output_age_state.json"), + "sex_state": os.path.join(TEST_DIR, "bjs_test_output_sex_state.json"), } expected_dtype = { - 'state_name': str, - 'state_fips': str, + "state_name": str, + "state_fips": str, "prison_per_100k": float, "prison_pct_share": float, "jail_per_100k": float, @@ -161,24 +161,24 @@ def _get_jail_7(): } expected_dtype_age = { **expected_dtype, - 'age': str, - 'incarceration_population_estimated_total': float, + "age": str, + "incarceration_population_estimated_total": float, } expected_dtype_race = { **expected_dtype, - 'race_and_ethnicity': str, - 'race_category_id': str, - 'incarceration_population_estimated_total': float, - 'prison_estimated_total': float, - 'jail_estimated_total': float, + "race_and_ethnicity": str, + "race_category_id": str, + "incarceration_population_estimated_total": float, + "prison_estimated_total": float, + "jail_estimated_total": float, } expected_dtype_sex = { **expected_dtype, - 'sex': str, - 'incarceration_population_estimated_total': float, - 'prison_estimated_total': float, - 'jail_estimated_total': float, + "sex": str, + "incarceration_population_estimated_total": float, + "prison_estimated_total": float, + "jail_estimated_total": float, } # --- INTEGRATION TESTS NATIONAL LEVEL @@ -192,10 +192,10 @@ def testGenerateBreakdownAgeNational(): datasource = BJSIncarcerationData() df = datasource.generate_breakdown_df("age", "national", [df_prison_10, df_jail_6], [df_prison_13, df_jail_6]) - expected_df_age_national = pd.read_json(GOLDEN_DATA['age_national'], dtype=expected_dtype_age) + expected_df_age_national = pd.read_json(GOLDEN_DATA["age_national"], dtype=expected_dtype_age) - df = df.sort_values(by=['state_name', 'age']).reset_index(drop=True) - expected_df_age_national = expected_df_age_national.sort_values(by=['state_name', 'age']).reset_index(drop=True) + df = df.sort_values(by=["state_name", "age"]).reset_index(drop=True) + expected_df_age_national = expected_df_age_national.sort_values(by=["state_name", "age"]).reset_index(drop=True) assert_frame_equal(df, expected_df_age_national, check_like=True) @@ -216,10 +216,10 @@ def testGenerateBreakdownRaceNational(): [prison_13, jail_6], ) - expected_df_race_national = pd.read_json(GOLDEN_DATA['race_national'], dtype=expected_dtype_race) - df = df.sort_values(by=['state_name', 'race_and_ethnicity']).reset_index(drop=True) + expected_df_race_national = pd.read_json(GOLDEN_DATA["race_national"], dtype=expected_dtype_race) + df = df.sort_values(by=["state_name", "race_and_ethnicity"]).reset_index(drop=True) expected_df_race_national = expected_df_race_national.sort_values( - by=['state_name', 'race_and_ethnicity'] + by=["state_name", "race_and_ethnicity"] ).reset_index(drop=True) assert_frame_equal(df, expected_df_race_national, check_like=True) @@ -237,10 +237,10 @@ def testGenerateBreakdownSexNational(): datasource = BJSIncarcerationData() df = datasource.generate_breakdown_df("sex", "national", [prison_2, prison_23, jail_6], [prison_13, jail_6]) - expected_df_sex_national = pd.read_json(GOLDEN_DATA['sex_national'], dtype=expected_dtype_sex) + expected_df_sex_national = pd.read_json(GOLDEN_DATA["sex_national"], dtype=expected_dtype_sex) - df = df.sort_values(by=['state_name', 'sex']).reset_index(drop=True) - expected_df_sex_national = expected_df_sex_national.sort_values(by=['state_name', 'sex']).reset_index(drop=True) + df = df.sort_values(by=["state_name", "sex"]).reset_index(drop=True) + expected_df_sex_national = expected_df_sex_national.sort_values(by=["state_name", "sex"]).reset_index(drop=True) assert_frame_equal(df, expected_df_sex_national, check_like=True) @@ -260,10 +260,10 @@ def testGenerateBreakdownSexState(): datasource = BJSIncarcerationData() df = datasource.generate_breakdown_df("sex", "state", [prison_2, prison_23, jail_6], [prison_13, jail_6]) - expected_df_sex_state = pd.read_json(GOLDEN_DATA['sex_state'], dtype=expected_dtype_sex) + expected_df_sex_state = pd.read_json(GOLDEN_DATA["sex_state"], dtype=expected_dtype_sex) - df = df.sort_values(by=['state_name', 'sex']).reset_index(drop=True) - expected_df_sex_state = expected_df_sex_state.sort_values(by=['state_name', 'sex']).reset_index(drop=True) + df = df.sort_values(by=["state_name", "sex"]).reset_index(drop=True) + expected_df_sex_state = expected_df_sex_state.sort_values(by=["state_name", "sex"]).reset_index(drop=True) assert_frame_equal(df, expected_df_sex_state, check_like=True) @@ -278,10 +278,10 @@ def testGenerateBreakdownAgeState(): datasource = BJSIncarcerationData() df = datasource.generate_breakdown_df("age", "state", [prison_2, prison_23, jail_6], [prison_13, jail_6]) - expected_df_age_state = pd.read_json(GOLDEN_DATA['age_state'], dtype=expected_dtype_age) + expected_df_age_state = pd.read_json(GOLDEN_DATA["age_state"], dtype=expected_dtype_age) - df = df.sort_values(by=['state_name', 'age']).reset_index(drop=True) - expected_df_age_state = expected_df_age_state.sort_values(by=['state_name', 'age']).reset_index(drop=True) + df = df.sort_values(by=["state_name", "age"]).reset_index(drop=True) + expected_df_age_state = expected_df_age_state.sort_values(by=["state_name", "age"]).reset_index(drop=True) assert_frame_equal(df, expected_df_age_state, check_like=True) @@ -302,10 +302,10 @@ def testGenerateBreakdownRaceState(): [prison_13, jail_6], ) - expected_df_race_state = pd.read_json(GOLDEN_DATA['race_state'], dtype=expected_dtype_race) + expected_df_race_state = pd.read_json(GOLDEN_DATA["race_state"], dtype=expected_dtype_race) - df = df.sort_values(by=['state_name', 'race_and_ethnicity']).reset_index(drop=True) - expected_df_race_state = expected_df_race_state.sort_values(by=['state_name', 'race_and_ethnicity']).reset_index( + df = df.sort_values(by=["state_name", "race_and_ethnicity"]).reset_index(drop=True) + expected_df_race_state = expected_df_race_state.sort_values(by=["state_name", "race_and_ethnicity"]).reset_index( drop=True ) @@ -314,8 +314,8 @@ def testGenerateBreakdownRaceState(): # INTEGRATION TEST - CORRECT NETWORK CALLS # comment out all mocks expect BQ to see real results (not just test sample results) -@mock.patch('datasources.bjs_incarceration.load_tables', side_effect=_get_test_table_files) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("datasources.bjs_incarceration.load_tables", side_effect=_get_test_table_files) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqNetworkCalls( mock_bq: mock.MagicMock, mock_zip: mock.MagicMock, @@ -324,12 +324,12 @@ def testWriteToBqNetworkCalls( # required by bigQuery kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - datasource.write_to_bq('dataset', 'gcs_bucket', **kwargs) + datasource.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_bq.call_count == 6 assert mock_zip.call_count == 2 diff --git a/python/tests/datasources/test_cawp_time.py b/python/tests/datasources/test_cawp_time.py index 9339460bb5..faa362dc70 100644 --- a/python/tests/datasources/test_cawp_time.py +++ b/python/tests/datasources/test_cawp_time.py @@ -52,7 +52,7 @@ def _fetch_json_from_web(*args): file_name = "test_legislators-historical.json" elif url == US_CONGRESS_CURRENT_URL: file_name = "test_legislators-current.json" - print(f'reading mock US CONGRESS: {file_name}') + print(f"reading mock US CONGRESS: {file_name}") with open(os.path.join(TEST_DIR, file_name)) as file: return json.load(file) @@ -80,7 +80,7 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs): "race_ethnicity": str, } return pd.read_csv( - os.path.join(TEST_DIR, f'test_input_{filename}'), + os.path.join(TEST_DIR, f"test_input_{filename}"), dtype=test_input_data_types, index_col=False, ) @@ -111,26 +111,26 @@ def _load_csv_as_df_from_web(*args, **kwargs): fips = "XX" return pd.read_csv( - os.path.join(TEST_DIR, "mock_cawp_state_leg_tables", f'cawp_state_leg_{fips}.csv'), + os.path.join(TEST_DIR, "mock_cawp_state_leg_tables", f"cawp_state_leg_{fips}.csv"), dtype=dtype, ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.fetch_json_from_web', side_effect=_fetch_json_from_web) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.fetch_json_from_web", side_effect=_fetch_json_from_web) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_web', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_web", side_effect=_load_csv_as_df_from_web, ) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) @mock.patch( - 'datasources.cawp_time.get_consecutive_time_periods', + "datasources.cawp_time.get_consecutive_time_periods", side_effect=_get_consecutive_time_periods, ) -@mock.patch('datasources.cawp_time.get_state_level_fips', return_value=FIPS_TO_TEST) +@mock.patch("datasources.cawp_time.get_state_level_fips", return_value=FIPS_TO_TEST) def testWriteToBq( mock_test_fips: mock.MagicMock, # only use a restricted set of FIPS codes in test mock_test_time_periods: mock.MagicMock, # only use a restricted number of years in test @@ -145,12 +145,12 @@ def testWriteToBq( print("testWriteToBq()") kwargs_for_bq = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } cawp_data = CAWPTimeData() - cawp_data.write_to_bq('dataset', 'gcs_bucket', **kwargs_for_bq) + cawp_data.write_to_bq("dataset", "gcs_bucket", **kwargs_for_bq) # (CONGRESS + STATE LEG) * (BY RACES + BY ALL) assert mock_test_fips.call_count == 4 diff --git a/python/tests/datasources/test_cdc_hiv.py b/python/tests/datasources/test_cdc_hiv.py index 05ab0682cb..e82eacdbd9 100644 --- a/python/tests/datasources/test_cdc_hiv.py +++ b/python/tests/datasources/test_cdc_hiv.py @@ -79,11 +79,11 @@ def test_write_to_bq_race_national( assert race_age_table_name == "by_race_age_national" expected_race_age_national_df = pd.read_csv(GOLDEN_DATA["race_age_national"], dtype=EXP_DTYPE) - race_age_national_df = race_age_national_df.sort_values(by=['time_period', 'race_and_ethnicity']).reset_index( + race_age_national_df = race_age_national_df.sort_values(by=["time_period", "race_and_ethnicity"]).reset_index( drop=True ) expected_race_age_national_df = expected_race_age_national_df.sort_values( - by=['time_period', 'race_and_ethnicity'] + by=["time_period", "race_and_ethnicity"] ).reset_index(drop=True) assert_frame_equal(race_age_national_df, expected_race_age_national_df, check_like=True) @@ -97,9 +97,9 @@ def test_write_to_bq_race_national( assert race_current_table_name == "race_and_ethnicity_national_current" expected_race_national_current_df = pd.read_csv(GOLDEN_DATA["race_national_current"], dtype=EXP_DTYPE) - race_national_current_df = race_national_current_df.sort_values(by=['race_and_ethnicity']).reset_index(drop=True) + race_national_current_df = race_national_current_df.sort_values(by=["race_and_ethnicity"]).reset_index(drop=True) expected_race_national_current_df = expected_race_national_current_df.sort_values( - by=['race_and_ethnicity'] + by=["race_and_ethnicity"] ).reset_index(drop=True) assert_frame_equal( @@ -119,8 +119,8 @@ def test_write_to_bq_race_national( expected_race_national_historical_df = pd.read_csv(GOLDEN_DATA["race_national_historical"], dtype=EXP_DTYPE) assert_frame_equal( - race_national_historical_df.sort_values(by=['time_period', 'race_and_ethnicity']).reset_index(drop=True), - expected_race_national_historical_df.sort_values(by=['time_period', 'race_and_ethnicity']).reset_index( + race_national_historical_df.sort_values(by=["time_period", "race_and_ethnicity"]).reset_index(drop=True), + expected_race_national_historical_df.sort_values(by=["time_period", "race_and_ethnicity"]).reset_index( drop=True ), check_like=True, @@ -155,8 +155,8 @@ def test_write_to_bq_age_national( assert table_name == "age_national_current" expected_age_national_current_df = pd.read_csv(GOLDEN_DATA["age_national_current"], dtype=EXP_DTYPE) - age_national_current_df = age_national_current_df.sort_values(by=['age']).reset_index(drop=True) - expected_age_national_current_df = expected_age_national_current_df.sort_values(by=['age']).reset_index(drop=True) + age_national_current_df = age_national_current_df.sort_values(by=["age"]).reset_index(drop=True) + expected_age_national_current_df = expected_age_national_current_df.sort_values(by=["age"]).reset_index(drop=True) assert_frame_equal( age_national_current_df, @@ -172,11 +172,11 @@ def test_write_to_bq_age_national( assert table_name == "age_national_historical" expected_age_national_historical_df = pd.read_csv(GOLDEN_DATA["age_national_historical"], dtype=EXP_DTYPE) - age_national_historical_df = age_national_historical_df.sort_values(by=['time_period', 'age']).reset_index( + age_national_historical_df = age_national_historical_df.sort_values(by=["time_period", "age"]).reset_index( drop=True ) expected_age_national_historical_df = expected_age_national_historical_df.sort_values( - by=['time_period', 'age'] + by=["time_period", "age"] ).reset_index(drop=True) assert_frame_equal( @@ -207,8 +207,8 @@ def test_write_to_bq_sex_state( assert table_name == "sex_state_current" expected_sex_state_current_df = pd.read_csv(GOLDEN_DATA["sex_state_current"], dtype=EXP_DTYPE) - sex_state_current_df = sex_state_current_df.sort_values(by=['state_name', 'sex']).reset_index(drop=True) - expected_sex_state_current_df = expected_sex_state_current_df.sort_values(by=['state_name', 'sex']).reset_index( + sex_state_current_df = sex_state_current_df.sort_values(by=["state_name", "sex"]).reset_index(drop=True) + expected_sex_state_current_df = expected_sex_state_current_df.sort_values(by=["state_name", "sex"]).reset_index( drop=True ) @@ -226,11 +226,11 @@ def test_write_to_bq_sex_state( assert table_name == "sex_state_historical" expected_sex_state_historical_df = pd.read_csv(GOLDEN_DATA["sex_state_historical"], dtype=EXP_DTYPE) - sex_state_historical_df = sex_state_historical_df.sort_values(by=['time_period', 'sex', 'state_name']).reset_index( + sex_state_historical_df = sex_state_historical_df.sort_values(by=["time_period", "sex", "state_name"]).reset_index( drop=True ) expected_sex_state_historical_df = expected_sex_state_historical_df.sort_values( - by=['time_period', 'sex', 'state_name'] + by=["time_period", "sex", "state_name"] ).reset_index(drop=True) assert_frame_equal( @@ -275,8 +275,8 @@ def test_write_to_bq_sex_county( assert table_name == "sex_county_historical" expected_sex_county_historical_df = pd.read_csv(GOLDEN_DATA["sex_county_historical"], dtype=EXP_DTYPE) assert_frame_equal( - sex_county_historical_df.sort_values(by=['time_period', 'sex']).reset_index(drop=True), - expected_sex_county_historical_df.sort_values(by=['time_period', 'sex']).reset_index(drop=True), + sex_county_historical_df.sort_values(by=["time_period", "sex"]).reset_index(drop=True), + expected_sex_county_historical_df.sort_values(by=["time_period", "sex"]).reset_index(drop=True), check_like=True, ) @@ -309,8 +309,8 @@ def test_write_to_bq_black_women_national( assert table_name == "black_women_by_age_national_current" expected_black_women_national_current_df = pd.read_csv(GOLDEN_DATA["black_women_national_current"], dtype=EXP_DTYPE) assert_frame_equal( - black_women_national_current_df.sort_values(by=['age']).reset_index(drop=True), - expected_black_women_national_current_df.sort_values(by=['age']).reset_index(drop=True), + black_women_national_current_df.sort_values(by=["age"]).reset_index(drop=True), + expected_black_women_national_current_df.sort_values(by=["age"]).reset_index(drop=True), check_like=True, ) @@ -324,7 +324,7 @@ def test_write_to_bq_black_women_national( GOLDEN_DATA["black_women_national_historical"], dtype=EXP_DTYPE ) assert_frame_equal( - black_women_national_historical_df.sort_values(by=['time_period', 'age']).reset_index(drop=True), - expected_black_women_national_historical_df.sort_values(by=['time_period', 'age']).reset_index(drop=True), + black_women_national_historical_df.sort_values(by=["time_period", "age"]).reset_index(drop=True), + expected_black_women_national_historical_df.sort_values(by=["time_period", "age"]).reset_index(drop=True), check_like=True, ) diff --git a/python/tests/datasources/test_cdc_restricted.py b/python/tests/datasources/test_cdc_restricted.py index 0355ae6ac6..d289fbc030 100644 --- a/python/tests/datasources/test_cdc_restricted.py +++ b/python/tests/datasources/test_cdc_restricted.py @@ -8,51 +8,51 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "cdc_restricted") -GOLDEN_DATA_BY_SEX_STATE_TIME_SERIES = os.path.join(TEST_DIR, 'golden_data', 'by_sex_state_time_series.json') -GOLDEN_DATA_BY_SEX_COUNTY_TIME_SERIES = os.path.join(TEST_DIR, 'golden_data', 'by_sex_county_time_series.json') -GOLDEN_DATA_BY_SEX_NATIONAL_TIME_SERIES = os.path.join(TEST_DIR, 'golden_data', 'by_sex_national_time_series.json') -GOLDEN_DATA_BY_SEX_STATE_CUMULATIVE = os.path.join(TEST_DIR, 'golden_data', 'by_sex_state_cumulative.json') -GOLDEN_DATA_BY_SEX_COUNTY_CUMULATIVE = os.path.join(TEST_DIR, 'golden_data', 'by_sex_county_cumulative.json') -GOLDEN_DATA_BY_SEX_NATIONAL_CUMULATIVE = os.path.join(TEST_DIR, 'golden_data', 'by_sex_national_cumulative.json') +GOLDEN_DATA_BY_SEX_STATE_TIME_SERIES = os.path.join(TEST_DIR, "golden_data", "by_sex_state_time_series.json") +GOLDEN_DATA_BY_SEX_COUNTY_TIME_SERIES = os.path.join(TEST_DIR, "golden_data", "by_sex_county_time_series.json") +GOLDEN_DATA_BY_SEX_NATIONAL_TIME_SERIES = os.path.join(TEST_DIR, "golden_data", "by_sex_national_time_series.json") +GOLDEN_DATA_BY_SEX_STATE_CUMULATIVE = os.path.join(TEST_DIR, "golden_data", "by_sex_state_cumulative.json") +GOLDEN_DATA_BY_SEX_COUNTY_CUMULATIVE = os.path.join(TEST_DIR, "golden_data", "by_sex_county_cumulative.json") +GOLDEN_DATA_BY_SEX_NATIONAL_CUMULATIVE = os.path.join(TEST_DIR, "golden_data", "by_sex_national_cumulative.json") def get_cdc_numbers_as_df(*args, **kwargs): print("KWARGS: ", kwargs) - if args[1] == 'cdc_restricted_by_race_and_age_state.csv': + if args[1] == "cdc_restricted_by_race_and_age_state.csv": # We dont test this, just need to return something here return pd.read_csv( - os.path.join(TEST_DIR, 'cdc_restricted_by_sex_state.csv'), + os.path.join(TEST_DIR, "cdc_restricted_by_sex_state.csv"), dtype={ - 'state_fips': str, + "state_fips": str, }, ) return pd.read_csv( os.path.join(TEST_DIR, args[1]), dtype={ - 'state_fips': str, - 'county_fips': str, + "state_fips": str, + "county_fips": str, }, ) def get_cdc_restricted_by_sex_state_as_df(): return pd.read_csv( - os.path.join(TEST_DIR, 'cdc_restricted_by_sex_state.csv'), + os.path.join(TEST_DIR, "cdc_restricted_by_sex_state.csv"), dtype={ - 'state_fips': str, + "state_fips": str, }, ) def get_cdc_restricted_by_sex_county_as_df(): return pd.read_csv( - os.path.join(TEST_DIR, 'cdc_restricted_by_sex_county.csv'), + os.path.join(TEST_DIR, "cdc_restricted_by_sex_county.csv"), dtype={ - 'state_fips': str, - 'county_fips': str, + "state_fips": str, + "county_fips": str, }, ) @@ -60,15 +60,15 @@ def get_cdc_restricted_by_sex_county_as_df(): def testGenerateBreakdownSexStateTimeSeries(): cdc_restricted = CDCRestrictedData() - df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), 'sex', 'state', True) + df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), "sex", "state", True) # pylint: disable=no-member expected_df = pd.read_json( GOLDEN_DATA_BY_SEX_STATE_TIME_SERIES, dtype={ - 'state_fips': str, - 'covid_cases_share': float, - 'covid_hosp_share': float, - 'covid_deaths_share': float, + "state_fips": str, + "covid_cases_share": float, + "covid_hosp_share": float, + "covid_deaths_share": float, }, ) @@ -84,17 +84,17 @@ def testGenerateBreakdownSexStateTimeSeries(): def testGenerateBreakdownSexCountyTimeSeries(): cdc_restricted = CDCRestrictedData() - df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_county_as_df(), 'sex', 'county', True) + df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_county_as_df(), "sex", "county", True) # pylint: disable=no-member expected_df = pd.read_json( GOLDEN_DATA_BY_SEX_COUNTY_TIME_SERIES, dtype={ - 'state_fips': str, - 'county_fips': str, - 'covid_cases_share': float, - 'covid_hosp_share': float, - 'covid_deaths_share': float, + "state_fips": str, + "county_fips": str, + "covid_cases_share": float, + "covid_hosp_share": float, + "covid_deaths_share": float, }, ) @@ -110,16 +110,16 @@ def testGenerateBreakdownSexCountyTimeSeries(): def testGenerateBreakdownSexNationalTimeSeries(): cdc_restricted = CDCRestrictedData() - df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), 'sex', 'national', True) + df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), "sex", "national", True) # pylint: disable=no-member expected_df = pd.read_json( GOLDEN_DATA_BY_SEX_NATIONAL_TIME_SERIES, dtype={ - 'state_fips': str, - 'covid_cases_share': float, - 'covid_hosp_share': float, - 'covid_deaths_share': float, + "state_fips": str, + "covid_cases_share": float, + "covid_hosp_share": float, + "covid_deaths_share": float, }, ) @@ -135,16 +135,16 @@ def testGenerateBreakdownSexNationalTimeSeries(): def testGenerateBreakdownSexStateCumulative(): cdc_restricted = CDCRestrictedData() - df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), 'sex', 'state', False) + df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), "sex", "state", False) # pylint: disable=no-member expected_df = pd.read_json( GOLDEN_DATA_BY_SEX_STATE_CUMULATIVE, dtype={ - 'state_fips': str, - 'covid_cases_share': float, - 'covid_hosp_share': float, - 'covid_deaths_share': float, + "state_fips": str, + "covid_cases_share": float, + "covid_hosp_share": float, + "covid_deaths_share": float, }, ) @@ -160,16 +160,16 @@ def testGenerateBreakdownSexStateCumulative(): def testGenerateBreakdownSexNationalCumulative(): cdc_restricted = CDCRestrictedData() - df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), 'sex', 'national', False) + df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_state_as_df(), "sex", "national", False) # pylint: disable=no-member expected_df = pd.read_json( GOLDEN_DATA_BY_SEX_NATIONAL_CUMULATIVE, dtype={ - 'state_fips': str, - 'covid_cases_share': float, - 'covid_hosp_share': float, - 'covid_deaths_share': float, + "state_fips": str, + "covid_cases_share": float, + "covid_hosp_share": float, + "covid_deaths_share": float, }, ) @@ -185,17 +185,17 @@ def testGenerateBreakdownSexNationalCumulative(): def testGenerateBreakdownSexCountyCumulative(): cdc_restricted = CDCRestrictedData() - df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_county_as_df(), 'sex', 'county', False) + df = cdc_restricted.generate_breakdown(get_cdc_restricted_by_sex_county_as_df(), "sex", "county", False) # pylint: disable=no-member expected_df = pd.read_json( GOLDEN_DATA_BY_SEX_COUNTY_CUMULATIVE, dtype={ - 'state_fips': str, - 'county_fips': str, - 'covid_cases_share': float, - 'covid_hosp_share': float, - 'covid_deaths_share': float, + "state_fips": str, + "county_fips": str, + "covid_cases_share": float, + "covid_hosp_share": float, + "covid_deaths_share": float, }, ) @@ -208,113 +208,113 @@ def testGenerateBreakdownSexCountyCumulative(): ) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df', side_effect=get_cdc_numbers_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df", side_effect=get_cdc_numbers_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAgeNational(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdc_restricted = CDCRestrictedData() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', - 'demographic': 'age', - 'geographic': 'national', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", + "demographic": "age", + "geographic": "national", } - cdc_restricted.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdc_restricted.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 - assert mock_csv.call_args_list[0].args[1] == 'cdc_restricted_by_age_state.csv' + assert mock_csv.call_args_list[0].args[1] == "cdc_restricted_by_age_state.csv" assert mock_bq.call_count == 2 - assert mock_bq.call_args_list[0].args[2] == 'by_age_national_processed' - assert mock_bq.call_args_list[1].args[2] == 'by_age_national_processed_time_series' + assert mock_bq.call_args_list[0].args[2] == "by_age_national_processed" + assert mock_bq.call_args_list[1].args[2] == "by_age_national_processed_time_series" -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df', side_effect=get_cdc_numbers_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df", side_effect=get_cdc_numbers_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAgeState(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdc_restricted = CDCRestrictedData() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', - 'demographic': 'age', - 'geographic': 'state', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", + "demographic": "age", + "geographic": "state", } - cdc_restricted.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdc_restricted.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 - assert mock_csv.call_args_list[0].args[1] == 'cdc_restricted_by_age_state.csv' + assert mock_csv.call_args_list[0].args[1] == "cdc_restricted_by_age_state.csv" assert mock_bq.call_count == 2 - assert mock_bq.call_args_list[0].args[2] == 'by_age_state_processed' - assert mock_bq.call_args_list[1].args[2] == 'by_age_state_processed_time_series' + assert mock_bq.call_args_list[0].args[2] == "by_age_state_processed" + assert mock_bq.call_args_list[1].args[2] == "by_age_state_processed_time_series" -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df', side_effect=get_cdc_numbers_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df", side_effect=get_cdc_numbers_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAgeCounty(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdc_restricted = CDCRestrictedData() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', - 'demographic': 'age', - 'geographic': 'county', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", + "demographic": "age", + "geographic": "county", } - cdc_restricted.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdc_restricted.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 - assert mock_csv.call_args_list[0].args[1] == 'cdc_restricted_by_age_county.csv' + assert mock_csv.call_args_list[0].args[1] == "cdc_restricted_by_age_county.csv" assert mock_bq.call_count == 2 - assert mock_bq.call_args_list[0].args[2] == 'by_age_county_processed' - assert mock_bq.call_args_list[1].args[2] == 'by_age_county_processed_time_series' + assert mock_bq.call_args_list[0].args[2] == "by_age_county_processed" + assert mock_bq.call_args_list[1].args[2] == "by_age_county_processed_time_series" -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df', side_effect=get_cdc_numbers_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df", side_effect=get_cdc_numbers_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSexCounty(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdc_restricted = CDCRestrictedData() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', - 'demographic': 'sex', - 'geographic': 'county', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", + "demographic": "sex", + "geographic": "county", } - cdc_restricted.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdc_restricted.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 - assert mock_csv.call_args_list[0].args[1] == 'cdc_restricted_by_sex_county.csv' + assert mock_csv.call_args_list[0].args[1] == "cdc_restricted_by_sex_county.csv" assert mock_bq.call_count == 2 - assert mock_bq.call_args_list[0].args[2] == 'by_sex_county_processed' - assert mock_bq.call_args_list[1].args[2] == 'by_sex_county_processed_time_series' + assert mock_bq.call_args_list[0].args[2] == "by_sex_county_processed" + assert mock_bq.call_args_list[1].args[2] == "by_sex_county_processed_time_series" -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df', side_effect=get_cdc_numbers_as_df) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df", side_effect=get_cdc_numbers_as_df) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqRaceNational(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdc_restricted = CDCRestrictedData() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', - 'demographic': 'race', - 'geographic': 'national', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", + "demographic": "race", + "geographic": "national", } - cdc_restricted.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdc_restricted.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 2 - assert mock_csv.call_args_list[0].args[1] == 'cdc_restricted_by_race_state.csv' - assert mock_csv.call_args_list[1].args[1] == 'cdc_restricted_by_race_and_age_state.csv' + assert mock_csv.call_args_list[0].args[1] == "cdc_restricted_by_race_state.csv" + assert mock_csv.call_args_list[1].args[1] == "cdc_restricted_by_race_and_age_state.csv" assert mock_bq.call_count == 3 - assert mock_bq.call_args_list[0].args[2] == 'by_race_national_processed' - assert mock_bq.call_args_list[1].args[2] == 'by_race_national_processed_time_series' - assert mock_bq.call_args_list[2].args[2] == 'by_race_age_state' + assert mock_bq.call_args_list[0].args[2] == "by_race_national_processed" + assert mock_bq.call_args_list[1].args[2] == "by_race_national_processed_time_series" + assert mock_bq.call_args_list[2].args[2] == "by_race_age_state" diff --git a/python/tests/datasources/test_cdc_restricted_local.py b/python/tests/datasources/test_cdc_restricted_local.py index e2e32ed813..63cd2496ab 100644 --- a/python/tests/datasources/test_cdc_restricted_local.py +++ b/python/tests/datasources/test_cdc_restricted_local.py @@ -25,9 +25,7 @@ GOLDEN_DATA = { ("state", "race"): os.path.join(TEST_DIR, "cdc_restricted_by_race_state.csv"), ("county", "race"): os.path.join(TEST_DIR, "cdc_restricted_by_race_county.csv"), - ("state", "race_and_age"): os.path.join( - TEST_DIR, "cdc_restricted_by_race_and_age_state.csv" - ), + ("state", "race_and_age"): os.path.join(TEST_DIR, "cdc_restricted_by_race_and_age_state.csv"), ("state", "age"): os.path.join(TEST_DIR, "cdc_restricted_by_age_state.csv"), ("county", "age"): os.path.join(TEST_DIR, "cdc_restricted_by_age_county.csv"), ("state", "sex"): os.path.join(TEST_DIR, "cdc_restricted_by_sex_state.csv"), @@ -35,9 +33,7 @@ } -GOLDEN_DATA_NATIONAL = os.path.join( - TEST_DIR, "cdc_restricted_by_race_and_age_national.csv" -) +GOLDEN_DATA_NATIONAL = os.path.join(TEST_DIR, "cdc_restricted_by_race_and_age_national.csv") def testKeyMap(): @@ -57,7 +53,7 @@ def run_test(key): dfs = cdc.process_data(TEST_DIR, TEST_DATA) expected_df = pd.read_csv(GOLDEN_DATA[key], dtype=str, keep_default_na=False) - expected_df = expected_df.replace({'nan': ''}) + expected_df = expected_df.replace({"nan": ""}) assert set(dfs[key].columns) == set(expected_df.columns) sortby_cols = list(dfs[key].columns) @@ -69,42 +65,42 @@ def run_test(key): def testStateRace(): - key = ('state', 'race') + key = ("state", "race") run_test(key) def testCountyRace(): - key = ('county', 'race') + key = ("county", "race") run_test(key) def testStateRaceAndAge(): - key = ('state', 'race_and_age') + key = ("state", "race_and_age") run_test(key) def testStateAge(): - key = ('state', 'age') + key = ("state", "age") run_test(key) def testCountyAge(): - key = ('county', 'age') + key = ("county", "age") run_test(key) def testStateSex(): - key = ('state', 'sex') + key = ("state", "sex") run_test(key) def testCountySex(): - key = ('county', 'sex') + key = ("county", "sex") run_test(key) def testGenerateNationalDataset(): - race_age_state = GOLDEN_DATA[('state', 'race_and_age')] + race_age_state = GOLDEN_DATA[("state", "race_and_age")] race_age_state_df = pd.read_csv(race_age_state, keep_default_na=False) groupby_cols = [std_col.RACE_CATEGORY_ID_COL, std_col.AGE_COL] @@ -118,6 +114,6 @@ def testGenerateNationalDataset(): keep_default_na=False, ) - national_df = national_df.replace({'nan': ''}) + national_df = national_df.replace({"nan": ""}) assert_frame_equal(expected_df, national_df, check_like=True) diff --git a/python/tests/datasources/test_cdc_vaccination_county.py b/python/tests/datasources/test_cdc_vaccination_county.py index d42564689c..9d91fd0df5 100644 --- a/python/tests/datasources/test_cdc_vaccination_county.py +++ b/python/tests/datasources/test_cdc_vaccination_county.py @@ -9,21 +9,21 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "cdc_vaccination_county") -GOLDEN_DATA = os.path.join(TEST_DIR, 'cdc_vaccination_county-alls_county.csv') +GOLDEN_DATA = os.path.join(TEST_DIR, "cdc_vaccination_county-alls_county.csv") def get_total_vaccinations_as_df(): return pd.read_csv( - os.path.join(TEST_DIR, 'cdc_vaccination_county_test.csv'), - dtype={'fips': str, 'recip_county': str}, + os.path.join(TEST_DIR, "cdc_vaccination_county_test.csv"), + dtype={"fips": str, "recip_county": str}, ) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_web', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_web", return_value=get_total_vaccinations_as_df(), ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBq( mock_bq: mock.MagicMock, mock_csv: mock.MagicMock, @@ -31,21 +31,21 @@ def testWriteToBq( cdcVaccinationCounty = CDCVaccinationCounty() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - cdcVaccinationCounty.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdcVaccinationCounty.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 assert mock_bq.call_count == 1 - assert mock_bq.call_args_list[0].args[2] == 'alls_county_current' + assert mock_bq.call_args_list[0].args[2] == "alls_county_current" expected_df = pd.read_csv( GOLDEN_DATA, dtype={ - 'county_fips': str, - 'vaccinated_pct_rate': float, + "county_fips": str, + "vaccinated_pct_rate": float, }, ) diff --git a/python/tests/datasources/test_cdc_vaccination_national.py b/python/tests/datasources/test_cdc_vaccination_national.py index 5357ac25e6..ed8eaa2a75 100644 --- a/python/tests/datasources/test_cdc_vaccination_national.py +++ b/python/tests/datasources/test_cdc_vaccination_national.py @@ -9,39 +9,39 @@ TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "cdc_vaccination_national") GOLDEN_DATA = { - 'race': os.path.join(TEST_DIR, 'cdc_vaccination_national_by_race_and_ethnicity.csv'), - 'sex': os.path.join(TEST_DIR, 'cdc_vaccination_national_by_sex.csv'), - 'age': os.path.join(TEST_DIR, 'cdc_vaccination_national_by_age.csv'), + "race": os.path.join(TEST_DIR, "cdc_vaccination_national_by_race_and_ethnicity.csv"), + "sex": os.path.join(TEST_DIR, "cdc_vaccination_national_by_sex.csv"), + "age": os.path.join(TEST_DIR, "cdc_vaccination_national_by_age.csv"), } def get_state_test_data_as_df(): return pd.read_json( - os.path.join(TEST_DIR, 'cdc_vaccination_national_test.json'), - dtype={'state_fips': str, 'administered_dose1_pct': float, 'administered_dose1': float}, + os.path.join(TEST_DIR, "cdc_vaccination_national_test.json"), + dtype={"state_fips": str, "administered_dose1_pct": float, "administered_dose1": float}, ) @mock.patch( - 'ingestion.gcs_to_bq_util.load_json_as_df_from_web', + "ingestion.gcs_to_bq_util.load_json_as_df_from_web", return_value=get_state_test_data_as_df(), ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqRace(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdcVaccination = CDCVaccinationNational() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - cdcVaccination.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdcVaccination.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 assert mock_bq.call_count == 3 - expected_df = pd.read_csv(GOLDEN_DATA['race'], dtype={'population_pct': str, 'state_fips': str}) + expected_df = pd.read_csv(GOLDEN_DATA["race"], dtype={"population_pct": str, "state_fips": str}) df = mock_bq.call_args_list[0].args[0] @@ -53,24 +53,24 @@ def testWriteToBqRace(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): @mock.patch( - 'ingestion.gcs_to_bq_util.load_json_as_df_from_web', + "ingestion.gcs_to_bq_util.load_json_as_df_from_web", return_value=get_state_test_data_as_df(), ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSex(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdcVaccination = CDCVaccinationNational() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - cdcVaccination.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdcVaccination.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 assert mock_bq.call_count == 3 - expected_df = pd.read_csv(GOLDEN_DATA['sex'], dtype={'population_pct': str, 'state_fips': str}) + expected_df = pd.read_csv(GOLDEN_DATA["sex"], dtype={"population_pct": str, "state_fips": str}) df = mock_bq.call_args_list[1].args[0] @@ -82,24 +82,24 @@ def testWriteToBqSex(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): @mock.patch( - 'ingestion.gcs_to_bq_util.load_json_as_df_from_web', + "ingestion.gcs_to_bq_util.load_json_as_df_from_web", return_value=get_state_test_data_as_df(), ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAge(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): cdcVaccination = CDCVaccinationNational() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - cdcVaccination.write_to_bq('dataset', 'gcs_bucket', **kwargs) + cdcVaccination.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 assert mock_bq.call_count == 3 - expected_df = pd.read_csv(GOLDEN_DATA['age'], dtype={'vaccinated_pop_pct': str, 'state_fips': str}) + expected_df = pd.read_csv(GOLDEN_DATA["age"], dtype={"vaccinated_pop_pct": str, "state_fips": str}) df = mock_bq.call_args_list[2].args[0] diff --git a/python/tests/datasources/test_cdc_wisqars_black_men.py b/python/tests/datasources/test_cdc_wisqars_black_men.py index 9afc8367fd..2ee47c4489 100644 --- a/python/tests/datasources/test_cdc_wisqars_black_men.py +++ b/python/tests/datasources/test_cdc_wisqars_black_men.py @@ -76,8 +76,8 @@ def test_write_to_bq_black_men_by_urbanicity_national( expected_historical_df = pd.read_csv(GOLDEN_DATA[table_name], dtype=DTYPE) assert table_name == "black_men_by_urbanicity_national_historical" - actual_historical_df = actual_historical_df.sort_values(by=['time_period', 'urbanicity']).reset_index(drop=True) - expected_historical_df = expected_historical_df.sort_values(by=['time_period', 'urbanicity']).reset_index(drop=True) + actual_historical_df = actual_historical_df.sort_values(by=["time_period", "urbanicity"]).reset_index(drop=True) + expected_historical_df = expected_historical_df.sort_values(by=["time_period", "urbanicity"]).reset_index(drop=True) # calls writing NATIONAL + STATE to bq assert mock_bq.call_count == 2 @@ -121,11 +121,11 @@ def test_write_to_bq_black_men_by_urbanicity_state( # calls writing NATIONAL + STATE to bq assert mock_bq.call_count == 2 - actual_historical_df = actual_historical_df.sort_values(by=['time_period', 'state_name', 'urbanicity']).reset_index( + actual_historical_df = actual_historical_df.sort_values(by=["time_period", "state_name", "urbanicity"]).reset_index( drop=True ) expected_historical_df = expected_historical_df.sort_values( - by=['time_period', 'state_name', 'urbanicity'] + by=["time_period", "state_name", "urbanicity"] ).reset_index(drop=True) assert_frame_equal(actual_current_df, expected_current_df, check_like=True) @@ -167,8 +167,8 @@ def test_write_to_bq_black_men_by_age_national( # calls writing NATIONAL + STATE to bq assert mock_bq.call_count == 2 - actual_historical_df = actual_historical_df.sort_values(by=['time_period', 'age']).reset_index(drop=True) - expected_historical_df = expected_historical_df.sort_values(by=['time_period', 'age']).reset_index(drop=True) + actual_historical_df = actual_historical_df.sort_values(by=["time_period", "age"]).reset_index(drop=True) + expected_historical_df = expected_historical_df.sort_values(by=["time_period", "age"]).reset_index(drop=True) assert_frame_equal(actual_current_df, expected_current_df, check_like=True) assert_frame_equal(actual_historical_df, expected_historical_df, check_like=True) @@ -209,10 +209,10 @@ def test_write_to_bq_black_men_by_age_state( # calls writing NATIONAL + STATE to bq assert mock_bq.call_count == 2 - actual_historical_df = actual_historical_df.sort_values(by=['time_period', 'state_name', 'age']).reset_index( + actual_historical_df = actual_historical_df.sort_values(by=["time_period", "state_name", "age"]).reset_index( drop=True ) - expected_historical_df = expected_historical_df.sort_values(by=['time_period', 'state_name', 'age']).reset_index( + expected_historical_df = expected_historical_df.sort_values(by=["time_period", "state_name", "age"]).reset_index( drop=True ) diff --git a/python/tests/datasources/test_cdc_wisqars_youth.py b/python/tests/datasources/test_cdc_wisqars_youth.py index 03202e4606..669298b1a9 100644 --- a/python/tests/datasources/test_cdc_wisqars_youth.py +++ b/python/tests/datasources/test_cdc_wisqars_youth.py @@ -76,10 +76,10 @@ def test_write_to_bq_youth_by_race_national( assert mock_bq.call_count == 2 - actual_historical_df = actual_historical_df.sort_values(by=['time_period', 'race_and_ethnicity']).reset_index( + actual_historical_df = actual_historical_df.sort_values(by=["time_period", "race_and_ethnicity"]).reset_index( drop=True ) - expected_historical_df = expected_historical_df.sort_values(by=['time_period', 'race_and_ethnicity']).reset_index( + expected_historical_df = expected_historical_df.sort_values(by=["time_period", "race_and_ethnicity"]).reset_index( drop=True ) @@ -118,18 +118,18 @@ def test_write_to_bq_youth_by_race_state( assert mock_bq.call_count == 2 - actual_current_df = actual_current_df.sort_values(by=['race_and_ethnicity', 'state_name']).reset_index(drop=True) - expected_current_df = expected_current_df.sort_values(by=['race_and_ethnicity', 'state_name']).reset_index( + actual_current_df = actual_current_df.sort_values(by=["race_and_ethnicity", "state_name"]).reset_index(drop=True) + expected_current_df = expected_current_df.sort_values(by=["race_and_ethnicity", "state_name"]).reset_index( drop=True ) assert_frame_equal(actual_current_df, expected_current_df, check_like=True) actual_historical_df = actual_historical_df.sort_values( - by=['time_period', 'race_and_ethnicity', 'state_name'] + by=["time_period", "race_and_ethnicity", "state_name"] ).reset_index(drop=True) expected_historical_df = expected_historical_df.sort_values( - by=['time_period', 'race_and_ethnicity', 'state_name'] + by=["time_period", "race_and_ethnicity", "state_name"] ).reset_index(drop=True) assert_frame_equal(actual_historical_df, expected_historical_df, check_like=True) diff --git a/python/tests/datasources/test_cdc_wonder.py b/python/tests/datasources/test_cdc_wonder.py index 45c3b2e0d8..d27189f655 100644 --- a/python/tests/datasources/test_cdc_wonder.py +++ b/python/tests/datasources/test_cdc_wonder.py @@ -9,27 +9,27 @@ import ingestion.standardized_columns as std_col THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data') -GOLDEN_DIR = os.path.join(TEST_DIR, CDC_WONDER_DIR, 'golden_data') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data") +GOLDEN_DIR = os.path.join(TEST_DIR, CDC_WONDER_DIR, "golden_data") CSV_DTYPES = {std_col.TIME_PERIOD_COL: str, std_col.STATE_FIPS_COL: str} GOLDEN_DATA = { - 'age_national_current': os.path.join(GOLDEN_DIR, 'expected_age_national.csv'), - 'age_national_historical': os.path.join(GOLDEN_DIR, 'expected_age_national_historical.csv'), - 'race_and_ethnicity_state_current': os.path.join(GOLDEN_DIR, 'expected_race_and_ethnicity_state.csv'), - 'race_and_ethnicity_state_historical': os.path.join(GOLDEN_DIR, 'expected_race_and_ethnicity_state_historical.csv'), - 'sex_national_current': os.path.join(GOLDEN_DIR, 'expected_sex_national.csv'), - 'sex_national_historical': os.path.join(GOLDEN_DIR, 'expected_sex_national_historical.csv'), + "age_national_current": os.path.join(GOLDEN_DIR, "expected_age_national.csv"), + "age_national_historical": os.path.join(GOLDEN_DIR, "expected_age_national_historical.csv"), + "race_and_ethnicity_state_current": os.path.join(GOLDEN_DIR, "expected_race_and_ethnicity_state.csv"), + "race_and_ethnicity_state_historical": os.path.join(GOLDEN_DIR, "expected_race_and_ethnicity_state_historical.csv"), + "sex_national_current": os.path.join(GOLDEN_DIR, "expected_sex_national.csv"), + "sex_national_historical": os.path.join(GOLDEN_DIR, "expected_sex_national_historical.csv"), } # Breakdown Tests -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', side_effect=_load_csv_as_df_from_real_data_dir) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir) def testBreakdownAgeNational(mock_data_dir: mock.MagicMock, mock_bq_write: mock.MagicMock): datasource = CdcWonderData() - datasource.write_to_bq('dataset', 'gcs_bucket', demographic='age', geographic='national') + datasource.write_to_bq("dataset", "gcs_bucket", demographic="age", geographic="national") assert mock_data_dir.called @@ -46,11 +46,11 @@ def testBreakdownAgeNational(mock_data_dir: mock.MagicMock, mock_bq_write: mock. assert_frame_equal(actual_historical_df, expected_historical_df, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', side_effect=_load_csv_as_df_from_real_data_dir) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir) def testBreakdownSexNational(mock_data_dir: mock.MagicMock, mock_bq_write: mock.MagicMock): datasource = CdcWonderData() - datasource.write_to_bq('dataset', 'gcs_bucket', demographic='sex', geographic='national') + datasource.write_to_bq("dataset", "gcs_bucket", demographic="sex", geographic="national") assert mock_data_dir.called @@ -67,11 +67,11 @@ def testBreakdownSexNational(mock_data_dir: mock.MagicMock, mock_bq_write: mock. assert_frame_equal(actual_historical_df, expected_historical_df, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', side_effect=_load_csv_as_df_from_real_data_dir) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir) def testBreakdownRaceState(mock_data_dir: mock.MagicMock, mock_bq_write: mock.MagicMock): datasource = CdcWonderData() - datasource.write_to_bq('dataset', 'gcs_bucket', demographic='race_and_ethnicity', geographic='state') + datasource.write_to_bq("dataset", "gcs_bucket", demographic="race_and_ethnicity", geographic="state") assert mock_data_dir.called diff --git a/python/tests/datasources/test_census_pop_estimates.py b/python/tests/datasources/test_census_pop_estimates.py index eac702c450..d89e34181f 100644 --- a/python/tests/datasources/test_census_pop_estimates.py +++ b/python/tests/datasources/test_census_pop_estimates.py @@ -12,28 +12,28 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "census_pop_estimates") -STATE_POP_DATA = os.path.join(TEST_DIR, 'census_pop_estimates-race_ethnicity_age_state.csv') -NATIONAL_POP_DATA = os.path.join(TEST_DIR, 'census_pop_estimates-race_ethnicity_age_national.csv') +STATE_POP_DATA = os.path.join(TEST_DIR, "census_pop_estimates-race_ethnicity_age_state.csv") +NATIONAL_POP_DATA = os.path.join(TEST_DIR, "census_pop_estimates-race_ethnicity_age_national.csv") def get_pop_estimates_as_df(): return pd.read_csv( - os.path.join(TEST_DIR, 'census_pop_estimates.csv'), + os.path.join(TEST_DIR, "census_pop_estimates.csv"), dtype={ - 'STATE': str, - 'STNAME': str, + "STATE": str, + "STNAME": str, }, ) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_web', return_value=get_pop_estimates_as_df()) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_web", return_value=get_pop_estimates_as_df()) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBq(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): censusPopEstimates = CensusPopEstimates() - kwargs = {'filename': 'test_file.csv', 'metadata_table_id': 'test_metadata', 'table_name': 'output_table'} + kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} - censusPopEstimates.write_to_bq('dataset', 'gcs_bucket', **kwargs) + censusPopEstimates.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 assert mock_bq.call_count == 1 @@ -41,7 +41,7 @@ def testWriteToBq(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): expected_df = pd.read_csv( STATE_POP_DATA, dtype={ - 'state_fips': str, + "state_fips": str, }, ) @@ -52,14 +52,14 @@ def testGenerateNationalPopData(): state_df = pd.read_csv( STATE_POP_DATA, dtype={ - 'state_fips': str, + "state_fips": str, }, ) national_df = pd.read_csv( NATIONAL_POP_DATA, dtype={ - 'state_fips': str, + "state_fips": str, }, ) diff --git a/python/tests/datasources/test_census_pop_estimates_sc.py b/python/tests/datasources/test_census_pop_estimates_sc.py index f7abbb7581..3e711d7b98 100644 --- a/python/tests/datasources/test_census_pop_estimates_sc.py +++ b/python/tests/datasources/test_census_pop_estimates_sc.py @@ -9,53 +9,41 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "census_pop_estimates_sc") -STATE_POP_RACE_DATA = os.path.join( - TEST_DIR, 'census_pop_estimates_sc-race_ethnicity_age_state.csv') -NATIONAL_POP_RACE_DATA = os.path.join( - TEST_DIR, 'census_pop_estimates_sc-race_ethnicity_age_national.csv') +STATE_POP_RACE_DATA = os.path.join(TEST_DIR, "census_pop_estimates_sc-race_ethnicity_age_state.csv") +NATIONAL_POP_RACE_DATA = os.path.join(TEST_DIR, "census_pop_estimates_sc-race_ethnicity_age_national.csv") -STATE_POP_SEX_DATA = os.path.join( - TEST_DIR, 'census_pop_estimates_sc-sex_age_state.csv') -NATIONAL_POP_SEX_DATA = os.path.join( - TEST_DIR, 'census_pop_estimates_sc-sex_age_national.csv') +STATE_POP_SEX_DATA = os.path.join(TEST_DIR, "census_pop_estimates_sc-sex_age_state.csv") +NATIONAL_POP_SEX_DATA = os.path.join(TEST_DIR, "census_pop_estimates_sc-sex_age_national.csv") def get_pop_estimates_as_df(): print("MOCK FILE READ OF sc-est2021-alldata6.csv") - return pd.read_csv(os.path.join(TEST_DIR, 'sc-est2021-alldata6.csv'), dtype={ - 'STATE': str, - 'STNAME': str, - }) + return pd.read_csv( + os.path.join(TEST_DIR, "sc-est2021-alldata6.csv"), + dtype={ + "STATE": str, + "STNAME": str, + }, + ) def get_breakdown_df(): - return pd.DataFrame({ - "col1": [0, 1, 2], - "col2": ["a", "b", "c"] - }) + return pd.DataFrame({"col1": [0, 1, 2], "col2": ["a", "b", "c"]}) # TEST OVERALL WRITE TO BQ -@mock.patch('datasources.census_pop_estimates_sc.generate_pop_data_18plus', - return_value=get_breakdown_df()) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_web', - return_value=get_pop_estimates_as_df()) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', - return_value=None) -def testWriteToBq( - mock_bq: mock.MagicMock, - mock_csv: mock.MagicMock, - mock_gen: mock.MagicMock -): + +@mock.patch("datasources.census_pop_estimates_sc.generate_pop_data_18plus", return_value=get_breakdown_df()) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_web", return_value=get_pop_estimates_as_df()) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +def testWriteToBq(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock, mock_gen: mock.MagicMock): censusPopEstimatesSC = CensusPopEstimatesSC() - kwargs = {'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table'} + kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} - censusPopEstimatesSC.write_to_bq('dataset', 'gcs_bucket', **kwargs) + censusPopEstimatesSC.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 1 # 4 = 2 demographic breakdowns X 2 geographic breakdowns @@ -66,58 +54,38 @@ def testWriteToBq( def test18PlusByRaceState(): mock_csv_as_df = get_pop_estimates_as_df() - df = generate_pop_data_18plus( - mock_csv_as_df, "race_category_id", "state") + df = generate_pop_data_18plus(mock_csv_as_df, "race_category_id", "state") - expected_race_df = pd.read_csv(STATE_POP_RACE_DATA, dtype={ - 'state_fips': str, - 'time_period': str - }) + expected_race_df = pd.read_csv(STATE_POP_RACE_DATA, dtype={"state_fips": str, "time_period": str}) - assert_frame_equal( - df, expected_race_df, check_like=True) + assert_frame_equal(df, expected_race_df, check_like=True) def test18PlusBySexState(): mock_csv_as_df = get_pop_estimates_as_df() - df = generate_pop_data_18plus( - mock_csv_as_df, "sex", "state") + df = generate_pop_data_18plus(mock_csv_as_df, "sex", "state") - expected_sex_df = pd.read_csv(STATE_POP_SEX_DATA, dtype={ - 'state_fips': str, - 'time_period': str - }) + expected_sex_df = pd.read_csv(STATE_POP_SEX_DATA, dtype={"state_fips": str, "time_period": str}) - assert_frame_equal( - df, expected_sex_df, check_like=True) + assert_frame_equal(df, expected_sex_df, check_like=True) def test18PlusByRaceNational(): mock_csv_as_df = get_pop_estimates_as_df() - df = generate_pop_data_18plus( - mock_csv_as_df, "race_category_id", "national") + df = generate_pop_data_18plus(mock_csv_as_df, "race_category_id", "national") - expected_race_df = pd.read_csv(NATIONAL_POP_RACE_DATA, dtype={ - 'state_fips': str, - 'time_period': str - }) + expected_race_df = pd.read_csv(NATIONAL_POP_RACE_DATA, dtype={"state_fips": str, "time_period": str}) - assert_frame_equal( - df, expected_race_df, check_like=True) + assert_frame_equal(df, expected_race_df, check_like=True) def test18PlusBySexNational(): mock_csv_as_df = get_pop_estimates_as_df() - df = generate_pop_data_18plus( - mock_csv_as_df, "sex", "national") + df = generate_pop_data_18plus(mock_csv_as_df, "sex", "national") - expected_sex_df = pd.read_csv(NATIONAL_POP_SEX_DATA, dtype={ - 'state_fips': str, - 'time_period': str - }) + expected_sex_df = pd.read_csv(NATIONAL_POP_SEX_DATA, dtype={"state_fips": str, "time_period": str}) - assert_frame_equal( - df, expected_sex_df, check_like=True) + assert_frame_equal(df, expected_sex_df, check_like=True) diff --git a/python/tests/datasources/test_data_source.py b/python/tests/datasources/test_data_source.py index afcbf43ba1..566f57b018 100644 --- a/python/tests/datasources/test_data_source.py +++ b/python/tests/datasources/test_data_source.py @@ -15,7 +15,7 @@ def testCleanFrameColumnNames(): ds = TestDataSource() df = pd.DataFrame( # /* cSpell:disable */ - {'Upp3rcase': [], 'Special!char': [], 'this=that': [], '%count': [], 'with SPACES': []} + {"Upp3rcase": [], "Special!char": [], "this=that": [], "%count": [], "with SPACES": []} ) ds.clean_frame_column_names(df) - assert set(df.columns) == set(['upp3rcase', 'special_char', 'thiseqthat', 'pctcount', 'with_spaces']) + assert set(df.columns) == set(["upp3rcase", "special_char", "thiseqthat", "pctcount", "with_spaces"]) diff --git a/python/tests/datasources/test_decia_2020_territory_population.py b/python/tests/datasources/test_decia_2020_territory_population.py index a3aab1712c..ee7ff173eb 100644 --- a/python/tests/datasources/test_decia_2020_territory_population.py +++ b/python/tests/datasources/test_decia_2020_territory_population.py @@ -6,8 +6,8 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data') -GOLDEN_DIR = os.path.join(TEST_DIR, 'decia_2020_territory_population', 'golden_data') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data") +GOLDEN_DIR = os.path.join(TEST_DIR, "decia_2020_territory_population", "golden_data") def _load_csv_as_df_from_data_dir(*args, **kwargs): @@ -20,7 +20,7 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs): # INTEGRATION TESTS datasource = Decia2020TerritoryPopulationData() dtypes = {"state_fips": str, "county_fips": str} -kwargs = {'filename': 'test_file.csv', 'metadata_table_id': 'test_metadata', 'table_name': 'output_table'} +kwargs = {"filename": "test_file.csv", "metadata_table_id": "test_metadata", "table_name": "output_table"} # @@ -28,63 +28,63 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs): # -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', side_effect=_load_csv_as_df_from_data_dir) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir) def testGenerateAgeTerritory( mock_data_dir: mock.MagicMock, mock_bq: mock.MagicMock, ): kwargs["demographic"] = "age" kwargs["geographic"] = "state" - datasource.write_to_bq('dataset', 'gcs_bucket', **kwargs) + datasource.write_to_bq("dataset", "gcs_bucket", **kwargs) # loads in 4 files, 1 per Island Area assert mock_data_dir.call_count == 4 df, _dataset, table_name = mock_bq.call_args_list[0][0] assert table_name == "age_state_current" - expected_df = pd.read_csv(os.path.join(GOLDEN_DIR, f'{table_name}.csv'), index_col=False, dtype=dtypes) + expected_df = pd.read_csv(os.path.join(GOLDEN_DIR, f"{table_name}.csv"), index_col=False, dtype=dtypes) assert_frame_equal(df, expected_df, check_dtype=False) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', side_effect=_load_csv_as_df_from_data_dir) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir) def testGenerateRaceTerritory( mock_data_dir: mock.MagicMock, mock_bq: mock.MagicMock, ): kwargs["demographic"] = "race_and_ethnicity" kwargs["geographic"] = "state" - datasource.write_to_bq('dataset', 'gcs_bucket', **kwargs) + datasource.write_to_bq("dataset", "gcs_bucket", **kwargs) # loads in 4 files, 1 per Island Area assert mock_data_dir.call_count == 4 df, _dataset, table_name = mock_bq.call_args_list[0][0] assert table_name == "race_and_ethnicity_state_current" - expected_df = pd.read_csv(os.path.join(GOLDEN_DIR, f'{table_name}.csv'), index_col=False, dtype=dtypes) + expected_df = pd.read_csv(os.path.join(GOLDEN_DIR, f"{table_name}.csv"), index_col=False, dtype=dtypes) - df = df.sort_values(by=['state_fips', 'race_category_id']).reset_index(drop=True) - expected_df = expected_df.sort_values(by=['state_fips', 'race_category_id']).reset_index(drop=True) + df = df.sort_values(by=["state_fips", "race_category_id"]).reset_index(drop=True) + expected_df = expected_df.sort_values(by=["state_fips", "race_category_id"]).reset_index(drop=True) assert_frame_equal(df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', side_effect=_load_csv_as_df_from_data_dir) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir) def testGenerateSexTerritoryCountyEquivalent( mock_data_dir: mock.MagicMock, mock_bq: mock.MagicMock, ): kwargs["demographic"] = "sex" kwargs["geographic"] = "county" - datasource.write_to_bq('dataset', 'gcs_bucket', **kwargs) + datasource.write_to_bq("dataset", "gcs_bucket", **kwargs) # loads in 4 files, 1 per Island Area assert mock_data_dir.call_count == 4 df, _dataset, table_name = mock_bq.call_args_list[0][0] assert table_name == "sex_county_current" - expected_df = pd.read_csv(os.path.join(GOLDEN_DIR, f'{table_name}.csv'), index_col=False, dtype=dtypes) + expected_df = pd.read_csv(os.path.join(GOLDEN_DIR, f"{table_name}.csv"), index_col=False, dtype=dtypes) assert_frame_equal(df, expected_df, check_dtype=False) diff --git a/python/tests/datasources/test_geo_context.py b/python/tests/datasources/test_geo_context.py index 799835506e..3131e205b1 100644 --- a/python/tests/datasources/test_geo_context.py +++ b/python/tests/datasources/test_geo_context.py @@ -24,11 +24,11 @@ def test_format_svi(): # Current working directory. THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "geo_context") -REAL_SVI_DIR = os.path.abspath('data/cdc_svi_county') +REAL_SVI_DIR = os.path.abspath("data/cdc_svi_county") -GOLDEN_DATA_NATIONAL = os.path.join(TEST_DIR, 'test_output_geo_context_national.csv') -GOLDEN_DATA_STATE = os.path.join(TEST_DIR, 'test_output_geo_context_state.csv') -GOLDEN_DATA_COUNTY = os.path.join(TEST_DIR, 'test_output_geo_context_county.csv') +GOLDEN_DATA_NATIONAL = os.path.join(TEST_DIR, "test_output_geo_context_national.csv") +GOLDEN_DATA_STATE = os.path.join(TEST_DIR, "test_output_geo_context_state.csv") +GOLDEN_DATA_COUNTY = os.path.join(TEST_DIR, "test_output_geo_context_county.csv") def _scaffold_fips_df(*args): @@ -43,7 +43,7 @@ def _scaffold_fips_df(*args): def _get_svi_as_df(): - return pd.read_csv(os.path.join(TEST_DIR, 'cdc_svi_county_test.csv'), dtype={"FIPS": str}) + return pd.read_csv(os.path.join(TEST_DIR, "cdc_svi_county_test.csv"), dtype={"FIPS": str}) def _generate_breakdown(*args): @@ -55,20 +55,20 @@ def _generate_breakdown(*args): @mock.patch( - 'datasources.geo_context.GeoContext.generate_breakdown', + "datasources.geo_context.GeoContext.generate_breakdown", side_effect=_generate_breakdown, ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBq(mock_bq: mock.MagicMock, mock_generate_breakdown: mock.MagicMock): """Ensures the correct structure and arguments were generated to be written to BigQuery""" geoContext = GeoContext() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - geoContext.write_to_bq('dataset', 'gcs_bucket', **kwargs) + geoContext.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_generate_breakdown.call_count == 3 assert mock_bq.call_count == 3 @@ -78,16 +78,16 @@ def testWriteToBq(mock_bq: mock.MagicMock, mock_generate_breakdown: mock.MagicMo national_call[1]["column_types"] == state_call[1]["column_types"] == { - 'fake_col1': BQ_STRING, - 'fake_col2': BQ_STRING, - 'population': BQ_FLOAT, + "fake_col1": BQ_STRING, + "fake_col2": BQ_STRING, + "population": BQ_FLOAT, } ) assert county_call[1]["column_types"] == { - 'fake_col1': BQ_STRING, - 'fake_col2': BQ_STRING, - 'svi': BQ_FLOAT, - 'population': BQ_FLOAT, + "fake_col1": BQ_STRING, + "fake_col2": BQ_STRING, + "svi": BQ_FLOAT, + "population": BQ_FLOAT, } @@ -99,13 +99,13 @@ def testGenerateNationalBreakdown(): expected_national_df = pd.read_csv( GOLDEN_DATA_NATIONAL, dtype={ - 'state_fips': str, + "state_fips": str, }, ) assert_frame_equal(national_df, expected_national_df, check_like=True) -@mock.patch('ingestion.dataset_utils.scaffold_fips_df', side_effect=_scaffold_fips_df) +@mock.patch("ingestion.dataset_utils.scaffold_fips_df", side_effect=_scaffold_fips_df) def testGenerateStateLevelBreakdown( mock_scaffold: mock.MagicMock, ): @@ -117,7 +117,7 @@ def testGenerateStateLevelBreakdown( expected_state_level_df = pd.read_csv( GOLDEN_DATA_STATE, dtype={ - 'state_fips': str, + "state_fips": str, }, ) @@ -126,9 +126,9 @@ def testGenerateStateLevelBreakdown( assert_frame_equal(state_level_df, expected_state_level_df, check_like=True) -@mock.patch('ingestion.dataset_utils.scaffold_fips_df', side_effect=_scaffold_fips_df) +@mock.patch("ingestion.dataset_utils.scaffold_fips_df", side_effect=_scaffold_fips_df) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", return_value=_get_svi_as_df(), ) def testGenerateCountyBreakdown( @@ -144,6 +144,6 @@ def testGenerateCountyBreakdown( assert mock_svi_data.call_count == 1 assert mock_scaffold.call_count == 1 - expected_county_df = pd.read_csv(GOLDEN_DATA_COUNTY, dtype={'county_fips': str}) + expected_county_df = pd.read_csv(GOLDEN_DATA_COUNTY, dtype={"county_fips": str}) assert_frame_equal(county_df, expected_county_df, check_like=True) diff --git a/python/tests/datasources/test_graphql_ahr.py b/python/tests/datasources/test_graphql_ahr.py index c5c01d567f..8e0e92b4bd 100644 --- a/python/tests/datasources/test_graphql_ahr.py +++ b/python/tests/datasources/test_graphql_ahr.py @@ -11,37 +11,37 @@ GOLDEN_DIR = os.path.join(TEST_DIR, "golden_data") GOLDEN_DATA = { - 'behavioral_health_age_national_current': os.path.join(GOLDEN_DIR, 'behavioral_health_age_national_current.csv'), - 'behavioral_health_sex_national_current': os.path.join(GOLDEN_DIR, 'behavioral_health_sex_national_current.csv'), - 'behavioral_health_race_and_ethnicity_state_current': os.path.join( - GOLDEN_DIR, 'behavioral_health_race_and_ethnicity_state_current.csv' + "behavioral_health_age_national_current": os.path.join(GOLDEN_DIR, "behavioral_health_age_national_current.csv"), + "behavioral_health_sex_national_current": os.path.join(GOLDEN_DIR, "behavioral_health_sex_national_current.csv"), + "behavioral_health_race_and_ethnicity_state_current": os.path.join( + GOLDEN_DIR, "behavioral_health_race_and_ethnicity_state_current.csv" ), - 'non-behavioral_health_age_national_current': os.path.join( - GOLDEN_DIR, 'non-behavioral_health_age_national_current.csv' + "non-behavioral_health_age_national_current": os.path.join( + GOLDEN_DIR, "non-behavioral_health_age_national_current.csv" ), - 'non-behavioral_health_sex_national_current': os.path.join( - GOLDEN_DIR, 'non-behavioral_health_sex_national_current.csv' + "non-behavioral_health_sex_national_current": os.path.join( + GOLDEN_DIR, "non-behavioral_health_sex_national_current.csv" ), - 'non-behavioral_health_race_and_ethnicity_state_current': os.path.join( - GOLDEN_DIR, 'non-behavioral_health_race_and_ethnicity_state_current.csv' + "non-behavioral_health_race_and_ethnicity_state_current": os.path.join( + GOLDEN_DIR, "non-behavioral_health_race_and_ethnicity_state_current.csv" ), - 'behavioral_health_age_national_historical': os.path.join( - GOLDEN_DIR, 'behavioral_health_age_national_historical.csv' + "behavioral_health_age_national_historical": os.path.join( + GOLDEN_DIR, "behavioral_health_age_national_historical.csv" ), - 'behavioral_health_sex_national_historical': os.path.join( - GOLDEN_DIR, 'behavioral_health_sex_national_historical.csv' + "behavioral_health_sex_national_historical": os.path.join( + GOLDEN_DIR, "behavioral_health_sex_national_historical.csv" ), - 'behavioral_health_race_and_ethnicity_state_historical': os.path.join( - GOLDEN_DIR, 'behavioral_health_race_and_ethnicity_state_historical.csv' + "behavioral_health_race_and_ethnicity_state_historical": os.path.join( + GOLDEN_DIR, "behavioral_health_race_and_ethnicity_state_historical.csv" ), - 'non-behavioral_health_age_national_historical': os.path.join( - GOLDEN_DIR, 'non-behavioral_health_age_national_historical.csv' + "non-behavioral_health_age_national_historical": os.path.join( + GOLDEN_DIR, "non-behavioral_health_age_national_historical.csv" ), - 'non-behavioral_health_sex_national_historical': os.path.join( - GOLDEN_DIR, 'non-behavioral_health_sex_national_historical.csv' + "non-behavioral_health_sex_national_historical": os.path.join( + GOLDEN_DIR, "non-behavioral_health_sex_national_historical.csv" ), - 'non-behavioral_health_race_and_ethnicity_state_historical': os.path.join( - GOLDEN_DIR, 'non-behavioral_health_race_and_ethnicity_state_historical.csv' + "non-behavioral_health_race_and_ethnicity_state_historical": os.path.join( + GOLDEN_DIR, "non-behavioral_health_race_and_ethnicity_state_historical.csv" ), } @@ -49,19 +49,19 @@ def _fetch_ahr_data_from_graphql(demographic: str, geo_level: str, category: str): print(f"MOCK - AHR GraphQL API response for {category}_{demographic}_{geo_level}") with open( - os.path.join(TEST_DIR, f'{category}_{demographic}_{geo_level}_response.json'), 'r', encoding='utf-8' + os.path.join(TEST_DIR, f"{category}_{demographic}_{geo_level}_response.json"), "r", encoding="utf-8" ) as file: data = json.load(file) return data -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('datasources.graphql_ahr.fetch_ahr_data_from_graphql', side_effect=_fetch_ahr_data_from_graphql) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("datasources.graphql_ahr.fetch_ahr_data_from_graphql", side_effect=_fetch_ahr_data_from_graphql) def testWriteToBqBehavioralHealthAgeNational(_mock_fetch: mock.MagicMock, mock_add_df_to_bq: mock.MagicMock): datasource = GraphQlAHRData() datasource.write_to_bq( - 'dataset', 'gcs_bucket', demographic='age', geographic='national', category='behavioral_health' + "dataset", "gcs_bucket", demographic="age", geographic="national", category="behavioral_health" ) assert mock_add_df_to_bq.call_count == 2 @@ -74,19 +74,19 @@ def testWriteToBqBehavioralHealthAgeNational(_mock_fetch: mock.MagicMock, mock_a assert_frame_equal(actual_df_current, expected_df_current, check_like=True) actual_df_historical, _, table_name = mock_add_df_to_bq.call_args_list[1][0] - expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, 'time_period': str}) + expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, "time_period": str}) assert table_name == "behavioral_health_age_national_historical" # actual_df_historical.to_csv(table_name, index=False) # print(actual_df_historical.to_string()) assert_frame_equal(actual_df_historical, expected_df_historical, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('datasources.graphql_ahr.fetch_ahr_data_from_graphql', side_effect=_fetch_ahr_data_from_graphql) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("datasources.graphql_ahr.fetch_ahr_data_from_graphql", side_effect=_fetch_ahr_data_from_graphql) def testWriteToBqNonBehavioralHealthAgeNational(_mock_fetch: mock.MagicMock, mock_add_df_to_bq: mock.MagicMock): datasource = GraphQlAHRData() datasource.write_to_bq( - 'dataset', 'gcs_bucket', demographic='age', geographic='national', category='non-behavioral_health' + "dataset", "gcs_bucket", demographic="age", geographic="national", category="non-behavioral_health" ) assert mock_add_df_to_bq.call_count == 2 @@ -99,19 +99,19 @@ def testWriteToBqNonBehavioralHealthAgeNational(_mock_fetch: mock.MagicMock, moc assert_frame_equal(actual_df_current, expected_df_current, check_like=True) actual_df_historical, _, table_name = mock_add_df_to_bq.call_args_list[1][0] - expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, 'time_period': str}) + expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, "time_period": str}) assert table_name == "non-behavioral_health_age_national_historical" # actual_df_historical.to_csv(table_name, index=False) # print(actual_df_historical.to_string()) assert_frame_equal(actual_df_historical, expected_df_historical, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('datasources.graphql_ahr.fetch_ahr_data_from_graphql', side_effect=_fetch_ahr_data_from_graphql) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("datasources.graphql_ahr.fetch_ahr_data_from_graphql", side_effect=_fetch_ahr_data_from_graphql) def testWriteToBqBehavioralHealthRaceState(_mock_fetch: mock.MagicMock, mock_add_df_to_bq: mock.MagicMock): datasource = GraphQlAHRData() datasource.write_to_bq( - 'dataset', 'gcs_bucket', demographic='race_and_ethnicity', geographic='state', category='behavioral_health' + "dataset", "gcs_bucket", demographic="race_and_ethnicity", geographic="state", category="behavioral_health" ) assert mock_add_df_to_bq.call_count == 2 @@ -124,23 +124,23 @@ def testWriteToBqBehavioralHealthRaceState(_mock_fetch: mock.MagicMock, mock_add assert_frame_equal(actual_df_current, expected_df_current, check_like=True) actual_df_historical, _, table_name = mock_add_df_to_bq.call_args_list[1][0] - expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, 'time_period': str}) + expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, "time_period": str}) assert table_name == "behavioral_health_race_and_ethnicity_state_historical" # actual_df_historical.to_csv(table_name, index=False) # print(actual_df_historical.to_string()) assert_frame_equal(actual_df_historical, expected_df_historical, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('datasources.graphql_ahr.fetch_ahr_data_from_graphql', side_effect=_fetch_ahr_data_from_graphql) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("datasources.graphql_ahr.fetch_ahr_data_from_graphql", side_effect=_fetch_ahr_data_from_graphql) def testWriteToBqNonBehavioralHealthRaceState(_mock_fetch: mock.MagicMock, mock_add_df_to_bq: mock.MagicMock): datasource = GraphQlAHRData() datasource.write_to_bq( - 'dataset', - 'gcs_bucket', - demographic='race_and_ethnicity', - geographic='state', - category='non-behavioral_health', + "dataset", + "gcs_bucket", + demographic="race_and_ethnicity", + geographic="state", + category="non-behavioral_health", ) assert mock_add_df_to_bq.call_count == 2 @@ -153,19 +153,19 @@ def testWriteToBqNonBehavioralHealthRaceState(_mock_fetch: mock.MagicMock, mock_ assert_frame_equal(actual_df_current, expected_df_current, check_like=True) actual_df_historical, _, table_name = mock_add_df_to_bq.call_args_list[1][0] - expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, 'time_period': str}) + expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, "time_period": str}) assert table_name == "non-behavioral_health_race_and_ethnicity_state_historical" # actual_df_historical.to_csv(table_name, index=False) # print(actual_df_historical.to_string()) assert_frame_equal(actual_df_historical, expected_df_historical, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('datasources.graphql_ahr.fetch_ahr_data_from_graphql', side_effect=_fetch_ahr_data_from_graphql) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("datasources.graphql_ahr.fetch_ahr_data_from_graphql", side_effect=_fetch_ahr_data_from_graphql) def testWriteToBqBehavioralHealthSexNational(_mock_fetch: mock.MagicMock, mock_add_df_to_bq: mock.MagicMock): datasource = GraphQlAHRData() datasource.write_to_bq( - 'dataset', 'gcs_bucket', demographic='sex', geographic='national', category='behavioral_health' + "dataset", "gcs_bucket", demographic="sex", geographic="national", category="behavioral_health" ) assert mock_add_df_to_bq.call_count == 2 @@ -178,19 +178,19 @@ def testWriteToBqBehavioralHealthSexNational(_mock_fetch: mock.MagicMock, mock_a assert_frame_equal(actual_df_current, expected_df_current, check_like=True) actual_df_historical, _, table_name = mock_add_df_to_bq.call_args_list[1][0] - expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, 'time_period': str}) + expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, "time_period": str}) assert table_name == "behavioral_health_sex_national_historical" # actual_df_historical.to_csv(table_name, index=False) # print(actual_df_historical.to_string()) assert_frame_equal(actual_df_historical, expected_df_historical, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) -@mock.patch('datasources.graphql_ahr.fetch_ahr_data_from_graphql', side_effect=_fetch_ahr_data_from_graphql) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) +@mock.patch("datasources.graphql_ahr.fetch_ahr_data_from_graphql", side_effect=_fetch_ahr_data_from_graphql) def testWriteToBqNonBehavioralHealthSexNational(_mock_fetch: mock.MagicMock, mock_add_df_to_bq: mock.MagicMock): datasource = GraphQlAHRData() datasource.write_to_bq( - 'dataset', 'gcs_bucket', demographic='sex', geographic='national', category='non-behavioral_health' + "dataset", "gcs_bucket", demographic="sex", geographic="national", category="non-behavioral_health" ) assert mock_add_df_to_bq.call_count == 2 @@ -203,7 +203,7 @@ def testWriteToBqNonBehavioralHealthSexNational(_mock_fetch: mock.MagicMock, moc assert_frame_equal(actual_df_current, expected_df_current, check_like=True) actual_df_historical, _, table_name = mock_add_df_to_bq.call_args_list[1][0] - expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, 'time_period': str}) + expected_df_historical = pd.read_csv(GOLDEN_DATA[table_name], dtype={STATE_FIPS_COL: str, "time_period": str}) assert table_name == "non-behavioral_health_sex_national_historical" # actual_df_historical.to_csv(table_name, index=False) # print(actual_df_historical.to_string()) diff --git a/python/tests/datasources/test_kff_vaccination.py b/python/tests/datasources/test_kff_vaccination.py index fac10a7a05..c0b54f4307 100644 --- a/python/tests/datasources/test_kff_vaccination.py +++ b/python/tests/datasources/test_kff_vaccination.py @@ -10,62 +10,62 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "kff_vaccination") -GOLDEN_DATA = os.path.join(TEST_DIR, 'kff_vaccination_by_race_and_ethnicity.csv') +GOLDEN_DATA = os.path.join(TEST_DIR, "kff_vaccination_by_race_and_ethnicity.csv") def get_github_file_list_as_df(): - return pd.read_json(os.path.join(TEST_DIR, 'github_file_list.json')) + return pd.read_json(os.path.join(TEST_DIR, "github_file_list.json")) def get_percentage_of_race_test_data_as_df(): - return pd.read_csv(os.path.join(TEST_DIR, 'kff_vaccination_percentage_of_race_test.csv')) + return pd.read_csv(os.path.join(TEST_DIR, "kff_vaccination_percentage_of_race_test.csv")) def get_pct_share_race_test_data_as_df(): - return pd.read_csv(os.path.join(TEST_DIR, 'kff_vaccination_pct_share_race_test.csv')) + return pd.read_csv(os.path.join(TEST_DIR, "kff_vaccination_pct_share_race_test.csv")) def get_state_totals_test_data_as_df(): return pd.read_csv( - os.path.join(TEST_DIR, 'kff_vaccination_state_totals_test.csv'), - dtype={'one_dose': str}, + os.path.join(TEST_DIR, "kff_vaccination_state_totals_test.csv"), + dtype={"one_dose": str}, ) def get_kff_population_numbers_as_df(): - return pd.read_csv(os.path.join(TEST_DIR, 'kff_vaccination_population.csv'), dtype=str) + return pd.read_csv(os.path.join(TEST_DIR, "kff_vaccination_population.csv"), dtype=str) @mock.patch( - 'ingestion.gcs_to_bq_util.load_json_as_df_from_web_based_on_key', + "ingestion.gcs_to_bq_util.load_json_as_df_from_web_based_on_key", return_value=get_github_file_list_as_df(), ) def testGetDataUrlPctTotal(mock_json: mock.MagicMock): assert mock_json.call_count == 0 - assert get_data_url('pct_total') == "some-up-to-date-url" + assert get_data_url("pct_total") == "some-up-to-date-url" @mock.patch( - 'ingestion.gcs_to_bq_util.load_json_as_df_from_web_based_on_key', + "ingestion.gcs_to_bq_util.load_json_as_df_from_web_based_on_key", return_value=get_github_file_list_as_df(), ) def testGetDataUrlPctShare(mock_json: mock.MagicMock): assert mock_json.call_count == 0 - assert get_data_url('pct_share') == "some-other-up-to-date-url" + assert get_data_url("pct_share") == "some-other-up-to-date-url" @mock.patch( - 'ingestion.gcs_to_bq_util.load_json_as_df_from_web_based_on_key', + "ingestion.gcs_to_bq_util.load_json_as_df_from_web_based_on_key", return_value=get_github_file_list_as_df(), ) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_web', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_web", return_value=get_state_totals_test_data_as_df(), ) -@mock.patch('ingestion.github_util.decode_json_from_url_into_df') -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.github_util.decode_json_from_url_into_df") +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBq( mock_bq: mock.MagicMock, mock_csv: mock.MagicMock, @@ -80,27 +80,27 @@ def testWriteToBq( kffVaccination = KFFVaccination() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - kffVaccination.write_to_bq('dataset', 'gcs_bucket', **kwargs) + kffVaccination.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_json.call_count == 3 assert mock_csv_web.call_count == 1 assert mock_bq.call_count == 2 - assert mock_bq.call_args_list[0].args[2] == 'race_and_ethnicity_state_current' - assert mock_bq.call_args_list[1].args[2] == 'alls_state_current' + assert mock_bq.call_args_list[0].args[2] == "race_and_ethnicity_state_current" + assert mock_bq.call_args_list[1].args[2] == "alls_state_current" df = mock_bq.call_args_list[0].args[0] expected_df = pd.read_csv( GOLDEN_DATA, dtype={ - 'state_fips': str, - 'vaccinated_pct_share': float, - 'vaccinated_population_pct': float, - 'acs_vaccinated_pop_pct': float, + "state_fips": str, + "vaccinated_pct_share": float, + "vaccinated_population_pct": float, + "acs_vaccinated_pop_pct": float, }, ) diff --git a/python/tests/datasources/test_maternal_mortality.py b/python/tests/datasources/test_maternal_mortality.py index c58e42f7c0..b861fdd2ee 100644 --- a/python/tests/datasources/test_maternal_mortality.py +++ b/python/tests/datasources/test_maternal_mortality.py @@ -8,75 +8,75 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data", "maternal_mortality") -GOLDEN_DIR = os.path.join(TEST_DIR, 'golden_data') +GOLDEN_DIR = os.path.join(TEST_DIR, "golden_data") GOLDEN_DATA_RACE_STATE_HISTORICAL = os.path.join( - GOLDEN_DIR, 'maternal_mortality_output_race_and_ethnicity_state_historical.csv' + GOLDEN_DIR, "maternal_mortality_output_race_and_ethnicity_state_historical.csv" ) GOLDEN_DATA_RACE_NATIONAL_HISTORICAL = os.path.join( - GOLDEN_DIR, 'maternal_mortality_output_race_and_ethnicity_national_historical.csv' + GOLDEN_DIR, "maternal_mortality_output_race_and_ethnicity_national_historical.csv" ) GOLDEN_DATA_RACE_STATE_CURRENT = os.path.join( - GOLDEN_DIR, 'maternal_mortality_output_race_and_ethnicity_state_current.csv' + GOLDEN_DIR, "maternal_mortality_output_race_and_ethnicity_state_current.csv" ) GOLDEN_DATA_RACE_NATIONAL_CURRENT = os.path.join( - GOLDEN_DIR, 'maternal_mortality_output_race_and_ethnicity_national_current.csv' + GOLDEN_DIR, "maternal_mortality_output_race_and_ethnicity_national_current.csv" ) @mock.patch( - 'ingestion.gcs_to_bq_util.load_tsv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_tsv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBq(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock, mock_tsv: mock.MagicMock): datasource = MaternalMortalityData() kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } - datasource.write_to_bq('dataset', 'gcs_bucket', **kwargs) + datasource.write_to_bq("dataset", "gcs_bucket", **kwargs) assert mock_csv.call_count == 2 # calls to JAMA main table and manual counts assert mock_tsv.call_count == 1 # call to CDC Natality counts at state level df_state_historical, _, table_name = mock_bq.call_args_list[0][0] # df_state_historical.to_csv(table_name, index=False) - assert table_name == 'race_state_historical' + assert table_name == "race_state_historical" expected_state_historical_df = pd.read_csv( - GOLDEN_DATA_RACE_STATE_HISTORICAL, dtype={'state_fips': str, 'time_period': str} + GOLDEN_DATA_RACE_STATE_HISTORICAL, dtype={"state_fips": str, "time_period": str} ) assert_frame_equal(df_state_historical, expected_state_historical_df, check_like=True, check_dtype=False) df_state_current, _, table_name = mock_bq.call_args_list[1][0] # df_state_current.to_csv(table_name, index=False) - assert table_name == 'race_state_current' + assert table_name == "race_state_current" - expected_state_current_df = pd.read_csv(GOLDEN_DATA_RACE_STATE_CURRENT, dtype={'state_fips': str}) + expected_state_current_df = pd.read_csv(GOLDEN_DATA_RACE_STATE_CURRENT, dtype={"state_fips": str}) assert_frame_equal(df_state_current, expected_state_current_df, check_like=True, check_dtype=False) df_national_historical, _, table_name = mock_bq.call_args_list[2][0] # df_national_historical.to_csv(table_name, index=False) - assert table_name == 'race_national_historical' + assert table_name == "race_national_historical" expected_national_historical_df = pd.read_csv( - GOLDEN_DATA_RACE_NATIONAL_HISTORICAL, dtype={'state_fips': str, 'time_period': str} + GOLDEN_DATA_RACE_NATIONAL_HISTORICAL, dtype={"state_fips": str, "time_period": str} ) assert_frame_equal(df_national_historical, expected_national_historical_df, check_like=True, check_dtype=False) df_national_current, _, table_name = mock_bq.call_args_list[3][0] # df_national_current.to_csv(table_name, index=False) - assert table_name == 'race_national_current' + assert table_name == "race_national_current" - expected_national_current_df = pd.read_csv(GOLDEN_DATA_RACE_NATIONAL_CURRENT, dtype={'state_fips': str}) + expected_national_current_df = pd.read_csv(GOLDEN_DATA_RACE_NATIONAL_CURRENT, dtype={"state_fips": str}) assert_frame_equal(df_national_current, expected_national_current_df, check_like=True, check_dtype=False) diff --git a/python/tests/datasources/test_phrma.py b/python/tests/datasources/test_phrma.py index 233969602c..5031e4acab 100644 --- a/python/tests/datasources/test_phrma.py +++ b/python/tests/datasources/test_phrma.py @@ -6,33 +6,33 @@ import os THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data') -GOLDEN_DIR = os.path.join(TEST_DIR, PHRMA_DIR, 'golden_data') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data") +GOLDEN_DIR = os.path.join(TEST_DIR, PHRMA_DIR, "golden_data") ALLS_DATA = { - "national": os.path.join(TEST_DIR, PHRMA_DIR, "mocked_alls", 'national-alls.csv'), - "state": os.path.join(TEST_DIR, PHRMA_DIR, "mocked_alls", 'state-alls.csv'), - "county": os.path.join(TEST_DIR, PHRMA_DIR, "mocked_alls", 'county-alls.csv'), + "national": os.path.join(TEST_DIR, PHRMA_DIR, "mocked_alls", "national-alls.csv"), + "state": os.path.join(TEST_DIR, PHRMA_DIR, "mocked_alls", "state-alls.csv"), + "county": os.path.join(TEST_DIR, PHRMA_DIR, "mocked_alls", "county-alls.csv"), } GOLDEN_DATA = { - 'lis_national_current': os.path.join(GOLDEN_DIR, 'expected_lis_national.csv'), - 'eligibility_national_current': os.path.join(GOLDEN_DIR, 'expected_eligibility_national.csv'), - 'sex_national_current': os.path.join(GOLDEN_DIR, 'expected_sex_national.csv'), - 'sex_state_current': os.path.join(GOLDEN_DIR, 'expected_sex_state.csv'), - 'race_and_ethnicity_state_current': os.path.join(GOLDEN_DIR, 'expected_race_and_ethnicity_state.csv'), - 'age_county_current': os.path.join(GOLDEN_DIR, 'expected_age_county.csv'), + "lis_national_current": os.path.join(GOLDEN_DIR, "expected_lis_national.csv"), + "eligibility_national_current": os.path.join(GOLDEN_DIR, "expected_eligibility_national.csv"), + "sex_national_current": os.path.join(GOLDEN_DIR, "expected_sex_national.csv"), + "sex_state_current": os.path.join(GOLDEN_DIR, "expected_sex_state.csv"), + "race_and_ethnicity_state_current": os.path.join(GOLDEN_DIR, "expected_race_and_ethnicity_state.csv"), + "age_county_current": os.path.join(GOLDEN_DIR, "expected_age_county.csv"), } def _load_csv_as_df_from_data_dir(*args, **kwargs): directory, filename = args print("MOCKING FILE READ FROM /data", directory, filename) - dtype = kwargs['dtype'] - na_values = kwargs['na_values'] - subdirectory = kwargs['subdirectory'] - usecols = kwargs['usecols'] - file_path = os.path.join(TEST_DIR, directory, 'test_input_data', subdirectory, filename) + dtype = kwargs["dtype"] + na_values = kwargs["na_values"] + subdirectory = kwargs["subdirectory"] + usecols = kwargs["usecols"] + file_path = os.path.join(TEST_DIR, directory, "test_input_data", subdirectory, filename) df = pd.read_csv(file_path, na_values=na_values, dtype=dtype, usecols=usecols) @@ -57,9 +57,9 @@ def _generate_breakdown_df(*args): # BREAKDOWN TESTS -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) def testBreakdownLisNational( @@ -73,16 +73,16 @@ def testBreakdownLisNational( assert mock_data_dir.call_count == 11 * 2 (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'lis_national_current' + assert table_name == "lis_national_current" - expected_df = pd.read_csv(GOLDEN_DATA['lis_national_current'], dtype={"state_fips": str}) + expected_df = pd.read_csv(GOLDEN_DATA["lis_national_current"], dtype={"state_fips": str}) # breakdown_df.to_csv(table_name, index=False) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) def testBreakdownEligibilityNational( @@ -96,16 +96,16 @@ def testBreakdownEligibilityNational( assert mock_data_dir.call_count == 11 * 2 (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'eligibility_national_current' + assert table_name == "eligibility_national_current" - expected_df = pd.read_csv(GOLDEN_DATA['eligibility_national_current'], dtype={"state_fips": str}) + expected_df = pd.read_csv(GOLDEN_DATA["eligibility_national_current"], dtype={"state_fips": str}) # breakdown_df.to_csv(table_name, index=False) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) def testBreakdownSexNational( @@ -119,16 +119,16 @@ def testBreakdownSexNational( assert mock_data_dir.call_count == 11 * 2 (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'sex_national_current' + assert table_name == "sex_national_current" - expected_df = pd.read_csv(GOLDEN_DATA['sex_national_current'], dtype={"state_fips": str}) + expected_df = pd.read_csv(GOLDEN_DATA["sex_national_current"], dtype={"state_fips": str}) # breakdown_df.to_csv(table_name, index=False) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) def testBreakdownSexState( @@ -142,16 +142,16 @@ def testBreakdownSexState( assert mock_data_dir.call_count == 11 * 2 (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'sex_state_current' + assert table_name == "sex_state_current" - expected_df = pd.read_csv(GOLDEN_DATA['sex_state_current'], dtype={"state_fips": str}) + expected_df = pd.read_csv(GOLDEN_DATA["sex_state_current"], dtype={"state_fips": str}) # breakdown_df.to_csv(table_name, index=False) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) def testBreakdownRaceState( @@ -165,16 +165,16 @@ def testBreakdownRaceState( assert mock_data_dir.call_count == 11 * 2 (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'race_and_ethnicity_state_current' + assert table_name == "race_and_ethnicity_state_current" - expected_df = pd.read_csv(GOLDEN_DATA['race_and_ethnicity_state_current'], dtype={"state_fips": str}) + expected_df = pd.read_csv(GOLDEN_DATA["race_and_ethnicity_state_current"], dtype={"state_fips": str}) # breakdown_df.to_csv(table_name, index=False) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_data_dir, ) def testBreakdownAgeCounty( @@ -188,9 +188,9 @@ def testBreakdownAgeCounty( assert mock_data_dir.call_count == 11 * 2 (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'age_county_current' + assert table_name == "age_county_current" - expected_df = pd.read_csv(GOLDEN_DATA['age_county_current'], dtype={"county_fips": str, "state_fips": str}) + expected_df = pd.read_csv(GOLDEN_DATA["age_county_current"], dtype={"county_fips": str, "state_fips": str}) # breakdown_df.to_csv(table_name, index=False) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) diff --git a/python/tests/datasources/test_phrma_brfss.py b/python/tests/datasources/test_phrma_brfss.py index 672ffa55e6..8a41ab5f76 100644 --- a/python/tests/datasources/test_phrma_brfss.py +++ b/python/tests/datasources/test_phrma_brfss.py @@ -6,32 +6,32 @@ from pandas._testing import assert_frame_equal THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DIR = os.path.join(THIS_DIR, os.pardir, 'data') -GOLDEN_DIR = os.path.join(TEST_DIR, 'phrma_brfss', 'golden_data') +TEST_DIR = os.path.join(THIS_DIR, os.pardir, "data") +GOLDEN_DIR = os.path.join(TEST_DIR, "phrma_brfss", "golden_data") GOLDEN_DATA = { - 'race_and_ethnicity_national_current': os.path.join(GOLDEN_DIR, 'expected_race_and_ethnicity_national.csv'), - 'race_and_ethnicity_state_current': os.path.join(GOLDEN_DIR, 'expected_race_and_ethnicity_state.csv'), - 'age_national_current': os.path.join(GOLDEN_DIR, 'expected_age_national.csv'), - 'age_state_current': os.path.join(GOLDEN_DIR, 'expected_age_state.csv'), - 'sex_national_current': os.path.join(GOLDEN_DIR, 'expected_sex_national.csv'), - 'sex_state_current': os.path.join(GOLDEN_DIR, 'expected_sex_state.csv'), - 'insurance_status_national_current': os.path.join(GOLDEN_DIR, 'expected_insurance_status_national.csv'), - 'insurance_status_state_current': os.path.join(GOLDEN_DIR, 'expected_insurance_status_state.csv'), - 'income_national_current': os.path.join(GOLDEN_DIR, 'expected_income_national.csv'), - 'income_state_current': os.path.join(GOLDEN_DIR, 'expected_income_state.csv'), - 'education_national_current': os.path.join(GOLDEN_DIR, 'expected_education_national.csv'), - 'education_state_current': os.path.join(GOLDEN_DIR, 'expected_education_state.csv'), + "race_and_ethnicity_national_current": os.path.join(GOLDEN_DIR, "expected_race_and_ethnicity_national.csv"), + "race_and_ethnicity_state_current": os.path.join(GOLDEN_DIR, "expected_race_and_ethnicity_state.csv"), + "age_national_current": os.path.join(GOLDEN_DIR, "expected_age_national.csv"), + "age_state_current": os.path.join(GOLDEN_DIR, "expected_age_state.csv"), + "sex_national_current": os.path.join(GOLDEN_DIR, "expected_sex_national.csv"), + "sex_state_current": os.path.join(GOLDEN_DIR, "expected_sex_state.csv"), + "insurance_status_national_current": os.path.join(GOLDEN_DIR, "expected_insurance_status_national.csv"), + "insurance_status_state_current": os.path.join(GOLDEN_DIR, "expected_insurance_status_state.csv"), + "income_national_current": os.path.join(GOLDEN_DIR, "expected_income_national.csv"), + "income_state_current": os.path.join(GOLDEN_DIR, "expected_income_state.csv"), + "education_national_current": os.path.join(GOLDEN_DIR, "expected_education_national.csv"), + "education_state_current": os.path.join(GOLDEN_DIR, "expected_education_state.csv"), } # # BREAKDOWN TESTS -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownRaceNational( @@ -44,16 +44,16 @@ def testBreakdownRaceNational( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'race_and_ethnicity_national_current' + assert table_name == "race_and_ethnicity_national_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownRaceState( @@ -66,16 +66,16 @@ def testBreakdownRaceState( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'race_and_ethnicity_state_current' + assert table_name == "race_and_ethnicity_state_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownAgeNational( @@ -88,16 +88,16 @@ def testBreakdownAgeNational( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'age_national_current' + assert table_name == "age_national_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownAgeState( @@ -110,16 +110,16 @@ def testBreakdownAgeState( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'age_state_current' + assert table_name == "age_state_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownSexNational( @@ -132,16 +132,16 @@ def testBreakdownSexNational( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'sex_national_current' + assert table_name == "sex_national_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownSexState( @@ -154,16 +154,16 @@ def testBreakdownSexState( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'sex_state_current' + assert table_name == "sex_state_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownInsuranceNational( @@ -176,16 +176,16 @@ def testBreakdownInsuranceNational( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'insurance_status_national_current' + assert table_name == "insurance_status_national_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownInsuranceState( @@ -198,16 +198,16 @@ def testBreakdownInsuranceState( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'insurance_status_state_current' + assert table_name == "insurance_status_state_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownEducationNational( @@ -220,16 +220,16 @@ def testBreakdownEducationNational( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'education_national_current' + assert table_name == "education_national_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownEducationState( @@ -242,16 +242,16 @@ def testBreakdownEducationState( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'education_state_current' + assert table_name == "education_state_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownIncomeNational( @@ -264,16 +264,16 @@ def testBreakdownIncomeNational( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'income_national_current' + assert table_name == "income_national_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) assert_frame_equal(breakdown_df, expected_df, check_dtype=False, check_like=True) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) def testBreakdownIncomeState( @@ -286,7 +286,7 @@ def testBreakdownIncomeState( assert mock_data_dir.called (breakdown_df, _dataset, table_name), _dtypes = mock_bq_write.call_args - assert table_name == 'income_state_current' + assert table_name == "income_state_current" # breakdown_df.to_csv(table_name, index=False) expected_df = pd.read_csv(GOLDEN_DATA[table_name], dtype={"state_fips": str}) diff --git a/python/tests/datasources/test_utils.py b/python/tests/datasources/test_utils.py index 392a28cfaa..80929b0fd2 100644 --- a/python/tests/datasources/test_utils.py +++ b/python/tests/datasources/test_utils.py @@ -10,7 +10,7 @@ def get_acs_metadata_as_json(year: int): - metadata_file = 'acs_metadata_2021_and_earlier.json' if year < 2022 else 'acs_metadata_2022_and_later.json' + metadata_file = "acs_metadata_2021_and_earlier.json" if year < 2022 else "acs_metadata_2022_and_later.json" with open(os.path.join(TEST_DIR, metadata_file)) as f: return json.load(f) @@ -21,12 +21,12 @@ def _load_csv_as_df_from_real_data_dir(*args, **kwargs) -> pd.DataFrame: """ directory, filename = args print("ACTUALLY LOADING FROM /data", filename) - dtype = kwargs.get('dtype', None) - na_values = kwargs.get('na_values', None) - subdirectory = kwargs.get('subdirectory', '') - usecols = kwargs.get('usecols', None) - delimiter = kwargs.get('delimiter', None) - skipinitialspace = kwargs.get('skipinitialspace', None) + dtype = kwargs.get("dtype", None) + na_values = kwargs.get("na_values", None) + subdirectory = kwargs.get("subdirectory", "") + usecols = kwargs.get("usecols", None) + delimiter = kwargs.get("delimiter", None) + skipinitialspace = kwargs.get("skipinitialspace", None) file_path = os.path.join(REAL_DATA_DIR, directory, subdirectory, filename) diff --git a/python/tests/datasources/test_vera_incarceration_county.py b/python/tests/datasources/test_vera_incarceration_county.py index 4b3995406d..b00cb8ffce 100644 --- a/python/tests/datasources/test_vera_incarceration_county.py +++ b/python/tests/datasources/test_vera_incarceration_county.py @@ -14,14 +14,14 @@ GOLDEN_DATA = { - 'race_and_ethnicity_county_current': os.path.join(TEST_DIR, "golden_data", 'race_and_ethnicity_county_current.csv'), - 'race_and_ethnicity_county_historical': os.path.join( - TEST_DIR, "golden_data", 'race_and_ethnicity_county_historical.csv' + "race_and_ethnicity_county_current": os.path.join(TEST_DIR, "golden_data", "race_and_ethnicity_county_current.csv"), + "race_and_ethnicity_county_historical": os.path.join( + TEST_DIR, "golden_data", "race_and_ethnicity_county_historical.csv" ), - 'age_county_current': os.path.join(TEST_DIR, "golden_data", 'age_county_current.csv'), - 'age_county_historical': os.path.join(TEST_DIR, "golden_data", 'age_county_historical.csv'), - 'sex_county_current': os.path.join(TEST_DIR, "golden_data", 'sex_county_current.csv'), - 'sex_county_historical': os.path.join(TEST_DIR, "golden_data", 'sex_county_historical.csv'), + "age_county_current": os.path.join(TEST_DIR, "golden_data", "age_county_current.csv"), + "age_county_historical": os.path.join(TEST_DIR, "golden_data", "age_county_historical.csv"), + "sex_county_current": os.path.join(TEST_DIR, "golden_data", "sex_county_current.csv"), + "sex_county_historical": os.path.join(TEST_DIR, "golden_data", "sex_county_historical.csv"), } @@ -43,22 +43,22 @@ } kwargs = { - 'filename': 'test_file.csv', - 'metadata_table_id': 'test_metadata', - 'table_name': 'output_table', + "filename": "test_file.csv", + "metadata_table_id": "test_metadata", + "table_name": "output_table", } veraIncarcerationCounty = VeraIncarcerationCounty() @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqSex(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): kwargs["demographic"] = "sex" - veraIncarcerationCounty.write_to_bq('dataset', 'gcs_bucket', **kwargs) + veraIncarcerationCounty.write_to_bq("dataset", "gcs_bucket", **kwargs) # writes 1 current and 1 historical table per demo breakdown assert mock_bq.call_count == 2 @@ -78,13 +78,13 @@ def testWriteToBqSex(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqAge(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): kwargs["demographic"] = "age" - veraIncarcerationCounty.write_to_bq('dataset', 'gcs_bucket', **kwargs) + veraIncarcerationCounty.write_to_bq("dataset", "gcs_bucket", **kwargs) # writes 1 current and 1 historical table per demo breakdown assert mock_bq.call_count == 2 @@ -104,13 +104,13 @@ def testWriteToBqAge(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): @mock.patch( - 'ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir', + "ingestion.gcs_to_bq_util.load_csv_as_df_from_data_dir", side_effect=_load_csv_as_df_from_real_data_dir, ) -@mock.patch('ingestion.gcs_to_bq_util.add_df_to_bq', return_value=None) +@mock.patch("ingestion.gcs_to_bq_util.add_df_to_bq", return_value=None) def testWriteToBqRace(mock_bq: mock.MagicMock, mock_csv: mock.MagicMock): kwargs["demographic"] = "race_and_ethnicity" - veraIncarcerationCounty.write_to_bq('dataset', 'gcs_bucket', **kwargs) + veraIncarcerationCounty.write_to_bq("dataset", "gcs_bucket", **kwargs) # writes 1 current and 1 historical table per demo breakdown assert mock_bq.call_count == 2 diff --git a/python/tests/ingestion/test_bjs_utils.py b/python/tests/ingestion/test_bjs_utils.py index 4aeba6b0ae..c63b7dd936 100644 --- a/python/tests/ingestion/test_bjs_utils.py +++ b/python/tests/ingestion/test_bjs_utils.py @@ -21,17 +21,17 @@ def test_set_state_col(): _fake_df = pd.DataFrame( { - 'Jurisdiction': ["Federal", None, None], - 'Unnamed: 1': [None, "Georgia", "Alaska"], - 'ignored_values': [1.0, 1.0, 1.0], + "Jurisdiction": ["Federal", None, None], + "Unnamed: 1": [None, "Georgia", "Alaska"], + "ignored_values": [1.0, 1.0, 1.0], } ) _expected_df_set_state_cols = pd.DataFrame( { std_col.STATE_NAME_COL: ["Federal", "Georgia", "Alaska"], - 'Jurisdiction': ["Federal", None, None], - 'Unnamed: 1': [None, "Georgia", "Alaska"], - 'ignored_values': [1.0, 1.0, 1.0], + "Jurisdiction": ["Federal", None, None], + "Unnamed: 1": [None, "Georgia", "Alaska"], + "ignored_values": [1.0, 1.0, 1.0], } ) @@ -47,9 +47,9 @@ def test_filter_cols(): "Maine", "Florida", ], - 'Male': [2.0, 4.0, 6.0], - 'Female': [1.0, 3.0, 5.0], - 'ignored_values': [1.0, 1.0, 1.0], + "Male": [2.0, 4.0, 6.0], + "Female": [1.0, 3.0, 5.0], + "ignored_values": [1.0, 1.0, 1.0], } ) @@ -60,8 +60,8 @@ def test_filter_cols(): "Maine", "Florida", ], - 'Male': [2.0, 4.0, 6.0], - 'Female': [1.0, 3.0, 5.0], + "Male": [2.0, 4.0, 6.0], + "Female": [1.0, 3.0, 5.0], } ) @@ -75,21 +75,21 @@ def test_filter_cols(): "Maine", "Florida", ], - 'Asian': [1_000_000, "~", 1000], - 'Black': [1_000_000, 100, "/"], + "Asian": [1_000_000, "~", 1000], + "Black": [1_000_000, 100, "/"], } ) _expected_by_race_df_missing_to_nan = pd.DataFrame( { std_col.STATE_NAME_COL: ["U.S. total", "Maine", "Florida"], - 'Asian': [1_000_000, np.nan, 1000], - 'Black': [1_000_000, 100, np.nan], + "Asian": [1_000_000, np.nan, 1000], + "Black": [1_000_000, 100, np.nan], } ) _expected_by_race_df_only_states = pd.DataFrame( - {std_col.STATE_NAME_COL: ["Maine", "Florida"], 'Asian': ["~", 1000], 'Black': [100, "/"]} + {std_col.STATE_NAME_COL: ["Maine", "Florida"], "Asian": ["~", 1000], "Black": [100, "/"]} ) @@ -115,8 +115,8 @@ def test_keep_only_national(): "Maine", "Florida", ], - 'Female': [1000, 100, 10], - 'Male': [1_000_000, 100_000, 10_000], + "Female": [1000, 100, 10], + "Male": [1_000_000, 100_000, 10_000], } ) @@ -127,8 +127,8 @@ def test_keep_only_national(): "Maine", "Florida", ], - 'Female': [1110, 100, 10], - 'Male': [1_110_000, 100_000, 10_000], + "Female": [1110, 100, 10], + "Male": [1_110_000, 100_000, 10_000], } ) @@ -137,8 +137,8 @@ def test_keep_only_national(): std_col.STATE_NAME_COL: [ "United States", ], - 'Female': [1110], - 'Male': [1_110_000], + "Female": [1110], + "Male": [1_110_000], } ) @@ -163,8 +163,8 @@ def test_cols_to_rows(): "Maine", "Florida", ], - 'Asian': [100, 200], - 'Black': [1000, 2000], + "Asian": [100, 200], + "Black": [1000, 2000], } ) @@ -176,8 +176,8 @@ def test_cols_to_rows(): "Maine", "Florida", ], - 'race': ["Asian", "Asian", "Black", "Black"], - 'some_value': [100, 200, 1000, 2000], + "race": ["Asian", "Asian", "Black", "Black"], + "some_value": [100, 200, 1000, 2000], } ) @@ -196,8 +196,8 @@ def test_strip_footnote_refs(): "Maine/b,c", "Florida", ], - 'Asian/e': [1, 2, 3], - 'Black': [4, 5, 6], + "Asian/e": [1, 2, 3], + "Black": [4, 5, 6], } ) @@ -208,8 +208,8 @@ def test_strip_footnote_refs(): "Maine", "Florida", ], - 'Asian': [1, 2, 3], - 'Black': [4, 5, 6], + "Asian": [1, 2, 3], + "Black": [4, 5, 6], } ) @@ -229,8 +229,8 @@ def test_swap_race_col_names_to_codes(): "Maine", "Florida", ], - 'American Indian/Alaska Native': [1, 2, 3], - 'Total': [4, 5, 6], + "American Indian/Alaska Native": [1, 2, 3], + "Total": [4, 5, 6], } ) @@ -241,8 +241,8 @@ def test_swap_race_col_names_to_codes(): "Maine", "Florida", ], - 'AIAN_NH': [1, 2, 3], - 'ALL': [4, 5, 6], + "AIAN_NH": [1, 2, 3], + "ALL": [4, 5, 6], } ) diff --git a/python/tests/ingestion/test_cdc_wisqars_utils.py b/python/tests/ingestion/test_cdc_wisqars_utils.py index dec576daa3..926e7cf8a0 100644 --- a/python/tests/ingestion/test_cdc_wisqars_utils.py +++ b/python/tests/ingestion/test_cdc_wisqars_utils.py @@ -22,64 +22,64 @@ def test_clean_numeric(): def test_contains_unknown(): - assert contains_unknown('unknown') is True - assert contains_unknown('Unknown') is True - assert contains_unknown('') is False - assert contains_unknown('known') is False + assert contains_unknown("unknown") is True + assert contains_unknown("Unknown") is True + assert contains_unknown("") is False + assert contains_unknown("known") is False def test_convert_columns_to_numeric(): fake_data_with_string_numbers = [ { - 'year': '2018', - 'some_topic_estimated_total': '94.0', + "year": "2018", + "some_topic_estimated_total": "94.0", }, { - 'year': '2018', - 'some_topic_estimated_total': '99', + "year": "2018", + "some_topic_estimated_total": "99", }, { - 'year': '2018', - 'some_topic_estimated_total': None, + "year": "2018", + "some_topic_estimated_total": None, }, ] expected_data_with_float_numbers = [ { - 'year': '2018', - 'some_topic_estimated_total': 94.0, + "year": "2018", + "some_topic_estimated_total": 94.0, }, { - 'year': '2018', - 'some_topic_estimated_total': 99.0, + "year": "2018", + "some_topic_estimated_total": 99.0, }, { - 'year': '2018', - 'some_topic_estimated_total': None, + "year": "2018", + "some_topic_estimated_total": None, }, ] df = pd.DataFrame(fake_data_with_string_numbers) expected_df = pd.DataFrame(expected_data_with_float_numbers) - cols_to_convert = ['some_topic_estimated_total'] + cols_to_convert = ["some_topic_estimated_total"] convert_columns_to_numeric(df, cols_to_convert) for column in cols_to_convert: - assert df[column].dtype == 'float64' + assert df[column].dtype == "float64" assert_frame_equal(df, expected_df) def test_generate_cols_map(): - count_cols = ['cat_estimated_total', 'dog_estimated_total'] - suffix = 'per_100k' + count_cols = ["cat_estimated_total", "dog_estimated_total"] + suffix = "per_100k" generated_map = generate_cols_map(count_cols, suffix) - expected_map = {'cat_estimated_total': 'cat_per_100k', 'dog_estimated_total': 'dog_per_100k'} + expected_map = {"cat_estimated_total": "cat_per_100k", "dog_estimated_total": "dog_per_100k"} assert generated_map == expected_map @@ -87,18 +87,18 @@ def test_generate_cols_map(): def test_generate_cols_map_empty(): count_cols = [] - suffix = 'per_100k' + suffix = "per_100k" expected_map = {} assert generate_cols_map(count_cols, suffix) == expected_map def test_generate_cols_map_bad_count_cols(): - count_cols = ['cat_estimated_total', 'dog_estimated_total', 'bird'] - suffix = 'per_100k' + count_cols = ["cat_estimated_total", "dog_estimated_total", "bird"] + suffix = "per_100k" assert generate_cols_map(count_cols, suffix) == { - 'cat_estimated_total': 'cat_per_100k', - 'dog_estimated_total': 'dog_per_100k', - 'bird': 'bird_per_100k', + "cat_estimated_total": "cat_per_100k", + "dog_estimated_total": "dog_per_100k", + "bird": "bird_per_100k", } diff --git a/python/tests/ingestion/test_census.py b/python/tests/ingestion/test_census.py index 357c6424bd..3ec78686ad 100644 --- a/python/tests/ingestion/test_census.py +++ b/python/tests/ingestion/test_census.py @@ -159,5 +159,5 @@ def testStandardizeFrameTwoDims(self): assert_frame_equal(expected_df, df) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/tests/ingestion/test_dataset_utils.py b/python/tests/ingestion/test_dataset_utils.py index 0fda684a0b..d0892d6eea 100644 --- a/python/tests/ingestion/test_dataset_utils.py +++ b/python/tests/ingestion/test_dataset_utils.py @@ -20,210 +20,210 @@ from io import StringIO _fake_race_data = [ - ['state_fips', 'state_name', 'race', 'population'], - ['01', 'Alabama', 'Asian alone', '660'], - ['01', 'Alabama', 'Some other race alone', '700'], - ['01', 'Alabama', 'Two or more races', '919'], - ['01', 'Alabama', 'An underrepresented race', '1'], - ['01', 'Alabama', 'ALL', '2280'], - ['01', 'Alabama', 'UNKNOWN', '30'], - ['02', 'Alaska', 'Asian alone', '45'], - ['02', 'Alaska', 'Some other race alone', '11'], - ['02', 'Alaska', 'Two or more races', '60'], - ['02', 'Alaska', 'ALL', '116'], - ['02', 'Alaska', 'UNKNOWN', '20'], - ['04', 'Arizona', 'Asian alone', '23'], - ['04', 'Arizona', 'Some other race alone', '46'], - ['04', 'Arizona', 'Two or more races', '26'], - ['04', 'Arizona', 'ALL', '95'], - ['04', 'Arizona', 'UNKNOWN', '10'], + ["state_fips", "state_name", "race", "population"], + ["01", "Alabama", "Asian alone", "660"], + ["01", "Alabama", "Some other race alone", "700"], + ["01", "Alabama", "Two or more races", "919"], + ["01", "Alabama", "An underrepresented race", "1"], + ["01", "Alabama", "ALL", "2280"], + ["01", "Alabama", "UNKNOWN", "30"], + ["02", "Alaska", "Asian alone", "45"], + ["02", "Alaska", "Some other race alone", "11"], + ["02", "Alaska", "Two or more races", "60"], + ["02", "Alaska", "ALL", "116"], + ["02", "Alaska", "UNKNOWN", "20"], + ["04", "Arizona", "Asian alone", "23"], + ["04", "Arizona", "Some other race alone", "46"], + ["04", "Arizona", "Two or more races", "26"], + ["04", "Arizona", "ALL", "95"], + ["04", "Arizona", "UNKNOWN", "10"], ] _expected_pct_share_data_without_unknowns = [ - ['state_fips', 'state_name', 'race', 'population', 'pct_share'], - ['01', 'Alabama', 'Asian alone', '660', '28.9'], - ['01', 'Alabama', 'Some other race alone', '700', '30.7'], - ['01', 'Alabama', 'Two or more races', '919', '40.3'], - ['01', 'Alabama', 'An underrepresented race', '1', '.04'], - ['01', 'Alabama', 'ALL', '2280', '100'], - ['02', 'Alaska', 'Asian alone', '45', '38.8'], - ['02', 'Alaska', 'Some other race alone', '11', '9.5'], - ['02', 'Alaska', 'Two or more races', '60', '51.7'], - ['02', 'Alaska', 'ALL', '116', '100'], - ['04', 'Arizona', 'Asian alone', '23', '24.2'], - ['04', 'Arizona', 'Some other race alone', '46', '48.4'], - ['04', 'Arizona', 'Two or more races', '26', '27.4'], - ['04', 'Arizona', 'ALL', '95', '100'], + ["state_fips", "state_name", "race", "population", "pct_share"], + ["01", "Alabama", "Asian alone", "660", "28.9"], + ["01", "Alabama", "Some other race alone", "700", "30.7"], + ["01", "Alabama", "Two or more races", "919", "40.3"], + ["01", "Alabama", "An underrepresented race", "1", ".04"], + ["01", "Alabama", "ALL", "2280", "100"], + ["02", "Alaska", "Asian alone", "45", "38.8"], + ["02", "Alaska", "Some other race alone", "11", "9.5"], + ["02", "Alaska", "Two or more races", "60", "51.7"], + ["02", "Alaska", "ALL", "116", "100"], + ["04", "Arizona", "Asian alone", "23", "24.2"], + ["04", "Arizona", "Some other race alone", "46", "48.4"], + ["04", "Arizona", "Two or more races", "26", "27.4"], + ["04", "Arizona", "ALL", "95", "100"], ] _expected_pct_share_data_with_unknowns = [ - ['state_fips', 'state_name', 'race', 'population', 'pct_share'], - ['01', 'Alabama', 'Asian alone', '660', '28.9'], - ['01', 'Alabama', 'Some other race alone', '700', '30.7'], - ['01', 'Alabama', 'Two or more races', '919', '40.3'], - ['01', 'Alabama', 'An underrepresented race', '1', '.04'], - ['01', 'Alabama', 'ALL', '2280', '100'], - ['01', 'Alabama', 'UNKNOWN', '30', '1.3'], - ['02', 'Alaska', 'Asian alone', '45', '38.8'], - ['02', 'Alaska', 'Some other race alone', '11', '9.5'], - ['02', 'Alaska', 'Two or more races', '60', '51.7'], - ['02', 'Alaska', 'ALL', '116', '100'], - ['02', 'Alaska', 'UNKNOWN', '20', '17.2'], - ['04', 'Arizona', 'Asian alone', '23', '24.2'], - ['04', 'Arizona', 'Some other race alone', '46', '48.4'], - ['04', 'Arizona', 'Two or more races', '26', '27.4'], - ['04', 'Arizona', 'ALL', '95', '100'], - ['04', 'Arizona', 'UNKNOWN', '10', '10.5'], + ["state_fips", "state_name", "race", "population", "pct_share"], + ["01", "Alabama", "Asian alone", "660", "28.9"], + ["01", "Alabama", "Some other race alone", "700", "30.7"], + ["01", "Alabama", "Two or more races", "919", "40.3"], + ["01", "Alabama", "An underrepresented race", "1", ".04"], + ["01", "Alabama", "ALL", "2280", "100"], + ["01", "Alabama", "UNKNOWN", "30", "1.3"], + ["02", "Alaska", "Asian alone", "45", "38.8"], + ["02", "Alaska", "Some other race alone", "11", "9.5"], + ["02", "Alaska", "Two or more races", "60", "51.7"], + ["02", "Alaska", "ALL", "116", "100"], + ["02", "Alaska", "UNKNOWN", "20", "17.2"], + ["04", "Arizona", "Asian alone", "23", "24.2"], + ["04", "Arizona", "Some other race alone", "46", "48.4"], + ["04", "Arizona", "Two or more races", "26", "27.4"], + ["04", "Arizona", "ALL", "95", "100"], + ["04", "Arizona", "UNKNOWN", "10", "10.5"], ] _fake_data_without_pct_relative_inequity_col = [ - ['state_fips', 'state_name', 'race', 'pct_share', 'pct_pop'], - ['01', 'Alabama', 'Race 1', 0, 0.0], - ['01', 'Alabama', 'Race 2', 10.0, 10.0], - ['01', 'Alabama', 'Race 3', 45.0, 80.0], - ['01', 'Alabama', 'Race 4', 45.0, 10.0], - ['01', 'Alabama', 'Race 5', None, None], + ["state_fips", "state_name", "race", "pct_share", "pct_pop"], + ["01", "Alabama", "Race 1", 0, 0.0], + ["01", "Alabama", "Race 2", 10.0, 10.0], + ["01", "Alabama", "Race 3", 45.0, 80.0], + ["01", "Alabama", "Race 4", 45.0, 10.0], + ["01", "Alabama", "Race 5", None, None], ] _expected_data_with_pct_relative_inequity_col = [ [ - 'state_fips', - 'state_name', - 'race', - 'pct_share', - 'pct_pop', - 'pct_relative_inequity', + "state_fips", + "state_name", + "race", + "pct_share", + "pct_pop", + "pct_relative_inequity", ], - ['01', 'Alabama', 'Race 1', 0, 0.0, np.nan], - ['01', 'Alabama', 'Race 2', 10.0, 10.0, 0.0], - ['01', 'Alabama', 'Race 3', 45.0, 80.0, -43.8], - ['01', 'Alabama', 'Race 4', 45.0, 10.0, 350.0], - ['01', 'Alabama', 'Race 5', None, None, np.nan], + ["01", "Alabama", "Race 1", 0, 0.0, np.nan], + ["01", "Alabama", "Race 2", 10.0, 10.0, 0.0], + ["01", "Alabama", "Race 3", 45.0, 80.0, -43.8], + ["01", "Alabama", "Race 4", 45.0, 10.0, 350.0], + ["01", "Alabama", "Race 5", None, None, np.nan], ] _fake_data_with_pct_rel_inequity_with_zero_rates = [ [ - 'time_period', - 'state_fips', - 'state_name', - 'race_category_id', - 'something_per_100k', - 'something_pct_relative_inequity', - 'something_pop_pct', + "time_period", + "state_fips", + "state_name", + "race_category_id", + "something_per_100k", + "something_pct_relative_inequity", + "something_pop_pct", ], - ['2018', '99', 'StateWithRates', 'RaceNoPop', 90_000, None, None], - ['2019', '01', 'Alabama', 'Race1', 0, -100.0, 10.0], - ['2019', '01', 'Alabama', 'Race2', 10.001, 0.0, 10.0], - ['2019', '01', 'Alabama', 'Race3', 60.0, 500.0, 10.0], - ['2019', '01', 'Alabama', 'Race4', 60.0, None, 10.0], - ['2019', '01', 'Alabama', 'RaceNoPop', 1, None, None], - ['2019', '01', 'Alabama', 'Race6', 100.0, None, 10.0], - ['2020', '01', 'Alabama', 'Race1', 0, -100.0, 10.0], - ['2020', '01', 'Alabama', 'Race2', 0, 0.0, 10.0], - ['2020', '01', 'Alabama', 'Race3', 0, 500.0, 10.0], - ['2020', '01', 'Alabama', 'Race4', 0, None, 10.0], - ['2020', '01', 'Alabama', 'RaceNoPop', 0, None, None], - ['2020', '01', 'Alabama', 'Race6', 0, None, 10.0], - ['2020', '99', 'StateWithRates', 'Race6', 100_000, 50.0, 10.0], + ["2018", "99", "StateWithRates", "RaceNoPop", 90_000, None, None], + ["2019", "01", "Alabama", "Race1", 0, -100.0, 10.0], + ["2019", "01", "Alabama", "Race2", 10.001, 0.0, 10.0], + ["2019", "01", "Alabama", "Race3", 60.0, 500.0, 10.0], + ["2019", "01", "Alabama", "Race4", 60.0, None, 10.0], + ["2019", "01", "Alabama", "RaceNoPop", 1, None, None], + ["2019", "01", "Alabama", "Race6", 100.0, None, 10.0], + ["2020", "01", "Alabama", "Race1", 0, -100.0, 10.0], + ["2020", "01", "Alabama", "Race2", 0, 0.0, 10.0], + ["2020", "01", "Alabama", "Race3", 0, 500.0, 10.0], + ["2020", "01", "Alabama", "Race4", 0, None, 10.0], + ["2020", "01", "Alabama", "RaceNoPop", 0, None, None], + ["2020", "01", "Alabama", "Race6", 0, None, 10.0], + ["2020", "99", "StateWithRates", "Race6", 100_000, 50.0, 10.0], ] _expected_data_with_properly_zeroed_pct_rel_inequity = [ [ - 'time_period', - 'state_fips', - 'state_name', - 'race_category_id', - 'something_per_100k', - 'something_pct_relative_inequity', - 'something_pop_pct', + "time_period", + "state_fips", + "state_name", + "race_category_id", + "something_per_100k", + "something_pct_relative_inequity", + "something_pop_pct", ], - ['2018', '99', 'StateWithRates', 'RaceNoPop', 90_000, np.nan, np.nan], - ['2019', '01', 'Alabama', 'Race1', 0, -100.0, 10.0], - ['2019', '01', 'Alabama', 'Race2', 10.001, 0.0, 10.0], - ['2019', '01', 'Alabama', 'Race3', 60.0, 500.0, 10.0], - ['2019', '01', 'Alabama', 'Race4', 60.0, np.nan, 10.0], - ['2019', '01', 'Alabama', 'RaceNoPop', 1, np.nan, np.nan], - ['2019', '01', 'Alabama', 'Race6', 100.0, np.nan, 10.0], + ["2018", "99", "StateWithRates", "RaceNoPop", 90_000, np.nan, np.nan], + ["2019", "01", "Alabama", "Race1", 0, -100.0, 10.0], + ["2019", "01", "Alabama", "Race2", 10.001, 0.0, 10.0], + ["2019", "01", "Alabama", "Race3", 60.0, 500.0, 10.0], + ["2019", "01", "Alabama", "Race4", 60.0, np.nan, 10.0], + ["2019", "01", "Alabama", "RaceNoPop", 1, np.nan, np.nan], + ["2019", "01", "Alabama", "Race6", 100.0, np.nan, 10.0], # all rates in Alabama in 2020 are zero, so all pct_rel_inequity are ZEROED # expect for races where the population_pct_share is null - ['2020', '01', 'Alabama', 'Race1', 0, 0, 10.0], - ['2020', '01', 'Alabama', 'Race2', 0, 0, 10.0], - ['2020', '01', 'Alabama', 'Race3', 0, 0, 10.0], - ['2020', '01', 'Alabama', 'Race4', 0, 0, 10.0], - ['2020', '01', 'Alabama', 'RaceNoPop', 0, np.nan, np.nan], - ['2020', '01', 'Alabama', 'Race6', 0, 0, 10.0], + ["2020", "01", "Alabama", "Race1", 0, 0, 10.0], + ["2020", "01", "Alabama", "Race2", 0, 0, 10.0], + ["2020", "01", "Alabama", "Race3", 0, 0, 10.0], + ["2020", "01", "Alabama", "Race4", 0, 0, 10.0], + ["2020", "01", "Alabama", "RaceNoPop", 0, np.nan, np.nan], + ["2020", "01", "Alabama", "Race6", 0, 0, 10.0], # each PLACE/YEAR is considered independently so the fact Race6 # has a rate in StateWithRates doesn't prevent the zeroing above - ['2020', '99', 'StateWithRates', 'Race6', 100_000, 50.0, 10.0], + ["2020", "99", "StateWithRates", "Race6", 100_000, 50.0, 10.0], ] _fake_condition_data = [ - ['state_fips', 'state_name', 'race', 'some_condition_total', 'population'], - ['01', 'Alabama', 'Asian alone', 100, 1000], - ['01', 'Alabama', 'Some other race alone', 200, 5000], - ['02', 'Alaska', 'Two or more races', 10, 2000], - ['02', 'Alaska', 'TOTAL', 100, 4000], - ['04', 'Arizona', 'Two or more races', 20, 4000], - ['04', 'Arizona', 'TOTAL', 10, 2000], + ["state_fips", "state_name", "race", "some_condition_total", "population"], + ["01", "Alabama", "Asian alone", 100, 1000], + ["01", "Alabama", "Some other race alone", 200, 5000], + ["02", "Alaska", "Two or more races", 10, 2000], + ["02", "Alaska", "TOTAL", 100, 4000], + ["04", "Arizona", "Two or more races", 20, 4000], + ["04", "Arizona", "TOTAL", 10, 2000], ] _fake_condition_data_with_per_100k = [ [ - 'state_fips', - 'state_name', - 'race', - 'some_condition_total', - 'population', - 'condition_per_100k', + "state_fips", + "state_name", + "race", + "some_condition_total", + "population", + "condition_per_100k", ], - ['01', 'Alabama', 'Asian alone', 100, 1000, 10000], - ['01', 'Alabama', 'Some other race alone', 200, 5000, 4000], - ['02', 'Alaska', 'Two or more races', 10, 2000, 500], - ['02', 'Alaska', 'TOTAL', 100, 4000, 2500], - ['04', 'Arizona', 'Two or more races', 20, 4000, 500], - ['04', 'Arizona', 'TOTAL', 10, 2000, 500], + ["01", "Alabama", "Asian alone", 100, 1000, 10000], + ["01", "Alabama", "Some other race alone", 200, 5000, 4000], + ["02", "Alaska", "Two or more races", 10, 2000, 500], + ["02", "Alaska", "TOTAL", 100, 4000, 2500], + ["04", "Arizona", "Two or more races", 20, 4000, 500], + ["04", "Arizona", "TOTAL", 10, 2000, 500], ] _fake_race_data_without_totals = [ - ['state_fips', 'state_name', 'race', 'population'], - ['01', 'Alabama', 'Asian alone', '66'], - ['01', 'Alabama', 'Some other race alone', '70'], - ['01', 'Alabama', 'Two or more races', '92'], - ['02', 'Alaska', 'Asian alone', '45'], - ['02', 'Alaska', 'Some other race alone', '11'], - ['02', 'Alaska', 'Two or more races', '60'], - ['04', 'Arizona', 'Asian alone', '23'], - ['04', 'Arizona', 'Some other race alone', '46'], - ['04', 'Arizona', 'Two or more races', '26'], + ["state_fips", "state_name", "race", "population"], + ["01", "Alabama", "Asian alone", "66"], + ["01", "Alabama", "Some other race alone", "70"], + ["01", "Alabama", "Two or more races", "92"], + ["02", "Alaska", "Asian alone", "45"], + ["02", "Alaska", "Some other race alone", "11"], + ["02", "Alaska", "Two or more races", "60"], + ["04", "Arizona", "Asian alone", "23"], + ["04", "Arizona", "Some other race alone", "46"], + ["04", "Arizona", "Two or more races", "26"], ] _expected_race_data_with_totals = [ - ['state_fips', 'state_name', 'race', 'population'], - ['01', 'Alabama', 'Asian alone', '66'], - ['01', 'Alabama', 'Some other race alone', '70'], - ['01', 'Alabama', 'Two or more races', '92'], - ['02', 'Alaska', 'Asian alone', '45'], - ['02', 'Alaska', 'Some other race alone', '11'], - ['02', 'Alaska', 'Two or more races', '60'], - ['04', 'Arizona', 'Asian alone', '23'], - ['04', 'Arizona', 'Some other race alone', '46'], - ['04', 'Arizona', 'Two or more races', '26'], - ['01', 'Alabama', 'ALL', '228'], - ['02', 'Alaska', 'ALL', '116'], - ['04', 'Arizona', 'ALL', '95'], + ["state_fips", "state_name", "race", "population"], + ["01", "Alabama", "Asian alone", "66"], + ["01", "Alabama", "Some other race alone", "70"], + ["01", "Alabama", "Two or more races", "92"], + ["02", "Alaska", "Asian alone", "45"], + ["02", "Alaska", "Some other race alone", "11"], + ["02", "Alaska", "Two or more races", "60"], + ["04", "Arizona", "Asian alone", "23"], + ["04", "Arizona", "Some other race alone", "46"], + ["04", "Arizona", "Two or more races", "26"], + ["01", "Alabama", "ALL", "228"], + ["02", "Alaska", "ALL", "116"], + ["04", "Arizona", "ALL", "95"], ] _fake_data_missing_zeros = [ - ['state_fips', 'state_name', 'race', 'population'], - ['1', 'Alabama', 'Asian alone', '66'], - ['1', 'Alabama', 'Some other race alone', '70'], - ['1', 'Alabama', 'Two or more races', '92'], - ['2', 'Alaska', 'Asian alone', '45'], - ['2', 'Alaska', 'Some other race alone', '11'], - ['2', 'Alaska', 'Two or more races', '60'], - ['4', 'Arizona', 'Asian alone', '23'], - ['4', 'Arizona', 'Some other race alone', '46'], - ['4', 'Arizona', 'Two or more races', '26'], + ["state_fips", "state_name", "race", "population"], + ["1", "Alabama", "Asian alone", "66"], + ["1", "Alabama", "Some other race alone", "70"], + ["1", "Alabama", "Two or more races", "92"], + ["2", "Alaska", "Asian alone", "45"], + ["2", "Alaska", "Some other race alone", "11"], + ["2", "Alaska", "Two or more races", "60"], + ["4", "Arizona", "Asian alone", "23"], + ["4", "Arizona", "Some other race alone", "46"], + ["4", "Arizona", "Two or more races", "26"], ] @@ -241,14 +241,14 @@ def testPercentAvoidRoundingToZero(): def testAddSumOfRows(): df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_fake_race_data_without_totals))).reset_index(drop=True) - df['population'] = df['population'].astype(int) - df = dataset_utils.add_sum_of_rows(df, 'race', 'population', 'ALL') + df["population"] = df["population"].astype(int) + df = dataset_utils.add_sum_of_rows(df, "race", "population", "ALL") expected_df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_expected_race_data_with_totals))).reset_index( drop=True ) - expected_df['population'] = expected_df['population'].astype(int) + expected_df["population"] = expected_df["population"].astype(int) assert_frame_equal(expected_df, df) @@ -256,18 +256,18 @@ def testAddSumOfRows(): def testGeneratePctShareColWithoutUnknowns(): df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_fake_race_data))).reset_index(drop=True) - df = df.loc[df['race'] != 'UNKNOWN'] - df['population'] = df['population'].astype(float) + df = df.loc[df["race"] != "UNKNOWN"] + df["population"] = df["population"].astype(float) expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_pct_share_data_without_unknowns)) ).reset_index(drop=True) - expected_df['population'] = expected_df['population'].astype(float) + expected_df["population"] = expected_df["population"].astype(float) - expected_df['pct_share'] = expected_df['pct_share'].astype(float) + expected_df["pct_share"] = expected_df["pct_share"].astype(float) - df = dataset_utils.generate_pct_share_col_without_unknowns(df, {'population': 'pct_share'}, 'race', 'ALL') + df = dataset_utils.generate_pct_share_col_without_unknowns(df, {"population": "pct_share"}, "race", "ALL") assert_frame_equal(expected_df, df) @@ -275,19 +275,19 @@ def testGeneratePctShareColWithoutUnknowns(): def testGeneratePctShareColWithUnknowns(): df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_fake_race_data))).reset_index(drop=True) - df['population'] = df['population'].astype(float) + df["population"] = df["population"].astype(float) expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_pct_share_data_with_unknowns)) ).reset_index(drop=True) - expected_df['population'] = expected_df['population'].astype(float) + expected_df["population"] = expected_df["population"].astype(float) - expected_df['pct_share'] = expected_df['pct_share'].astype(float) + expected_df["pct_share"] = expected_df["pct_share"].astype(float) - df = dataset_utils.generate_pct_share_col_with_unknowns(df, {'population': 'pct_share'}, 'race', 'ALL', 'UNKNOWN') + df = dataset_utils.generate_pct_share_col_with_unknowns(df, {"population": "pct_share"}, "race", "ALL", "UNKNOWN") - df = df.sort_values(by=['state_fips']).reset_index(drop=True) + df = df.sort_values(by=["state_fips"]).reset_index(drop=True) assert_frame_equal(expected_df, df) @@ -297,61 +297,61 @@ def testGeneratePctShareColExtraTotalError(): extra_row = pd.DataFrame( [ { - 'state_fips': '01', - 'state_name': 'Alabama', - 'race': 'ALL', - 'population': '66', + "state_fips": "01", + "state_name": "Alabama", + "race": "ALL", + "population": "66", } ] ) df = pd.concat([df, extra_row]) - df = df.loc[df['race'] != 'UNKNOWN'] - df['population'] = df['population'].astype(float) + df = df.loc[df["race"] != "UNKNOWN"] + df["population"] = df["population"].astype(float) expected_error = re.escape("Fips ('01',) has 2 ALL rows, there should be 1") with pytest.raises(ValueError, match=expected_error): - df = dataset_utils.generate_pct_share_col_without_unknowns(df, {'population': 'pct_share'}, 'race', 'ALL') + df = dataset_utils.generate_pct_share_col_without_unknowns(df, {"population": "pct_share"}, "race", "ALL") def testGeneratePer100kCol(): df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_fake_condition_data))).reset_index(drop=True) - df = dataset_utils.generate_per_100k_col(df, 'some_condition_total', 'population', 'condition_per_100k') + df = dataset_utils.generate_per_100k_col(df, "some_condition_total", "population", "condition_per_100k") expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_fake_condition_data_with_per_100k)) ).reset_index(drop=True) - expected_df['condition_per_100k'] = df['condition_per_100k'].astype(float) + expected_df["condition_per_100k"] = df["condition_per_100k"].astype(float) assert_frame_equal(expected_df, df, check_like=True) def test_generate_pct_rate_col(): data = [ - {'some_condition_total': 1, 'population': 2}, - {'some_condition_total': 11, 'population': 1000}, - {'some_condition_total': 0, 'population': 1000}, - {'some_condition_total': 1, 'population': 0}, - {'some_condition_total': None, 'population': 1000}, - {'some_condition_total': 1, 'population': 1000}, + {"some_condition_total": 1, "population": 2}, + {"some_condition_total": 11, "population": 1000}, + {"some_condition_total": 0, "population": 1000}, + {"some_condition_total": 1, "population": 0}, + {"some_condition_total": None, "population": 1000}, + {"some_condition_total": 1, "population": 1000}, ] df = pd.DataFrame(data) - df = dataset_utils.generate_pct_rate_col(df, 'some_condition_total', 'population', 'condition_pct_rate') + df = dataset_utils.generate_pct_rate_col(df, "some_condition_total", "population", "condition_pct_rate") expected_data = [ - {'some_condition_total': 1, 'population': 2, 'condition_pct_rate': 50}, - {'some_condition_total': 11, 'population': 1000, 'condition_pct_rate': 1.0}, - {'some_condition_total': 0, 'population': 1000, 'condition_pct_rate': 0.0}, - {'some_condition_total': 1, 'population': 0, 'condition_pct_rate': None}, - {'some_condition_total': None, 'population': 1000, 'condition_pct_rate': None}, - {'some_condition_total': 1, 'population': 1000, 'condition_pct_rate': 0.0}, + {"some_condition_total": 1, "population": 2, "condition_pct_rate": 50}, + {"some_condition_total": 11, "population": 1000, "condition_pct_rate": 1.0}, + {"some_condition_total": 0, "population": 1000, "condition_pct_rate": 0.0}, + {"some_condition_total": 1, "population": 0, "condition_pct_rate": None}, + {"some_condition_total": None, "population": 1000, "condition_pct_rate": None}, + {"some_condition_total": 1, "population": 1000, "condition_pct_rate": 0.0}, ] expected_df = pd.DataFrame(expected_data) - expected_df['condition_pct_rate'] = expected_df['condition_pct_rate'].astype(float) + expected_df["condition_pct_rate"] = expected_df["condition_pct_rate"].astype(float) assert_frame_equal(df, expected_df, check_like=True) @@ -372,13 +372,13 @@ def testGeneratePctRelInequityCol(): df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_fake_data_without_pct_relative_inequity_col)) ).reset_index(drop=True) - df = dataset_utils.generate_pct_rel_inequity_col(df, 'pct_share', 'pct_pop', 'pct_relative_inequity') + df = dataset_utils.generate_pct_rel_inequity_col(df, "pct_share", "pct_pop", "pct_relative_inequity") expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_data_with_pct_relative_inequity_col)) ).reset_index(drop=True) - expected_df[['pct_relative_inequity', 'pct_share', 'pct_pop']] = expected_df[ - ['pct_relative_inequity', 'pct_share', 'pct_pop'] + expected_df[["pct_relative_inequity", "pct_share", "pct_pop"]] = expected_df[ + ["pct_relative_inequity", "pct_share", "pct_pop"] ].astype(float) assert_frame_equal(df, expected_df, check_like=True) @@ -390,7 +390,7 @@ def testZeroOutPctRelInequity(): ).reset_index(drop=True) rate_to_inequity_cols_map = {"something_per_100k": "something_pct_relative_inequity"} df = dataset_utils.zero_out_pct_rel_inequity( - df, 'state', 'race', rate_to_inequity_cols_map, pop_pct_col="something_pop_pct" + df, "state", "race", rate_to_inequity_cols_map, pop_pct_col="something_pop_pct" ) expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_data_with_properly_zeroed_pct_rel_inequity)) @@ -401,30 +401,30 @@ def testZeroOutPctRelInequity(): _fake_wide_short_source_data = [ [ - 'time_period', - 'state_fips', - 'state_name', - 'black_A_100k', - 'white_A_100k', - 'black_B_100k', - 'white_B_100k', + "time_period", + "state_fips", + "state_name", + "black_A_100k", + "white_A_100k", + "black_B_100k", + "white_B_100k", ], - ['1999', '88', 'North Somestate', 100, 50, 999, 2222], - ['1999', '99', 'South Somestate', 101, 51, 998, 2221], - ['2000', '88', 'North Somestate', 100, 50, 999, 2222], - ['2000', '99', 'South Somestate', 101, 51, 998, 2221], + ["1999", "88", "North Somestate", 100, 50, 999, 2222], + ["1999", "99", "South Somestate", 101, 51, 998, 2221], + ["2000", "88", "North Somestate", 100, 50, 999, 2222], + ["2000", "99", "South Somestate", 101, 51, 998, 2221], ] _expected_HET_style_data = [ - ['time_period', 'state_fips', 'state_name', 'race', 'A_100k', 'B_100k'], - ['1999', '88', 'North Somestate', 'black', 100, 999], - ['1999', '88', 'North Somestate', 'white', 50, 2222], - ['1999', '99', 'South Somestate', 'black', 101, 998], - ['1999', '99', 'South Somestate', 'white', 51, 2221], - ['2000', '88', 'North Somestate', 'black', 100, 999], - ['2000', '88', 'North Somestate', 'white', 50, 2222], - ['2000', '99', 'South Somestate', 'black', 101, 998], - ['2000', '99', 'South Somestate', 'white', 51, 2221], + ["time_period", "state_fips", "state_name", "race", "A_100k", "B_100k"], + ["1999", "88", "North Somestate", "black", 100, 999], + ["1999", "88", "North Somestate", "white", 50, 2222], + ["1999", "99", "South Somestate", "black", 101, 998], + ["1999", "99", "South Somestate", "white", 51, 2221], + ["2000", "88", "North Somestate", "black", 100, 999], + ["2000", "88", "North Somestate", "white", 50, 2222], + ["2000", "99", "South Somestate", "black", 101, 998], + ["2000", "99", "South Somestate", "white", 51, 2221], ] @@ -454,26 +454,26 @@ def test_melt_to_het_style_df(): def test_preserve_only_current_time_period_rows(): _time_data = [ - ['time_period', 'state_fips', 'state_name', 'race', 'A_100k', 'B_100k'], - ['1999-01', '88', 'North Somestate', 'black', 100, 999], - ['1999', '88', 'North Somestate', 'white', 50, 2222], - ['1999', '99', 'South Somestate', 'black', 101, 998], - ['1999', '99', 'South Somestate', 'white', 51, 2221], - ['2000', '88', 'North Somestate', 'black', 100, 999], - ['2000', '88', 'North Somestate', 'white', 50, 2222], - ['2000', '99', 'South Somestate', 'black', 101, 998], - ['2000', '99', 'South Somestate', 'white', 51, 2221], + ["time_period", "state_fips", "state_name", "race", "A_100k", "B_100k"], + ["1999-01", "88", "North Somestate", "black", 100, 999], + ["1999", "88", "North Somestate", "white", 50, 2222], + ["1999", "99", "South Somestate", "black", 101, 998], + ["1999", "99", "South Somestate", "white", 51, 2221], + ["2000", "88", "North Somestate", "black", 100, 999], + ["2000", "88", "North Somestate", "white", 50, 2222], + ["2000", "99", "South Somestate", "black", 101, 998], + ["2000", "99", "South Somestate", "white", 51, 2221], ] time_df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_time_data))).reset_index(drop=True) # normal mode: drop time_period current_df = dataset_utils.preserve_only_current_time_period_rows(time_df) _expected_current_data = [ - ['state_fips', 'state_name', 'race', 'A_100k', 'B_100k'], - ['88', 'North Somestate', 'black', 100, 999], - ['88', 'North Somestate', 'white', 50, 2222], - ['99', 'South Somestate', 'black', 101, 998], - ['99', 'South Somestate', 'white', 51, 2221], + ["state_fips", "state_name", "race", "A_100k", "B_100k"], + ["88", "North Somestate", "black", 100, 999], + ["88", "North Somestate", "white", 50, 2222], + ["99", "South Somestate", "black", 101, 998], + ["99", "South Somestate", "white", 51, 2221], ] expected_current_df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_expected_current_data))).reset_index( drop=True @@ -484,11 +484,11 @@ def test_preserve_only_current_time_period_rows(): # optional mode: keep time_period current_df_with_time = dataset_utils.preserve_only_current_time_period_rows(time_df, keep_time_period_col=True) _expected_current_data = [ - ['time_period', 'state_fips', 'state_name', 'race', 'A_100k', 'B_100k'], - ['2000', '88', 'North Somestate', 'black', 100, 999], - ['2000', '88', 'North Somestate', 'white', 50, 2222], - ['2000', '99', 'South Somestate', 'black', 101, 998], - ['2000', '99', 'South Somestate', 'white', 51, 2221], + ["time_period", "state_fips", "state_name", "race", "A_100k", "B_100k"], + ["2000", "88", "North Somestate", "black", 100, 999], + ["2000", "88", "North Somestate", "white", 50, 2222], + ["2000", "99", "South Somestate", "black", 101, 998], + ["2000", "99", "South Somestate", "white", 51, 2221], ] expected_current_df_with_time = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_current_data)), dtype={"time_period": str} @@ -499,21 +499,21 @@ def test_preserve_only_current_time_period_rows(): # optional alt name for time_period column _time_alt_col_data = [ [ - 'some_other_datetime_col', - 'state_fips', - 'state_name', - 'race', - 'A_100k', - 'B_100k', + "some_other_datetime_col", + "state_fips", + "state_name", + "race", + "A_100k", + "B_100k", ], - ['1999-01', '88', 'North Somestate', 'black', 100, 999], - ['1999', '88', 'North Somestate', 'white', 50, 2222], - ['1999', '99', 'South Somestate', 'black', 101, 998], - ['1999', '99', 'South Somestate', 'white', 51, 2221], - ['2000', '88', 'North Somestate', 'black', 100, 999], - ['2000', '88', 'North Somestate', 'white', 50, 2222], - ['2000', '99', 'South Somestate', 'black', 101, 998], - ['2000', '99', 'South Somestate', 'white', 51, 2221], + ["1999-01", "88", "North Somestate", "black", 100, 999], + ["1999", "88", "North Somestate", "white", 50, 2222], + ["1999", "99", "South Somestate", "black", 101, 998], + ["1999", "99", "South Somestate", "white", 51, 2221], + ["2000", "88", "North Somestate", "black", 100, 999], + ["2000", "88", "North Somestate", "white", 50, 2222], + ["2000", "99", "South Somestate", "black", 101, 998], + ["2000", "99", "South Somestate", "white", 51, 2221], ] time_alt_col_df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_time_alt_col_data))).reset_index(drop=True) @@ -521,11 +521,11 @@ def test_preserve_only_current_time_period_rows(): time_alt_col_df, time_period_col="some_other_datetime_col" ) _expected_alt_col_current_data = [ - ['state_fips', 'state_name', 'race', 'A_100k', 'B_100k'], - ['88', 'North Somestate', 'black', 100, 999], - ['88', 'North Somestate', 'white', 50, 2222], - ['99', 'South Somestate', 'black', 101, 998], - ['99', 'South Somestate', 'white', 51, 2221], + ["state_fips", "state_name", "race", "A_100k", "B_100k"], + ["88", "North Somestate", "black", 100, 999], + ["88", "North Somestate", "white", 50, 2222], + ["99", "South Somestate", "black", 101, 998], + ["99", "South Somestate", "white", 51, 2221], ] expected_current_df_with_alt_col = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_alt_col_current_data)) @@ -540,15 +540,15 @@ def test_preserve_only_current_time_period_rows(): def test_combine_race_ethnicity_hispanic_default_behavior_with_count(): df = pd.DataFrame( { - 'ethnicity': ['Hispanic or Latino', 'Hispanic or Latino'], - 'race': ['White', 'Black'], - 'state_fips': ['99', '99'], - 'condition_count': [100, 200], + "ethnicity": ["Hispanic or Latino", "Hispanic or Latino"], + "race": ["White", "Black"], + "state_fips": ["99", "99"], + "condition_count": [100, 200], } ) - result_df = combine_race_ethnicity(df, ['condition_count'], {'White': 'WHITE_NH', 'Black': 'BLACK_NH'}) + result_df = combine_race_ethnicity(df, ["condition_count"], {"White": "WHITE_NH", "Black": "BLACK_NH"}) expected_df = pd.DataFrame( - {'race_category_id': ["HISP"], 'state_fips': ['99'], 'condition_count': [300]} # Sum of Hispanic counts + {"race_category_id": ["HISP"], "state_fips": ["99"], "condition_count": [300]} # Sum of Hispanic counts ) assert_frame_equal(result_df, expected_df, check_like=True) @@ -556,23 +556,23 @@ def test_combine_race_ethnicity_hispanic_default_behavior_with_count(): def test_combine_race_ethnicity_hispanic_latino_specific_hisp_value_with_count(): df = pd.DataFrame( { - 'ethnicity': ['Hispanic/Latino', 'Hispanic/Latino', 'Not Hispanic/Latino'], - 'race': ['Black', 'Asian', 'White'], - 'state_fips': ['99', '99', '99'], - 'condition_count': [150, 250, 300], + "ethnicity": ["Hispanic/Latino", "Hispanic/Latino", "Not Hispanic/Latino"], + "race": ["Black", "Asian", "White"], + "state_fips": ["99", "99", "99"], + "condition_count": [150, 250, 300], } ) result_df = combine_race_ethnicity( df, ["condition_count"], - {'Black': 'BLACK_NH', 'Asian': 'ASIAN_NH', 'White': 'WHITE_NH'}, - ethnicity_value='Hispanic/Latino', + {"Black": "BLACK_NH", "Asian": "ASIAN_NH", "White": "WHITE_NH"}, + ethnicity_value="Hispanic/Latino", ) expected_df = pd.DataFrame( { - 'race_category_id': ["HISP", "WHITE_NH"], - 'state_fips': ['99', '99'], - 'condition_count': [400, 300], # Sum of Hispanic counts, Non-Hispanic count + "race_category_id": ["HISP", "WHITE_NH"], + "state_fips": ["99", "99"], + "condition_count": [400, 300], # Sum of Hispanic counts, Non-Hispanic count } ) assert_frame_equal(result_df, expected_df, check_like=True) @@ -581,20 +581,20 @@ def test_combine_race_ethnicity_hispanic_latino_specific_hisp_value_with_count() def test_combine_race_ethnicity_non_hispanic_default_behavior_with_count(): df = pd.DataFrame( { - 'ethnicity': ['Non-Hispanic/Latino', 'Non-Hispanic/Latino', 'Hispanic or Latino'], - 'race': ['White', 'Black', 'Asian'], - 'state_fips': ['99', '99', '99'], - 'condition_count': [200, 100, 300], + "ethnicity": ["Non-Hispanic/Latino", "Non-Hispanic/Latino", "Hispanic or Latino"], + "race": ["White", "Black", "Asian"], + "state_fips": ["99", "99", "99"], + "condition_count": [200, 100, 300], } ) result_df = combine_race_ethnicity( - df, ['condition_count'], {'White': 'WHITE_NH', 'Black': 'BLACK_NH', 'Asian': 'ASIAN_NH'} + df, ["condition_count"], {"White": "WHITE_NH", "Black": "BLACK_NH", "Asian": "ASIAN_NH"} ) expected_df = pd.DataFrame( { - 'race_category_id': ["BLACK_NH", "HISP", "WHITE_NH"], - 'state_fips': ['99', '99', '99'], - 'condition_count': [100, 300, 200], + "race_category_id": ["BLACK_NH", "HISP", "WHITE_NH"], + "state_fips": ["99", "99", "99"], + "condition_count": [100, 300, 200], } ) assert_frame_equal(result_df, expected_df, check_like=True) @@ -603,20 +603,20 @@ def test_combine_race_ethnicity_non_hispanic_default_behavior_with_count(): def test_combine_race_ethnicity_unknown_and_missing_default_values_with_count(): df = pd.DataFrame( { - 'ethnicity': ['Unknown', 'Missing', 'Hispanic or Latino', 'Hispanic or Latino'], - 'race': ['Asian', 'Missing', 'White', 'Black'], - 'state_fips': ['99', '99', '99', '99'], - 'condition_count': [50, 75, 100, 200], + "ethnicity": ["Unknown", "Missing", "Hispanic or Latino", "Hispanic or Latino"], + "race": ["Asian", "Missing", "White", "Black"], + "state_fips": ["99", "99", "99", "99"], + "condition_count": [50, 75, 100, 200], } ) result_df = combine_race_ethnicity( - df, ['condition_count'], {'Asian': 'ASIAN_NH', 'White': 'WHITE_NH', 'Black': 'BLACK_NH'} + df, ["condition_count"], {"Asian": "ASIAN_NH", "White": "WHITE_NH", "Black": "BLACK_NH"} ) expected_df = pd.DataFrame( { - 'race_category_id': ["HISP", "UNKNOWN"], - 'state_fips': ['99', '99'], - 'condition_count': [300, 125], # sum Unknown counts, sum Hispanic counts + "race_category_id": ["HISP", "UNKNOWN"], + "state_fips": ["99", "99"], + "condition_count": [300, 125], # sum Unknown counts, sum Hispanic counts } ) assert_frame_equal(result_df, expected_df, check_like=True) @@ -625,23 +625,23 @@ def test_combine_race_ethnicity_unknown_and_missing_default_values_with_count(): def test_combine_race_ethnicity_unknown_and_missing_specific_unknown_value_with_count(): df = pd.DataFrame( { - 'ethnicity': ['Nothing', 'Hispanic or Latino', 'Hispanic or Latino', 'Not Hispanic/Latino'], - 'race': ['Asian', 'White', 'Black', 'Asian'], - 'state_fips': ['99', '99', '99', '99'], - 'condition_count': [25, 150, 250, 100], + "ethnicity": ["Nothing", "Hispanic or Latino", "Hispanic or Latino", "Not Hispanic/Latino"], + "race": ["Asian", "White", "Black", "Asian"], + "state_fips": ["99", "99", "99", "99"], + "condition_count": [25, 150, 250, 100], } ) result_df = combine_race_ethnicity( df, - ['condition_count'], - {'Asian': 'ASIAN_NH', 'White': 'WHITE_NH', 'Black': 'BLACK_NH'}, - unknown_values=['Nothing'], + ["condition_count"], + {"Asian": "ASIAN_NH", "White": "WHITE_NH", "Black": "BLACK_NH"}, + unknown_values=["Nothing"], ) expected_df = pd.DataFrame( { - 'race_category_id': ["ASIAN_NH", "HISP", "UNKNOWN"], - 'state_fips': ['99', '99', '99'], - 'condition_count': [100, 400, 25], # Unknown count, sum of Hispanic counts, Non-Hispanic count + "race_category_id": ["ASIAN_NH", "HISP", "UNKNOWN"], + "state_fips": ["99", "99", "99"], + "condition_count": [100, 400, 25], # Unknown count, sum of Hispanic counts, Non-Hispanic count } ) assert_frame_equal(result_df, expected_df, check_like=True) @@ -651,76 +651,76 @@ def test_generate_time_df_with_cols_and_types(): test_data = pd.DataFrame( { - 'time_period': ['2020', '2021'], - 'state_name': ['Alabama', 'California'], - 'state_fips': ['01', '02'], - 'age': ['25-30', '31-36'], - 'estimated_total': [100, 200], - 'per_100k': [50, 75], - 'pct_share': [0.5, 0.7], - 'pct_relative_inequity': [0.1, 0.2], - 'population': [1351583, 5168831], - 'population_pct': [0.2, 0.7], + "time_period": ["2020", "2021"], + "state_name": ["Alabama", "California"], + "state_fips": ["01", "02"], + "age": ["25-30", "31-36"], + "estimated_total": [100, 200], + "per_100k": [50, 75], + "pct_share": [0.5, 0.7], + "pct_relative_inequity": [0.1, 0.2], + "population": [1351583, 5168831], + "population_pct": [0.2, 0.7], } ) expected_current_df = pd.DataFrame( { - 'state_name': ['California'], - 'state_fips': ['02'], - 'age': ['31-36'], - 'estimated_total': [200.0], - 'per_100k': [75.0], - 'pct_share': [0.7], - 'population_pct': [0.7], + "state_name": ["California"], + "state_fips": ["02"], + "age": ["31-36"], + "estimated_total": [200.0], + "per_100k": [75.0], + "pct_share": [0.7], + "population_pct": [0.7], } ) expected_current_df.reset_index(drop=True) expected_historical_df = pd.DataFrame( { - 'time_period': ['2020', '2021'], - 'state_name': ['Alabama', 'California'], - 'state_fips': ['01', '02'], - 'age': ['25-30', '31-36'], - 'per_100k': [50.0, 75.0], - 'pct_relative_inequity': [0.1, 0.2], - 'pct_share': [0.5, 0.7], + "time_period": ["2020", "2021"], + "state_name": ["Alabama", "California"], + "state_fips": ["01", "02"], + "age": ["25-30", "31-36"], + "per_100k": [50.0, 75.0], + "pct_relative_inequity": [0.1, 0.2], + "pct_share": [0.5, 0.7], } ) expected_current_col_types = { - 'state_name': 'STRING', - 'state_fips': 'STRING', - 'age': 'STRING', - 'estimated_total': 'FLOAT64', - 'per_100k': 'FLOAT64', - 'pct_share': 'FLOAT64', - 'population_pct': 'FLOAT64', + "state_name": "STRING", + "state_fips": "STRING", + "age": "STRING", + "estimated_total": "FLOAT64", + "per_100k": "FLOAT64", + "pct_share": "FLOAT64", + "population_pct": "FLOAT64", } expected_historical_col_types = { - 'time_period': 'STRING', - 'state_name': 'STRING', - 'state_fips': 'STRING', - 'age': 'STRING', - 'per_100k': 'FLOAT64', - 'pct_relative_inequity': 'FLOAT64', - 'pct_share': 'FLOAT64', + "time_period": "STRING", + "state_name": "STRING", + "state_fips": "STRING", + "age": "STRING", + "per_100k": "FLOAT64", + "pct_relative_inequity": "FLOAT64", + "pct_share": "FLOAT64", } current_df, current_col_types = generate_time_df_with_cols_and_types( test_data, - ['estimated_total', 'per_100k', 'pct_share', 'population_pct'], # numerical_cols_to_keep - 'current', # table_type - 'age', # dem_col + ["estimated_total", "per_100k", "pct_share", "population_pct"], # numerical_cols_to_keep + "current", # table_type + "age", # dem_col ) historical_df, historical_col_types = generate_time_df_with_cols_and_types( test_data, - ['per_100k', 'pct_relative_inequity', 'pct_share'], # numerical_cols_to_keep - 'historical', # table_type - 'age', # dem_col + ["per_100k", "pct_relative_inequity", "pct_share"], # numerical_cols_to_keep + "historical", # table_type + "age", # dem_col ) current_df.reset_index(drop=True) @@ -735,31 +735,31 @@ def test_generate_time_df_with_cols_and_types(): # # STATE BY SEX DATA fake_state_by_sex_data_with_rates_pop_18plus = { - 'topic_per_100k': [20, 60, 40, 50, 50, 50], - 'sex': ['Male', 'Female', 'All', 'Male', 'Female', 'All'], - 'state_fips': ['01', '01', '01', '02', '02', '02'], - 'state_name': ['Alabama', 'Alabama', 'Alabama', 'Alaska', 'Alaska', 'Alaska'], - 'population_18+': [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], - 'topic_estimated_total': [376.0, 1223.0, 1567.0, 147.0, 131.0, 278.0], + "topic_per_100k": [20, 60, 40, 50, 50, 50], + "sex": ["Male", "Female", "All", "Male", "Female", "All"], + "state_fips": ["01", "01", "01", "02", "02", "02"], + "state_name": ["Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"], + "population_18+": [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], + "topic_estimated_total": [376.0, 1223.0, 1567.0, 147.0, 131.0, 278.0], } fake_state_by_sex_data_with_rates_pop_18plus_and_counts = { - 'topic_per_100k': [20, 60, 40, 50, 50, 50], - 'sex': ['Male', 'Female', 'All', 'Male', 'Female', 'All'], - 'state_fips': ['01', '01', '01', '02', '02', '02'], - 'state_name': ['Alabama', 'Alabama', 'Alabama', 'Alaska', 'Alaska', 'Alaska'], - 'population_18+': [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], - 'topic_estimated_total': [376.0, 1223.0, 1567.0, 147.0, 131.0, 278.0], + "topic_per_100k": [20, 60, 40, 50, 50, 50], + "sex": ["Male", "Female", "All", "Male", "Female", "All"], + "state_fips": ["01", "01", "01", "02", "02", "02"], + "state_name": ["Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"], + "population_18+": [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], + "topic_estimated_total": [376.0, 1223.0, 1567.0, 147.0, 131.0, 278.0], } fake_state_by_sex_data_with_rates_pop_18plus_adjusted_all_counts_and_pct_share = { - 'topic_per_100k': [20, 60, 40, 50, 50, 50], - 'sex': ['Male', 'Female', 'All', 'Male', 'Female', 'All'], - 'state_fips': ['01', '01', '01', '02', '02', '02'], - 'state_name': ['Alabama', 'Alabama', 'Alabama', 'Alaska', 'Alaska', 'Alaska'], - 'population_18+': [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], - 'topic_estimated_total': [376.0, 1223.0, 1599.0, 147.0, 131.0, 278.0], # note the new summed Alls - 'topic_pct_share': [23.5, 76.5, 100.0, 52.9, 47.1, 100.0], + "topic_per_100k": [20, 60, 40, 50, 50, 50], + "sex": ["Male", "Female", "All", "Male", "Female", "All"], + "state_fips": ["01", "01", "01", "02", "02", "02"], + "state_name": ["Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"], + "population_18+": [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], + "topic_estimated_total": [376.0, 1223.0, 1599.0, 147.0, 131.0, 278.0], # note the new summed Alls + "topic_pct_share": [23.5, 76.5, 100.0, 52.9, 47.1, 100.0], } # # STATE BY SEX TESTS @@ -767,13 +767,13 @@ def test_generate_time_df_with_cols_and_types(): def test_state_sex_generate_estimated_total_col(): df = pd.DataFrame(fake_state_by_sex_data_with_rates_pop_18plus) - df = generate_estimated_total_col(df, 'population_18+', {'topic_per_100k': 'topic_estimated_total'}) + df = generate_estimated_total_col(df, "population_18+", {"topic_per_100k": "topic_estimated_total"}) assert_frame_equal(df, pd.DataFrame(fake_state_by_sex_data_with_rates_pop_18plus_and_counts), check_like=True) def test_state_sex_generate_pct_share_col_of_summed_alls(): df = pd.DataFrame(fake_state_by_sex_data_with_rates_pop_18plus_and_counts) - df = generate_pct_share_col_of_summed_alls(df, {'topic_estimated_total': 'topic_pct_share'}, 'sex') + df = generate_pct_share_col_of_summed_alls(df, {"topic_estimated_total": "topic_pct_share"}, "sex") assert_frame_equal( df, pd.DataFrame(fake_state_by_sex_data_with_rates_pop_18plus_adjusted_all_counts_and_pct_share), @@ -784,78 +784,78 @@ def test_state_sex_generate_pct_share_col_of_summed_alls(): # COUNTY BY RACE DATA fake_county_by_race_data_with_rates_and_pop = { - 'topic_per_100k': [100, 10, 20, 50, 50, 50], - 'race_category_id': ['BLACK_NH', 'WHITE_NH', 'ALL', 'BLACK_NH', 'WHITE_NH', 'ALL'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'All', - 'Black or African American (NH)', - 'White (NH)', - 'All', + "topic_per_100k": [100, 10, 20, 50, 50, 50], + "race_category_id": ["BLACK_NH", "WHITE_NH", "ALL", "BLACK_NH", "WHITE_NH", "ALL"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "All", + "Black or African American (NH)", + "White (NH)", + "All", ], - 'county_fips': ['01001', '01001', '01001', '01003', '01003', '01003'], - 'state_fips': ['01', '01', '01', '01', '01', '01'], - 'county_name': [ - 'Autuga County', - 'Autuga County', - 'Autuga County', - 'Baldwin County', - 'Baldwin County', - 'Baldwin County', + "county_fips": ["01001", "01001", "01001", "01003", "01003", "01003"], + "state_fips": ["01", "01", "01", "01", "01", "01"], + "county_name": [ + "Autuga County", + "Autuga County", + "Autuga County", + "Baldwin County", + "Baldwin County", + "Baldwin County", ], - 'population': [11496.0, 42635.0, 58761.0, 19445.0, 192161.0, 233420.0], + "population": [11496.0, 42635.0, 58761.0, 19445.0, 192161.0, 233420.0], } fake_county_by_race_data_with_rates_pop_and_counts = { - 'topic_per_100k': [100, 10, 20, 50, 50, 50], - 'race_category_id': ['BLACK_NH', 'WHITE_NH', 'ALL', 'BLACK_NH', 'WHITE_NH', 'ALL'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'All', - 'Black or African American (NH)', - 'White (NH)', - 'All', + "topic_per_100k": [100, 10, 20, 50, 50, 50], + "race_category_id": ["BLACK_NH", "WHITE_NH", "ALL", "BLACK_NH", "WHITE_NH", "ALL"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "All", + "Black or African American (NH)", + "White (NH)", + "All", ], - 'county_fips': ['01001', '01001', '01001', '01003', '01003', '01003'], - 'state_fips': ['01', '01', '01', '01', '01', '01'], - 'county_name': [ - 'Autuga County', - 'Autuga County', - 'Autuga County', - 'Baldwin County', - 'Baldwin County', - 'Baldwin County', + "county_fips": ["01001", "01001", "01001", "01003", "01003", "01003"], + "state_fips": ["01", "01", "01", "01", "01", "01"], + "county_name": [ + "Autuga County", + "Autuga County", + "Autuga County", + "Baldwin County", + "Baldwin County", + "Baldwin County", ], - 'population': [11496.0, 42635.0, 58761.0, 19445.0, 192161.0, 233420.0], - 'topic_estimated_total': [11.0, 4.0, 12.0, 10.0, 96.0, 117.0], + "population": [11496.0, 42635.0, 58761.0, 19445.0, 192161.0, 233420.0], + "topic_estimated_total": [11.0, 4.0, 12.0, 10.0, 96.0, 117.0], } fake_county_by_race_data_with_rates_pop_adjusted_all_counts_and_pct_share = { - 'topic_per_100k': [100, 10, 20, 50, 50, 50], - 'race_category_id': ['BLACK_NH', 'WHITE_NH', 'ALL', 'BLACK_NH', 'WHITE_NH', 'ALL'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'All', - 'Black or African American (NH)', - 'White (NH)', - 'All', + "topic_per_100k": [100, 10, 20, 50, 50, 50], + "race_category_id": ["BLACK_NH", "WHITE_NH", "ALL", "BLACK_NH", "WHITE_NH", "ALL"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "All", + "Black or African American (NH)", + "White (NH)", + "All", ], - 'county_fips': ['01001', '01001', '01001', '01003', '01003', '01003'], - 'state_fips': ['01', '01', '01', '01', '01', '01'], - 'county_name': [ - 'Autuga County', - 'Autuga County', - 'Autuga County', - 'Baldwin County', - 'Baldwin County', - 'Baldwin County', + "county_fips": ["01001", "01001", "01001", "01003", "01003", "01003"], + "state_fips": ["01", "01", "01", "01", "01", "01"], + "county_name": [ + "Autuga County", + "Autuga County", + "Autuga County", + "Baldwin County", + "Baldwin County", + "Baldwin County", ], - 'population': [11496.0, 42635.0, 58761.0, 19445.0, 192161.0, 233420.0], - 'topic_estimated_total': [11.0, 4.0, 15.0, 10.0, 96.0, 106.0], # note the new summed Alls - 'topic_pct_share': [73.3, 26.7, 100.0, 9.4, 90.6, 100.0], + "population": [11496.0, 42635.0, 58761.0, 19445.0, 192161.0, 233420.0], + "topic_estimated_total": [11.0, 4.0, 15.0, 10.0, 96.0, 106.0], # note the new summed Alls + "topic_pct_share": [73.3, 26.7, 100.0, 9.4, 90.6, 100.0], } @@ -864,13 +864,13 @@ def test_state_sex_generate_pct_share_col_of_summed_alls(): def test_county_race_generate_estimated_total_col(): df = pd.DataFrame(fake_county_by_race_data_with_rates_and_pop) - df = generate_estimated_total_col(df, 'population', {'topic_per_100k': 'topic_estimated_total'}) + df = generate_estimated_total_col(df, "population", {"topic_per_100k": "topic_estimated_total"}) assert_frame_equal(df, pd.DataFrame(fake_county_by_race_data_with_rates_pop_and_counts), check_like=True) def test_county_race_generate_pct_share_col_of_summed_alls(): df = pd.DataFrame(fake_county_by_race_data_with_rates_pop_and_counts) - df = generate_pct_share_col_of_summed_alls(df, {'topic_estimated_total': 'topic_pct_share'}, 'race_and_ethnicity') + df = generate_pct_share_col_of_summed_alls(df, {"topic_estimated_total": "topic_pct_share"}, "race_and_ethnicity") assert_frame_equal( df, pd.DataFrame(fake_county_by_race_data_with_rates_pop_adjusted_all_counts_and_pct_share), @@ -880,23 +880,23 @@ def test_county_race_generate_pct_share_col_of_summed_alls(): def test_preserve_most_recent_year_rows_per_topic_normal_case(): test_data = { - 'race_and_ethnicity': ['Black', 'Black', 'Black', 'White', 'White', 'White'], - 'time_period': ['2021', '2022', '2023', '2021', '2022', '2023'], - 'topic1_per_100k': [10.0, 15.0, 20.0, None, 25.0, 30.0], - 'topic2_pct_rate': [5.0, None, 10.0, 20.0, 25.0, None], - 'topic3_index': [None, 1.0, 2.0, 3.0, None, 4.0], + "race_and_ethnicity": ["Black", "Black", "Black", "White", "White", "White"], + "time_period": ["2021", "2022", "2023", "2021", "2022", "2023"], + "topic1_per_100k": [10.0, 15.0, 20.0, None, 25.0, 30.0], + "topic2_pct_rate": [5.0, None, 10.0, 20.0, 25.0, None], + "topic3_index": [None, 1.0, 2.0, 3.0, None, 4.0], } expected_data = { - 'race_and_ethnicity': ['Black', 'White'], - 'topic1_per_100k': [20.0, 30.0], - 'topic2_pct_rate': [10.0, None], - 'topic3_index': [2.0, 4.0], + "race_and_ethnicity": ["Black", "White"], + "topic1_per_100k": [20.0, 30.0], + "topic2_pct_rate": [10.0, None], + "topic3_index": [2.0, 4.0], } test_df = pd.DataFrame(test_data) expected_df = pd.DataFrame(expected_data) - topic_prefixes = ['topic1', 'topic2', 'topic3'] + topic_prefixes = ["topic1", "topic2", "topic3"] test_df = preserve_most_recent_year_rows_per_topic(test_df, topic_prefixes) pd.testing.assert_frame_equal(test_df, expected_df) @@ -906,42 +906,42 @@ def test_preserve_most_recent_year_rows_per_topic_normal_case(): # SHARED TEST DATA df = pd.DataFrame( { - 'time_period': ['2020', '2021', '2022'], - 'state_fips': ['01', '02', '03'], - 'example_per_100k': [10, 20, 30], - 'example_estimated_total': [100, 200, 300], - 'example_pct_relative_inequity': [0.1, 0.2, 0.3], - 'example_pct_share': [0.5, 0.6, 0.7], - 'other_pct_rate': [1, 2, 3], - 'some_population_pct': [99, 100, 100], + "time_period": ["2020", "2021", "2022"], + "state_fips": ["01", "02", "03"], + "example_per_100k": [10, 20, 30], + "example_estimated_total": [100, 200, 300], + "example_pct_relative_inequity": [0.1, 0.2, 0.3], + "example_pct_share": [0.5, 0.6, 0.7], + "other_pct_rate": [1, 2, 3], + "some_population_pct": [99, 100, 100], } ) -topic_prefixes = ['example', 'other', 'some'] +topic_prefixes = ["example", "other", "some"] def test_current_time_view(): expected_current_df = pd.DataFrame( { - 'state_fips': ['03'], - 'example_per_100k': [30], - 'example_estimated_total': [300], - 'example_pct_share': [0.7], - 'other_pct_rate': [3], - 'some_population_pct': [100], + "state_fips": ["03"], + "example_per_100k": [30], + "example_estimated_total": [300], + "example_pct_share": [0.7], + "other_pct_rate": [3], + "some_population_pct": [100], } ) expected_current_bq_col_types = { - 'state_fips': BQ_STRING, - 'example_per_100k': BQ_FLOAT, - 'example_estimated_total': BQ_FLOAT, - 'example_pct_share': BQ_FLOAT, - 'other_pct_rate': BQ_FLOAT, - 'some_population_pct': BQ_FLOAT, + "state_fips": BQ_STRING, + "example_per_100k": BQ_FLOAT, + "example_estimated_total": BQ_FLOAT, + "example_pct_share": BQ_FLOAT, + "other_pct_rate": BQ_FLOAT, + "some_population_pct": BQ_FLOAT, } - result_current_df, result_bq_col_types = get_timeview_df_and_cols(df, 'current', topic_prefixes) + result_current_df, result_bq_col_types = get_timeview_df_and_cols(df, "current", topic_prefixes) pd.testing.assert_frame_equal(result_current_df, expected_current_df) assert result_bq_col_types == expected_current_bq_col_types @@ -951,23 +951,23 @@ def test_historical_time_view(): expected_historical_df = pd.DataFrame( { - 'time_period': ['2020', '2021', '2022'], - 'state_fips': ['01', '02', '03'], - 'example_per_100k': [10, 20, 30], - 'example_pct_relative_inequity': [0.1, 0.2, 0.3], - 'other_pct_rate': [1, 2, 3], + "time_period": ["2020", "2021", "2022"], + "state_fips": ["01", "02", "03"], + "example_per_100k": [10, 20, 30], + "example_pct_relative_inequity": [0.1, 0.2, 0.3], + "other_pct_rate": [1, 2, 3], } ) expected_bq_col_types = { - 'time_period': BQ_STRING, - 'state_fips': BQ_STRING, - 'example_per_100k': BQ_FLOAT, - 'example_pct_relative_inequity': BQ_FLOAT, - 'other_pct_rate': BQ_FLOAT, + "time_period": BQ_STRING, + "state_fips": BQ_STRING, + "example_per_100k": BQ_FLOAT, + "example_pct_relative_inequity": BQ_FLOAT, + "other_pct_rate": BQ_FLOAT, } - result_df, result_bq_col_types = get_timeview_df_and_cols(df, 'historical', topic_prefixes) + result_df, result_bq_col_types = get_timeview_df_and_cols(df, "historical", topic_prefixes) pd.testing.assert_frame_equal(result_df, expected_historical_df) assert result_bq_col_types == expected_bq_col_types @@ -975,4 +975,4 @@ def test_historical_time_view(): def test_invalid_time_view(): with pytest.raises(ValueError): - get_timeview_df_and_cols(df, 'some_invalid_time_viewπ', topic_prefixes) + get_timeview_df_and_cols(df, "some_invalid_time_viewπ", topic_prefixes) diff --git a/python/tests/ingestion/test_gcs_to_bq.py b/python/tests/ingestion/test_gcs_to_bq.py index cb4529dcb1..f580dac892 100644 --- a/python/tests/ingestion/test_gcs_to_bq.py +++ b/python/tests/ingestion/test_gcs_to_bq.py @@ -17,12 +17,12 @@ def test_get_bq_column_types(): fake_df = pd.DataFrame( { - 'state_fips': ["01", "02", "03"], - 'some_condition_per_100k': [None, 1, 2], + "state_fips": ["01", "02", "03"], + "some_condition_per_100k": [None, 1, 2], } ) - column_types = gcs_to_bq_util.get_bq_column_types(fake_df, ['some_condition_per_100k']) - expected_column_types = {'state_fips': BQ_STRING, 'some_condition_per_100k': BQ_FLOAT} + column_types = gcs_to_bq_util.get_bq_column_types(fake_df, ["some_condition_per_100k"]) + expected_column_types = {"state_fips": BQ_STRING, "some_condition_per_100k": BQ_FLOAT} assert column_types == expected_column_types @@ -33,7 +33,7 @@ class GcsToBqTest(TestCase): def testLoadValuesBlobAsDataframe(self): """Tests that data in json list format is loaded into a pandas.DataFrame object using the first row as a header.""" - mock_attrs = {'download_as_string.return_value': json.dumps(self._test_data).encode('utf-8')} + mock_attrs = {"download_as_string.return_value": json.dumps(self._test_data).encode("utf-8")} mock_blob = Mock(**mock_attrs) frame = gcs_to_bq_util.load_values_blob_as_df(mock_blob) @@ -48,19 +48,19 @@ def testAddDataframeToBq_AutoSchema(self): add_df_to_bq.""" test_frame = DataFrame(data=self._test_data[1:], columns=self._test_data[0], index=[1, 2]) - with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client: + with patch("ingestion.gcs_to_bq_util.bigquery.Client") as mock_client: # Set up mock calls mock_instance = mock_client.return_value mock_table = Mock() mock_instance.dataset.return_value = mock_table - mock_table.table.return_value = 'test-project.test-dataset.table' + mock_table.table.return_value = "test-project.test-dataset.table" gcs_to_bq_util.add_df_to_bq(test_frame.copy(deep=True), "test-dataset", "table") mock_instance.load_table_from_json.assert_called() call_args = mock_instance.load_table_from_json.call_args - self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient='records'))) - job_config = call_args.kwargs['job_config'] + self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient="records"))) + job_config = call_args.kwargs["job_config"] self.assertTrue(job_config.autodetect) @freeze_time("2020-01-01") @@ -69,24 +69,24 @@ def testAddDataframeToBq_IgnoreColModes(self): to add_df_to_bq.""" test_frame = DataFrame(data=self._test_data[1:], columns=self._test_data[0], index=[1, 2]) - with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client: + with patch("ingestion.gcs_to_bq_util.bigquery.Client") as mock_client: # Set up mock calls mock_instance = mock_client.return_value mock_table = Mock() mock_instance.dataset.return_value = mock_table - mock_table.table.return_value = 'test-project.test-dataset.table' + mock_table.table.return_value = "test-project.test-dataset.table" gcs_to_bq_util.add_df_to_bq( test_frame.copy(deep=True), "test-dataset", "table", - col_modes={'label1': 'REPEATED', 'label2': 'REQUIRED'}, + col_modes={"label1": "REPEATED", "label2": "REQUIRED"}, ) mock_instance.load_table_from_json.assert_called() call_args = mock_instance.load_table_from_json.call_args - self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient='records'))) - job_config = call_args.kwargs['job_config'] + self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient="records"))) + job_config = call_args.kwargs["job_config"] self.assertTrue(job_config.autodetect) @freeze_time("2020-01-01") @@ -95,37 +95,37 @@ def testAddDataframeToBq_SpecifySchema(self): are provided to add_df_to_bq.""" test_frame = DataFrame(data=self._test_data[1:], columns=self._test_data[0], index=[1, 2]) - with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client: + with patch("ingestion.gcs_to_bq_util.bigquery.Client") as mock_client: # Set up mock calls mock_instance = mock_client.return_value mock_table = Mock() mock_instance.dataset.return_value = mock_table - mock_table.table.return_value = 'test-project.test-dataset.table' + mock_table.table.return_value = "test-project.test-dataset.table" column_types = {label: BQ_STRING for label in test_frame.columns} - col_modes = {'label1': 'REPEATED', 'label2': 'REQUIRED'} + col_modes = {"label1": "REPEATED", "label2": "REQUIRED"} gcs_to_bq_util.add_df_to_bq( - test_frame.copy(deep=True), 'test-dataset', 'table', column_types=column_types, col_modes=col_modes + test_frame.copy(deep=True), "test-dataset", "table", column_types=column_types, col_modes=col_modes ) mock_instance.load_table_from_json.assert_called() call_args = mock_instance.load_table_from_json.call_args - self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient='records'))) - job_config = call_args.kwargs['job_config'] + self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient="records"))) + job_config = call_args.kwargs["job_config"] self.assertFalse(job_config.autodetect) - expected_cols = ['label1', 'label2', 'label3'] + expected_cols = ["label1", "label2", "label3"] expected_types = [BQ_STRING, BQ_STRING, BQ_STRING] - expected_modes = ['REPEATED', 'REQUIRED', 'NULLABLE'] + expected_modes = ["REPEATED", "REQUIRED", "NULLABLE"] self.assertListEqual([field.name for field in job_config.schema], expected_cols) self.assertListEqual([field.field_type for field in job_config.schema], expected_types) self.assertListEqual([field.mode for field in job_config.schema], expected_modes) - @patch('ingestion.gcs_to_bq_util.storage.Client') + @patch("ingestion.gcs_to_bq_util.storage.Client") def testLoadCsvAsDataFrame_ParseTypes(self, mock_bq: MagicMock): # Write data to an temporary file - test_file_path = '/tmp/test_file.csv' + test_file_path = "/tmp/test_file.csv" test_data = dedent( """ col1,col2,col3,col4 @@ -133,25 +133,25 @@ def testLoadCsvAsDataFrame_ParseTypes(self, mock_bq: MagicMock): 20210105,"1,400",string, """ ) - with open(test_file_path, 'w') as f: + with open(test_file_path, "w") as f: f.write(test_data) - df = gcs_to_bq_util.load_csv_as_df('gcs_bucket', 'test_file.csv', parse_dates=['col1'], thousands=',') + df = gcs_to_bq_util.load_csv_as_df("gcs_bucket", "test_file.csv", parse_dates=["col1"], thousands=",") # With parse_dates, col1 should be interpreted as numpy datetime. With # thousands=',', numeric columns should be interpreted correctly even if # they are written as strings with commas. Numeric cols with null values # are inferred as floats. - expected_types = {'col1': np.dtype('datetime64[ns]'), 'col2': np.int64, 'col3': object, 'col4': np.float64} + expected_types = {"col1": np.dtype("datetime64[ns]"), "col2": np.int64, "col3": object, "col4": np.float64} for col in df.columns: self.assertEqual(df[col].dtype, expected_types[col]) # Re-write the test data since load_csv_as_df removes the file. - with open(test_file_path, 'w') as f: + with open(test_file_path, "w") as f: f.write(test_data) - df = gcs_to_bq_util.load_csv_as_df('gcs_bucket', 'test_file.csv') + df = gcs_to_bq_util.load_csv_as_df("gcs_bucket", "test_file.csv") # Without the additional read_csv args, the data are inferred to the # default object type. - expected_types = {'col1': np.int64, 'col2': object, 'col3': object, 'col4': object} + expected_types = {"col1": np.int64, "col2": object, "col3": object, "col4": object} for col in df.columns: self.assertEqual(df[col].dtype, expected_types[col]) @@ -159,8 +159,8 @@ def testLoadCsvAsDataFrame_ParseTypes(self, mock_bq: MagicMock): def test_make_bq_table_id(): - assert gcs_to_bq_util.make_bq_table_id('race', 'national', 'current') == 'race_national_current' + assert gcs_to_bq_util.make_bq_table_id("race", "national", "current") == "race_national_current" assert ( - gcs_to_bq_util.make_bq_table_id('race', 'state', 'current', category_prefix='behavioral_health') - == 'behavioral_health_race_state_current' + gcs_to_bq_util.make_bq_table_id("race", "state", "current", category_prefix="behavioral_health") + == "behavioral_health_race_state_current" ) diff --git a/python/tests/ingestion/test_graphql_ahr_measure_ids.py b/python/tests/ingestion/test_graphql_ahr_measure_ids.py index 5fe135bb7e..f7898e9ec8 100644 --- a/python/tests/ingestion/test_graphql_ahr_measure_ids.py +++ b/python/tests/ingestion/test_graphql_ahr_measure_ids.py @@ -29,24 +29,24 @@ def test_all_demographic(): - result = get_measure_ids("all", 'all', data=test_data) + result = get_measure_ids("all", "all", data=test_data) expected_ids = ["16388", "16348", "18353", "115", "176"] assert result == expected_ids def test_age_demographic(): - result = get_measure_ids("age", 'all', data=test_data) + result = get_measure_ids("age", "all", data=test_data) expected_ids = ["16374", "16372", "16373", "15969", "16369", "18358", "18389", "16367", "18356", "16368", "18357"] assert result == expected_ids def test_race_and_ethnicity_demographic(): - result = get_measure_ids("race_and_ethnicity", 'all', data=test_data) + result = get_measure_ids("race_and_ethnicity", "all", data=test_data) expected_ids = ["16376", "19988", "16375", "19987", "410", "432", "19743", "409", "431", "19742"] assert result == expected_ids def test_sex_demographic(): - result = get_measure_ids("sex", 'all', data=test_data) + result = get_measure_ids("sex", "all", data=test_data) expected_ids = ["16371", "16370", "405", "427", "404", "426"] assert result == expected_ids diff --git a/python/tests/ingestion/test_merge_utils.py b/python/tests/ingestion/test_merge_utils.py index 150c687d43..0b763f24ea 100644 --- a/python/tests/ingestion/test_merge_utils.py +++ b/python/tests/ingestion/test_merge_utils.py @@ -11,124 +11,124 @@ _data_with_bad_county_names = [ - ['state_postal', 'county_fips', 'county_name'], - ['CA', '06059', 'drop-me'], - ['GA', '13133', 'also-drop-me'], - ['VI', '78010', 'bad-county-equivalent-name'], + ["state_postal", "county_fips", "county_name"], + ["CA", "06059", "drop-me"], + ["GA", "13133", "also-drop-me"], + ["VI", "78010", "bad-county-equivalent-name"], ] _data_with_good_county_names = [ - ['state_postal', 'county_fips', 'county_name'], - ['CA', '06059', 'Orange County'], - ['GA', '13133', 'Greene County'], - ['VI', '78010', 'St. Croix'], + ["state_postal", "county_fips", "county_name"], + ["CA", "06059", "Orange County"], + ["GA", "13133", "Greene County"], + ["VI", "78010", "St. Croix"], ] _expected_merged_fips_county = [ - ['state_name', 'state_fips', 'county_fips', 'county_name'], - ['California', '06', '06059', 'Orange County'], - ['Georgia', '13', '13133', 'Greene County'], - ['U.S. Virgin Islands', '78', '78010', 'St. Croix'], + ["state_name", "state_fips", "county_fips", "county_name"], + ["California", "06", "06059", "Orange County"], + ["Georgia", "13", "13133", "Greene County"], + ["U.S. Virgin Islands", "78", "78010", "St. Croix"], ] _data_without_fips_codes = [ - ['state_name', 'state_postal', 'other_col'], - ['United States', 'US', 'something_cool'], - ['California', 'CA', 'something'], - ['Georgia', 'GA', 'something_else'], - ['U.S. Virgin Islands', 'VI', 'something_else_entirely'], - ['Unknown', 'Unknown', 'who_am_i'], + ["state_name", "state_postal", "other_col"], + ["United States", "US", "something_cool"], + ["California", "CA", "something"], + ["Georgia", "GA", "something_else"], + ["U.S. Virgin Islands", "VI", "something_else_entirely"], + ["Unknown", "Unknown", "who_am_i"], ] _expected_merged_fips = [ - ['state_name', 'other_col', 'state_fips'], - ['United States', 'something_cool', '00'], - ['California', 'something', '06'], - ['Georgia', 'something_else', '13'], - ['U.S. Virgin Islands', 'something_else_entirely', '78'], - ['Unknown', 'who_am_i', 'Unknown'], + ["state_name", "other_col", "state_fips"], + ["United States", "something_cool", "00"], + ["California", "something", "06"], + ["Georgia", "something_else", "13"], + ["U.S. Virgin Islands", "something_else_entirely", "78"], + ["Unknown", "who_am_i", "Unknown"], ] _data_with_only_fips_codes = [ - ['state_fips', 'other_col'], - ['00', 'something_cool'], - ['06', 'something'], - ['13', 'something_else'], - ['78', 'something_else_entirely'], + ["state_fips", "other_col"], + ["00", "something_cool"], + ["06", "something"], + ["13", "something_else"], + ["78", "something_else_entirely"], ] _expected_merged_names_from_fips = [ - ['state_name', 'other_col', 'state_fips'], - ['United States', 'something_cool', '00'], - ['California', 'something', '06'], - ['Georgia', 'something_else', '13'], - ['U.S. Virgin Islands', 'something_else_entirely', '78'], + ["state_name", "other_col", "state_fips"], + ["United States", "something_cool", "00"], + ["California", "something", "06"], + ["Georgia", "something_else", "13"], + ["U.S. Virgin Islands", "something_else_entirely", "78"], ] _data_without_pop_numbers = [ - ['state_fips', 'race_category_id', 'other_col'], - ['01', 'BLACK_NH', 'something_cool'], - ['01', 'WHITE_NH', 'something_else_cool'], - ['02', 'BLACK_NH', 'something_cooler'], - ['78', 'WHITE_NH', 'something_else_entirely'], - ['78', 'BLACK_NH', 'something_else_entirely'], + ["state_fips", "race_category_id", "other_col"], + ["01", "BLACK_NH", "something_cool"], + ["01", "WHITE_NH", "something_else_cool"], + ["02", "BLACK_NH", "something_cooler"], + ["78", "WHITE_NH", "something_else_entirely"], + ["78", "BLACK_NH", "something_else_entirely"], ] _expected_merged_with_pop_numbers = [ - ['state_fips', 'race_category_id', 'population', 'population_pct', 'other_col'], - ['01', 'BLACK_NH', 1318388, 26.2, 'something_cool'], - ['01', 'WHITE_NH', 3247262, 64.6, 'something_else_cool'], - ['02', 'BLACK_NH', 22400, 3.0, 'something_cooler'], - ['78', 'WHITE_NH', 11036, 12.7, 'something_else_entirely'], - ['78', 'BLACK_NH', 55936, 64.2, 'something_else_entirely'], + ["state_fips", "race_category_id", "population", "population_pct", "other_col"], + ["01", "BLACK_NH", 1318388, 26.2, "something_cool"], + ["01", "WHITE_NH", 3247262, 64.6, "something_else_cool"], + ["02", "BLACK_NH", 22400, 3.0, "something_cooler"], + ["78", "WHITE_NH", 11036, 12.7, "something_else_entirely"], + ["78", "BLACK_NH", 55936, 64.2, "something_else_entirely"], ] _data_without_pop_numbers_county = [ - ['state_fips', 'county_fips', 'race_category_id', 'other_col'], - ['01', '01001', 'BLACK_NH', 'something_cool'], - ['01', '01003', 'WHITE_NH', 'something_else_cool'], - ['01', '01005', 'BLACK_NH', 'something_cooler'], - ['78', '78010', 'BLACK_NH', 'something_territory'], + ["state_fips", "county_fips", "race_category_id", "other_col"], + ["01", "01001", "BLACK_NH", "something_cool"], + ["01", "01003", "WHITE_NH", "something_else_cool"], + ["01", "01005", "BLACK_NH", "something_cooler"], + ["78", "78010", "BLACK_NH", "something_territory"], ] _expected_merged_with_pop_numbers_county = [ [ - 'state_fips', - 'county_fips', - 'race_category_id', - 'population', - 'population_pct', - 'other_col', + "state_fips", + "county_fips", + "race_category_id", + "population", + "population_pct", + "other_col", ], - ['01', '01001', 'BLACK_NH', 11496, 19.6, 'something_cool'], - ['01', '01003', 'WHITE_NH', 192161, 82.3, 'something_else_cool'], - ['01', '01005', 'BLACK_NH', 11662, 46.9, 'something_cooler'], - ['78', '78010', 'BLACK_NH', 24995, 61.0, 'something_territory'], + ["01", "01001", "BLACK_NH", 11496, 19.6, "something_cool"], + ["01", "01003", "WHITE_NH", 192161, 82.3, "something_else_cool"], + ["01", "01005", "BLACK_NH", 11662, 46.9, "something_cooler"], + ["78", "78010", "BLACK_NH", 24995, 61.0, "something_territory"], ] _data_time_series_without_pop_numbers = [ - ['time_period', 'state_fips', 'race_category_id', 'other_col'], - ['2008', '01', 'BLACK_NH', 'something_cool'], - ['2008', '01', 'WHITE_NH', 'something_else_cool'], - ['2008', '02', 'BLACK_NH', 'something_cooler'], - ['2008', '78', 'WHITE_NH', 'something_else_entirely'], - ['2008', '78', 'BLACK_NH', 'something_else_entirely'], - ["2010", '78', 'WHITE_NH', 'something_something'], - ["2010", '78', 'BLACK_NH', 'something_something'], - ['2019', '01', 'BLACK_NH', 'something_cool'], - ['2019', '01', 'WHITE_NH', 'something_else_cool'], - ['2019', '02', 'BLACK_NH', 'something_cooler'], - ['2019', '78', 'WHITE_NH', 'something_else_entirely'], - ['2019', '78', 'BLACK_NH', 'something_else_entirely'], - ["2021", '01', 'BLACK_NH', 'something_cool'], - ["2021", '01', 'WHITE_NH', 'something_else_cool'], - ["2021", '02', 'BLACK_NH', 'something_cooler'], - ["2021", '78', 'WHITE_NH', 'something_else_entirely'], - ["2021", '78', 'BLACK_NH', 'something_else_entirely'], - ['9999', '01', 'BLACK_NH', 'something_cool'], - ['9999', '01', 'WHITE_NH', 'something_else_cool'], - ['9999', '02', 'BLACK_NH', 'something_cooler'], - ['9999', '78', 'WHITE_NH', 'something_else_entirely'], - ['9999', '78', 'BLACK_NH', 'something_else_entirely'], + ["time_period", "state_fips", "race_category_id", "other_col"], + ["2008", "01", "BLACK_NH", "something_cool"], + ["2008", "01", "WHITE_NH", "something_else_cool"], + ["2008", "02", "BLACK_NH", "something_cooler"], + ["2008", "78", "WHITE_NH", "something_else_entirely"], + ["2008", "78", "BLACK_NH", "something_else_entirely"], + ["2010", "78", "WHITE_NH", "something_something"], + ["2010", "78", "BLACK_NH", "something_something"], + ["2019", "01", "BLACK_NH", "something_cool"], + ["2019", "01", "WHITE_NH", "something_else_cool"], + ["2019", "02", "BLACK_NH", "something_cooler"], + ["2019", "78", "WHITE_NH", "something_else_entirely"], + ["2019", "78", "BLACK_NH", "something_else_entirely"], + ["2021", "01", "BLACK_NH", "something_cool"], + ["2021", "01", "WHITE_NH", "something_else_cool"], + ["2021", "02", "BLACK_NH", "something_cooler"], + ["2021", "78", "WHITE_NH", "something_else_entirely"], + ["2021", "78", "BLACK_NH", "something_else_entirely"], + ["9999", "01", "BLACK_NH", "something_cool"], + ["9999", "01", "WHITE_NH", "something_else_cool"], + ["9999", "02", "BLACK_NH", "something_cooler"], + ["9999", "78", "WHITE_NH", "something_else_entirely"], + ["9999", "78", "BLACK_NH", "something_else_entirely"], ] # 2008 should not get pop data because it's too early for the ACS range @@ -136,55 +136,55 @@ # After RECENT_YEAR should get the same pop data as RECENT_YEAR _expected_time_series_merged_with_pop_numbers = [ [ - 'time_period', - 'state_fips', - 'race_category_id', - 'population', - 'population_pct', - 'other_col', + "time_period", + "state_fips", + "race_category_id", + "population", + "population_pct", + "other_col", ], # Pre-2009 rows should not get population data - ['2008', '01', 'BLACK_NH', np.nan, np.nan, 'something_cool'], - ['2008', '01', 'WHITE_NH', np.nan, np.nan, 'something_else_cool'], - ['2008', '02', 'BLACK_NH', np.nan, np.nan, 'something_cooler'], - ['2008', '78', 'WHITE_NH', np.nan, np.nan, 'something_else_entirely'], - ['2008', '78', 'BLACK_NH', np.nan, np.nan, 'something_else_entirely'], + ["2008", "01", "BLACK_NH", np.nan, np.nan, "something_cool"], + ["2008", "01", "WHITE_NH", np.nan, np.nan, "something_else_cool"], + ["2008", "02", "BLACK_NH", np.nan, np.nan, "something_cooler"], + ["2008", "78", "WHITE_NH", np.nan, np.nan, "something_else_entirely"], + ["2008", "78", "BLACK_NH", np.nan, np.nan, "something_else_entirely"], # Territories / Years 2009-2015 should merge against 2010 Decennial (decia_2010) - ["2010", '78', 'WHITE_NH', 14352, 13.5, 'something_something'], - ["2010", '78', 'BLACK_NH', 70379, 66.1, 'something_something'], + ["2010", "78", "WHITE_NH", 14352, 13.5, "something_something"], + ["2010", "78", "BLACK_NH", 70379, 66.1, "something_something"], # States / Years within ACS range should merge directly onto ACS years - ['2019', '01', 'BLACK_NH', 1291524, 26.5, 'something_cool'], - ['2019', '01', 'WHITE_NH', 3194929, 65.5, 'something_else_cool'], - ['2019', '02', 'BLACK_NH', 22857, 3.1, 'something_cooler'], + ["2019", "01", "BLACK_NH", 1291524, 26.5, "something_cool"], + ["2019", "01", "WHITE_NH", 3194929, 65.5, "something_else_cool"], + ["2019", "02", "BLACK_NH", 22857, 3.1, "something_cooler"], # Territories / Years 2016-current should merge against 2020 Decennial (decia_2020) - ['2019', '78', 'WHITE_NH', 11036, 12.7, 'something_else_entirely'], - ['2019', '78', 'BLACK_NH', 55936, 64.2, 'something_else_entirely'], + ["2019", "78", "WHITE_NH", 11036, 12.7, "something_else_entirely"], + ["2019", "78", "BLACK_NH", 55936, 64.2, "something_else_entirely"], # States / Years within ACS range should merge directly onto ACS years - ["2021", '01', 'BLACK_NH', 1316314, 26.3, 'something_cool'], - ["2021", '01', 'WHITE_NH', 3241003, 64.9, 'something_else_cool'], - ["2021", '02', 'BLACK_NH', 22787, 3.1, 'something_cooler'], + ["2021", "01", "BLACK_NH", 1316314, 26.3, "something_cool"], + ["2021", "01", "WHITE_NH", 3241003, 64.9, "something_else_cool"], + ["2021", "02", "BLACK_NH", 22787, 3.1, "something_cooler"], # Territories / Years 2016-current should merge against 2020 Decennial (decia_2020) - ["2021", '78', 'WHITE_NH', 11036, 12.7, 'something_else_entirely'], - ["2021", '78', 'BLACK_NH', 55936, 64.2, 'something_else_entirely'], + ["2021", "78", "WHITE_NH", 11036, 12.7, "something_else_entirely"], + ["2021", "78", "BLACK_NH", 55936, 64.2, "something_else_entirely"], # Years AFTER ACS range should merge against the most recent ACS year - ['9999', '01', 'BLACK_NH', 1318388, 26.2, 'something_cool'], - ['9999', '01', 'WHITE_NH', 3247262, 64.6, 'something_else_cool'], - ['9999', '02', 'BLACK_NH', 22400, 3.0, 'something_cooler'], - ['9999', '78', 'WHITE_NH', 11036, 12.7, 'something_else_entirely'], - ['9999', '78', 'BLACK_NH', 55936, 64.2, 'something_else_entirely'], + ["9999", "01", "BLACK_NH", 1318388, 26.2, "something_cool"], + ["9999", "01", "WHITE_NH", 3247262, 64.6, "something_else_cool"], + ["9999", "02", "BLACK_NH", 22400, 3.0, "something_cooler"], + ["9999", "78", "WHITE_NH", 11036, 12.7, "something_else_entirely"], + ["9999", "78", "BLACK_NH", 55936, 64.2, "something_else_entirely"], ] _data_county_time_series_without_pop_numbers = [ - ['time_period', 'state_fips', 'county_fips', 'race_category_id', 'other_col'], - ['2008', '01', '01001', 'ALL', 'something_cool'], - ['2008', '78', '78030', 'ALL', 'something_else_entirely'], - ["2010", '78', '78030', 'ALL', 'something_something'], - ['2019', '01', '01001', 'ALL', 'something_cool'], - ['2019', '78', '78030', 'ALL', 'something_else_entirely'], - ["2021", '01', '01001', 'ALL', 'something_cool'], - ["2021", '78', '78030', 'ALL', 'something_else_entirely'], - ['9999', '01', '01001', 'ALL', 'something_cool'], - ['9999', '78', '78030', 'ALL', 'something_else_entirely'], + ["time_period", "state_fips", "county_fips", "race_category_id", "other_col"], + ["2008", "01", "01001", "ALL", "something_cool"], + ["2008", "78", "78030", "ALL", "something_else_entirely"], + ["2010", "78", "78030", "ALL", "something_something"], + ["2019", "01", "01001", "ALL", "something_cool"], + ["2019", "78", "78030", "ALL", "something_else_entirely"], + ["2021", "01", "01001", "ALL", "something_cool"], + ["2021", "78", "78030", "ALL", "something_else_entirely"], + ["9999", "01", "01001", "ALL", "something_cool"], + ["9999", "78", "78030", "ALL", "something_else_entirely"], ] # 2008 should not get pop data because it's too early for the ACS range @@ -192,55 +192,55 @@ # After RECENT_YEAR should get the same pop data as RECENT_YEAR _expected_county_time_series_merged_with_pop_numbers = [ [ - 'time_period', - 'state_fips', - 'county_fips', - 'race_category_id', - 'population', - 'population_pct', - 'other_col', + "time_period", + "state_fips", + "county_fips", + "race_category_id", + "population", + "population_pct", + "other_col", ], # Pre-2009 rows should not get population data - ['2008', '01', '01001', 'ALL', np.nan, np.nan, 'something_cool'], - ['2008', '78', '78030', 'ALL', np.nan, np.nan, 'something_else_entirely'], + ["2008", "01", "01001", "ALL", np.nan, np.nan, "something_cool"], + ["2008", "78", "78030", "ALL", np.nan, np.nan, "something_else_entirely"], # Territory Counties / Years 2009-2015 should merge against 2020 Decennial (decia_2020) since 2010 has no counties - ["2010", '78', '78030', 'ALL', 42261, 100.0, 'something_something'], + ["2010", "78", "78030", "ALL", 42261, 100.0, "something_something"], # States / Years within ACS range should merge directly onto ACS years - ['2019', '01', '01001', 'ALL', 55380, 100.0, 'something_cool'], + ["2019", "01", "01001", "ALL", 55380, 100.0, "something_cool"], # Territory Counties / Years 2016-current should merge against 2020 Decennial (decia_2020) - ['2019', '78', '78030', 'ALL', 42261, 100.0, 'something_else_entirely'], + ["2019", "78", "78030", "ALL", 42261, 100.0, "something_else_entirely"], # Counties / Years within ACS range should merge directly onto ACS years - ["2021", '01', '01001', 'ALL', 58239, 100.0, 'something_cool'], + ["2021", "01", "01001", "ALL", 58239, 100.0, "something_cool"], # Territories / Years 2016-current should merge against 2020 Decennial (decia_2020) - ["2021", '78', '78030', 'ALL', 42261, 100.0, 'something_else_entirely'], + ["2021", "78", "78030", "ALL", 42261, 100.0, "something_else_entirely"], # Years AFTER ACS range should merge against the most recent ACS year - ['9999', '01', '01001', 'ALL', 58761, 100.0, 'something_cool'], - ['9999', '78', '78030', 'ALL', 42261, 100.0, 'something_else_entirely'], + ["9999", "01", "01001", "ALL", 58761, 100.0, "something_cool"], + ["9999", "78", "78030", "ALL", 42261, 100.0, "something_else_entirely"], ] _data_without_pop_numbers_multiple_rows = [ - ['state_fips', 'race_category_id', 'cases', 'deaths'], - ['01', 'BLACK_NH', 10, 1], - ['01', 'WHITE_NH', 100, np.nan], - ['02', 'BLACK_NH', 20, np.nan], - ['78', 'WHITE_NH', 10, 2], - ['78', 'BLACK_NH', 5, 0], + ["state_fips", "race_category_id", "cases", "deaths"], + ["01", "BLACK_NH", 10, 1], + ["01", "WHITE_NH", 100, np.nan], + ["02", "BLACK_NH", 20, np.nan], + ["78", "WHITE_NH", 10, 2], + ["78", "BLACK_NH", 5, 0], ] _expected_merge_with_pop_numbers_multiple_rows = [ [ - 'state_fips', - 'race_category_id', - 'cases', - 'deaths', - 'cases_population', - 'deaths_population', + "state_fips", + "race_category_id", + "cases", + "deaths", + "cases_population", + "deaths_population", ], - ['01', 'BLACK_NH', 10, 1, 1318388, 1318388], - ['01', 'WHITE_NH', 100, np.nan, 3247262, 3247262], - ['02', 'BLACK_NH', 20, np.nan, 22400, 22400], - ['78', 'WHITE_NH', 10, 2, 11036, 11036], - ['78', 'BLACK_NH', 5, 0, 55936, 55936], + ["01", "BLACK_NH", 10, 1, 1318388, 1318388], + ["01", "WHITE_NH", 100, np.nan, 3247262, 3247262], + ["02", "BLACK_NH", 20, np.nan, 22400, 22400], + ["78", "WHITE_NH", 10, 2, 11036, 11036], + ["78", "BLACK_NH", 5, 0, 55936, 55936], ] @@ -278,7 +278,7 @@ def testMergeStateInfoByName(): drop=True ) - df = df[['state_name', 'other_col']] + df = df[["state_name", "other_col"]] expected_df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_expected_merged_fips)), dtype=str).reset_index( drop=True @@ -294,7 +294,7 @@ def testMergeStateInfoByPostal(): drop=True ) - df = df[['state_postal', 'other_col']] + df = df[["state_postal", "other_col"]] expected_df = gcs_to_bq_util.values_json_to_df(StringIO(json.dumps(_expected_merged_fips)), dtype=str).reset_index( drop=True @@ -310,7 +310,7 @@ def testMergeStateInfoByFips(): drop=True ) - df = df[['state_fips', 'other_col']] + df = df[["state_fips", "other_col"]] expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_merged_names_from_fips)), dtype=str @@ -331,7 +331,7 @@ def testMergePopNumbersState(): dtype={std_col.STATE_FIPS_COL: str}, ).reset_index(drop=True) - df = merge_utils.merge_pop_numbers(df, 'race', 'state') + df = merge_utils.merge_pop_numbers(df, "race", "state") assert_frame_equal(df, expected_df, check_like=True, check_dtype=False) @@ -347,7 +347,7 @@ def testMergePopNumbersCounty(): dtype={std_col.STATE_FIPS_COL: str, std_col.COUNTY_FIPS_COL: str}, ).reset_index(drop=True) - df = merge_utils.merge_pop_numbers(df, 'race', 'county') + df = merge_utils.merge_pop_numbers(df, "race", "county") assert_frame_equal(df, expected_df, check_like=True, check_dtype=False) @@ -358,7 +358,7 @@ def testMergeYearlyPopNumbers(): dtype={std_col.STATE_FIPS_COL: str, std_col.TIME_PERIOD_COL: str}, ).reset_index(drop=True) - df = merge_utils.merge_yearly_pop_numbers(df_no_pop, 'race', 'state') + df = merge_utils.merge_yearly_pop_numbers(df_no_pop, "race", "state") expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_time_series_merged_with_pop_numbers)), @@ -374,7 +374,7 @@ def testMergeYearlyCountyPopNumbers(): dtype={std_col.STATE_FIPS_COL: str, std_col.COUNTY_FIPS_COL: str, std_col.TIME_PERIOD_COL: str}, ).reset_index(drop=True) - df = merge_utils.merge_yearly_pop_numbers(df_no_pop, 'race', 'county') + df = merge_utils.merge_yearly_pop_numbers(df_no_pop, "race", "county") expected_df = gcs_to_bq_util.values_json_to_df( StringIO(json.dumps(_expected_county_time_series_merged_with_pop_numbers)), @@ -395,7 +395,7 @@ def testMergeMultiplePopCols(): dtype={std_col.STATE_FIPS_COL: str}, ).reset_index(drop=True) - df = merge_utils.merge_multiple_pop_cols(df, 'race', ['cases_population', 'deaths_population']) + df = merge_utils.merge_multiple_pop_cols(df, "race", ["cases_population", "deaths_population"]) assert_frame_equal(df, expected_df, check_like=True, check_dtype=False) @@ -406,70 +406,70 @@ def testMergeMultiplePopCols(): def test_state_sex_merge_intersectional_pop(): fake_state_by_sex_data_with_only_rates = { - 'topic_per_100k': [20, 60, 40, 50, 50, 50], - 'sex': ['Male', 'Female', 'All', 'Male', 'Female', 'All'], - 'state_fips': ['01', '01', '01', '02', '02', '02'], - 'state_name': ['Alabama', 'Alabama', 'Alabama', 'Alaska', 'Alaska', 'Alaska'], + "topic_per_100k": [20, 60, 40, 50, 50, 50], + "sex": ["Male", "Female", "All", "Male", "Female", "All"], + "state_fips": ["01", "01", "01", "02", "02", "02"], + "state_name": ["Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"], } fake_state_by_sex_data_with_rates_pop_18plus = { - 'topic_per_100k': [20, 60, 40, 50, 50, 50], - 'sex': ['Male', 'Female', 'All', 'Male', 'Female', 'All'], - 'state_fips': ['01', '01', '01', '02', '02', '02'], - 'state_name': ['Alabama', 'Alabama', 'Alabama', 'Alaska', 'Alaska', 'Alaska'], - '18plus_population': [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], + "topic_per_100k": [20, 60, 40, 50, 50, 50], + "sex": ["Male", "Female", "All", "Male", "Female", "All"], + "state_fips": ["01", "01", "01", "02", "02", "02"], + "state_name": ["Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"], + "18plus_population": [1878392.0, 2039058.0, 3917450.0, 294462.0, 261021.0, 555483.0], } df = pd.DataFrame(fake_state_by_sex_data_with_only_rates) - (df, intersectional_pop_col) = merge_utils.merge_intersectional_pop(df, 'state', 'sex', age_specific_group='18+') - assert intersectional_pop_col == '18plus_population' + (df, intersectional_pop_col) = merge_utils.merge_intersectional_pop(df, "state", "sex", age_specific_group="18+") + assert intersectional_pop_col == "18plus_population" assert_frame_equal(df, pd.DataFrame(fake_state_by_sex_data_with_rates_pop_18plus), check_like=True) # COUNTY BY RACE DATA fake_county_by_race_data_with_only_rates = { - 'topic_per_100k': [100, 10, 20, 50, 50, 50], - 'race_category_id': ['BLACK_NH', 'WHITE_NH', 'ALL', 'BLACK_NH', 'WHITE_NH', 'ALL'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'All', - 'Black or African American (NH)', - 'White (NH)', - 'All', + "topic_per_100k": [100, 10, 20, 50, 50, 50], + "race_category_id": ["BLACK_NH", "WHITE_NH", "ALL", "BLACK_NH", "WHITE_NH", "ALL"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "All", + "Black or African American (NH)", + "White (NH)", + "All", ], - 'county_fips': ['01001', '01001', '01001', '01003', '01003', '01003'], - 'county_name': [ - 'Autuga County', - 'Autuga County', - 'Autuga County', - 'Baldwin County', - 'Baldwin County', - 'Baldwin County', + "county_fips": ["01001", "01001", "01001", "01003", "01003", "01003"], + "county_name": [ + "Autuga County", + "Autuga County", + "Autuga County", + "Baldwin County", + "Baldwin County", + "Baldwin County", ], } fake_county_by_race_data_with_rates_and_female_pop = { - 'topic_per_100k': [100, 10, 20, 50, 50, 50], - 'race_category_id': ['BLACK_NH', 'WHITE_NH', 'ALL', 'BLACK_NH', 'WHITE_NH', 'ALL'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'All', - 'Black or African American (NH)', - 'White (NH)', - 'All', + "topic_per_100k": [100, 10, 20, 50, 50, 50], + "race_category_id": ["BLACK_NH", "WHITE_NH", "ALL", "BLACK_NH", "WHITE_NH", "ALL"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "All", + "Black or African American (NH)", + "White (NH)", + "All", ], - 'county_fips': ['01001', '01001', '01001', '01003', '01003', '01003'], - 'county_name': [ - 'Autuga County', - 'Autuga County', - 'Autuga County', - 'Baldwin County', - 'Baldwin County', - 'Baldwin County', + "county_fips": ["01001", "01001", "01001", "01003", "01003", "01003"], + "county_name": [ + "Autuga County", + "Autuga County", + "Autuga County", + "Baldwin County", + "Baldwin County", + "Baldwin County", ], - 'female_population': [6030.0, 21625.0, 30098.0, 10284.0, 98154.0, 119343.0], + "female_population": [6030.0, 21625.0, 30098.0, 10284.0, 98154.0, 119343.0], } @@ -493,51 +493,51 @@ def test_state_sex_merge_intersectional_pop(): def test_sum_age_groups(): fake_pop_data_all_ages = { - 'county_fips': ['01001'] * 23, - 'county_name': ['Autuga County '] * 23, - 'race_and_ethnicity': ['Black or African American (NH)'] * 23, - 'race_category_id': ['BLACK_NH'] * 23, - 'sex': ['All'] * 23, - 'age': [ - '0-4', - '5-9', - '10-14', - '15-17', - '18-19', - '20-20', - '21-21', - '22-24', - '25-29', - '30-34', - '35-39', - '40-44', - '45-49', - '50-54', - '55-59', - '60-61', - '62-64', - '65-66', - '67-69', - '70-74', - '75-79', - '80-84', - '85+', + "county_fips": ["01001"] * 23, + "county_name": ["Autuga County "] * 23, + "race_and_ethnicity": ["Black or African American (NH)"] * 23, + "race_category_id": ["BLACK_NH"] * 23, + "sex": ["All"] * 23, + "age": [ + "0-4", + "5-9", + "10-14", + "15-17", + "18-19", + "20-20", + "21-21", + "22-24", + "25-29", + "30-34", + "35-39", + "40-44", + "45-49", + "50-54", + "55-59", + "60-61", + "62-64", + "65-66", + "67-69", + "70-74", + "75-79", + "80-84", + "85+", ], - 'population': [100] * 23, + "population": [100] * 23, } fake_pop_data_summed_18plus = { - 'county_fips': ['01001'] * 5, - 'county_name': ['Autuga County '] * 5, - 'race_and_ethnicity': ['Black or African American (NH)'] * 5, - 'race_category_id': ['BLACK_NH'] * 5, - 'sex': ['All'] * 5, - 'age': ['0-4', '5-9', '10-14', '15-17', '18+'], - 'population': [100, 100, 100, 100, 1900], + "county_fips": ["01001"] * 5, + "county_name": ["Autuga County "] * 5, + "race_and_ethnicity": ["Black or African American (NH)"] * 5, + "race_category_id": ["BLACK_NH"] * 5, + "sex": ["All"] * 5, + "age": ["0-4", "5-9", "10-14", "15-17", "18+"], + "population": [100, 100, 100, 100, 1900], } pop_df = pd.DataFrame(fake_pop_data_all_ages) - pop_df = merge_utils.sum_age_groups(pop_df, '18+') + pop_df = merge_utils.sum_age_groups(pop_df, "18+") expected_summed_pop_df = pd.DataFrame(fake_pop_data_summed_18plus) assert_frame_equal(pop_df, expected_summed_pop_df, check_like=True) @@ -545,51 +545,51 @@ def test_sum_age_groups(): def test_sum_states_to_national(): fake_pop_data_state_level_by_sex_by_race = { - 'state_fips': ['01', '01', '01', '01', '02', '02', '02', '02'], - 'state_name': ['Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alaska', 'Alaska', 'Alaska', 'Alaska'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'Black or African American (NH)', - 'White (NH)', - 'Black or African American (NH)', - 'White (NH)', - 'Black or African American (NH)', - 'White (NH)', + "state_fips": ["01", "01", "01", "01", "02", "02", "02", "02"], + "state_name": ["Alabama", "Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska", "Alaska"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "Black or African American (NH)", + "White (NH)", + "Black or African American (NH)", + "White (NH)", + "Black or African American (NH)", + "White (NH)", ], - 'race_category_id': [ - 'BLACK_NH', - 'WHITE_NH', - 'BLACK_NH', - 'WHITE_NH', - 'BLACK_NH', - 'WHITE_NH', - 'BLACK_NH', - 'WHITE_NH', + "race_category_id": [ + "BLACK_NH", + "WHITE_NH", + "BLACK_NH", + "WHITE_NH", + "BLACK_NH", + "WHITE_NH", + "BLACK_NH", + "WHITE_NH", ], - 'sex': ['Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'], - 'age': ['All', 'All', 'All', 'All', 'All', 'All', 'All', 'All'], - 'population': [100, 100, 100, 100, 100, 100, 100, 100], + "sex": ["Male", "Male", "Female", "Female", "Male", "Male", "Female", "Female"], + "age": ["All", "All", "All", "All", "All", "All", "All", "All"], + "population": [100, 100, 100, 100, 100, 100, 100, 100], } fake_pop_data_national_by_sex_by_race = { - 'state_fips': ['00', '00', '00', '00'], - 'state_name': ['United States', 'United States', 'United States', 'United States'], - 'race_and_ethnicity': [ - 'Black or African American (NH)', - 'White (NH)', - 'Black or African American (NH)', - 'White (NH)', + "state_fips": ["00", "00", "00", "00"], + "state_name": ["United States", "United States", "United States", "United States"], + "race_and_ethnicity": [ + "Black or African American (NH)", + "White (NH)", + "Black or African American (NH)", + "White (NH)", ], - 'race_category_id': [ - 'BLACK_NH', - 'WHITE_NH', - 'BLACK_NH', - 'WHITE_NH', + "race_category_id": [ + "BLACK_NH", + "WHITE_NH", + "BLACK_NH", + "WHITE_NH", ], - 'sex': ['Female', 'Female', 'Male', 'Male'], - 'age': ['All', 'All', 'All', 'All'], - 'population': [200, 200, 200, 200], + "sex": ["Female", "Female", "Male", "Male"], + "age": ["All", "All", "All", "All"], + "population": [200, 200, 200, 200], } df = pd.DataFrame(fake_pop_data_state_level_by_sex_by_race) @@ -601,21 +601,21 @@ def test_sum_states_to_national(): def test_merge_dfs_list(): # Test case: Normal case - df1 = pd.DataFrame({'STATE': ['STATE1', 'STATE2'], 'RACE': ['RACE1', 'RACE2'], 'C': ['C1', 'C2']}) + df1 = pd.DataFrame({"STATE": ["STATE1", "STATE2"], "RACE": ["RACE1", "RACE2"], "C": ["C1", "C2"]}) - df2 = pd.DataFrame({'STATE': ['STATE1', 'STATE2'], 'RACE': ['RACE1', 'RACE2'], 'D': ['D1', 'D2']}) + df2 = pd.DataFrame({"STATE": ["STATE1", "STATE2"], "RACE": ["RACE1", "RACE2"], "D": ["D1", "D2"]}) - df3 = pd.DataFrame({'STATE': ['STATE1', 'STATE2'], 'RACE': ['RACE1', 'RACE2'], 'E': ['E1', 'E2']}) + df3 = pd.DataFrame({"STATE": ["STATE1", "STATE2"], "RACE": ["RACE1", "RACE2"], "E": ["E1", "E2"]}) expected_df = pd.DataFrame( { - 'STATE': ['STATE1', 'STATE2'], - 'RACE': ['RACE1', 'RACE2'], - 'C': ['C1', 'C2'], - 'D': ['D1', 'D2'], - 'E': ['E1', 'E2'], + "STATE": ["STATE1", "STATE2"], + "RACE": ["RACE1", "RACE2"], + "C": ["C1", "C2"], + "D": ["D1", "D2"], + "E": ["E1", "E2"], } ) - result_df = merge_utils.merge_dfs_list([df1, df2, df3], ['STATE', 'RACE']) + result_df = merge_utils.merge_dfs_list([df1, df2, df3], ["STATE", "RACE"]) pd.testing.assert_frame_equal(result_df, expected_df) diff --git a/python/tests/ingestion/test_standardized_columns.py b/python/tests/ingestion/test_standardized_columns.py index c674b9cae6..2647c32c77 100644 --- a/python/tests/ingestion/test_standardized_columns.py +++ b/python/tests/ingestion/test_standardized_columns.py @@ -8,13 +8,13 @@ def test_extract_prefix(): with pytest.raises(ValueError): - extract_prefix('') + extract_prefix("") with pytest.raises(ValueError): - extract_prefix('something_without_any_known_suffix') + extract_prefix("something_without_any_known_suffix") - assert extract_prefix('specific_disease_per_100k') == 'specific_disease' - assert extract_prefix('some_prefix_estimated_total') == 'some_prefix' + assert extract_prefix("specific_disease_per_100k") == "specific_disease" + assert extract_prefix("some_prefix_estimated_total") == "some_prefix" def test_ends_with_suffix_from_list(): diff --git a/python/tests/ingestion/test_url_file_to_gcs.py b/python/tests/ingestion/test_url_file_to_gcs.py index 709911a30b..cc91a730bc 100644 --- a/python/tests/ingestion/test_url_file_to_gcs.py +++ b/python/tests/ingestion/test_url_file_to_gcs.py @@ -25,13 +25,14 @@ def write_to_file(file_to_write, contents): def initialize_mocks(mock_storage_client, mock_requests_get, response_data, gcs_data, blob_download_side_effect=None): if blob_download_side_effect is None: + def blob_download_side_effect(test_old_file): write_to_file(test_old_file, gcs_data) + mock_storage_instance = mock_storage_client.return_value - blob_attrs = { - 'download_to_file.side_effect': blob_download_side_effect} + blob_attrs = {"download_to_file.side_effect": blob_download_side_effect} mock_blob = Mock(**blob_attrs) - bucket_attrs = {'blob.return_value': mock_blob} + bucket_attrs = {"blob.return_value": mock_blob} mock_bucket = Mock(**bucket_attrs) mock_storage_instance.get_bucket.return_value = mock_bucket mock_requests_get.return_value = MockResponse(response_data) @@ -39,36 +40,44 @@ def blob_download_side_effect(test_old_file): class URLFileToGCSTest(unittest.TestCase): def testDownloadFirstUrlToGcs_SameFile(self): - test_data = b'fake data' - with patch('ingestion.url_file_to_gcs.storage.Client') as mock_storage_client, \ - patch('requests.get') as mock_requests_get: - initialize_mocks(mock_storage_client, - mock_requests_get, test_data, test_data) + test_data = b"fake data" + with patch("ingestion.url_file_to_gcs.storage.Client") as mock_storage_client, patch( + "requests.get" + ) as mock_requests_get: + initialize_mocks(mock_storage_client, mock_requests_get, test_data, test_data) result = url_file_to_gcs.download_first_url_to_gcs( - ['https://testurl.com'], 'test_bucket', 'test_destination') + ["https://testurl.com"], "test_bucket", "test_destination" + ) self.assertFalse(result) def testDownloadFirstUrlToGcs_DiffFile(self): - with patch('ingestion.url_file_to_gcs.storage.Client') as mock_storage_client, \ - patch('requests.get') as mock_requests_get: - initialize_mocks(mock_storage_client, - mock_requests_get, b'data from url', b'gcs data') + with patch("ingestion.url_file_to_gcs.storage.Client") as mock_storage_client, patch( + "requests.get" + ) as mock_requests_get: + initialize_mocks(mock_storage_client, mock_requests_get, b"data from url", b"gcs data") result = url_file_to_gcs.download_first_url_to_gcs( - ['https://testurl.com'], 'test_bucket', 'test_destination') + ["https://testurl.com"], "test_bucket", "test_destination" + ) self.assertTrue(result) def testDownloadFirstUrlToGcs_NoGCSFile(self): - with patch('ingestion.url_file_to_gcs.storage.Client') as mock_storage_client, \ - patch('requests.get') as mock_requests_get: - initialize_mocks(mock_storage_client, - mock_requests_get, b'data from url', b'gcs data', - blob_download_side_effect=google.cloud.exceptions.NotFound('test error')) + with patch("ingestion.url_file_to_gcs.storage.Client") as mock_storage_client, patch( + "requests.get" + ) as mock_requests_get: + initialize_mocks( + mock_storage_client, + mock_requests_get, + b"data from url", + b"gcs data", + blob_download_side_effect=google.cloud.exceptions.NotFound("test error"), + ) result = url_file_to_gcs.download_first_url_to_gcs( - ['https://testurl.com'], 'test_bucket', 'test_destination') + ["https://testurl.com"], "test_bucket", "test_destination" + ) self.assertTrue(result) diff --git a/run_gcs_to_bq/main.py b/run_gcs_to_bq/main.py index b8531a8478..79506844ad 100644 --- a/run_gcs_to_bq/main.py +++ b/run_gcs_to_bq/main.py @@ -6,28 +6,28 @@ app = Flask(__name__) -@app.route('/', methods=['POST']) +@app.route("/", methods=["POST"]) def ingest_bucket_to_bq(): """Main function for moving data from buckets to bigquery. Triggered by notify-data-ingested topic.""" envelope = request.get_json() if not envelope: - logging.error('No Pub/Sub message received.') - return ('', 400) + logging.error("No Pub/Sub message received.") + return ("", 400) - if not isinstance(envelope, dict) or 'message' not in envelope: - logging.error('Invalid Pub/Sub message format') - return ('', 400) + if not isinstance(envelope, dict) or "message" not in envelope: + logging.error("Invalid Pub/Sub message format") + return ("", 400) - event = envelope['message'] + event = envelope["message"] logging.info("Received message: %s", event) try: do_ingestion(event) - return ('', 204) + return ("", 204) except Exception as e: logging.exception(e) - return ('', 400) + return ("", 400) def do_ingestion(event): @@ -37,24 +37,24 @@ def do_ingestion(event): event: Dict containing the Pub/Sub method. The payload will be a base-64 encoded string in the 'data' field with additional attributes in the 'attributes' field.""" - is_airflow_run = event['is_airflow_run'] + is_airflow_run = event["is_airflow_run"] if is_airflow_run: attrs = event else: - if 'attributes' not in event: + if "attributes" not in event: raise RuntimeError("PubSub message missing 'attributes' field") - attrs = event['attributes'] - if 'id' not in attrs or 'gcs_bucket' not in attrs: + attrs = event["attributes"] + if "id" not in attrs or "gcs_bucket" not in attrs: raise RuntimeError("PubSub data missing 'id' or 'gcs_bucket' field") - workflow_id = attrs.pop('id') - gcs_bucket = attrs.pop('gcs_bucket') + workflow_id = attrs.pop("id") + gcs_bucket = attrs.pop("gcs_bucket") - dataset = attrs.pop('dataset', None) + dataset = attrs.pop("dataset", None) if dataset is None: - if 'DATASET_NAME' not in os.environ: + if "DATASET_NAME" not in os.environ: raise RuntimeError("Environment variable DATASET_NAME missing.") - dataset = os.environ['DATASET_NAME'] + dataset = os.environ["DATASET_NAME"] if workflow_id not in DATA_SOURCES_DICT.keys(): raise RuntimeError(f"ID: {workflow_id}, is not a valid id") @@ -66,4 +66,4 @@ def do_ingestion(event): if __name__ == "__main__": - app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080))) + app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) diff --git a/run_gcs_to_bq/test_run_gcs_to_bq.py b/run_gcs_to_bq/test_run_gcs_to_bq.py index ccd308db9c..80bcbf026c 100644 --- a/run_gcs_to_bq/test_run_gcs_to_bq.py +++ b/run_gcs_to_bq/test_run_gcs_to_bq.py @@ -7,55 +7,55 @@ @pytest.fixture def client(): - main.app.config['TESTING'] = True + main.app.config["TESTING"] = True with main.app.test_client() as client: yield client def test_ingest_bucket_to_bq_no_json(client): - response = client.post('/', content_type='application/json') + response = client.post("/", content_type="application/json") assert response.status_code == 400 def test_ingest_bucket_to_bq_invalid_format(client): - response = client.post('/', json={}) + response = client.post("/", json={}) assert response.status_code == 400 def test_ingest_bucket_to_bq_with_exception(client): - with mock.patch('main.do_ingestion') as mock_do_ingestion: - mock_do_ingestion.side_effect = Exception('Something went wrong') - response = client.post('/', json={'message': {}}) + with mock.patch("main.do_ingestion") as mock_do_ingestion: + mock_do_ingestion.side_effect = Exception("Something went wrong") + response = client.post("/", json={"message": {}}) assert response.status_code == 400 def test_do_ingestion_missing_attributes(): - mock_event = {'is_airflow_run': False} + mock_event = {"is_airflow_run": False} with pytest.raises(RuntimeError): main.do_ingestion(mock_event) def test_do_ingestion_missing_id_and_gcs_bucket(): - mock_event = {'is_airflow_run': False, 'attributes': {}} + mock_event = {"is_airflow_run": False, "attributes": {}} with pytest.raises(RuntimeError): main.do_ingestion(mock_event) def test_do_ingestion_missing_dataset_env_variable(monkeypatch): - mock_event = {'is_airflow_run': False, 'attributes': {'id': '123', 'gcs_bucket': 'bucket'}} - monkeypatch.delenv('DATASET_NAME', raising=False) + mock_event = {"is_airflow_run": False, "attributes": {"id": "123", "gcs_bucket": "bucket"}} + monkeypatch.delenv("DATASET_NAME", raising=False) with pytest.raises(RuntimeError): main.do_ingestion(mock_event) # Additional tests for missing 'id' and 'gcs_bucket' def test_do_ingestion_missing_id(): - mock_event = {'is_airflow_run': False, 'attributes': {'gcs_bucket': 'bucket'}} + mock_event = {"is_airflow_run": False, "attributes": {"gcs_bucket": "bucket"}} with pytest.raises(RuntimeError): main.do_ingestion(mock_event) def test_do_ingestion_missing_gcs_bucket(): - mock_event = {'is_airflow_run': False, 'attributes': {'id': '123'}} + mock_event = {"is_airflow_run": False, "attributes": {"id": "123"}} with pytest.raises(RuntimeError): main.do_ingestion(mock_event) diff --git a/run_ingestion/main.py b/run_ingestion/main.py index 040e3b1024..6878ac3682 100644 --- a/run_ingestion/main.py +++ b/run_ingestion/main.py @@ -6,34 +6,35 @@ import os from datasources.data_sources import DATA_SOURCES_DICT from flask import Flask, request + app = Flask(__name__) -@app.route('/', methods=['POST']) +@app.route("/", methods=["POST"]) def ingest_data(): """Main function for data ingestion. Receives Pub/Sub trigger and triages - to the appropriate data ingestion workflow. + to the appropriate data ingestion workflow. - Returns 400 for a bad request and 204 for successful new file downloads - or 201 for successful non file download execution.""" + Returns 400 for a bad request and 204 for successful new file downloads + or 201 for successful non file download execution.""" envelope = request.get_json() if not envelope: - logging.error('No Pub/Sub message received.') - return ('', HTTPStatus.BAD_REQUEST) + logging.error("No Pub/Sub message received.") + return ("", HTTPStatus.BAD_REQUEST) - if not isinstance(envelope, dict) or 'message' not in envelope: - logging.error('Invalid Pub/Sub message format') - return ('', HTTPStatus.BAD_REQUEST) + if not isinstance(envelope, dict) or "message" not in envelope: + logging.error("Invalid Pub/Sub message format") + return ("", HTTPStatus.BAD_REQUEST) - event = envelope['message'] + event = envelope["message"] logging.info(f"message: {event}") try: ingest_data_to_gcs(event) - return ('', HTTPStatus.CREATED) + return ("", HTTPStatus.CREATED) except Exception as e: logging.exception(e) - return ('', HTTPStatus.BAD_REQUEST) + return ("", HTTPStatus.BAD_REQUEST) def ingest_data_to_gcs(event): @@ -45,32 +46,31 @@ def ingest_data_to_gcs(event): event: Dict containing the Pub/Sub method. The payload will be a base-64 encoded string in the 'data' field. """ - is_airflow_run = event['is_airflow_run'] + is_airflow_run = event["is_airflow_run"] if is_airflow_run: event_dict = event else: - if 'data' not in event: + if "data" not in event: raise RuntimeError("PubSub message missing 'data' field") - data = base64.b64decode(event['data']).decode('utf-8') + data = base64.b64decode(event["data"]).decode("utf-8") event_dict = json.loads(data) attrs = event_dict.copy() - if 'id' not in attrs or 'gcs_bucket' not in attrs: + if "id" not in attrs or "gcs_bucket" not in attrs: raise RuntimeError("PubSub data missing 'id' or 'gcs_bucket' field") - workflow_id = attrs.pop('id') - gcs_bucket = attrs.pop('gcs_bucket') + workflow_id = attrs.pop("id") + gcs_bucket = attrs.pop("gcs_bucket") logging.info("Data ingestion received message: %s", workflow_id) if workflow_id not in DATA_SOURCES_DICT.keys(): - raise RuntimeError("ID: {}, is not a valid id".format(workflow_id)) + raise RuntimeError(f"ID: {workflow_id}, is not a valid id") data_source = DATA_SOURCES_DICT[workflow_id] data_source.upload_to_gcs(gcs_bucket, **attrs) - logging.info( - "Successfully uploaded data to GCS for workflow %s", workflow_id) + logging.info("Successfully uploaded data to GCS for workflow %s", workflow_id) if __name__ == "__main__": - app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080))) + app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) diff --git a/run_ingestion/test_run_ingestion.py b/run_ingestion/test_run_ingestion.py index 25c8efbb7f..1180fb56fd 100644 --- a/run_ingestion/test_run_ingestion.py +++ b/run_ingestion/test_run_ingestion.py @@ -6,23 +6,23 @@ @pytest.fixture def client(): - main.app.config['TESTING'] = True + main.app.config["TESTING"] = True with main.app.test_client() as client: yield client def test_ingest_data_no_json(client): - response = client.post('/', content_type='application/json') + response = client.post("/", content_type="application/json") assert response.status_code == HTTPStatus.BAD_REQUEST def test_ingest_data_invalid_format(client): - response = client.post('/', json={}) + response = client.post("/", json={}) assert response.status_code == HTTPStatus.BAD_REQUEST def test_ingest_data_with_exception(client): - with patch('main.ingest_data_to_gcs') as mock_ingest_data_to_gcs: - mock_ingest_data_to_gcs.side_effect = Exception('Something went wrong') - response = client.post('/', json={'message': {}}) + with patch("main.ingest_data_to_gcs") as mock_ingest_data_to_gcs: + mock_ingest_data_to_gcs.side_effect = Exception("Something went wrong") + response = client.post("/", json={"message": {}}) assert response.status_code == HTTPStatus.BAD_REQUEST