From 919673c140ca7036d0385d298849dd521e1a1421 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 21 Oct 2024 11:58:53 +0100 Subject: [PATCH 01/21] Remove obsolete time use data --- scripts/1_prep_synthpop.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/scripts/1_prep_synthpop.py b/scripts/1_prep_synthpop.py index 4d3d0e1..3df2dd8 100644 --- a/scripts/1_prep_synthpop.py +++ b/scripts/1_prep_synthpop.py @@ -27,35 +27,6 @@ def main(config_file): acbm.root_path / f"data/external/spc_output/{region}_people_hh.parquet" ) - # People and time-use data - # Subset of (non-time-use) features to include and unnest - # The features can be found here: https://github.com/alan-turing-institute/uatk-spc/blob/main/synthpop.proto - features = { - "health": [ - "bmi", - "has_cardiovascular_disease", - "has_diabetes", - "has_high_blood_pressure", - "self_assessed_health", - "life_satisfaction", - ], - "demographics": ["age_years", "ethnicity", "sex", "nssec8"], - "employment": ["sic1d2007", "sic2d2007", "pwkstat", "salary_yearly"], - } - - # build the table - spc_people_tu = ( - Builder(path, region, backend="polars", input_type="parquet") - .add_households() - .add_time_use_diaries(features, diary_type="weekday_diaries") - .build() - ) - - # save the output - spc_people_tu.write_parquet( - acbm.root_path / f"data/external/spc_output/{region}_people_tu.parquet" - ) - if __name__ == "__main__": main() From db525102b42df962b2a116e733f5fb3f65950d0d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 21 Oct 2024 12:00:02 +0100 Subject: [PATCH 02/21] Fix hard coded region --- scripts/2_match_households_and_individuals.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index aaf766b..7595cf60 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -39,14 +39,12 @@ def get_interim_path( # ### SPC - # useful variables - region = "leeds" - logger.info("Loading SPC data") # Read in the spc data (parquet format) spc = pd.read_parquet( - acbm.root_path / "data/external/spc_output/" f"{region}_people_hh.parquet" + acbm.root_path / "data/external/spc_output/" + f"{config.region}_people_hh.parquet" ) logger.info("Filtering SPC data to specific columns") From 91e9f07b51c7e34b2228d30b3f6caa2219b21717 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 10:11:11 +0100 Subject: [PATCH 03/21] Refactor loop logic to improve performance --- src/acbm/matching.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/acbm/matching.py b/src/acbm/matching.py index 9508fa7..88cafc2 100644 --- a/src/acbm/matching.py +++ b/src/acbm/matching.py @@ -264,13 +264,16 @@ def match_individuals( # Remove all unmateched households matches_hh = {key: value for key, value in matches_hh.items() if not pd.isna(value)} - # loop over all rows in the matches_hh dictionary - for i, (key, value) in enumerate(matches_hh.items(), 1): - # Get the rows in df1 and df2 that correspond to the matched hids - rows_df1 = df1[df1[df1_id] == key] + # loop over all groups of df1_id + for i, (key, rows_df1) in df1.groupby(df1_id): + try: + value = matches_hh[key] + except Exception: + # Continue if key not in matches_hh + continue rows_df2 = df2[df2[df2_id] == int(value)] - if show_progress: + if show_progress and i % 100 == 0: # Print the iteration number and the number of keys in the dict print(f"Matching for household {i} out of: {len(matches_hh)}") From 69535ef9cbe3c0c3d2622881f684bdccad89a319 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 10:26:35 +0100 Subject: [PATCH 04/21] fix: add enumerate --- src/acbm/matching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/acbm/matching.py b/src/acbm/matching.py index 88cafc2..4b54376 100644 --- a/src/acbm/matching.py +++ b/src/acbm/matching.py @@ -265,7 +265,7 @@ def match_individuals( matches_hh = {key: value for key, value in matches_hh.items() if not pd.isna(value)} # loop over all groups of df1_id - for i, (key, rows_df1) in df1.groupby(df1_id): + for i, (key, rows_df1) in enumerate(df1.groupby(df1_id), 1): try: value = matches_hh[key] except Exception: From 0d291d61c9f518fa4f849df7f7e2e6b8cfd3aad0 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 10:36:09 +0100 Subject: [PATCH 05/21] Add matching config and option to load household matches --- scripts/2_match_households_and_individuals.py | 333 +++++++++--------- src/acbm/config.py | 6 + 2 files changed, 180 insertions(+), 159 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 7595cf60..c3be6a5 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -685,194 +685,209 @@ def get_interim_path( ) # fill the NaNs with the original values # ## Step 3: Matching at Household Level + if not config.matching.load_hh: + logger.info("Categorical matching: MATCHING HOUSEHOLDS") + + # + # Now that we've prepared all the columns, we can start matching. + + # ### 3.1 Categorical matching + # + # We will match on (a subset of) the following columns: + # + # | Matching variable | NTS column | SPC column | + # | ------------------| ---------- | ---------- | + # | Household income | `HHIncome2002_BO2ID` | `salary_yearly_hh_cat` | + # | Number of adults | `HHoldNumAdults` | `num_adults` | + # | Number of children | `HHoldNumChildren` | `num_children` | + # | Employment status | `HHoldEmploy_B01ID` | `pwkstat_NTS_match` | + # | Car ownership | `NumCar_SPC_match` | `num_cars` | + # | Type of tenancy | `tenure_nts_for_matching` | `tenure_spc_for_matching` | + # | Rural/Urban Classification | `Settlement2011EW_B03ID` | `Settlement2011EW_B03ID_spc_CD` | + + # Prepare SPC df for matching + + # Select multiple columns + spc_matching = spc_edited[ + [ + "hid", + "salary_yearly_hh_cat", + "num_adults", + "num_children", + "num_pension_age", + "pwkstat_NTS_match", + "num_cars", + "tenure_spc_for_matching", + "Settlement2011EW_B03ID_spc_CD", + "Settlement2011EW_B04ID_spc_CD", + ] + ] - logger.info("Categorical matching: MATCHING HOUSEHOLDS") + # edit the df so that we have one row per hid + spc_matching = spc_matching.drop_duplicates(subset="hid") + + spc_matching.head(10) + + # Prepare NTS df for matching + + nts_matching = nts_households[ + [ + "HouseholdID", + "HHIncome2002_B02ID", + "HHoldNumAdults", + "HHoldNumChildren", + "num_pension_age_nts", + "HHoldEmploy_B01ID", + "NumCar_SPC_match", + "tenure_nts_for_matching", + "Settlement2011EW_B03ID", + "Settlement2011EW_B04ID", + ] + ] - # - # Now that we've prepared all the columns, we can start matching. + # Dictionary of matching columns. We extract column names from this dictioary when matching on a subset of the columns - # ### 3.1 Categorical matching - # - # We will match on (a subset of) the following columns: - # - # | Matching variable | NTS column | SPC column | - # | ------------------| ---------- | ---------- | - # | Household income | `HHIncome2002_BO2ID` | `salary_yearly_hh_cat` | - # | Number of adults | `HHoldNumAdults` | `num_adults` | - # | Number of children | `HHoldNumChildren` | `num_children` | - # | Employment status | `HHoldEmploy_B01ID` | `pwkstat_NTS_match` | - # | Car ownership | `NumCar_SPC_match` | `num_cars` | - # | Type of tenancy | `tenure_nts_for_matching` | `tenure_spc_for_matching` | - # | Rural/Urban Classification | `Settlement2011EW_B03ID` | `Settlement2011EW_B03ID_spc_CD` | - - # Prepare SPC df for matching - - # Select multiple columns - spc_matching = spc_edited[ - [ - "hid", - "salary_yearly_hh_cat", - "num_adults", - "num_children", + # column_names (keys) for the dictionary + matching_ids = [ + "household_id", + "yearly_income", + "number_adults", + "number_children", "num_pension_age", - "pwkstat_NTS_match", - "num_cars", - "tenure_spc_for_matching", - "Settlement2011EW_B03ID_spc_CD", - "Settlement2011EW_B04ID_spc_CD", + "employment_status", + "number_cars", + "tenure_status", + "rural_urban_2_categories", + "rural_urban_4_categories", ] - ] - # edit the df so that we have one row per hid - spc_matching = spc_matching.drop_duplicates(subset="hid") - - spc_matching.head(10) + # Dict with value qual to a list with spc_matching and nts_matching column names + matching_dfs_dict = { + column_name: [spc_value, nts_value] + for column_name, spc_value, nts_value in zip( + matching_ids, spc_matching, nts_matching + ) + } - # Prepare NTS df for matching + # We match iteratively on a subset of columns. We start with all columns, and then remove + # one of the optionals columns at a time (relaxing the condition). Once a household has over n + # matches, we stop matching it to more matches. We continue until all optional columns are removed - nts_matching = nts_households[ - [ - "HouseholdID", - "HHIncome2002_B02ID", - "HHoldNumAdults", - "HHoldNumChildren", - "num_pension_age_nts", - "HHoldEmploy_B01ID", - "NumCar_SPC_match", - "tenure_nts_for_matching", - "Settlement2011EW_B03ID", - "Settlement2011EW_B04ID", + # Define required columns for matching + required_columns = [ + "number_adults", + "number_children", ] - ] - # Dictionary of matching columns. We extract column names from this dictioary when matching on a subset of the columns - - # column_names (keys) for the dictionary - matching_ids = [ - "household_id", - "yearly_income", - "number_adults", - "number_children", - "num_pension_age", - "employment_status", - "number_cars", - "tenure_status", - "rural_urban_2_categories", - "rural_urban_4_categories", - ] + # Define optional columns in order of importance (most to least important) + optional_columns = [ + "number_cars", + "num_pension_age", + "rural_urban_2_categories", + "employment_status", + "tenure_status", + ] - # Dict with value qual to a list with spc_matching and nts_matching column names - matching_dfs_dict = { - column_name: [spc_value, nts_value] - for column_name, spc_value, nts_value in zip( - matching_ids, spc_matching, nts_matching + matcher_exact = MatcherExact( + df_pop=spc_matching, + df_pop_id="hid", + df_sample=nts_matching, + df_sample_id="HouseholdID", + matching_dict=matching_dfs_dict, + fixed_cols=required_columns, + optional_cols=optional_columns, + n_matches=10, + chunk_size=50000, + show_progress=True, ) - } - # We match iteratively on a subset of columns. We start with all columns, and then remove - # one of the optionals columns at a time (relaxing the condition). Once a household has over n - # matches, we stop matching it to more matches. We continue until all optional columns are removed + # Match - # Define required columns for matching - required_columns = [ - "number_adults", - "number_children", - ] + matches_hh_level = matcher_exact.iterative_match_categorical() - # Define optional columns in order of importance (most to least important) - optional_columns = [ - "number_cars", - "num_pension_age", - "rural_urban_2_categories", - "employment_status", - "tenure_status", - ] + # Number of unmatched households - matcher_exact = MatcherExact( - df_pop=spc_matching, - df_pop_id="hid", - df_sample=nts_matching, - df_sample_id="HouseholdID", - matching_dict=matching_dfs_dict, - fixed_cols=required_columns, - optional_cols=optional_columns, - n_matches=10, - chunk_size=50000, - show_progress=True, - ) - - # Match - - matches_hh_level = matcher_exact.iterative_match_categorical() + # no. of keys where value is na + na_count = sum([1 for v in matches_hh_level.values() if pd.isna(v).all()]) - # Number of unmatched households - - # no. of keys where value is na - na_count = sum([1 for v in matches_hh_level.values() if pd.isna(v).all()]) - - logger.info(f"Categorical matching: {na_count} households in the SPC had no match") - logger.info( - f"{round((na_count / len(matches_hh_level)) * 100, 1)}% of households in the SPC had no match" - ) + logger.info( + f"Categorical matching: {na_count} households in the SPC had no match" + ) + logger.info( + f"{round((na_count / len(matches_hh_level)) * 100, 1)}% of households in the SPC had no match" + ) - ## add matches_hh_level as a column in spc_edited - spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level) + ## add matches_hh_level as a column in spc_edited + spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level) - # ### Random Sampling from matched households + # ### Random Sampling from matched households - logger.info("Categorical matching: Randomly choosing one match per household") - # - # In categorical matching, many households in the SPC are matched to more than 1 household in the NTS. Which household to choose? We do random sampling + logger.info("Categorical matching: Randomly choosing one match per household") + # + # In categorical matching, many households in the SPC are matched to more than 1 household in the NTS. Which household to choose? We do random sampling - # for each key in the dictionary, sample 1 of the values associated with it and store it in a new dictionary + # for each key in the dictionary, sample 1 of the values associated with it and store it in a new dictionary - """ - - iterate over each key-value pair in the matches_hh_result dictionary. - - For each key-value pair, use np.random.choice(value) to randomly select - one item from the list of values associated with the current key. - - create a new dictionary hid_to_HouseholdID_sample where each key from the - original dictionary is associated with one randomly selected value from the - original list of values. + """ + - iterate over each key-value pair in the matches_hh_result dictionary. + - For each key-value pair, use np.random.choice(value) to randomly select + one item from the list of values associated with the current key. + - create a new dictionary hid_to_HouseholdID_sample where each key from the + original dictionary is associated with one randomly selected value from the + original list of values. - """ - # Randomly sample one match per household if it has one match or more - matches_hh_level_sample = { - key: np.random.choice(value) - for key, value in matches_hh_level.items() - if value - and not pd.isna( - np.random.choice(value) - ) # Ensure the value list is not empty and the selected match is not NaN - } - - # Multiple matches in case we want to try stochastic runs - - # Same logic as above, but repeat it multiple times and store each result as a separate dictionary in a list - matches_hh_level_sample_list = [ - { + """ + # Randomly sample one match per household if it has one match or more + matches_hh_level_sample = { key: np.random.choice(value) for key, value in matches_hh_level.items() - if value and not pd.isna(np.random.choice(value)) + if value + and not pd.isna( + np.random.choice(value) + ) # Ensure the value list is not empty and the selected match is not NaN } - for i in range(25) # Repeat the process 25 times - ] - logger.info("Categorical matching: Random sampling complete") + # Multiple matches in case we want to try stochastic runs - # Save results - logger.info("Categorical matching: Saving results") - # random sample - with open( - get_interim_path("matches_hh_level_categorical_random_sample.pkl"), "wb" - ) as f: - pkl.dump(matches_hh_level_sample, f) + # Same logic as above, but repeat it multiple times and store each result as a separate dictionary in a list + matches_hh_level_sample_list = [ + { + key: np.random.choice(value) + for key, value in matches_hh_level.items() + if value and not pd.isna(np.random.choice(value)) + } + for i in range(25) # Repeat the process 25 times + ] - # multiple random samples - with open( - get_interim_path("matches_hh_level_categorical_random_sample_multiple.pkl"), - "wb", - ) as f: - pkl.dump(matches_hh_level_sample_list, f) + logger.info("Categorical matching: Random sampling complete") + + # Save results + logger.info("Categorical matching: Saving results") + # random sample + with open( + get_interim_path("matches_hh_level_categorical_random_sample.pkl"), "wb" + ) as f: + pkl.dump(matches_hh_level_sample, f) + + # multiple random samples + with open( + get_interim_path("matches_hh_level_categorical_random_sample_multiple.pkl"), + "wb", + ) as f: + pkl.dump(matches_hh_level_sample_list, f) + else: + # Load matching result + with open( + get_interim_path("matches_hh_level_categorical_random_sample.pkl"), "rb" + ) as f: + matches_hh_level_sample = pkl.load(f) + + # multiple random samples + with open( + get_interim_path("matches_hh_level_categorical_random_sample_multiple.pkl"), + "rb", + ) as f: + matches_hh_level_sample_list = pkl.load(f) # Do the same at the df level. Add nts_hh_id_sample column to the spc df diff --git a/src/acbm/config.py b/src/acbm/config.py index 1ae03fd..98e0e58 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -17,6 +17,11 @@ class Parameters(BaseModel): boundary_geography: str +@dataclass(frozen=True) +class MatchingParams(BaseModel): + load_hh: bool + + @dataclass(frozen=True) class WorkAssignmentParams(BaseModel): use_percentages: bool @@ -30,6 +35,7 @@ class Config(BaseModel): work_assignment: WorkAssignmentParams = Field( description="Config: parameters for work assignment." ) + matching: MatchingParams = Field(description="Config: parameters for matching.") @property def seed(self) -> int: From ea5d7f0ac9fe1c12ede9f8918b922fd02c6151c4 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 17:19:25 +0100 Subject: [PATCH 06/21] Subset columns for SPC with NTS output dataframe --- scripts/2_match_households_and_individuals.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index c3be6a5..2652428 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -8,6 +8,7 @@ # from joblib import Parallel, delayed # from tqdm import trange import acbm +from acbm.assigning.utils import cols_for_assignment_all from acbm.cli import acbm_cli from acbm.config import load_config from acbm.logger_config import matching_logger as logger @@ -1114,9 +1115,20 @@ def get_interim_path( # convert the nts_ind_id column to int for merging spc_edited_copy["nts_ind_id"] = spc_edited_copy["nts_ind_id"].astype(int) + # Add output columns required for assignment scripts + spc_output_cols = [ + col for col in spc_edited_copy.columns if col in cols_for_assignment_all() + ] + nts_output_cols = [ + col for col in nts_trips.columns if col in cols_for_assignment_all() + ] + # merge the copy with nts_trips using IndividualID - spc_edited_copy = spc_edited_copy.merge( - nts_trips, left_on="nts_ind_id", right_on="IndividualID", how="left" + spc_edited_copy = spc_edited_copy[spc_output_cols].merge( + nts_trips[nts_output_cols], + left_on="nts_ind_id", + right_on="IndividualID", + how="left", ) # save the file as a parquet file From 2dea50dc7eb9720d64c0f55851da9537d2b3c971 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 17:24:22 +0100 Subject: [PATCH 07/21] Add option to load individual matches --- scripts/2_match_households_and_individuals.py | 116 +++++++++--------- src/acbm/config.py | 3 +- 2 files changed, 63 insertions(+), 56 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 2652428..9fb01d1 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -908,69 +908,75 @@ def get_interim_path( # # - logger.info("Statistical matching: MATCHING INDIVIDUALS") - - # Create an 'age' column in the SPC that matches the NTS categories - - # create a dictionary for reference on how the labels for "Age_B04ID" match the actual age brackets - - # dict_nts_ind_age = {-10: 'DEAD', - # -8: 'NA', - # 1: '0-4', - # 2: '5-10', - # 3: '11-16', - # 4: '17-20', - # 5: '21-29', - # 6: '30-39', - # 7: '40-49', - # 8: '50-59', - # 9: '60+' - # } - - # Define the bins and labels based on dict_nts_ind_age - bins = [0, 4, 10, 16, 20, 29, 39, 49, 59, np.inf] - labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] - - # Create a new column in spc_edited that maps the age_years to the keys of dict_nts_ind_age - spc_edited["age_group"] = ( - pd.cut(spc_edited["age_years"], bins=bins, labels=labels) - .astype("int") - .fillna(-8) - ) + if not config.matching.load_ind: + logger.info("Statistical matching: MATCHING INDIVIDUALS") + + # Create an 'age' column in the SPC that matches the NTS categories + + # create a dictionary for reference on how the labels for "Age_B04ID" match the actual age brackets + + # dict_nts_ind_age = {-10: 'DEAD', + # -8: 'NA', + # 1: '0-4', + # 2: '5-10', + # 3: '11-16', + # 4: '17-20', + # 5: '21-29', + # 6: '30-39', + # 7: '40-49', + # 8: '50-59', + # 9: '60+' + # } + + # Define the bins and labels based on dict_nts_ind_age + bins = [0, 4, 10, 16, 20, 29, 39, 49, 59, np.inf] + labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] + + # Create a new column in spc_edited that maps the age_years to the keys of dict_nts_ind_age + spc_edited["age_group"] = ( + pd.cut(spc_edited["age_years"], bins=bins, labels=labels) + .astype("int") + .fillna(-8) + ) - # rename nts columns in preparation for matching + # rename nts columns in preparation for matching - nts_individuals.rename( - columns={"Age_B04ID": "age_group", "Sex_B01ID": "sex"}, inplace=True - ) + nts_individuals.rename( + columns={"Age_B04ID": "age_group", "Sex_B01ID": "sex"}, inplace=True + ) - # PSM matching using internal match_individuals function + # PSM matching using internal match_individuals function - matches_ind = match_individuals( - df1=spc_edited, - df2=nts_individuals, - matching_columns=["age_group", "sex"], - df1_id="hid", - df2_id="HouseholdID", - matches_hh=matches_hh_level_sample, - show_progress=True, - ) + matches_ind = match_individuals( + df1=spc_edited, + df2=nts_individuals, + matching_columns=["age_group", "sex"], + df1_id="hid", + df2_id="HouseholdID", + matches_hh=matches_hh_level_sample, + show_progress=True, + ) - # Add matches_ind values to spc_edited using map - spc_edited["nts_ind_id"] = spc_edited.index.map(matches_ind) + # Add matches_ind values to spc_edited using map + spc_edited["nts_ind_id"] = spc_edited.index.map(matches_ind) - # add the nts_individuals.IndividualID to spc_edit. The current nts_ind_id is the row index of nts_individuals - spc_edited["nts_ind_id"] = spc_edited["nts_ind_id"].map( - nts_individuals["IndividualID"] - ) + # add the nts_individuals.IndividualID to spc_edit. The current nts_ind_id is the row index of nts_individuals + spc_edited["nts_ind_id"] = spc_edited["nts_ind_id"].map( + nts_individuals["IndividualID"] + ) - logger.info("Statistical matching: Matching complete") + logger.info("Statistical matching: Matching complete") - # save random sample - with open( - get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "wb" - ) as f: - pkl.dump(matches_ind, f) + # save random sample + with open( + get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "wb" + ) as f: + pkl.dump(matches_ind, f) + else: + with open( + get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "rb" + ) as f: + matches_ind = pkl.load(f) # ### Match on multiple samples diff --git a/src/acbm/config.py b/src/acbm/config.py index 98e0e58..cf48101 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -19,7 +19,8 @@ class Parameters(BaseModel): @dataclass(frozen=True) class MatchingParams(BaseModel): - load_hh: bool + load_hh: bool | None = False + load_ind: bool | None = False @dataclass(frozen=True) From 3dc8831d5927d33612f6a338221645cf921cfc23 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 17:27:14 +0100 Subject: [PATCH 08/21] Fix missing column required for merge --- scripts/2_match_households_and_individuals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 9fb01d1..056777d 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -1127,7 +1127,7 @@ def get_interim_path( ] nts_output_cols = [ col for col in nts_trips.columns if col in cols_for_assignment_all() - ] + ] + ["IndividualID"] # merge the copy with nts_trips using IndividualID spc_edited_copy = spc_edited_copy[spc_output_cols].merge( From 9cbbe8a770cc332b45159385276cdd5a78a17a38 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 17:31:36 +0100 Subject: [PATCH 09/21] Add logging for loading case --- scripts/2_match_households_and_individuals.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 056777d..9493acf 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -877,6 +877,7 @@ def get_interim_path( ) as f: pkl.dump(matches_hh_level_sample_list, f) else: + logger.info("Categorical matching: loading matched households") # Load matching result with open( get_interim_path("matches_hh_level_categorical_random_sample.pkl"), "rb" @@ -973,6 +974,7 @@ def get_interim_path( ) as f: pkl.dump(matches_ind, f) else: + logger.info("Statistical matching: loading matched individuals") with open( get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "rb" ) as f: From 6e5c396ef287c646fc0e9de4af06f68eb699a6e6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 17:50:48 +0100 Subject: [PATCH 10/21] Reorganise code for matching results to be loaded --- scripts/2_match_households_and_individuals.py | 99 ++++++++++--------- 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 9493acf..367c696 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -818,9 +818,6 @@ def get_interim_path( f"{round((na_count / len(matches_hh_level)) * 100, 1)}% of households in the SPC had no match" ) - ## add matches_hh_level as a column in spc_edited - spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level) - # ### Random Sampling from matched households logger.info("Categorical matching: Randomly choosing one match per household") @@ -864,6 +861,11 @@ def get_interim_path( # Save results logger.info("Categorical matching: Saving results") + + # matching results + with open(get_interim_path("matches_hh_level_categorical.pkl"), "wb") as f: + pkl.dump(matches_hh_level, f) + # random sample with open( get_interim_path("matches_hh_level_categorical_random_sample.pkl"), "wb" @@ -891,6 +893,10 @@ def get_interim_path( ) as f: matches_hh_level_sample_list = pkl.load(f) + ## add matches_hh_level as a column in spc_edited + # TODO: update other scripts to only add this in-memory + # spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level) + # Do the same at the df level. Add nts_hh_id_sample column to the spc df # # for each hid in spc_edited, sample a value from the nts_hh_id col. @@ -909,45 +915,44 @@ def get_interim_path( # # - if not config.matching.load_ind: - logger.info("Statistical matching: MATCHING INDIVIDUALS") + # Create an 'age' column in the SPC that matches the NTS categories + + # create a dictionary for reference on how the labels for "Age_B04ID" match the actual age brackets + + # dict_nts_ind_age = {-10: 'DEAD', + # -8: 'NA', + # 1: '0-4', + # 2: '5-10', + # 3: '11-16', + # 4: '17-20', + # 5: '21-29', + # 6: '30-39', + # 7: '40-49', + # 8: '50-59', + # 9: '60+' + # } + + # Define the bins and labels based on dict_nts_ind_age + bins = [0, 4, 10, 16, 20, 29, 39, 49, 59, np.inf] + labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] + + # Create a new column in spc_edited that maps the age_years to the keys of dict_nts_ind_age + spc_edited["age_group"] = ( + pd.cut(spc_edited["age_years"], bins=bins, labels=labels) + .astype("int") + .fillna(-8) + ) - # Create an 'age' column in the SPC that matches the NTS categories - - # create a dictionary for reference on how the labels for "Age_B04ID" match the actual age brackets - - # dict_nts_ind_age = {-10: 'DEAD', - # -8: 'NA', - # 1: '0-4', - # 2: '5-10', - # 3: '11-16', - # 4: '17-20', - # 5: '21-29', - # 6: '30-39', - # 7: '40-49', - # 8: '50-59', - # 9: '60+' - # } - - # Define the bins and labels based on dict_nts_ind_age - bins = [0, 4, 10, 16, 20, 29, 39, 49, 59, np.inf] - labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] - - # Create a new column in spc_edited that maps the age_years to the keys of dict_nts_ind_age - spc_edited["age_group"] = ( - pd.cut(spc_edited["age_years"], bins=bins, labels=labels) - .astype("int") - .fillna(-8) - ) + # rename nts columns in preparation for matching - # rename nts columns in preparation for matching + nts_individuals.rename( + columns={"Age_B04ID": "age_group", "Sex_B01ID": "sex"}, inplace=True + ) - nts_individuals.rename( - columns={"Age_B04ID": "age_group", "Sex_B01ID": "sex"}, inplace=True - ) + if not config.matching.load_ind: + logger.info("Statistical matching: MATCHING INDIVIDUALS") # PSM matching using internal match_individuals function - matches_ind = match_individuals( df1=spc_edited, df2=nts_individuals, @@ -958,16 +963,6 @@ def get_interim_path( show_progress=True, ) - # Add matches_ind values to spc_edited using map - spc_edited["nts_ind_id"] = spc_edited.index.map(matches_ind) - - # add the nts_individuals.IndividualID to spc_edit. The current nts_ind_id is the row index of nts_individuals - spc_edited["nts_ind_id"] = spc_edited["nts_ind_id"].map( - nts_individuals["IndividualID"] - ) - - logger.info("Statistical matching: Matching complete") - # save random sample with open( get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "wb" @@ -980,6 +975,16 @@ def get_interim_path( ) as f: matches_ind = pkl.load(f) + # Add matches_ind values to spc_edited using map + spc_edited["nts_ind_id"] = spc_edited.index.map(matches_ind) + + # add the nts_individuals.IndividualID to spc_edit. The current nts_ind_id is the row index of nts_individuals + spc_edited["nts_ind_id"] = spc_edited["nts_ind_id"].map( + nts_individuals["IndividualID"] + ) + + logger.info("Statistical matching: Matching complete") + # ### Match on multiple samples # logger.info("Statistical matching: Matching on multiple samples") From 773d58950a2eb8da2378f6ed7ec46074e032caa1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 18:03:34 +0100 Subject: [PATCH 11/21] Update nts_hh_id to be sample instead of full list --- scripts/2_match_households_and_individuals.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 367c696..04e7a93 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -893,9 +893,12 @@ def get_interim_path( ) as f: matches_hh_level_sample_list = pkl.load(f) + # TODO: check if this: + # - column is required and possibly update other scripts to add this column in-memory since it is large + # - or can use the single sample hh for the new column + # For now, updated to use the sample dictionary ## add matches_hh_level as a column in spc_edited - # TODO: update other scripts to only add this in-memory - # spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level) + spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level_sample) # Do the same at the df level. Add nts_hh_id_sample column to the spc df From b0fbb56f060c558256cc8fea48affe8110566be5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 18:04:41 +0100 Subject: [PATCH 12/21] Add todo for "nts_hh_id" --- scripts/3.2.3_assign_secondary_zone.py | 1 + src/acbm/assigning/utils.py | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py index d649a70..dfc7418 100644 --- a/scripts/3.2.3_assign_secondary_zone.py +++ b/scripts/3.2.3_assign_secondary_zone.py @@ -224,6 +224,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr "id", "household", "nts_ind_id", + # TODO: check if this column is required "nts_hh_id", "age_years", "oact", diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py index 24e3561..1fb3ddc 100644 --- a/src/acbm/assigning/utils.py +++ b/src/acbm/assigning/utils.py @@ -14,6 +14,7 @@ def cols_for_assignment_all() -> list[str]: "household", "oact", "nts_ind_id", + # TODO: check if this column is required "nts_hh_id", "age_years", "TripDisIncSW", From de4de69dd8524f83379252d3fa581eec177b001b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 18:20:56 +0100 Subject: [PATCH 13/21] Add comment for individual matching --- src/acbm/matching.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/acbm/matching.py b/src/acbm/matching.py index 4b54376..87fea06 100644 --- a/src/acbm/matching.py +++ b/src/acbm/matching.py @@ -265,6 +265,9 @@ def match_individuals( matches_hh = {key: value for key, value in matches_hh.items() if not pd.isna(value)} # loop over all groups of df1_id + # note: for large populations looping through the groups (keys) of the + # large dataframe (assumed to be df1) is more efficient than looping + # over keys and subsetting on a key in each iteration. for i, (key, rows_df1) in enumerate(df1.groupby(df1_id), 1): try: value = matches_hh[key] From 38a43e224351736b63c34b3662b3d53b95625980 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 22 Oct 2024 18:30:00 +0100 Subject: [PATCH 14/21] Update config tomls --- config/base_500.toml | 7 +++++-- config/base_all.toml | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/config/base_500.toml b/config/base_500.toml index d9164a4..9c7ee5e 100644 --- a/config/base_500.toml +++ b/config/base_500.toml @@ -1,11 +1,14 @@ [parameters] seed = 0 -region = "leeds" +region = "greater-london" number_of_households = 500 zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" +[matching] +load_hh = false +load_ind = false [work_assignment] use_percentages = true diff --git a/config/base_all.toml b/config/base_all.toml index bb1cc1e..ebe0431 100644 --- a/config/base_all.toml +++ b/config/base_all.toml @@ -1,10 +1,13 @@ [parameters] seed = 0 -region = "leeds" +region = "greater-london" zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" +[matching] +load_hh = false +load_ind = false [work_assignment] use_percentages = false From c8f63962a7dfb4841f8b07b3004de89da607e221 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 23 Oct 2024 21:08:45 +0100 Subject: [PATCH 15/21] Add required and optional columns to config --- scripts/2_match_households_and_individuals.py | 20 ++----------------- src/acbm/config.py | 13 ++++++++++++ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 04e7a93..bc6de99 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -773,30 +773,14 @@ def get_interim_path( # We match iteratively on a subset of columns. We start with all columns, and then remove # one of the optionals columns at a time (relaxing the condition). Once a household has over n # matches, we stop matching it to more matches. We continue until all optional columns are removed - - # Define required columns for matching - required_columns = [ - "number_adults", - "number_children", - ] - - # Define optional columns in order of importance (most to least important) - optional_columns = [ - "number_cars", - "num_pension_age", - "rural_urban_2_categories", - "employment_status", - "tenure_status", - ] - matcher_exact = MatcherExact( df_pop=spc_matching, df_pop_id="hid", df_sample=nts_matching, df_sample_id="HouseholdID", matching_dict=matching_dfs_dict, - fixed_cols=required_columns, - optional_cols=optional_columns, + fixed_cols=list(config.matching.required_columns), + optional_cols=list(config.matching.optional_columns), n_matches=10, chunk_size=50000, show_progress=True, diff --git a/src/acbm/config.py b/src/acbm/config.py index cf48101..61f594b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -21,6 +21,19 @@ class Parameters(BaseModel): class MatchingParams(BaseModel): load_hh: bool | None = False load_ind: bool | None = False + # Define required columns for matching + required_columns: list[str] | tuple[str] = ( + "number_adults", + "number_children", + ) + # Define optional columns in order of importance (most to least important) + optional_columns: list[str] | tuple[str] = ( + "number_cars", + "num_pension_age", + "rural_urban_2_categories", + "employment_status", + "tenure_status", + ) @dataclass(frozen=True) From 073fd460b33fcbcd9ef66a8389be2cffb2d9a5b1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 23 Oct 2024 21:12:13 +0100 Subject: [PATCH 16/21] Extend config for matching and update tomls --- config/base_500.toml | 1 + config/base_5000.toml | 7 ++++++- config/base_all.toml | 1 + scripts/2_match_households_and_individuals.py | 4 ++-- src/acbm/config.py | 2 ++ src/acbm/matching.py | 16 ++++++++++------ 6 files changed, 22 insertions(+), 9 deletions(-) diff --git a/config/base_500.toml b/config/base_500.toml index 9c7ee5e..1356bd9 100644 --- a/config/base_500.toml +++ b/config/base_500.toml @@ -9,6 +9,7 @@ boundary_geography = "OA" [matching] load_hh = false load_ind = false +n_matches = 10 [work_assignment] use_percentages = true diff --git a/config/base_5000.toml b/config/base_5000.toml index d4f0134..3e857c4 100644 --- a/config/base_5000.toml +++ b/config/base_5000.toml @@ -3,9 +3,14 @@ seed = 0 region = "leeds" number_of_households = 5000 zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" +[matching] +load_hh = false +load_ind = false +n_matches = 10 + [work_assignment] use_percentages = true weight_max_dev = 0.2 diff --git a/config/base_all.toml b/config/base_all.toml index ebe0431..5bef78a 100644 --- a/config/base_all.toml +++ b/config/base_all.toml @@ -8,6 +8,7 @@ boundary_geography = "OA" [matching] load_hh = false load_ind = false +n_matches = 10 [work_assignment] use_percentages = false diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index bc6de99..0e4fb69 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -781,8 +781,8 @@ def get_interim_path( matching_dict=matching_dfs_dict, fixed_cols=list(config.matching.required_columns), optional_cols=list(config.matching.optional_columns), - n_matches=10, - chunk_size=50000, + n_matches=config.matching.n_matches, + chunk_size=config.matching.chunk_size, show_progress=True, ) diff --git a/src/acbm/config.py b/src/acbm/config.py index 61f594b..b0cf15b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -34,6 +34,8 @@ class MatchingParams(BaseModel): "employment_status", "tenure_status", ) + n_matches: int | None = None + chunk_size: int = 50_000 @dataclass(frozen=True) diff --git a/src/acbm/matching.py b/src/acbm/matching.py index 87fea06..0be0dfd 100644 --- a/src/acbm/matching.py +++ b/src/acbm/matching.py @@ -18,7 +18,7 @@ class MatcherExact: matching_dict: Dict[str, List[str]] fixed_cols: List[str] optional_cols: List[str] - n_matches: int = 5 + n_matches: int | None = 10 chunk_size: int = 50000 show_progress: bool = True matched_dict: Dict[str, List[str]] = field( @@ -147,11 +147,15 @@ def iterative_match_categorical(self) -> Dict[str, List[str]]: self.matched_dict[pop_id].extend(unique_sample_ids) self.match_count[pop_id] += len(unique_sample_ids) - matched_ids = [ - pop_id - for pop_id, count in self.match_count.items() - if count >= self.n_matches - ] + matched_ids = ( + [ + pop_id + for pop_id, count in self.match_count.items() + if count >= self.n_matches + ] + if self.n_matches is not None + else [] + ) self.remaining_df_pop = self.remaining_df_pop[ ~self.remaining_df_pop[self.df_pop_id].isin(matched_ids) ] From 37216aa445ef255557fda34868f2ea5ffaf831ef Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 24 Oct 2024 07:33:00 +0100 Subject: [PATCH 17/21] Add commute_level config --- config/base_all.toml | 2 +- scripts/3.2.2_assign_primary_zone_work.py | 8 ++++++-- src/acbm/config.py | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/config/base_all.toml b/config/base_all.toml index 5bef78a..e2d99c4 100644 --- a/config/base_all.toml +++ b/config/base_all.toml @@ -2,7 +2,7 @@ seed = 0 region = "greater-london" zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography +travel_times = false # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" [matching] diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 82ee9ee..6dc6bb2 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -62,8 +62,12 @@ def main(config_file): # Commuting matrices (from 2021 census) - # TODO: consider making this configurable - commute_level = config.boundary_geography # "OA" or "MSOA" data + # "OA" or "MSOA" data: set as config.boundary_geography if not passed + commute_level = ( + config.boundary_geography + if config.work_assignment.commute_level is None + else config.work_assignment.commute_level + ) logger.info(f"Loading commuting matrices at {commute_level} level") diff --git a/src/acbm/config.py b/src/acbm/config.py index b0cf15b..0d3eb0f 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -44,6 +44,7 @@ class WorkAssignmentParams(BaseModel): weight_max_dev: float weight_total_dev: float max_zones: int + commute_level: str | None class Config(BaseModel): From 6722b1a725effb8817757cfc3fb52b1f7d4e00b5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 24 Oct 2024 10:20:53 +0100 Subject: [PATCH 18/21] Update pre-commit, format, add raise from error --- .pre-commit-config.yaml | 2 +- scripts/3.2.2_assign_primary_zone_work.py | 16 ++++++++-------- src/acbm/__init__.py | 1 + src/acbm/config.py | 2 +- tests/test_matching.py | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 227e436..c1a3dc4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.2.0" + rev: "v0.7.0" hooks: # first, lint + autofix - id: ruff diff --git a/scripts/3.2.2_assign_primary_zone_work.py b/scripts/3.2.2_assign_primary_zone_work.py index 6dc6bb2..1c1da11 100644 --- a/scripts/3.2.2_assign_primary_zone_work.py +++ b/scripts/3.2.2_assign_primary_zone_work.py @@ -260,20 +260,20 @@ def main(config_file): workzone_assignment_opt["pct_of_o_total_actual"] = workzone_assignment_opt.groupby( "origin_zone" )["demand_actual"].transform(lambda x: (x / x.sum()) * 100) - workzone_assignment_opt[ - "pct_of_o_total_assigned" - ] = workzone_assignment_opt.groupby("origin_zone")["demand_assigned"].transform( - lambda x: (x / x.sum()) * 100 + workzone_assignment_opt["pct_of_o_total_assigned"] = ( + workzone_assignment_opt.groupby( + "origin_zone" + )["demand_assigned"].transform(lambda x: (x / x.sum()) * 100) ) # (3) For each OD pair, demand as % of total demand to each destination workzone_assignment_opt["pct_of_d_total_actual"] = workzone_assignment_opt.groupby( "assigned_zone" )["demand_actual"].transform(lambda x: (x / x.sum()) * 100) - workzone_assignment_opt[ - "pct_of_d_total_assigned" - ] = workzone_assignment_opt.groupby("assigned_zone")["demand_assigned"].transform( - lambda x: (x / x.sum()) * 100 + workzone_assignment_opt["pct_of_d_total_assigned"] = ( + workzone_assignment_opt.groupby( + "assigned_zone" + )["demand_assigned"].transform(lambda x: (x / x.sum()) * 100) ) # Define the output file path diff --git a/src/acbm/__init__.py b/src/acbm/__init__.py index 0171ca8..d630574 100644 --- a/src/acbm/__init__.py +++ b/src/acbm/__init__.py @@ -1,6 +1,7 @@ """ acbm: A package to create activity-based models (for transport demand modelling) """ + from __future__ import annotations import os diff --git a/src/acbm/config.py b/src/acbm/config.py index 0d3eb0f..08788e3 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -85,7 +85,7 @@ def init_rng(self): random.seed(self.seed) except Exception as err: msg = f"config does not provide a rng seed with err: {err}" - ValueError(msg) + raise ValueError(msg) from err def load_config(filepath: str | Path) -> Config: diff --git a/tests/test_matching.py b/tests/test_matching.py index fcd17f1..6aa7727 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -4,7 +4,7 @@ from acbm.matching import MatcherExact, match_psm # noqa: F401 -@pytest.fixture() +@pytest.fixture def setup_data(): df_pop = pd.DataFrame( { From 44a87d5c022eda4f10a23bac009d05a5bb8bddb5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 1 Nov 2024 14:16:55 +0000 Subject: [PATCH 19/21] Only retain single base.toml --- config/base_500.toml | 18 ------------------ config/base_5000.toml | 18 ------------------ config/base_all.toml | 17 ----------------- 3 files changed, 53 deletions(-) delete mode 100644 config/base_500.toml delete mode 100644 config/base_5000.toml delete mode 100644 config/base_all.toml diff --git a/config/base_500.toml b/config/base_500.toml deleted file mode 100644 index 1356bd9..0000000 --- a/config/base_500.toml +++ /dev/null @@ -1,18 +0,0 @@ -[parameters] -seed = 0 -region = "greater-london" -number_of_households = 500 -zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography -boundary_geography = "OA" - -[matching] -load_hh = false -load_ind = false -n_matches = 10 - -[work_assignment] -use_percentages = true -weight_max_dev = 0.2 -weight_total_dev = 0.8 -max_zones = 8 diff --git a/config/base_5000.toml b/config/base_5000.toml deleted file mode 100644 index 3e857c4..0000000 --- a/config/base_5000.toml +++ /dev/null @@ -1,18 +0,0 @@ -[parameters] -seed = 0 -region = "leeds" -number_of_households = 5000 -zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography -boundary_geography = "OA" - -[matching] -load_hh = false -load_ind = false -n_matches = 10 - -[work_assignment] -use_percentages = true -weight_max_dev = 0.2 -weight_total_dev = 0.8 -max_zones = 8 diff --git a/config/base_all.toml b/config/base_all.toml deleted file mode 100644 index e2d99c4..0000000 --- a/config/base_all.toml +++ /dev/null @@ -1,17 +0,0 @@ -[parameters] -seed = 0 -region = "greater-london" -zone_id = "OA21CD" -travel_times = false # Only set to true if you have travel time matrix at the level specified in boundary_geography -boundary_geography = "OA" - -[matching] -load_hh = false -load_ind = false -n_matches = 10 - -[work_assignment] -use_percentages = false -weight_max_dev = 0.0 -weight_total_dev = 1.0 -max_zones = 4 From 0ca21949a104bd04643f5fc610a42bf86241fbe0 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 1 Nov 2024 14:17:29 +0000 Subject: [PATCH 20/21] Update gitignore --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 13c4a8c..f036118 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,9 @@ logs/ # pyright config pyrightconfig.json + +# scratch +notebooks/scratch* + +# AcBM config +config/ From 72c7c218dc273d786dbdff68473e55afe2a1f067 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 1 Nov 2024 14:23:33 +0000 Subject: [PATCH 21/21] Remove load_hh and load_ind from config --- config/base.toml | 14 ++++++++++++-- scripts/2_match_households_and_individuals.py | 8 ++++++-- src/acbm/config.py | 17 ++--------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/config/base.toml b/config/base.toml index 38a4f2f..9829be3 100644 --- a/config/base.toml +++ b/config/base.toml @@ -1,11 +1,21 @@ [parameters] seed = 0 region = "leeds" -number_of_households = 10000 +number_of_households = 5000 zone_id = "OA21CD" -travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography +travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography boundary_geography = "OA" +[matching] +required_columns = ["number_adults", "number_children"] +optional_columns = [ + "number_cars", + "num_pension_age", + "rural_urban_2_categories", + "employment_status", + "tenure_status", +] +n_matches = 10 [work_assignment] use_percentages = true diff --git a/scripts/2_match_households_and_individuals.py b/scripts/2_match_households_and_individuals.py index 0e4fb69..982e97c 100644 --- a/scripts/2_match_households_and_individuals.py +++ b/scripts/2_match_households_and_individuals.py @@ -686,7 +686,9 @@ def get_interim_path( ) # fill the NaNs with the original values # ## Step 3: Matching at Household Level - if not config.matching.load_hh: + # TODO: remove once refactored into two scripts + load_households = False + if not load_households: logger.info("Categorical matching: MATCHING HOUSEHOLDS") # @@ -936,7 +938,9 @@ def get_interim_path( columns={"Age_B04ID": "age_group", "Sex_B01ID": "sex"}, inplace=True ) - if not config.matching.load_ind: + # TODO: remove once refactored into two scripts + load_individuals = False + if not load_individuals: logger.info("Statistical matching: MATCHING INDIVIDUALS") # PSM matching using internal match_individuals function diff --git a/src/acbm/config.py b/src/acbm/config.py index 08788e3..9ea592b 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -19,21 +19,8 @@ class Parameters(BaseModel): @dataclass(frozen=True) class MatchingParams(BaseModel): - load_hh: bool | None = False - load_ind: bool | None = False - # Define required columns for matching - required_columns: list[str] | tuple[str] = ( - "number_adults", - "number_children", - ) - # Define optional columns in order of importance (most to least important) - optional_columns: list[str] | tuple[str] = ( - "number_cars", - "num_pension_age", - "rural_urban_2_categories", - "employment_status", - "tenure_status", - ) + required_columns: list[str] + optional_columns: list[str] n_matches: int | None = None chunk_size: int = 50_000