diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb2d..74bd5ebe725 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + fixed: + - ZIP codes are sampled from the state, and axes-containing simulations don't vary the ZIP code. diff --git a/policyengine_us/variables/household/demographic/geographic/zip_code/zip_code.py b/policyengine_us/variables/household/demographic/geographic/zip_code/zip_code.py index 2f2518f20fa..5fca4921e42 100644 --- a/policyengine_us/variables/household/demographic/geographic/zip_code/zip_code.py +++ b/policyengine_us/variables/household/demographic/geographic/zip_code/zip_code.py @@ -9,7 +9,37 @@ class zip_code(Variable): default_value = "UNKNOWN" def formula(household, period, parameters): - numeric_zip_code = ZIP_CODE_DATASET.zip_code.sample( - household.count, weights=ZIP_CODE_DATASET.population, replace=True - ) - return numeric_zip_code.astype(str).str.zfill(5) + state_code = household("state_code_str", period) + + if household.simulation.has_axes: + # For each state, select ONE zip code randomly, with probability proportional to population. + + state_to_zip_code = { + state_code: ZIP_CODE_DATASET[ + ZIP_CODE_DATASET.state == state_code + ] + .sample(1, weights="population") + .zip_code.iloc[0] + for state_code in ZIP_CODE_DATASET.state.unique() + } + + household_zip_code = ( + pd.Series(state_code).map(state_to_zip_code).squeeze() + ) + + else: + household_zip_code = np.empty_like(state_code, dtype=object) + for state in ZIP_CODE_DATASET.state.unique(): + count_households_in_state = (state_code == state).sum() + household_zip_code[state_code == state] = ( + ZIP_CODE_DATASET[ZIP_CODE_DATASET.state == state] + .sample( + count_households_in_state, + weights="population", + replace=True, + ) + .zip_code + ) + household_zip_code = pd.Series(household_zip_code) + + return household_zip_code.astype(str).str.zfill(5)