Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

498 sampled_flag is providing flag for census groups #122

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion mbs_results/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,5 +123,8 @@
"0817":15,
"0867":15,
"0823":16,
"0873":16}
"0873":16},

"census_extra_calibration_group": [5043, 5113, 5123, 5203, 5233,
5403, 5643, 5763, 5783, 5903, 6073]
}
42 changes: 36 additions & 6 deletions mbs_results/estimation/apply_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,19 @@
calculate_design_weight,
)
from mbs_results.estimation.pre_processing_estimation import get_estimation_data
from mbs_results.staging.data_cleaning import is_census

# from mbs_results.estimation.validate_estimation import validate_estimation


def apply_estimation(population_path, sample_path, period, **config):
def apply_estimation(
population_path,
sample_path,
calibration_group,
census_extra_calibration_group,
period,
**config
):
"""
Read population frame and sample, merge key variables onto df then derive
and validate estimation weights.
Expand All @@ -22,6 +30,10 @@ def apply_estimation(population_path, sample_path, period, **config):
filepath for population frame data
sample_path : str
filepath for sample data
calibration_group: str
column name of dimension contaning calibration group values
census_extra_calibration_group: list
calibration groups which are census but not band 4 or 5
period : str
name of column containing period

Expand All @@ -45,12 +57,30 @@ def apply_estimation(population_path, sample_path, period, **config):
population_file, sample_file, period, **config
)

estimation_data = calculate_design_weight(estimation_data, period, **config)
estimation_data = calculate_calibration_factor(
estimation_data, period, **config
)
census_df = estimation_data[
is_census(
estimation_data[calibration_group], census_extra_calibration_group
)
]

census_df["design_weight"] = 1
census_df["calibration_factor"] = 1
census_df["sampled"] = 0

non_census_df = estimation_data[
~(
is_census(
estimation_data[calibration_group], census_extra_calibration_group
)
)
]

non_census_df = calculate_design_weight(non_census_df, period, **config)
non_census_df = calculate_calibration_factor(non_census_df, period, **config)

all_together = pd.concat([non_census_df, census_df], ignore_index=True)

estimation_df_list.append(estimation_data)
estimation_df_list.append(all_together)

estimation_df = pd.concat(estimation_df_list, ignore_index=True)

Expand Down
Empty file added mbs_results/staging/__init__.py
Empty file.
75 changes: 17 additions & 58 deletions mbs_results/staging/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,72 +381,31 @@ def create_imputation_class(
return df


# TODO: Can be used when we set defaults in other parts of the pipeline
def correct_values(
df: pd.DataFrame,
columns_to_correct: List[str] or str,
condition_column: str,
condition_values: List[int],
replace_with: int,
) -> pd.DataFrame:
def is_census(calibration_group: pd.Series, extra_bands: List) -> pd.Series:
"""
Sets values in a dataframe column(s) based on a condition, checks if
columns exists prior to correction to avoid creating them.
Returns a bool series indicating if calibration group is considered cencus
or not.

Calibration groups in extra_bands list are considered cencus groups.

Calibration groups ending with 4 or 5 are considered cencus groups.


Parameters
----------
df : pd.DataFrame
Original dataframe to apply the correction.
columns_to_correct : List(str) or str
Column(s) to set values.
condition_column : str
Column for the condition.
condition_values : List(int)
Values which exist in condition_column.
replace_with : int
Value to set if condition is met.
calibration_group : pd.Series
Series with calibration groups
extra_bands: List
Extra calibration groups which are cencus

Returns
-------
df : pd.DataFrame
Dataframe with values replaced.

Examples
--------
>>> df = pd.DataFrame({'a': [0, 1, 2, 3, 4],
'b': [5, 6, 7, 8, 9],
'band': [1,2,3,4,5]})
>>> df
a b band
0 0 5 1
1 1 6 2
2 2 7 3
3 3 8 4
4 4 9 5

>>> df2 = correct_values(df,["a","b"],"band",[4,5],1)
>>> df2
a b band
0 0 5 1
1 1 6 2
2 2 7 3
3 1 1 4
4 1 1 5
pd.Series
A bool series, TRUE if calibration group is cencus
"""

df_temp = df.copy() # to avoid changing input df

check_columns = (
columns_to_correct + [condition_column] # list + list(str)
if pd.api.types.is_list_like(columns_to_correct)
else [columns_to_correct, condition_column]
)
rule_band_4_5 = calibration_group.astype(str).map(lambda x: x.endswith(("4", "5")))

# Update value only if columns exist
if set(check_columns).issubset(df.columns):

df_temp.loc[df[condition_column].isin(condition_values), columns_to_correct] = (
replace_with
)
rule_extra_bands = calibration_group.isin(extra_bands)

return df_temp
return rule_band_4_5 | rule_extra_bands
10 changes: 0 additions & 10 deletions tests/data/staging/data_cleaning/test_correct_values.csv

This file was deleted.

21 changes: 21 additions & 0 deletions tests/data/staging/data_cleaning/test_is_cencus.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
calibration_group,is_census
5043,TRUE
5113,TRUE
5123,TRUE
5203,TRUE
5233,TRUE
5403,TRUE
5643,TRUE
5763,TRUE
5783,TRUE
5903,TRUE
6073,TRUE
6005,TRUE
6004,TRUE
6003,FALSE
6001,FALSE
6006,FALSE
6007,FALSE
6008,FALSE
6009,FALSE
6000,FALSE
33 changes: 22 additions & 11 deletions tests/staging/test_data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

import pandas as pd
import pytest
from pandas.testing import assert_frame_equal
from pandas.testing import assert_frame_equal, assert_series_equal

from mbs_results.staging.data_cleaning import (
clean_and_merge,
correct_values,
create_imputation_class,
enforce_datatypes,
is_census,
run_live_or_frozen,
)

Expand Down Expand Up @@ -137,18 +137,29 @@ def test_run_live_or_frozen_exception(filepath):
run_live_or_frozen(df, "target", "error", "love")


def test_correct_values(filepath):
def test_is_census(filepath):

df = pd.read_csv(filepath / "test_correct_values.csv")
df = pd.read_csv(filepath / "test_is_cencus.csv")

df_in = df[["band_no", "value_1", "value_2", "value_3"]]

expected_output = df[
["band_no", "expected_value_1", "expected_value_2", "expected_value_3"]
extra_cal_groups = [
5043,
5113,
5123,
5203,
5233,
5403,
5643,
5763,
5783,
5903,
6073,
]

expected_output.columns = df_in.columns
input_series = df["calibration_group"]
expected_output = df["is_census"]

actual_output = correct_values(df_in, ["value_1", "value_2"], "band_no", [4, 5], 1)
# By default takes name of input series
actual_output = is_census(input_series, extra_cal_groups)
actual_output.name = "is_census"

assert_frame_equal(actual_output, expected_output)
assert_series_equal(actual_output, expected_output)
Loading