-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
331 apply imputation link to target #19
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
def create_and_merge_imputation_values( | ||
df, | ||
imputation_class, | ||
reference, | ||
period, | ||
marker, | ||
combined_imputation, | ||
target, | ||
cumulative_forward_link, | ||
cumulative_backward_link, | ||
auxiliary, | ||
construction_link, | ||
imputation_types=("c", "fir", "bir", "fic"), | ||
): | ||
Comment on lines
+2
to
+14
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. type hints needed |
||
""" | ||
Loop through different imputation types and merge the results according | ||
to an imputation marker column | ||
|
||
Parameters | ||
---------- | ||
df : pandas.DataFrame | ||
imputation_class : str | ||
column name for the variable that defines the imputation class | ||
reference : str | ||
column name for the reference | ||
period : str | ||
column name for the period | ||
marker : str | ||
column name containing a marker to indicate the type of imputation required | ||
combined_imputation : str | ||
column name for the combined imputation types according to the imputation marker | ||
target : str | ||
column name for the target variable for imputation | ||
cumulative_forward_link : str | ||
column name for the cumulative forward imputation link | ||
cumulative_backward_link : str | ||
column name for the cumulative backward imputation link | ||
auxiliary : str | ||
column name for auxiliary variable | ||
construction_link : str | ||
column name for contruction link | ||
imputation_types : tup | ||
types of imputation to run and add to combined_imputation column stored in a | ||
tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'. | ||
For 'fic' to produce the correct result, the C marker must be in the first | ||
period for a given reference. | ||
|
||
Returns | ||
------- | ||
pandas.DataFrame | ||
dataframe with imputation values defined by the imputation marker | ||
""" | ||
|
||
# constructed has to come first to use the result for forward impute from contructed | ||
imputation_config = { | ||
"c": { | ||
"intermediate_column": "constructed", | ||
"marker": "C", | ||
# doesn't actually apply a fill so can be forward or back | ||
"fill_column": auxiliary, | ||
"fill_method": "ffill", | ||
"link_column": construction_link, | ||
}, | ||
"fir": { | ||
"intermediate_column": "fir", | ||
"marker": "FIR", | ||
"fill_column": target, | ||
"fill_method": "ffill", | ||
"link_column": cumulative_forward_link, | ||
}, | ||
"bir": { | ||
"intermediate_column": "bir", | ||
"marker": "BIR", | ||
"fill_column": target, | ||
"fill_method": "bfill", | ||
"link_column": cumulative_backward_link, | ||
}, | ||
"fic": { | ||
# FIC only works if the C is in the first period of the business being | ||
# sampled. This is fine for automatic imputation, but should be careful | ||
# if manual construction imputation is done | ||
"intermediate_column": "fic", | ||
"marker": "FIC", | ||
# this has to have the same name as the intermediate column for constructed | ||
"fill_column": "constructed", | ||
"fill_method": "ffill", | ||
"link_column": cumulative_forward_link, | ||
}, | ||
} | ||
|
||
df.sort_values([imputation_class, reference, period], inplace=True) | ||
|
||
intermediate_columns = [] | ||
|
||
for imp_type in imputation_types: | ||
df = create_impute( | ||
df, [imputation_class, reference], imputation_config[imp_type] | ||
) | ||
df = merge_imputation_type( | ||
df, imputation_config[imp_type], marker, combined_imputation | ||
) | ||
|
||
intermediate_columns.append(imputation_config[imp_type]["intermediate_column"]) | ||
|
||
return df.drop(columns=intermediate_columns) | ||
|
||
|
||
def create_impute(df, group, imputation_spec): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 2 verbs in function implies doing 2 things, perhaps rename to create_imputation? |
||
""" | ||
Add a new column to a dataframe of imputed values using ratio imputation. | ||
|
||
Parameters | ||
---------- | ||
dataframe : pandas.DataFrame | ||
group : str or list | ||
variables that define the imputation class | ||
imputation_spec: dict | ||
dictionary defining the details of the imputation type | ||
|
||
Returns | ||
------- | ||
pandas.DataFrame | ||
dataframe with an added imputation column defined by the imputation_spec | ||
""" | ||
column_name = imputation_spec["intermediate_column"] | ||
fill_column = imputation_spec["fill_column"] | ||
fill_method = imputation_spec["fill_method"] | ||
link_column = imputation_spec["link_column"] | ||
|
||
df[column_name] = ( | ||
df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column] | ||
) | ||
return df | ||
|
||
|
||
def merge_imputation_type(df, imputation_spec, marker, combined_imputation): | ||
""" | ||
Uses an existing column of imputed values and a imputation marker to merge values | ||
into a single column | ||
|
||
Parameters | ||
---------- | ||
dataframe : pandas.DataFrame | ||
imputation_spec: dict | ||
dictionary defining the details of the imputation type | ||
marker : str | ||
column name containing a marker to indicate the type of imputation required | ||
combined_imputation : str | ||
column name for the combined imputation types according to the imputation marker | ||
|
||
Returns | ||
------- | ||
pandas.DataFrame | ||
dataframe with combined_imputation | ||
""" | ||
|
||
imputation_marker = imputation_spec["marker"] | ||
imputation_column = imputation_spec["intermediate_column"] | ||
|
||
df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column] | ||
return df |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link | ||
100,100000,200,202402,1,2,1,,,R,,, | ||
100,100000,,202403,2,0.6,2,2,0.6,FIR,400,, | ||
100,100000,,202404,3,1,2,6,1,FIR,1200,, | ||
200,100001,,202402,1,4,3,1,2,BIR,600,, | ||
200,100001,,202403,3,0.5,3,3,0.5,BIR,150,, | ||
200,100001,300,202404,0.5,1,4,,,R,,, | ||
300,100002,,202402,1,4,5,1,2,C,600,40,0.1 | ||
300,100002,,202403,3,0.5,5,3,0.5,FIC,150,, | ||
300,100002,,202404,0.5,1,5,2,,FIC,,, |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value | ||
200,100001,,202402,4,2,BIR,600 | ||
200,100001,,202403,0.5,0.5,BIR,150 | ||
200,100001,300,202404,1,,R, |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value | ||
300,100002,,202402,1,,0.1,1000,C,100 | ||
300,100002,,202403,3,3,,,FIC,300 | ||
300,100002,,202404,0.5,1.5,,,FIC,150 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value | ||
100,100000,200,202402,1,,R, | ||
100,100000,,202403,2,2,FIR,400 | ||
100,100000,,202404,3,6,FIR,1200 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value | ||
100,100000,200,202402,1,2,,,,,R, | ||
100,100000,,202403,2,0.6,,,2,0.6,FIR,400 | ||
100,100000,,202404,3,1,,,6,1,FIR,1200 | ||
200,100001,,202402,1,4,,,1,2,BIR,600 | ||
200,100001,,202403,3,0.5,,,3,0.5,BIR,150 | ||
200,100001,300,202404,0.5,1,,,,,R, | ||
300,100002,,202402,1,4,1000,0.1,,2,C,100 | ||
300,100002,,202403,3,0.5,,,3,0.5,FIC,300 | ||
300,100002,,202404,0.5,1,,,1.5,,FIC,150 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from pathlib import Path | ||
|
||
import pytest | ||
from helper_functions import load_and_format | ||
from pandas.testing import assert_frame_equal | ||
|
||
from src.apply_imputation_link import create_and_merge_imputation_values | ||
|
||
|
||
@pytest.fixture(scope="class") | ||
def fir_bir_c_fic_test_data(): | ||
return load_and_format( | ||
Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv" We can define the full path like this: |
||
) | ||
|
||
|
||
class TestApplyImputationLink: | ||
def test_all_imputation_types(self, fir_bir_c_fic_test_data): | ||
expected_output = fir_bir_c_fic_test_data | ||
|
||
input_data = expected_output.drop(columns=["imputed_value"]) | ||
actual_output = create_and_merge_imputation_values( | ||
input_data, | ||
"imputation_class", | ||
"reference", | ||
"period", | ||
"imputation_marker", | ||
"imputed_value", | ||
"target", | ||
"cumulative_forward_link", | ||
"cumulative_backward_link", | ||
"auxiliary_variable", | ||
"construction_link", | ||
imputation_types=("c", "fir", "bir", "fic"), | ||
) | ||
|
||
assert_frame_equal(actual_output, expected_output) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe this function imputes the values by creating new columns and merging them, so create and merge should go in the docstrings.
I understand this might crossover with the name of higher level function perhaps rename it to add_imputed_values?