Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

331 apply imputation link to target #19

Merged
merged 5 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions src/apply_imputation_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
def create_and_merge_imputation_values(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this function imputes the values by creating new columns and merging them, so create and merge should go in the docstrings.

I understand this might crossover with the name of higher level function perhaps rename it to add_imputed_values?

df,
imputation_class,
reference,
period,
marker,
combined_imputation,
target,
cumulative_forward_link,
cumulative_backward_link,
auxiliary,
construction_link,
imputation_types=("c", "fir", "bir", "fic"),
):
Comment on lines +2 to +14
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type hints needed

"""
Loop through different imputation types and merge the results according
to an imputation marker column

Parameters
----------
df : pandas.DataFrame
imputation_class : str
column name for the variable that defines the imputation class
reference : str
column name for the reference
period : str
column name for the period
marker : str
column name containing a marker to indicate the type of imputation required
combined_imputation : str
column name for the combined imputation types according to the imputation marker
target : str
column name for the target variable for imputation
cumulative_forward_link : str
column name for the cumulative forward imputation link
cumulative_backward_link : str
column name for the cumulative backward imputation link
auxiliary : str
column name for auxiliary variable
construction_link : str
column name for contruction link
imputation_types : tup
types of imputation to run and add to combined_imputation column stored in a
tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'.
For 'fic' to produce the correct result, the C marker must be in the first
period for a given reference.

Returns
-------
pandas.DataFrame
dataframe with imputation values defined by the imputation marker
"""

# constructed has to come first to use the result for forward impute from contructed
imputation_config = {
"c": {
"intermediate_column": "constructed",
"marker": "C",
# doesn't actually apply a fill so can be forward or back
"fill_column": auxiliary,
"fill_method": "ffill",
"link_column": construction_link,
},
"fir": {
"intermediate_column": "fir",
"marker": "FIR",
"fill_column": target,
"fill_method": "ffill",
"link_column": cumulative_forward_link,
},
"bir": {
"intermediate_column": "bir",
"marker": "BIR",
"fill_column": target,
"fill_method": "bfill",
"link_column": cumulative_backward_link,
},
"fic": {
# FIC only works if the C is in the first period of the business being
# sampled. This is fine for automatic imputation, but should be careful
# if manual construction imputation is done
"intermediate_column": "fic",
"marker": "FIC",
# this has to have the same name as the intermediate column for constructed
"fill_column": "constructed",
"fill_method": "ffill",
"link_column": cumulative_forward_link,
},
}

df.sort_values([imputation_class, reference, period], inplace=True)

intermediate_columns = []

for imp_type in imputation_types:
df = create_impute(
df, [imputation_class, reference], imputation_config[imp_type]
)
df = merge_imputation_type(
df, imputation_config[imp_type], marker, combined_imputation
)

intermediate_columns.append(imputation_config[imp_type]["intermediate_column"])

return df.drop(columns=intermediate_columns)


def create_impute(df, group, imputation_spec):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 verbs in function implies doing 2 things, perhaps rename to create_imputation?

"""
Add a new column to a dataframe of imputed values using ratio imputation.

Parameters
----------
dataframe : pandas.DataFrame
group : str or list
variables that define the imputation class
imputation_spec: dict
dictionary defining the details of the imputation type

Returns
-------
pandas.DataFrame
dataframe with an added imputation column defined by the imputation_spec
"""
column_name = imputation_spec["intermediate_column"]
fill_column = imputation_spec["fill_column"]
fill_method = imputation_spec["fill_method"]
link_column = imputation_spec["link_column"]

df[column_name] = (
df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column]
)
return df


def merge_imputation_type(df, imputation_spec, marker, combined_imputation):
"""
Uses an existing column of imputed values and a imputation marker to merge values
into a single column

Parameters
----------
dataframe : pandas.DataFrame
imputation_spec: dict
dictionary defining the details of the imputation type
marker : str
column name containing a marker to indicate the type of imputation required
combined_imputation : str
column name for the combined imputation types according to the imputation marker

Returns
-------
pandas.DataFrame
dataframe with combined_imputation
"""

imputation_marker = imputation_spec["marker"]
imputation_column = imputation_spec["intermediate_column"]

df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column]
return df
10 changes: 10 additions & 0 deletions tests/apply_imputation_link.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link
100,100000,200,202402,1,2,1,,,R,,,
100,100000,,202403,2,0.6,2,2,0.6,FIR,400,,
100,100000,,202404,3,1,2,6,1,FIR,1200,,
200,100001,,202402,1,4,3,1,2,BIR,600,,
200,100001,,202403,3,0.5,3,3,0.5,BIR,150,,
200,100001,300,202404,0.5,1,4,,,R,,,
300,100002,,202402,1,4,5,1,2,C,600,40,0.1
300,100002,,202403,3,0.5,5,3,0.5,FIC,150,,
300,100002,,202404,0.5,1,5,2,,FIC,,,
4 changes: 4 additions & 0 deletions tests/data/apply_imputation_link/BIR.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value
200,100001,,202402,4,2,BIR,600
200,100001,,202403,0.5,0.5,BIR,150
200,100001,300,202404,1,,R,
4 changes: 4 additions & 0 deletions tests/data/apply_imputation_link/C_FIC.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value
300,100002,,202402,1,,0.1,1000,C,100
300,100002,,202403,3,3,,,FIC,300
300,100002,,202404,0.5,1.5,,,FIC,150
4 changes: 4 additions & 0 deletions tests/data/apply_imputation_link/FIR.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value
100,100000,200,202402,1,,R,
100,100000,,202403,2,2,FIR,400
100,100000,,202404,3,6,FIR,1200
10 changes: 10 additions & 0 deletions tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value
100,100000,200,202402,1,2,,,,,R,
100,100000,,202403,2,0.6,,,2,0.6,FIR,400
100,100000,,202404,3,1,,,6,1,FIR,1200
200,100001,,202402,1,4,,,1,2,BIR,600
200,100001,,202403,3,0.5,,,3,0.5,BIR,150
200,100001,300,202404,0.5,1,,,,,R,
300,100002,,202402,1,4,1000,0.1,,2,C,100
300,100002,,202403,3,0.5,,,3,0.5,FIC,300
300,100002,,202404,0.5,1,,,1.5,,FIC,150
37 changes: 37 additions & 0 deletions tests/test_apply_imputation_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pathlib import Path

import pytest
from helper_functions import load_and_format
from pandas.testing import assert_frame_equal

from src.apply_imputation_link import create_and_merge_imputation_values


@pytest.fixture(scope="class")
def fir_bir_c_fic_test_data():
return load_and_format(
Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv"
Not a big fan of this aproach, we combine windows path object with some strings

We can define the full path like this:
Path("tests","data","apply_imputation_link","FIR_BIR_C_FIC.csv")

)


class TestApplyImputationLink:
def test_all_imputation_types(self, fir_bir_c_fic_test_data):
expected_output = fir_bir_c_fic_test_data

input_data = expected_output.drop(columns=["imputed_value"])
actual_output = create_and_merge_imputation_values(
input_data,
"imputation_class",
"reference",
"period",
"imputation_marker",
"imputed_value",
"target",
"cumulative_forward_link",
"cumulative_backward_link",
"auxiliary_variable",
"construction_link",
imputation_types=("c", "fir", "bir", "fic"),
)

assert_frame_equal(actual_output, expected_output)
Loading