diff --git a/mbs_results/apply_imputation_link.py b/mbs_results/apply_imputation_link.py index a1c18f8b..4e499fc6 100755 --- a/mbs_results/apply_imputation_link.py +++ b/mbs_results/apply_imputation_link.py @@ -58,7 +58,7 @@ def create_and_merge_imputation_values( imputation_config = { "c": { "intermediate_column": "constructed", - "marker": "C", + "marker": "c", # doesn't actually apply a fill so can be forward or back "fill_column": auxiliary, "fill_method": "ffill", @@ -66,14 +66,14 @@ def create_and_merge_imputation_values( }, "fir": { "intermediate_column": "fir", - "marker": "FIR", + "marker": "fir", "fill_column": target, "fill_method": "ffill", "link_column": cumulative_forward_link, }, "bir": { "intermediate_column": "bir", - "marker": "BIR", + "marker": "bir", "fill_column": target, "fill_method": "bfill", "link_column": cumulative_backward_link, @@ -83,9 +83,8 @@ def create_and_merge_imputation_values( # sampled. This is fine for automatic imputation, but should be careful # if manual construction imputation is done "intermediate_column": "fic", - "marker": "FIC", - # this has to have the same name as the intermediate column for constructed - "fill_column": "constructed", + "marker": "fic", + "fill_column": "imputed_value", "fill_method": "ffill", "link_column": cumulative_forward_link, }, diff --git a/mbs_results/cumulative_imputation_links.py b/mbs_results/cumulative_imputation_links.py index 1180e925..450b6201 100755 --- a/mbs_results/cumulative_imputation_links.py +++ b/mbs_results/cumulative_imputation_links.py @@ -47,14 +47,20 @@ def get_cumulative_links( dataframe.sort_values([strata, reference, period], inplace=True) dataframe["missing_value"] = np.where(dataframe[target].isnull(), True, False) + # TODO: These conditions are similar with the ones at flags, consider a fun for this + marker_diff_con = ( + dataframe["imputation_marker"] + .ne(dataframe["imputation_marker"].shift().bfill()) + .astype(int) + != 0 + ) + + strat_diff_con = dataframe[strata].diff(time_difference) != 0 + + reference_diff_con = dataframe[reference].diff(time_difference) != 0 + dataframe["imputation_group"] = ( - ( - (dataframe["missing_value"].diff(time_difference) != 0) - | (dataframe[strata].diff(time_difference) != 0) - | (dataframe[reference].diff(time_difference) != 0) - ) - .astype("int") - .cumsum() + (marker_diff_con | strat_diff_con | reference_diff_con).astype("int").cumsum() ) if forward_or_backward == "f": @@ -72,4 +78,4 @@ def get_cumulative_links( dataframe["cumulative_" + imputation_link], ) - return dataframe[["imputation_group", "cumulative_" + imputation_link]] + return dataframe diff --git a/tests/cumulative_links.csv b/tests/cumulative_links.csv index bef347a5..30c455e0 100755 --- a/tests/cumulative_links.csv +++ b/tests/cumulative_links.csv @@ -1,7 +1,7 @@ -strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link -100,100000,200,202402,1,2,1,, -100,100000,,202403,2,0.6,2,2,0.6 -100,100000,,202404,3,1,2,6,1 -200,100001,,202402,1,4,3,1,2 -200,100001,,202403,3,0.5,3,3,0.5 -200,100001,300,202404,0.5,1,4,, +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_marker,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link +100,100000,200,202402,1,2,r,1,, +100,100000,,202403,2,0.6,fir,2,2,0.6 +100,100000,,202404,3,1,fir,2,6,1 +200,100001,,202402,1,4,bir,3,1,2 +200,100001,,202403,3,0.5,bir,3,3,0.5 +200,100001,300,202404,0.5,1,r,4,, diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv index 91ec36ec..56ac7c04 100755 --- a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv +++ b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv @@ -1,10 +1,10 @@ imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value -100,100000,200,202402,1,2,,,,,R, -100,100000,,202403,2,0.6,,,2,0.6,FIR,400 -100,100000,,202404,3,1,,,6,1,FIR,1200 -200,100001,,202402,1,4,,,1,2,BIR,600 -200,100001,,202403,3,0.5,,,3,0.5,BIR,150 -200,100001,300,202404,0.5,1,,,,,R, -300,100002,,202402,1,4,1000,0.1,,2,C,100 -300,100002,,202403,3,0.5,,,3,0.5,FIC,300 -300,100002,,202404,0.5,1,,,1.5,,FIC,150 +100,100000,200,202402,1,2,,,,,r, +100,100000,,202403,2,0.6,,,2,0.6,fir,400 +100,100000,,202404,3,1,,,6,1,fir,1200 +200,100001,,202402,1,4,,,1,2,bir,600 +200,100001,,202403,3,0.5,,,3,0.5,bir,150 +200,100001,300,202404,0.5,1,,,,,r, +300,100002,,202402,1,4,1000,0.1,,2,c,100 +300,100002,,202403,3,0.5,,,3,0.5,fic,300 +300,100002,,202404,0.5,1,,,1.5,,fic,150 diff --git a/tests/test_cumulative_imputation_links.py b/tests/test_cumulative_imputation_links.py index 43589f42..4cc785f6 100755 --- a/tests/test_cumulative_imputation_links.py +++ b/tests/test_cumulative_imputation_links.py @@ -12,18 +12,19 @@ def cumulative_links_test_data(): return load_and_format(Path("tests") / "cumulative_links.csv") -class TestComulativeLinks: +class TestCumulativeLinks: def test_get_cumulative_links_forward(self, cumulative_links_test_data): input_data = cumulative_links_test_data.drop( - columns=["cumulative_forward_imputation_link", "imputation_group"] - ) - - expected_output = cumulative_links_test_data[ - [ - "imputation_group", + columns=[ + "cumulative_backward_imputation_link", "cumulative_forward_imputation_link", + "imputation_group", ] - ] + ) + + expected_output = cumulative_links_test_data.drop( + columns=["imputation_group", "cumulative_backward_imputation_link"] + ) actual_output = get_cumulative_links( input_data, @@ -36,19 +37,24 @@ def test_get_cumulative_links_forward(self, cumulative_links_test_data): 1, ) + actual_output = actual_output.drop( + columns=["imputation_group", "missing_value"] + ) + assert_frame_equal(actual_output, expected_output) def test_get_cumulative_links_backward(self, cumulative_links_test_data): input_data = cumulative_links_test_data.drop( - columns=["cumulative_backward_imputation_link", "imputation_group"] - ) - - expected_output = cumulative_links_test_data[ - [ - "imputation_group", + columns=[ "cumulative_backward_imputation_link", + "cumulative_forward_imputation_link", + "imputation_group", ] - ] + ) + + expected_output = cumulative_links_test_data.drop( + columns=["imputation_group", "cumulative_forward_imputation_link"] + ) actual_output = get_cumulative_links( input_data, @@ -61,4 +67,8 @@ def test_get_cumulative_links_backward(self, cumulative_links_test_data): 1, ) + actual_output = actual_output.drop( + columns=["imputation_group", "missing_value"] + ) + assert_frame_equal(actual_output, expected_output)