From 6a1938bbaea92aac0a5ee482645cdf0ef8b82fbe Mon Sep 17 00:00:00 2001 From: lhubbardONS Date: Mon, 18 Nov 2024 09:38:20 +0000 Subject: [PATCH 1/3] Adding additional outputs wrapper and making edits to additional outputs scripts to work correctly with wrapper. --- mbs_results/main.py | 4 +- mbs_results/outputs/get_additional_outputs.py | 38 +++++++++----- .../outputs/produce_additional_outputs.py | 50 +++++++++++++++++++ mbs_results/outputs/produce_outputs.py | 7 --- .../selective_editing_contributer_output.py | 7 ++- .../selective_editing_question_output.py | 9 ++-- mbs_results/outputs/turnover_analysis.py | 3 ++ .../outputs/weighted_adj_val_time_series.py | 4 +- 8 files changed, 96 insertions(+), 26 deletions(-) create mode 100644 mbs_results/outputs/produce_additional_outputs.py delete mode 100644 mbs_results/outputs/produce_outputs.py diff --git a/mbs_results/main.py b/mbs_results/main.py index 69e114aa..8013fbb2 100755 --- a/mbs_results/main.py +++ b/mbs_results/main.py @@ -1,7 +1,7 @@ from mbs_results.estimation.estimate import estimate from mbs_results.imputation.impute import impute from mbs_results.outlier_detection.detect_outlier import detect_outlier -from mbs_results.outputs.produce_outputs import produce_outputs +from mbs_results.outputs.produce_additional_outputs import produce_additional_outputs from mbs_results.staging.stage_dataframe import stage_dataframe from mbs_results.utilities.inputs import load_config from mbs_results.utilities.validation_checks import ( @@ -32,7 +32,7 @@ def run_mbs_main(): outlier_output = detect_outlier(estimation_output, config) validate_outlier_detection(outlier_output, config) - produce_outputs(outlier_output, "output_path/") + produce_additional_outputs(config) if __name__ == "__main__": diff --git a/mbs_results/outputs/get_additional_outputs.py b/mbs_results/outputs/get_additional_outputs.py index 35c7056d..4cf2ff64 100644 --- a/mbs_results/outputs/get_additional_outputs.py +++ b/mbs_results/outputs/get_additional_outputs.py @@ -27,25 +27,37 @@ def get_additional_outputs(config: dict, function_mapper: dict) -> None: Returns ------- - None + dict + Dictionary of additional outputs, with the keys being the names + of the outputs and the values being the outputs to be exported. Examples -------- - >> example_function = print("Hello world) - >> config = {additional_outputs:["output_name"]} - >> function_mapper = {output_name : example_function} - >> get_additional_outputs(config,function_mapper) + >> example_function = print("Hello world") + >> config = {"additional_outputs" : ["output_name"]} + >> function_mapper = {"output_name" : example_function} + >> get_additional_outputs(config, function_mapper) + >> + >> + >> example_function = function(argA, argB) + >> config = {"additional_outputs" : ["example_output"], + >> "argA": "valueA", + >> "argB": "valueB"} + >> function_mapper = {"example_output" : example_function} + >> get_additional_outputs(config, function_mapper) """ + additional_outputs = dict() + if not isinstance(config["additional_outputs"], list): raise ValueError( """ -In config file additional_outputs must be a list, please use:\n -["all"] to get all outputs\n -[] to get no outputs\n -or a list with the outputs, e.g. ["output_1","output_2"] + In config file additional_outputs must be a list, please use:\n + ["all"] to get all outputs\n + [] to get no outputs\n + or a list with the outputs, e.g. ["output_1","output_2"] """ ) @@ -64,12 +76,14 @@ def get_additional_outputs(config: dict, function_mapper: dict) -> None: if function in function_mapper: - function_mapper[function](**config) + additional_outputs[function] = function_mapper[function](**config) else: raise ValueError( f""" - The function {function} is not registerd, check spelling.\n - Currently the registered functions are:\n {function_mapper} + The function {function} is not registered, check spelling.\n + Currently the registered functions are:\n {function_mapper} """ ) + + return additional_outputs diff --git a/mbs_results/outputs/produce_additional_outputs.py b/mbs_results/outputs/produce_additional_outputs.py new file mode 100644 index 00000000..64ac367a --- /dev/null +++ b/mbs_results/outputs/produce_additional_outputs.py @@ -0,0 +1,50 @@ +from importlib import metadata + +from mbs_results.outputs.get_additional_outputs import get_additional_outputs +from mbs_results.outputs.selective_editing_contributer_output import ( + get_selective_editing_contributer_output, +) +from mbs_results.outputs.selective_editing_question_output import ( + create_selective_editing_question_output, +) +from mbs_results.outputs.turnover_analysis import create_turnover_output +from mbs_results.outputs.weighted_adj_val_time_series import ( + get_weighted_adj_val_time_series, +) + + +def produce_additional_outputs(config: dict): + """ + Function to write additional outputs + + Parameters + ---------- + config : Dict + main pipeline configuration + + Returns + ------- + None. + Outputs are written to output path defined in config + + """ + + additional_outputs = get_additional_outputs( + config, + { + "selective_editing_contributor": get_selective_editing_contributer_output, + "selective_editing_question": create_selective_editing_question_output, + "turnover_output": create_turnover_output, + "weighted_adj_val_time_series": get_weighted_adj_val_time_series, + }, + ) + + # Stop function if no additional_outputs are listed in config. + if additional_outputs is None: + return + + file_version_mbs = metadata.metadata("monthly-business-survey-results")["version"] + snapshot_name = config["mbs_file_name"].split(".")[0] + for output in additional_outputs: + filename = f"{output}_v{file_version_mbs}_{snapshot_name}.csv" + additional_outputs[output].to_csv(config["output_path"] + filename) diff --git a/mbs_results/outputs/produce_outputs.py b/mbs_results/outputs/produce_outputs.py deleted file mode 100644 index caad9cfd..00000000 --- a/mbs_results/outputs/produce_outputs.py +++ /dev/null @@ -1,7 +0,0 @@ -import warnings - -import pandas as pd - - -def produce_outputs(df: pd.DataFrame, output_path: str): - warnings.warn("Temporary function to produce optional outputs") diff --git a/mbs_results/outputs/selective_editing_contributer_output.py b/mbs_results/outputs/selective_editing_contributer_output.py index 8a4c85d3..49ab3032 100644 --- a/mbs_results/outputs/selective_editing_contributer_output.py +++ b/mbs_results/outputs/selective_editing_contributer_output.py @@ -1,6 +1,6 @@ import pandas as pd -from mbs_results.merge_domain import merge_domain +from mbs_results.staging.merge_domain import merge_domain def get_selective_editing_contributer_output( @@ -10,6 +10,7 @@ def get_selective_editing_contributer_output( sic_input: str, sic_mapping: str, period_selected: int, + **config ) -> pd.DataFrame: """ Returns a dataframe containing period, reference, domain_group, and @@ -29,6 +30,10 @@ def get_selective_editing_contributer_output( Name of column in input_filepath csv file containing SIC variable. sic_mapping : str Name of column in domain_filepath csv file containing SIC variable. + period_selected : int + period to include in outputs + **config: Dict + main pipeline configuration. Can be used to input the entire config dictionary Returns ------- diff --git a/mbs_results/outputs/selective_editing_question_output.py b/mbs_results/outputs/selective_editing_question_output.py index 38b4a3b5..2a93e1a6 100644 --- a/mbs_results/outputs/selective_editing_question_output.py +++ b/mbs_results/outputs/selective_editing_question_output.py @@ -1,7 +1,7 @@ import pandas as pd -from mbs_results.merge_domain import merge_domain -from mbs_results.unsorted.selective_editing import create_standardising_factor +from mbs_results.outputs.selective_editing import create_standardising_factor +from mbs_results.staging.merge_domain import merge_domain def create_selective_editing_question_output( @@ -18,6 +18,7 @@ def create_selective_editing_question_output( adjusted_value: str, sic_domain_mapping_path: str, period_selected: int, + **config, ) -> pd.DataFrame: """ creates the selective editing question output. @@ -48,11 +49,13 @@ def create_selective_editing_question_output( adjusted_value : str name of column in dataframe containing adjusted_value variable combined with imputed_values as outputted from Ratio of Means script - sic_domain_mapping_path : str + sic_domain_mapping_path : str path to the sic domain mapping file period_selected : int previous period to take the weights for estimation of standardising factor in the format yyyymm + **config: Dict + main pipeline configuration. Can be used to input the entire config dictionary Returns ------- diff --git a/mbs_results/outputs/turnover_analysis.py b/mbs_results/outputs/turnover_analysis.py index 058c6238..1beaa373 100644 --- a/mbs_results/outputs/turnover_analysis.py +++ b/mbs_results/outputs/turnover_analysis.py @@ -9,6 +9,7 @@ def create_turnover_output( winsorisation_df: pd.DataFrame, winsorisation_period: str, selected_period: int, + **config ) -> pd.DataFrame: """ Creating output for turnover analysis tool. @@ -30,6 +31,8 @@ def create_turnover_output( Name of column displaying period in winsorisation selected_period : int Period to output results for in the format YYYYMM + **config: Dict + main pipeline configuration. Can be used to input the entire config dictionary Returns ------- diff --git a/mbs_results/outputs/weighted_adj_val_time_series.py b/mbs_results/outputs/weighted_adj_val_time_series.py index 927d723b..5a7402b8 100644 --- a/mbs_results/outputs/weighted_adj_val_time_series.py +++ b/mbs_results/outputs/weighted_adj_val_time_series.py @@ -3,7 +3,7 @@ from utilities.utils import convert_column_to_datetime -def get_weighted_adj_val_time_series(filepath: str) -> pd.DataFrame: +def get_weighted_adj_val_time_series(filepath: str, **config) -> pd.DataFrame: """ Time series of weighted adjusted values by classification, question number, and cell number. @@ -13,6 +13,8 @@ def get_weighted_adj_val_time_series(filepath: str) -> pd.DataFrame: filepath : str filepath to csv containing classification, question number, cell number, period, and weighted adjusted value. + **config: Dict + main pipeline configuration. Can be used to input the entire config dictionary Returns ------- From 0b00d569ae72b39d4f2e013215605b4f52dab354 Mon Sep 17 00:00:00 2001 From: lhubbardONS Date: Fri, 6 Dec 2024 11:46:00 +0000 Subject: [PATCH 2/3] Streamlining additional outputs process and updating methods to work correctly with pipeline --- mbs_results/main.py | 8 +- mbs_results/outputs/get_additional_outputs.py | 11 ++- .../outputs/produce_additional_outputs.py | 59 ++++++++++++- .../selective_editing_contributer_output.py | 36 +++----- .../selective_editing_question_output.py | 84 +++++-------------- mbs_results/outputs/turnover_analysis.py | 56 ++++--------- .../outputs/weighted_adj_val_time_series.py | 33 +++++--- .../additional_outputs_df_input.csv | 6 ++ .../outputs/turnover_analysis/cp_input.csv | 4 - .../turnover_analysis/finalsel_input.csv | 4 - .../outputs/turnover_analysis/qv_input.csv | 5 -- .../turnover_analysis_output.csv | 8 +- .../turnover_analysis/winsorisation_input.csv | 6 -- tests/outputs/test_get_additional_outputs.py | 7 +- tests/outputs/test_turnover_analysis.py | 30 +------ 15 files changed, 165 insertions(+), 192 deletions(-) create mode 100644 tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv delete mode 100755 tests/data/outputs/turnover_analysis/cp_input.csv delete mode 100755 tests/data/outputs/turnover_analysis/finalsel_input.csv delete mode 100755 tests/data/outputs/turnover_analysis/qv_input.csv delete mode 100755 tests/data/outputs/turnover_analysis/winsorisation_input.csv diff --git a/mbs_results/main.py b/mbs_results/main.py index 8013fbb2..bdf5daa4 100755 --- a/mbs_results/main.py +++ b/mbs_results/main.py @@ -1,7 +1,10 @@ from mbs_results.estimation.estimate import estimate from mbs_results.imputation.impute import impute from mbs_results.outlier_detection.detect_outlier import detect_outlier -from mbs_results.outputs.produce_additional_outputs import produce_additional_outputs +from mbs_results.outputs.produce_additional_outputs import ( + get_additional_outputs_df, + produce_additional_outputs, +) from mbs_results.staging.stage_dataframe import stage_dataframe from mbs_results.utilities.inputs import load_config from mbs_results.utilities.validation_checks import ( @@ -32,7 +35,8 @@ def run_mbs_main(): outlier_output = detect_outlier(estimation_output, config) validate_outlier_detection(outlier_output, config) - produce_additional_outputs(config) + additional_outputs_df = get_additional_outputs_df(estimation_output, outlier_output) + produce_additional_outputs(config, additional_outputs_df) if __name__ == "__main__": diff --git a/mbs_results/outputs/get_additional_outputs.py b/mbs_results/outputs/get_additional_outputs.py index 4cf2ff64..9155da8c 100644 --- a/mbs_results/outputs/get_additional_outputs.py +++ b/mbs_results/outputs/get_additional_outputs.py @@ -1,4 +1,9 @@ -def get_additional_outputs(config: dict, function_mapper: dict) -> None: +import pandas as pd + + +def get_additional_outputs( + config: dict, function_mapper: dict, additional_outputs_df: pd.DataFrame +) -> dict: """ Runs a set of functions as defined in additional_outputs from the config, the function names must exist in function_mapper which also has the relevant @@ -76,7 +81,9 @@ def get_additional_outputs(config: dict, function_mapper: dict) -> None: if function in function_mapper: - additional_outputs[function] = function_mapper[function](**config) + additional_outputs[function] = function_mapper[function]( + additional_outputs_df=additional_outputs_df, **config + ) else: raise ValueError( diff --git a/mbs_results/outputs/produce_additional_outputs.py b/mbs_results/outputs/produce_additional_outputs.py index 64ac367a..342e944d 100644 --- a/mbs_results/outputs/produce_additional_outputs.py +++ b/mbs_results/outputs/produce_additional_outputs.py @@ -1,5 +1,7 @@ from importlib import metadata +import pandas as pd + from mbs_results.outputs.get_additional_outputs import get_additional_outputs from mbs_results.outputs.selective_editing_contributer_output import ( get_selective_editing_contributer_output, @@ -13,7 +15,59 @@ ) -def produce_additional_outputs(config: dict): +def get_additional_outputs_df( + estimation_output: pd.DataFrame, outlier_output: pd.DataFrame +): + """ + Creating dataframe that contains all variables needed for producing additional + outputs. + + Parameters + ---------- + estimation_output : pd.DataFrame + Dataframe output from the estimation stage of the pipeline + outlier_output : pd.DataFrame + Dataframe output from the outliering stage of the pipeline + + Returns + ------- + pd.DataFrame + + """ + + additional_outputs_df = estimation_output[ + [ + "reference", + "period", + "design_weight", + "frosic2007", + "formtype", + "questioncode", + "frotover", + "calibration_factor", + "adjustedresponse", + "status", + "response", + "froempment", + "cell_no", + "referencename", + "imputation_flags_adjustedresponse", + "f_link_adjustedresponse", + "b_link_adjustedresponse", + "construction_link", + ] + ] + + additional_outputs_df = additional_outputs_df.merge( + outlier_output[["reference", "period", "questioncode", "outlier_weight"]], + how="left", + on=["reference", "period", "questioncode"], + ) + + return additional_outputs_df + + +def produce_additional_outputs(config: dict, additional_outputs_df: pd.DataFrame): """ Function to write additional outputs @@ -21,6 +75,8 @@ def produce_additional_outputs(config: dict): ---------- config : Dict main pipeline configuration + additional_outputs_df : pd.DataFrame + Dataframe to feed in as arguments for additional outputs Returns ------- @@ -37,6 +93,7 @@ def produce_additional_outputs(config: dict): "turnover_output": create_turnover_output, "weighted_adj_val_time_series": get_weighted_adj_val_time_series, }, + additional_outputs_df, ) # Stop function if no additional_outputs are listed in config. diff --git a/mbs_results/outputs/selective_editing_contributer_output.py b/mbs_results/outputs/selective_editing_contributer_output.py index 49ab3032..8f12afc6 100644 --- a/mbs_results/outputs/selective_editing_contributer_output.py +++ b/mbs_results/outputs/selective_editing_contributer_output.py @@ -4,11 +4,9 @@ def get_selective_editing_contributer_output( - input_filepath: str, - domain_filepath: str, + additional_outputs_df: pd.DataFrame, + sic_domain_mapping_path: str, threshold_filepath: str, - sic_input: str, - sic_mapping: str, period_selected: int, **config ) -> pd.DataFrame: @@ -19,17 +17,12 @@ def get_selective_editing_contributer_output( Parameters ---------- - input_filepath : str - Filepath to csv file containing reference, imp_class, period and - SIC columns. - domain_filepath : str + additional_outputs_df : pd.DataFrame + Dataframe containing reference, design_weight, formtype, period and SIC columns. + sic_domain_mapping_path : str Filepath to csv file containing SIC and domain columns. threshold_filepath : str Filepath to csv file containing form type, domain and threshold columns. - sic_input : str - Name of column in input_filepath csv file containing SIC variable. - sic_mapping : str - Name of column in domain_filepath csv file containing SIC variable. period_selected : int period to include in outputs **config: Dict @@ -46,32 +39,29 @@ def get_selective_editing_contributer_output( >> input_filepath=input_filepath, >> domain_filepath=domain_filepath, >> threshold_filepath=threshold_filepath, - >> sic_input="sic_5_digit", - >> sic_mapping="sic_5_digit", >> period_selected=202201 >> ) """ - input_data = pd.read_csv( - input_filepath, - usecols=["period", "reference", "design_weight", sic_input, "form_type"], - ) + input_data = additional_outputs_df[ + ["period", "reference", "design_weight", "frosic2007", "formtype"] + ] - domain_data = pd.read_csv(domain_filepath) + domain_data = pd.read_csv(sic_domain_mapping_path).astype(str) - threshold_mapping = pd.read_csv(threshold_filepath) + threshold_mapping = pd.read_csv(threshold_filepath).astype(str) selective_editing_contributer_output = merge_domain( - input_data, domain_data, sic_input, sic_mapping + input_data, domain_data, "frosic2007", "sic_5_digit" ) selective_editing_contributer_output = pd.merge( selective_editing_contributer_output, threshold_mapping, - left_on=["form_type", "domain"], + left_on=["formtype", "domain"], right_on=["form", "domain"], how="left", - ).drop(columns=["form", "form_type"]) + ).drop(columns=["form", "formtype"]) selective_editing_contributer_output = selective_editing_contributer_output.rename( columns={"reference": "ruref", "domain": "domain_group"} diff --git a/mbs_results/outputs/selective_editing_question_output.py b/mbs_results/outputs/selective_editing_question_output.py index 2a93e1a6..d551fd3b 100644 --- a/mbs_results/outputs/selective_editing_question_output.py +++ b/mbs_results/outputs/selective_editing_question_output.py @@ -5,17 +5,7 @@ def create_selective_editing_question_output( - df: pd.DataFrame, - reference: str, - period: str, - domain: str, - question_no: str, - sic: str, - aux: str, - a_weight: str, - o_weight: str, - g_weight: str, - adjusted_value: str, + additional_outputs_df: pd.DataFrame, sic_domain_mapping_path: str, period_selected: int, **config, @@ -26,29 +16,9 @@ def create_selective_editing_question_output( Parameters ---------- - df : pd.DataFrame - Reference dataframe with domain, a_weights, o_weights, and g_weights - reference : str - name of column in dataframe containing reference variable - period : str - name of column in dataframe containing period variable - domain : str - name of column name containing domain variable in sic_domain_mapping file. - question_no : str - name of column in dataframe containing question number variable - sic : str - name of column in dataframe containing sic variable - aux : str - name of column in dataframe containing auxiliary value variable - a_weight : str - Column name containing the design weight. - o_weight : str - column name containing the outlier weight. - g_weight : str - column name containing the g weight. - adjusted_value : str - name of column in dataframe containing adjusted_value variable combined - with imputed_values as outputted from Ratio of Means script + additional_outputs_df : pd.DataFrame + Reference dataframe with sic, a_weights, o_weights, g_weights, + adjustedresponse, imputation_flags and frotover sic_domain_mapping_path : str path to the sic domain mapping file period_selected : int @@ -65,58 +35,48 @@ def create_selective_editing_question_output( Examples -------- >> output = create_selective_editing_question_output( - >> df=wins_output, - >> reference="reference", - >> period="period", - >> domain="domain", - >> question_no="question_no", - >> sic="sic_5_digit", - >> aux="frotover", - >> a_weight="design_weight", - >> o_weight="outlier_weight", - >> g_weight="calibration_factor", - >> adjusted_value="adjusted_value", + >> additional_outputs_df=wins_output, >> sic_domain_mapping_path="mapping_files/sic_domain_mapping.csv", >> period_selected=202201, >> ) """ - sic_domain_mapping = pd.read_csv(sic_domain_mapping_path).astype(int) + sic_domain_mapping = pd.read_csv(sic_domain_mapping_path).astype(str) df_with_domain = merge_domain( - input_df=df, + input_df=additional_outputs_df, domain_mapping=sic_domain_mapping, - sic_input=sic, + sic_input="frosic2007", sic_mapping="sic_5_digit", ) standardising_factor = create_standardising_factor( dataframe=df_with_domain, - reference=reference, - period=period, - domain=domain, - question_no=question_no, - predicted_value=adjusted_value, - imputation_marker="imputation_flags_adjusted_value", - a_weight=a_weight, - o_weight=o_weight, - g_weight=g_weight, - auxiliary_value=aux, + reference="reference", + period="period", + domain="domain", + question_no="questioncode", + predicted_value="adjustedresponse", + imputation_marker="imputation_flags_adjustedresponse", + a_weight="design_weight", + o_weight="outlier_weight", + g_weight="calibration_factor", + auxiliary_value="frotover", period_selected=period_selected, ) # Survey code is requested on this output, 009 is MBS code standardising_factor["survey_code"] = "009" - standardising_factor["imputation_flags_adjusted_value"] = standardising_factor[ - "imputation_flags_adjusted_value" + standardising_factor["imputation_flags_adjustedresponse"] = standardising_factor[ + "imputation_flags_adjustedresponse" ].str.upper() standardising_factor = standardising_factor.rename( columns={ "reference": "ruref", "domain": "domain_group", - aux: "auxiliary_value", + "frotover": "auxiliary_value", "imputation_flags_adjusted_value": "imputation_marker", - question_no: "question_code", + "questioncode": "question_code", } ) diff --git a/mbs_results/outputs/turnover_analysis.py b/mbs_results/outputs/turnover_analysis.py index 1beaa373..757df959 100644 --- a/mbs_results/outputs/turnover_analysis.py +++ b/mbs_results/outputs/turnover_analysis.py @@ -3,33 +3,16 @@ def create_turnover_output( - cp_df: pd.DataFrame, - qv_df: pd.DataFrame, - finalsel_df: pd.DataFrame, - winsorisation_df: pd.DataFrame, - winsorisation_period: str, - selected_period: int, - **config + additional_outputs_df: pd.DataFrame, current_period: int, **config ) -> pd.DataFrame: """ Creating output for turnover analysis tool. Parameters ---------- - cp_df : pd.DataFrame - cp input dataframe containing reference, sic and error_mkr - qv_df : pd.DataFrame - qv input dataframe containing reference, question_no, adjusted_value and - returned_value - finalsel_df : pd.DataFrame - finalsel input dataframe containing reference, froempment, frotover, cell_no - and entname1 - winsorisation_df : pd.DataFrame - winsorisation input dataframe containing question_no, period, reference, - imputation_marker, design_weight, calibration_factor and outlier_weight - winsorisation_period : str - Name of column displaying period in winsorisation - selected_period : int + additional_outputs_df : pd.DataFrame + estimation input dataframe containing relevant columns for turnover tool + current_period : int Period to output results for in the format YYYYMM **config: Dict main pipeline configuration. Can be used to input the entire config dictionary @@ -40,19 +23,12 @@ def create_turnover_output( dataframe in correct format for populating turnover analysis tool. """ - qv_df = qv_df.query("question_no == 40") - winsorisation_df = winsorisation_df.query( - "{} == {} and question_no == 40".format(winsorisation_period, selected_period) - ) - - turnover_df = ( - cp_df.merge(qv_df, how="left", on="reference") - .merge(finalsel_df, how="left", left_on="reference", right_on="ruref") - .merge(winsorisation_df, how="left", on="reference") + turnover_df = additional_outputs_df.query( + "period == {} and questioncode == 40".format(current_period) ) turnover_df["curr_grossed_value"] = ( - turnover_df["adjusted_value"] + turnover_df["adjustedresponse"] * turnover_df["design_weight"] * turnover_df["outlier_weight"] * turnover_df["calibration_factor"] @@ -61,9 +37,11 @@ def create_turnover_output( # Convert imp_marker to type # Type 1: Return, Type 2: Construction, Type 3: Imputation type_conditions = [ - turnover_df["imputation_marker"] == "r", - turnover_df["imputation_marker"].isin(["c", "mc"]), - turnover_df["imputation_marker"].isin(["fir", "bir", "fic", "fimc"]), + turnover_df["imputation_flags_adjustedresponse"] == "r", + turnover_df["imputation_flags_adjustedresponse"].isin(["c", "mc"]), + turnover_df["imputation_flags_adjustedresponse"].isin( + ["fir", "bir", "fic", "fimc"] + ), ] type_values = [1, 2, 3] @@ -77,19 +55,19 @@ def create_turnover_output( turnover_df = turnover_df[ [ - "sic92", + "frosic2007", "cell_no", "reference", - "entname1", - "adjusted_value", + "referencename", + "adjustedresponse", "type", "curr_grossed_value", "outlier_weight", - "error_mkr", + "status", "error_res_code", "frotover", "froempment", - "returned_value", + "response", ] ] diff --git a/mbs_results/outputs/weighted_adj_val_time_series.py b/mbs_results/outputs/weighted_adj_val_time_series.py index 5a7402b8..e7076f81 100644 --- a/mbs_results/outputs/weighted_adj_val_time_series.py +++ b/mbs_results/outputs/weighted_adj_val_time_series.py @@ -1,44 +1,55 @@ import numpy as np import pandas as pd +from staging.merge_domain import merge_domain from utilities.utils import convert_column_to_datetime -def get_weighted_adj_val_time_series(filepath: str, **config) -> pd.DataFrame: +def get_weighted_adj_val_time_series( + additional_outputs_df: pd.DataFrame, sic_class_mapping: str, **config +) -> pd.DataFrame: """ Time series of weighted adjusted values by classification, question number, and cell number. Parameters ---------- - filepath : str - filepath to csv containing classification, question number, cell number, + additional_outputs_df : pd.DataFrame + dataframe containing classification, question code, cell number, period, and weighted adjusted value. + sic_class_mapping : str + filepath to mapping between SIC and classification **config: Dict main pipeline configuration. Can be used to input the entire config dictionary Returns ------- - pandas.Data.Frame + pd.DataFrame Dataframe containing classification, question number, and cell number, pivoted wider on period with adjusted values. """ - input_data = pd.read_csv( - filepath, - usecols=[ + additional_outputs_df = merge_domain( + additional_outputs_df, sic_class_mapping, "frosic2007", "sic_5_digit" + ) + + # TODO: Find out calculation of weighted adjusted value and derive as necessary + # in function + input_data = additional_outputs_df[ + [ "classification", - "question_no", + "questioncode", "cell_no", "period", "weighted adjusted value", - ], - dtype={ + ] + ].astype( + { "classification": "Int32", "question_no": "Int8", "cell_no": "Int16", "period": "Int32", "weighted adjusted value": "float64", - }, + } ) input_data["period"] = ( diff --git a/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv b/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv new file mode 100644 index 00000000..06ab34c7 --- /dev/null +++ b/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv @@ -0,0 +1,6 @@ +reference,period,design_weight,frosic2007,questioncode,frotover,calibration_factor,adjustedresponse,status,response,froempment,cell_no,referencename,imputation_flags_adjustedresponse,outlier_weight +101,202301,1.1,1,40,594,1.0,4205.4,O,5940,4593,32,NAME 1,r,1.0 +101,202301,1.5,1,49,594,1.2,493.3,O,500,4593,32,NAME 1,r,1.0 +101,202302,1.0,1,40,960,1.3,849.3,O,1000,4592,32,NAME 1,r,1.2 +102,202301,1.0,2,40,43,1.0,448,C,448,62,6,NAME 2,c,1.0 +103,202301,3.0,2,40,509,1.0,84205.9,E,75940,394,19,NAME 3,fir,1.5 \ No newline at end of file diff --git a/tests/data/outputs/turnover_analysis/cp_input.csv b/tests/data/outputs/turnover_analysis/cp_input.csv deleted file mode 100755 index ca373c87..00000000 --- a/tests/data/outputs/turnover_analysis/cp_input.csv +++ /dev/null @@ -1,4 +0,0 @@ -period,reference,sic92,error_mkr -202301,101,1,O -202301,102,2,C -202301,103,2,E diff --git a/tests/data/outputs/turnover_analysis/finalsel_input.csv b/tests/data/outputs/turnover_analysis/finalsel_input.csv deleted file mode 100755 index 3a4ccb89..00000000 --- a/tests/data/outputs/turnover_analysis/finalsel_input.csv +++ /dev/null @@ -1,4 +0,0 @@ -ruref,froempment,frotover,cell_no,entname1 -101,4593,594,32,NAME 1 -102,62,43,6,NAME 2 -103,394,509,19,NAME 3 diff --git a/tests/data/outputs/turnover_analysis/qv_input.csv b/tests/data/outputs/turnover_analysis/qv_input.csv deleted file mode 100755 index d857c39d..00000000 --- a/tests/data/outputs/turnover_analysis/qv_input.csv +++ /dev/null @@ -1,5 +0,0 @@ -period,reference,question_no,returned_value,adjusted_value -202301,101,40,5940.0,4205.4 -202301,101,49,4326.0,4265.4 -202301,102,40,448.0,448.0 -202301,103,40,75940.0,84205.9 diff --git a/tests/data/outputs/turnover_analysis/turnover_analysis_output.csv b/tests/data/outputs/turnover_analysis/turnover_analysis_output.csv index ffaed658..e4b7d527 100755 --- a/tests/data/outputs/turnover_analysis/turnover_analysis_output.csv +++ b/tests/data/outputs/turnover_analysis/turnover_analysis_output.csv @@ -1,4 +1,4 @@ -sic92,cell_no,reference,entname1,adjusted_value,type,curr_grossed_value,outlier_weight,error_mkr,error_res_code,frotover,froempment,returned_value -1,32,101,NAME 1,4205.4,1,4625.94,1.0,O,0,594,4593,5940.0 -2,6,102,NAME 2,448.0,2,448.0,1.0,C,0,43,62,448.0 -2,19,103,NAME 3,84205.9,3,378926.55,1.5,E,0,509,394,75940.0 +frosic2007,cell_no,reference,referencename,adjustedresponse,type,curr_grossed_value,outlier_weight,status,error_res_code,frotover,froempment,response +1,32,101,NAME 1,4205.4,1,4625.94,1,O,0,594,4593,5940 +2,6,102,NAME 2,448,2,448,1,C,0,43,62,448 +2,19,103,NAME 3,84205.9,3,378926.55,1.5,E,0,509,394,75940 diff --git a/tests/data/outputs/turnover_analysis/winsorisation_input.csv b/tests/data/outputs/turnover_analysis/winsorisation_input.csv deleted file mode 100755 index b6873599..00000000 --- a/tests/data/outputs/turnover_analysis/winsorisation_input.csv +++ /dev/null @@ -1,6 +0,0 @@ -question_no,period_x,reference,imputation_marker,design_weight,calibration_factor,outlier_weight -40,202301,101,r,1.0,1.1,1.0 -49,202301,101,r,1.5,1.0,1.2 -40,202302,101,r,0.8,1.0,1.0 -40,202301,102,c,1.0,1.0,1.0 -40,202301,103,fir,1.0,3.0,1.5 diff --git a/tests/outputs/test_get_additional_outputs.py b/tests/outputs/test_get_additional_outputs.py index f1ca29ad..9920adf5 100644 --- a/tests/outputs/test_get_additional_outputs.py +++ b/tests/outputs/test_get_additional_outputs.py @@ -1,4 +1,5 @@ import pytest +import pandas as pd from mbs_results.outputs.get_additional_outputs import get_additional_outputs @@ -27,7 +28,7 @@ def function_mapper(): ) def test_output(capsys, function_mapper, inp, expected): """Test that the right functions were run""" - get_additional_outputs(inp, function_mapper) + get_additional_outputs(inp, function_mapper, pd.DataFrame()) out, err = capsys.readouterr() assert out == expected @@ -37,7 +38,7 @@ def test_raise_errors(function_mapper): function which does not link to a function""" with pytest.raises(ValueError): - get_additional_outputs({"additional_outputs": "not_a_list"}, function_mapper) + get_additional_outputs({"additional_outputs": "not_a_list"}, function_mapper, pd.DataFrame()) with pytest.raises(ValueError): - get_additional_outputs({"additional_outputs": ["test3"]}, function_mapper) + get_additional_outputs({"additional_outputs": ["test3"]}, function_mapper, pd.DataFrame()) diff --git a/tests/outputs/test_turnover_analysis.py b/tests/outputs/test_turnover_analysis.py index a65397e5..ccac652f 100644 --- a/tests/outputs/test_turnover_analysis.py +++ b/tests/outputs/test_turnover_analysis.py @@ -13,23 +13,8 @@ def filepath(): @pytest.fixture(scope="class") -def cp_input_data(filepath): - return pd.read_csv(filepath / "cp_input.csv", index_col=False) - - -@pytest.fixture(scope="class") -def qv_input_data(filepath): - return pd.read_csv(filepath / "qv_input.csv", index_col=False) - - -@pytest.fixture(scope="class") -def finalsel_input_data(filepath): - return pd.read_csv(filepath / "finalsel_input.csv", index_col=False) - - -@pytest.fixture(scope="class") -def winsorisation_input_data(filepath): - return pd.read_csv(filepath / "winsorisation_input.csv", index_col=False) +def additional_outputs_df_input_data(filepath): + return pd.read_csv(filepath / "additional_outputs_df_input.csv", index_col=False) @pytest.fixture(scope="class") @@ -40,20 +25,13 @@ def turnover_analysis_output(filepath): class TestTurnoverAnalysis: def test_turnover_analysis( self, - cp_input_data, - qv_input_data, - finalsel_input_data, - winsorisation_input_data, + additional_outputs_df_input_data, turnover_analysis_output, ): expected_output = turnover_analysis_output actual_output = create_turnover_output( - cp_input_data, - qv_input_data, - finalsel_input_data, - winsorisation_input_data, - "period_x", + additional_outputs_df_input_data, 202301, ) From 6ffd6da9dbe1f24b948d238b0f7237dca9bf81ce Mon Sep 17 00:00:00 2001 From: lhubbardONS Date: Fri, 6 Dec 2024 11:56:00 +0000 Subject: [PATCH 3/3] pre-commit hooks --- .../turnover_analysis/additional_outputs_df_input.csv | 2 +- tests/outputs/test_get_additional_outputs.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv b/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv index 06ab34c7..b6f35194 100644 --- a/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv +++ b/tests/data/outputs/turnover_analysis/additional_outputs_df_input.csv @@ -3,4 +3,4 @@ reference,period,design_weight,frosic2007,questioncode,frotover,calibration_fact 101,202301,1.5,1,49,594,1.2,493.3,O,500,4593,32,NAME 1,r,1.0 101,202302,1.0,1,40,960,1.3,849.3,O,1000,4592,32,NAME 1,r,1.2 102,202301,1.0,2,40,43,1.0,448,C,448,62,6,NAME 2,c,1.0 -103,202301,3.0,2,40,509,1.0,84205.9,E,75940,394,19,NAME 3,fir,1.5 \ No newline at end of file +103,202301,3.0,2,40,509,1.0,84205.9,E,75940,394,19,NAME 3,fir,1.5 diff --git a/tests/outputs/test_get_additional_outputs.py b/tests/outputs/test_get_additional_outputs.py index 9920adf5..74c5b2a3 100644 --- a/tests/outputs/test_get_additional_outputs.py +++ b/tests/outputs/test_get_additional_outputs.py @@ -1,5 +1,5 @@ -import pytest import pandas as pd +import pytest from mbs_results.outputs.get_additional_outputs import get_additional_outputs @@ -38,7 +38,11 @@ def test_raise_errors(function_mapper): function which does not link to a function""" with pytest.raises(ValueError): - get_additional_outputs({"additional_outputs": "not_a_list"}, function_mapper, pd.DataFrame()) + get_additional_outputs( + {"additional_outputs": "not_a_list"}, function_mapper, pd.DataFrame() + ) with pytest.raises(ValueError): - get_additional_outputs({"additional_outputs": ["test3"]}, function_mapper, pd.DataFrame()) + get_additional_outputs( + {"additional_outputs": ["test3"]}, function_mapper, pd.DataFrame() + )