From ba419c9863516adb4a070a196d6385053f33b167 Mon Sep 17 00:00:00 2001 From: daviel9 Date: Thu, 24 Oct 2024 14:57:44 +0100 Subject: [PATCH 1/5] Inital testing code --- mbs_results/csw_to_spp_converter.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 mbs_results/csw_to_spp_converter.py diff --git a/mbs_results/csw_to_spp_converter.py b/mbs_results/csw_to_spp_converter.py new file mode 100644 index 00000000..67d932bc --- /dev/null +++ b/mbs_results/csw_to_spp_converter.py @@ -0,0 +1,18 @@ +import glob + +import pandas as pd + + +def csw_to_spp(filepath): + + files = glob.glob(filepath + "qv*") + glob.glob(filepath + "cp*") + + li = [] + + for f in files: + + temp_df = pd.read_csv(f) + + li.append(temp_df) + + print(f"Successfully created dataframe for {f} with shape {temp_df.shape}") From d42bef9af6a7cda7753202b2df2607c5cbf8dd9f Mon Sep 17 00:00:00 2001 From: daviel9 Date: Thu, 31 Oct 2024 14:24:50 +0000 Subject: [PATCH 2/5] Create function and sub functions --- mbs_results/csw_to_spp_converter.py | 18 --- mbs_results/utilities/csw_to_spp_converter.py | 118 ++++++++++++++++++ 2 files changed, 118 insertions(+), 18 deletions(-) delete mode 100644 mbs_results/csw_to_spp_converter.py create mode 100644 mbs_results/utilities/csw_to_spp_converter.py diff --git a/mbs_results/csw_to_spp_converter.py b/mbs_results/csw_to_spp_converter.py deleted file mode 100644 index 67d932bc..00000000 --- a/mbs_results/csw_to_spp_converter.py +++ /dev/null @@ -1,18 +0,0 @@ -import glob - -import pandas as pd - - -def csw_to_spp(filepath): - - files = glob.glob(filepath + "qv*") + glob.glob(filepath + "cp*") - - li = [] - - for f in files: - - temp_df = pd.read_csv(f) - - li.append(temp_df) - - print(f"Successfully created dataframe for {f} with shape {temp_df.shape}") diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py new file mode 100644 index 00000000..38896cf6 --- /dev/null +++ b/mbs_results/utilities/csw_to_spp_converter.py @@ -0,0 +1,118 @@ +import fnmatch +from os import listdir +from os.path import isfile, join +import pandas as pd + +from utils import convert_column_to_datetime + +def get_patern_df( + filepath: str, + pattern: str + ) -> pd.DataFrame: + """Loads as pd dataframe all csv files with pattern. + + Parameters + ---------- + filepath : str + Filepath to folder containg desired files. + pattern : str + Regex pattern to filter files in the folder based on name. + + Returns + ------- + pd.DataFrame + Dataframe containg data from all selected files. + """ + + filenames = [ + filename for filename in listdir(filepath) if isfile(join(filepath, filename)) + ] + filenames = fnmatch.filter(filenames, pattern) + df_list = [pd.read_csv(filepath + "/" + filename) for filename in filenames] + df = pd.concat(df_list, ignore_index=True) + + return df + +def get_qv_and_cp_data( + cp_path: str, + qv_path: str, + ) -> pd.DataFrame: + """Reads and joins qv and cp data. + + Parameters + ---------- + cp_path : str + Filepath to folder containing cp data. + qv_path : str + Filepath to folder containing qv data. + + Returns + ------- + pd.DataFrame + Dataframe containing combined qv and cp data. + """ + + qv_df = get_patern_df(qv_path,"qv*.csv") + cp_df = get_patern_df(cp_path,"cp*.csv") + + qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"]) + + return qv_and_cp + +def csw_to_spp( + cp_path: str, + qv_path: str, + output_path: str, + column_map: dict, + period: str, + period_range: int + ) -> None: + """Combines cp and qv files, filters and renames columns based on a mapping, and + then saves the output as a json file. + + Parameters + ---------- + cp_path : str + Filepath to folder containing cp data. + qv_path : str + Filepath to folder containing qv data. + output_path : str + Filepath to save json file. + column_map : dict + Dictionary containing desired columns from qv and cp data as keys and their + desired names as values. + period : str + Date to filter output on (YYYY-MM-DD). + period_range : str + Number of months from the period and previous to include in the output. + """ + qv_and_cp = get_qv_and_cp_data(cp_path,qv_path) + + qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"]) + + period = pd.Timestamp(period) + + qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)] + + qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m') + + qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map) + + qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json") + +col_mapping = { + "reference": "reference", + "period": "period", + "error_mkr": "status", + "question_no": "questioncode", + "returned_value": "response", + "adjusted_value": "adjustedresponse", + } + +filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z" + +csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3) + +df = pd.read_json("D:/test_202303_3.json") +print(df.head()) +print(df.tail()) From 4288e8cecbfea5395225b0649a45cb61b3e94f9b Mon Sep 17 00:00:00 2001 From: daviel9 Date: Mon, 4 Nov 2024 10:05:02 +0000 Subject: [PATCH 3/5] Remove reference to personal files --- mbs_results/growth_rate_main.py | 78 +++++++++++++++++++ mbs_results/test.py | 65 ++++++++++++++++ mbs_results/utilities/csw_to_spp_converter.py | 17 ---- 3 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 mbs_results/growth_rate_main.py create mode 100644 mbs_results/test.py diff --git a/mbs_results/growth_rate_main.py b/mbs_results/growth_rate_main.py new file mode 100644 index 00000000..08f4b073 --- /dev/null +++ b/mbs_results/growth_rate_main.py @@ -0,0 +1,78 @@ +import numpy as np +import pandas as pd +from utils import convert_column_to_datetime + + +def get_growth_rate_data(filepath: str) -> pd.DataFrame: + """ + Filters and pivots wider winsorisation data on period to return growth rate data. + + Parameters + ---------- + filepath : str + filepath to the asap output. + + Returns + ------- + pandas.Data.Frame + Dataframe containing classification, question number, and cell number, pivoted + wider on period with adjusted values. + """ + + input_data = pd.read_csv( + filepath, + usecols=[ + "classification", + "question_no", + "cell_no", + "period", + "adjusted_value", + "total weight (A*G*O)" + ], + dtype={ + "classification": "Int32", + "question_no": "Int8", + "cell_no": "Int16", + "period": "Int32", + "adjusted_value": "float64", + "total weight (A*G*O)": "float64", + }, + ) + + input_data["weighted_adjusted_value"] = input_data["adjusted_value"] * input_data["total weight (A*G*O)"] + + input_data["period"] = ( + convert_column_to_datetime(input_data["period"]) + .dt.strftime("%Y%b") + .str.upper() + ) + + input_data["sizeband"] = np.where( + input_data["cell_no"].isna(), + input_data["cell_no"], + input_data.cell_no.astype(str).str[-1], + ) + + input_data.drop(columns=["cell_no", "adjusted_value", "total weight (A*G*O)"], inplace=True) + + input_data.sort_values( + ["classification", "question_no", "sizeband", "period"], inplace=True + ) + + growth_rate_output = ( + input_data.pivot_table( + columns="period", + values="weighted_adjusted_value", + index=["classification", "question_no", "sizeband"], + aggfunc="sum", + dropna=False, + ) + .reset_index() + .dropna(how="any") + ) + + return growth_rate_output + + +data = get_growth_rate_data("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/shadow_team/asap_482_df_0.0.2.csv") +data.to_csv("D:/growth_rate_data.csv", index=False) diff --git a/mbs_results/test.py b/mbs_results/test.py new file mode 100644 index 00000000..7614aac3 --- /dev/null +++ b/mbs_results/test.py @@ -0,0 +1,65 @@ +import pandas as pd + +from mbs_results.merge_domain import merge_domain + + +def get_selective_editing_contributer_output( + input_filepath: str, + domain_filepath: str, + sic_input: str, + sic_mapping: str, +) -> pd.DataFrame: + """ + Returns a dataframe containing period, reference, domain_group, and + design_weight. + + Parameters + ---------- + input_filepath : str + Filepath to csv file containing reference, imp_class, period and + SIC columns. + domain_filepath : str + Filepath to csv file containing SIC and domain columns. + sic_input : str + Name of column in input_filepath csv file containing SIC variable. + sic_mapping : str + Name of column in domain_filepath csv file containing SIC variable. + + Returns + ------- + pd.DataFrame + Dataframe with SIC and domain columns merged. + ` + """ + + input_data = pd.read_csv( + input_filepath, + usecols=[ + "period", + "reference", + "design_weight", + sic_input, + ], + ) + + domain_data = pd.read_csv(domain_filepath) + + selective_editing_contributer_output = merge_domain( + input_data, domain_data, sic_input, sic_mapping + ) + + selective_editing_contributer_output = selective_editing_contributer_output.rename( + columns={"reference": "ruref", "domain": "domain_group"} + ) + + return selective_editing_contributer_output + + +selective_editing_contributer_output = get_selective_editing_contributer_output( + "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/winsorisation/winsorisation_output_0.0.2.csv", + "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/mapping_files/sic_domain_mapping.csv", + "sic_5_digit", + "sic_5_digit" +) + +selective_editing_contributer_output.to_csv("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/selective_editing_outputs/selective_editing_contributer_output.csv", index=False) diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py index 38896cf6..e93644bb 100644 --- a/mbs_results/utilities/csw_to_spp_converter.py +++ b/mbs_results/utilities/csw_to_spp_converter.py @@ -99,20 +99,3 @@ def csw_to_spp( qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map) qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json") - -col_mapping = { - "reference": "reference", - "period": "period", - "error_mkr": "status", - "question_no": "questioncode", - "returned_value": "response", - "adjusted_value": "adjustedresponse", - } - -filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z" - -csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3) - -df = pd.read_json("D:/test_202303_3.json") -print(df.head()) -print(df.tail()) From 4633bdaf4d58b28b78c16af2819bbcd3ed6d8ee4 Mon Sep 17 00:00:00 2001 From: daviel9 Date: Mon, 4 Nov 2024 10:07:47 +0000 Subject: [PATCH 4/5] Remove accidently added files --- mbs_results/growth_rate_main.py | 78 --------------------------------- mbs_results/test.py | 65 --------------------------- 2 files changed, 143 deletions(-) delete mode 100644 mbs_results/growth_rate_main.py delete mode 100644 mbs_results/test.py diff --git a/mbs_results/growth_rate_main.py b/mbs_results/growth_rate_main.py deleted file mode 100644 index 08f4b073..00000000 --- a/mbs_results/growth_rate_main.py +++ /dev/null @@ -1,78 +0,0 @@ -import numpy as np -import pandas as pd -from utils import convert_column_to_datetime - - -def get_growth_rate_data(filepath: str) -> pd.DataFrame: - """ - Filters and pivots wider winsorisation data on period to return growth rate data. - - Parameters - ---------- - filepath : str - filepath to the asap output. - - Returns - ------- - pandas.Data.Frame - Dataframe containing classification, question number, and cell number, pivoted - wider on period with adjusted values. - """ - - input_data = pd.read_csv( - filepath, - usecols=[ - "classification", - "question_no", - "cell_no", - "period", - "adjusted_value", - "total weight (A*G*O)" - ], - dtype={ - "classification": "Int32", - "question_no": "Int8", - "cell_no": "Int16", - "period": "Int32", - "adjusted_value": "float64", - "total weight (A*G*O)": "float64", - }, - ) - - input_data["weighted_adjusted_value"] = input_data["adjusted_value"] * input_data["total weight (A*G*O)"] - - input_data["period"] = ( - convert_column_to_datetime(input_data["period"]) - .dt.strftime("%Y%b") - .str.upper() - ) - - input_data["sizeband"] = np.where( - input_data["cell_no"].isna(), - input_data["cell_no"], - input_data.cell_no.astype(str).str[-1], - ) - - input_data.drop(columns=["cell_no", "adjusted_value", "total weight (A*G*O)"], inplace=True) - - input_data.sort_values( - ["classification", "question_no", "sizeband", "period"], inplace=True - ) - - growth_rate_output = ( - input_data.pivot_table( - columns="period", - values="weighted_adjusted_value", - index=["classification", "question_no", "sizeband"], - aggfunc="sum", - dropna=False, - ) - .reset_index() - .dropna(how="any") - ) - - return growth_rate_output - - -data = get_growth_rate_data("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/shadow_team/asap_482_df_0.0.2.csv") -data.to_csv("D:/growth_rate_data.csv", index=False) diff --git a/mbs_results/test.py b/mbs_results/test.py deleted file mode 100644 index 7614aac3..00000000 --- a/mbs_results/test.py +++ /dev/null @@ -1,65 +0,0 @@ -import pandas as pd - -from mbs_results.merge_domain import merge_domain - - -def get_selective_editing_contributer_output( - input_filepath: str, - domain_filepath: str, - sic_input: str, - sic_mapping: str, -) -> pd.DataFrame: - """ - Returns a dataframe containing period, reference, domain_group, and - design_weight. - - Parameters - ---------- - input_filepath : str - Filepath to csv file containing reference, imp_class, period and - SIC columns. - domain_filepath : str - Filepath to csv file containing SIC and domain columns. - sic_input : str - Name of column in input_filepath csv file containing SIC variable. - sic_mapping : str - Name of column in domain_filepath csv file containing SIC variable. - - Returns - ------- - pd.DataFrame - Dataframe with SIC and domain columns merged. - ` - """ - - input_data = pd.read_csv( - input_filepath, - usecols=[ - "period", - "reference", - "design_weight", - sic_input, - ], - ) - - domain_data = pd.read_csv(domain_filepath) - - selective_editing_contributer_output = merge_domain( - input_data, domain_data, sic_input, sic_mapping - ) - - selective_editing_contributer_output = selective_editing_contributer_output.rename( - columns={"reference": "ruref", "domain": "domain_group"} - ) - - return selective_editing_contributer_output - - -selective_editing_contributer_output = get_selective_editing_contributer_output( - "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/winsorisation/winsorisation_output_0.0.2.csv", - "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/mapping_files/sic_domain_mapping.csv", - "sic_5_digit", - "sic_5_digit" -) - -selective_editing_contributer_output.to_csv("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/selective_editing_outputs/selective_editing_contributer_output.csv", index=False) From 82df270b69be6a4a11ba97ccb6effcd66edc9803 Mon Sep 17 00:00:00 2001 From: Jordan Day Date: Mon, 4 Nov 2024 11:26:41 +0000 Subject: [PATCH 5/5] commit hook changes --- mbs_results/utilities/csw_to_spp_converter.py | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py index e93644bb..bb4383bd 100644 --- a/mbs_results/utilities/csw_to_spp_converter.py +++ b/mbs_results/utilities/csw_to_spp_converter.py @@ -1,14 +1,12 @@ import fnmatch from os import listdir from os.path import isfile, join -import pandas as pd +import pandas as pd from utils import convert_column_to_datetime -def get_patern_df( - filepath: str, - pattern: str - ) -> pd.DataFrame: + +def get_patern_df(filepath: str, pattern: str) -> pd.DataFrame: """Loads as pd dataframe all csv files with pattern. Parameters @@ -23,7 +21,7 @@ def get_patern_df( pd.DataFrame Dataframe containg data from all selected files. """ - + filenames = [ filename for filename in listdir(filepath) if isfile(join(filepath, filename)) ] @@ -33,10 +31,11 @@ def get_patern_df( return df + def get_qv_and_cp_data( cp_path: str, qv_path: str, - ) -> pd.DataFrame: +) -> pd.DataFrame: """Reads and joins qv and cp data. Parameters @@ -45,28 +44,29 @@ def get_qv_and_cp_data( Filepath to folder containing cp data. qv_path : str Filepath to folder containing qv data. - + Returns ------- pd.DataFrame Dataframe containing combined qv and cp data. """ - - qv_df = get_patern_df(qv_path,"qv*.csv") - cp_df = get_patern_df(cp_path,"cp*.csv") - - qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"]) - + + qv_df = get_patern_df(qv_path, "qv*.csv") + cp_df = get_patern_df(cp_path, "cp*.csv") + + qv_and_cp = pd.merge(qv_df, cp_df, how="left", on=["period", "reference"]) + return qv_and_cp + def csw_to_spp( cp_path: str, qv_path: str, output_path: str, column_map: dict, period: str, - period_range: int - ) -> None: + period_range: int, +) -> None: """Combines cp and qv files, filters and renames columns based on a mapping, and then saves the output as a json file. @@ -79,23 +79,26 @@ def csw_to_spp( output_path : str Filepath to save json file. column_map : dict - Dictionary containing desired columns from qv and cp data as keys and their + Dictionary containing desired columns from qv and cp data as keys and their desired names as values. period : str Date to filter output on (YYYY-MM-DD). period_range : str Number of months from the period and previous to include in the output. """ - qv_and_cp = get_qv_and_cp_data(cp_path,qv_path) - + qv_and_cp = get_qv_and_cp_data(cp_path, qv_path) + qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"]) - + period = pd.Timestamp(period) - - qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)] - - qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m') - + + qv_and_cp = qv_and_cp[ + (qv_and_cp["period"] > period - pd.DateOffset(months=period_range)) + & (qv_and_cp["period"] <= period) + ] + + qv_and_cp["period"] = qv_and_cp["period"].dt.strftime("%Y%m") + qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map) - + qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json")