From ba419c9863516adb4a070a196d6385053f33b167 Mon Sep 17 00:00:00 2001
From: daviel9 <Luke.Davies@ons.gov.uk>
Date: Thu, 24 Oct 2024 14:57:44 +0100
Subject: [PATCH 1/5] Inital testing code

---
 mbs_results/csw_to_spp_converter.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 mbs_results/csw_to_spp_converter.py

diff --git a/mbs_results/csw_to_spp_converter.py b/mbs_results/csw_to_spp_converter.py
new file mode 100644
index 00000000..67d932bc
--- /dev/null
+++ b/mbs_results/csw_to_spp_converter.py
@@ -0,0 +1,18 @@
+import glob
+
+import pandas as pd
+
+
+def csw_to_spp(filepath):
+
+    files = glob.glob(filepath + "qv*") + glob.glob(filepath + "cp*")
+
+    li = []
+
+    for f in files:
+
+        temp_df = pd.read_csv(f)
+
+        li.append(temp_df)
+
+        print(f"Successfully created dataframe for {f} with shape {temp_df.shape}")

From d42bef9af6a7cda7753202b2df2607c5cbf8dd9f Mon Sep 17 00:00:00 2001
From: daviel9 <Luke.Davies@ons.gov.uk>
Date: Thu, 31 Oct 2024 14:24:50 +0000
Subject: [PATCH 2/5] Create function and sub functions

---
 mbs_results/csw_to_spp_converter.py           |  18 ---
 mbs_results/utilities/csw_to_spp_converter.py | 118 ++++++++++++++++++
 2 files changed, 118 insertions(+), 18 deletions(-)
 delete mode 100644 mbs_results/csw_to_spp_converter.py
 create mode 100644 mbs_results/utilities/csw_to_spp_converter.py

diff --git a/mbs_results/csw_to_spp_converter.py b/mbs_results/csw_to_spp_converter.py
deleted file mode 100644
index 67d932bc..00000000
--- a/mbs_results/csw_to_spp_converter.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import glob
-
-import pandas as pd
-
-
-def csw_to_spp(filepath):
-
-    files = glob.glob(filepath + "qv*") + glob.glob(filepath + "cp*")
-
-    li = []
-
-    for f in files:
-
-        temp_df = pd.read_csv(f)
-
-        li.append(temp_df)
-
-        print(f"Successfully created dataframe for {f} with shape {temp_df.shape}")
diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py
new file mode 100644
index 00000000..38896cf6
--- /dev/null
+++ b/mbs_results/utilities/csw_to_spp_converter.py
@@ -0,0 +1,118 @@
+import fnmatch
+from os import listdir
+from os.path import isfile, join
+import pandas as pd
+
+from utils import convert_column_to_datetime
+
+def get_patern_df(
+    filepath: str, 
+    pattern: str
+    ) -> pd.DataFrame:
+    """Loads as pd dataframe all csv files with pattern.
+
+    Parameters
+    ----------
+    filepath : str
+        Filepath to folder containg desired files.
+    pattern : str
+        Regex pattern to filter files in the folder based on name.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containg data from all selected files.
+    """
+      
+    filenames = [
+        filename for filename in listdir(filepath) if isfile(join(filepath, filename))
+    ]
+    filenames = fnmatch.filter(filenames, pattern)
+    df_list = [pd.read_csv(filepath + "/" + filename) for filename in filenames]
+    df = pd.concat(df_list, ignore_index=True)
+
+    return df
+
+def get_qv_and_cp_data(
+    cp_path: str,
+    qv_path: str,
+    ) -> pd.DataFrame:
+    """Reads and joins qv and cp data.
+
+    Parameters
+    ----------
+    cp_path : str
+        Filepath to folder containing cp data.
+    qv_path : str
+        Filepath to folder containing qv data.
+        
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing combined qv and cp data.
+    """
+    
+    qv_df = get_patern_df(qv_path,"qv*.csv")
+    cp_df = get_patern_df(cp_path,"cp*.csv")
+    
+    qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"])
+    
+    return qv_and_cp
+
+def csw_to_spp(
+    cp_path: str,
+    qv_path: str,
+    output_path: str,
+    column_map: dict,
+    period: str,
+    period_range: int
+    ) -> None:
+    """Combines cp and qv files, filters and renames columns based on a mapping, and
+    then saves the output as a json file.
+
+    Parameters
+    ----------
+    cp_path : str
+        Filepath to folder containing cp data.
+    qv_path : str
+        Filepath to folder containing qv data.
+    output_path : str
+        Filepath to save json file.
+    column_map : dict
+        Dictionary containing desired columns from qv and cp data as keys and their 
+        desired names as values.
+    period : str
+        Date to filter output on (YYYY-MM-DD).
+    period_range : str
+        Number of months from the period and previous to include in the output.
+    """
+    qv_and_cp = get_qv_and_cp_data(cp_path,qv_path)
+    
+    qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"])
+    
+    period = pd.Timestamp(period)
+        
+    qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)]
+    
+    qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m')
+    
+    qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map)
+        
+    qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json")
+
+col_mapping = {
+        "reference": "reference",
+        "period": "period",
+        "error_mkr": "status",
+        "question_no": "questioncode",
+        "returned_value": "response",
+        "adjusted_value": "adjustedresponse",
+    }
+
+filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z"
+
+csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3)
+
+df = pd.read_json("D:/test_202303_3.json")
+print(df.head())
+print(df.tail())

From 4288e8cecbfea5395225b0649a45cb61b3e94f9b Mon Sep 17 00:00:00 2001
From: daviel9 <Luke.Davies@ons.gov.uk>
Date: Mon, 4 Nov 2024 10:05:02 +0000
Subject: [PATCH 3/5] Remove reference to personal files

---
 mbs_results/growth_rate_main.py               | 78 +++++++++++++++++++
 mbs_results/test.py                           | 65 ++++++++++++++++
 mbs_results/utilities/csw_to_spp_converter.py | 17 ----
 3 files changed, 143 insertions(+), 17 deletions(-)
 create mode 100644 mbs_results/growth_rate_main.py
 create mode 100644 mbs_results/test.py

diff --git a/mbs_results/growth_rate_main.py b/mbs_results/growth_rate_main.py
new file mode 100644
index 00000000..08f4b073
--- /dev/null
+++ b/mbs_results/growth_rate_main.py
@@ -0,0 +1,78 @@
+import numpy as np
+import pandas as pd
+from utils import convert_column_to_datetime
+
+
+def get_growth_rate_data(filepath: str) -> pd.DataFrame:
+    """
+    Filters and pivots wider winsorisation data on period to return growth rate data.
+
+    Parameters
+    ----------
+    filepath : str
+        filepath to the asap output.
+
+    Returns
+    -------
+    pandas.Data.Frame
+        Dataframe containing classification, question number, and cell number, pivoted
+        wider on period with adjusted values.
+    """
+
+    input_data = pd.read_csv(
+        filepath,
+        usecols=[
+            "classification",
+            "question_no",
+            "cell_no",
+            "period",
+            "adjusted_value",
+            "total weight (A*G*O)"
+        ],
+        dtype={
+            "classification": "Int32",
+            "question_no": "Int8",
+            "cell_no": "Int16",
+            "period": "Int32",
+            "adjusted_value": "float64",
+            "total weight (A*G*O)": "float64",
+        },
+    )
+    
+    input_data["weighted_adjusted_value"] = input_data["adjusted_value"] * input_data["total weight (A*G*O)"]
+
+    input_data["period"] = (
+        convert_column_to_datetime(input_data["period"])
+        .dt.strftime("%Y%b")
+        .str.upper()
+    )
+
+    input_data["sizeband"] = np.where(
+        input_data["cell_no"].isna(),
+        input_data["cell_no"],
+        input_data.cell_no.astype(str).str[-1],
+    )
+
+    input_data.drop(columns=["cell_no", "adjusted_value", "total weight (A*G*O)"], inplace=True)
+
+    input_data.sort_values(
+        ["classification", "question_no", "sizeband", "period"], inplace=True
+    )
+
+    growth_rate_output = (
+        input_data.pivot_table(
+            columns="period",
+            values="weighted_adjusted_value",
+            index=["classification", "question_no", "sizeband"],
+            aggfunc="sum",
+            dropna=False,
+        )
+        .reset_index()
+        .dropna(how="any")
+    )
+
+    return growth_rate_output
+
+
+data = get_growth_rate_data("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/shadow_team/asap_482_df_0.0.2.csv")
+data.to_csv("D:/growth_rate_data.csv", index=False)
diff --git a/mbs_results/test.py b/mbs_results/test.py
new file mode 100644
index 00000000..7614aac3
--- /dev/null
+++ b/mbs_results/test.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+from mbs_results.merge_domain import merge_domain
+
+
+def get_selective_editing_contributer_output(
+    input_filepath: str,
+    domain_filepath: str,
+    sic_input: str,
+    sic_mapping: str,
+) -> pd.DataFrame:
+    """
+        Returns a dataframe containing period, reference, domain_group, and
+        design_weight.
+
+        Parameters
+        ----------
+        input_filepath : str
+            Filepath to csv file containing reference, imp_class, period and
+            SIC columns.
+        domain_filepath : str
+            Filepath to csv file containing SIC and domain columns.
+        sic_input : str
+            Name of column in input_filepath csv file containing SIC variable.
+        sic_mapping : str
+            Name of column in domain_filepath csv file containing SIC variable.
+
+        Returns
+        -------
+        pd.DataFrame
+            Dataframe with SIC and domain columns merged.
+    `
+    """
+
+    input_data = pd.read_csv(
+        input_filepath,
+        usecols=[
+            "period",
+            "reference",
+            "design_weight",
+            sic_input,
+        ],
+    )
+
+    domain_data = pd.read_csv(domain_filepath)
+
+    selective_editing_contributer_output = merge_domain(
+        input_data, domain_data, sic_input, sic_mapping
+    )
+
+    selective_editing_contributer_output = selective_editing_contributer_output.rename(
+        columns={"reference": "ruref", "domain": "domain_group"}
+    )
+
+    return selective_editing_contributer_output
+
+
+selective_editing_contributer_output = get_selective_editing_contributer_output(
+    "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/winsorisation/winsorisation_output_0.0.2.csv",
+    "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/mapping_files/sic_domain_mapping.csv",
+    "sic_5_digit",
+    "sic_5_digit"
+)
+
+selective_editing_contributer_output.to_csv("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/selective_editing_outputs/selective_editing_contributer_output.csv", index=False)
diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py
index 38896cf6..e93644bb 100644
--- a/mbs_results/utilities/csw_to_spp_converter.py
+++ b/mbs_results/utilities/csw_to_spp_converter.py
@@ -99,20 +99,3 @@ def csw_to_spp(
     qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map)
         
     qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json")
-
-col_mapping = {
-        "reference": "reference",
-        "period": "period",
-        "error_mkr": "status",
-        "question_no": "questioncode",
-        "returned_value": "response",
-        "adjusted_value": "adjustedresponse",
-    }
-
-filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z"
-
-csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3)
-
-df = pd.read_json("D:/test_202303_3.json")
-print(df.head())
-print(df.tail())

From 4633bdaf4d58b28b78c16af2819bbcd3ed6d8ee4 Mon Sep 17 00:00:00 2001
From: daviel9 <Luke.Davies@ons.gov.uk>
Date: Mon, 4 Nov 2024 10:07:47 +0000
Subject: [PATCH 4/5] Remove accidently added files

---
 mbs_results/growth_rate_main.py | 78 ---------------------------------
 mbs_results/test.py             | 65 ---------------------------
 2 files changed, 143 deletions(-)
 delete mode 100644 mbs_results/growth_rate_main.py
 delete mode 100644 mbs_results/test.py

diff --git a/mbs_results/growth_rate_main.py b/mbs_results/growth_rate_main.py
deleted file mode 100644
index 08f4b073..00000000
--- a/mbs_results/growth_rate_main.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import numpy as np
-import pandas as pd
-from utils import convert_column_to_datetime
-
-
-def get_growth_rate_data(filepath: str) -> pd.DataFrame:
-    """
-    Filters and pivots wider winsorisation data on period to return growth rate data.
-
-    Parameters
-    ----------
-    filepath : str
-        filepath to the asap output.
-
-    Returns
-    -------
-    pandas.Data.Frame
-        Dataframe containing classification, question number, and cell number, pivoted
-        wider on period with adjusted values.
-    """
-
-    input_data = pd.read_csv(
-        filepath,
-        usecols=[
-            "classification",
-            "question_no",
-            "cell_no",
-            "period",
-            "adjusted_value",
-            "total weight (A*G*O)"
-        ],
-        dtype={
-            "classification": "Int32",
-            "question_no": "Int8",
-            "cell_no": "Int16",
-            "period": "Int32",
-            "adjusted_value": "float64",
-            "total weight (A*G*O)": "float64",
-        },
-    )
-    
-    input_data["weighted_adjusted_value"] = input_data["adjusted_value"] * input_data["total weight (A*G*O)"]
-
-    input_data["period"] = (
-        convert_column_to_datetime(input_data["period"])
-        .dt.strftime("%Y%b")
-        .str.upper()
-    )
-
-    input_data["sizeband"] = np.where(
-        input_data["cell_no"].isna(),
-        input_data["cell_no"],
-        input_data.cell_no.astype(str).str[-1],
-    )
-
-    input_data.drop(columns=["cell_no", "adjusted_value", "total weight (A*G*O)"], inplace=True)
-
-    input_data.sort_values(
-        ["classification", "question_no", "sizeband", "period"], inplace=True
-    )
-
-    growth_rate_output = (
-        input_data.pivot_table(
-            columns="period",
-            values="weighted_adjusted_value",
-            index=["classification", "question_no", "sizeband"],
-            aggfunc="sum",
-            dropna=False,
-        )
-        .reset_index()
-        .dropna(how="any")
-    )
-
-    return growth_rate_output
-
-
-data = get_growth_rate_data("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/shadow_team/asap_482_df_0.0.2.csv")
-data.to_csv("D:/growth_rate_data.csv", index=False)
diff --git a/mbs_results/test.py b/mbs_results/test.py
deleted file mode 100644
index 7614aac3..00000000
--- a/mbs_results/test.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import pandas as pd
-
-from mbs_results.merge_domain import merge_domain
-
-
-def get_selective_editing_contributer_output(
-    input_filepath: str,
-    domain_filepath: str,
-    sic_input: str,
-    sic_mapping: str,
-) -> pd.DataFrame:
-    """
-        Returns a dataframe containing period, reference, domain_group, and
-        design_weight.
-
-        Parameters
-        ----------
-        input_filepath : str
-            Filepath to csv file containing reference, imp_class, period and
-            SIC columns.
-        domain_filepath : str
-            Filepath to csv file containing SIC and domain columns.
-        sic_input : str
-            Name of column in input_filepath csv file containing SIC variable.
-        sic_mapping : str
-            Name of column in domain_filepath csv file containing SIC variable.
-
-        Returns
-        -------
-        pd.DataFrame
-            Dataframe with SIC and domain columns merged.
-    `
-    """
-
-    input_data = pd.read_csv(
-        input_filepath,
-        usecols=[
-            "period",
-            "reference",
-            "design_weight",
-            sic_input,
-        ],
-    )
-
-    domain_data = pd.read_csv(domain_filepath)
-
-    selective_editing_contributer_output = merge_domain(
-        input_data, domain_data, sic_input, sic_mapping
-    )
-
-    selective_editing_contributer_output = selective_editing_contributer_output.rename(
-        columns={"reference": "ruref", "domain": "domain_group"}
-    )
-
-    return selective_editing_contributer_output
-
-
-selective_editing_contributer_output = get_selective_editing_contributer_output(
-    "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/winsorisation/winsorisation_output_0.0.2.csv",
-    "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/mapping_files/sic_domain_mapping.csv",
-    "sic_5_digit",
-    "sic_5_digit"
-)
-
-selective_editing_contributer_output.to_csv("C:/Users/daviel9/Office for National Statistics/Legacy Uplift - Testing outputs from DAP/MBS/selective_editing_outputs/selective_editing_contributer_output.csv", index=False)

From 82df270b69be6a4a11ba97ccb6effcd66edc9803 Mon Sep 17 00:00:00 2001
From: Jordan Day <jordan.day@ons.gov.uk>
Date: Mon, 4 Nov 2024 11:26:41 +0000
Subject: [PATCH 5/5] commit hook changes

---
 mbs_results/utilities/csw_to_spp_converter.py | 55 ++++++++++---------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py
index e93644bb..bb4383bd 100644
--- a/mbs_results/utilities/csw_to_spp_converter.py
+++ b/mbs_results/utilities/csw_to_spp_converter.py
@@ -1,14 +1,12 @@
 import fnmatch
 from os import listdir
 from os.path import isfile, join
-import pandas as pd
 
+import pandas as pd
 from utils import convert_column_to_datetime
 
-def get_patern_df(
-    filepath: str, 
-    pattern: str
-    ) -> pd.DataFrame:
+
+def get_patern_df(filepath: str, pattern: str) -> pd.DataFrame:
     """Loads as pd dataframe all csv files with pattern.
 
     Parameters
@@ -23,7 +21,7 @@ def get_patern_df(
     pd.DataFrame
         Dataframe containg data from all selected files.
     """
-      
+
     filenames = [
         filename for filename in listdir(filepath) if isfile(join(filepath, filename))
     ]
@@ -33,10 +31,11 @@ def get_patern_df(
 
     return df
 
+
 def get_qv_and_cp_data(
     cp_path: str,
     qv_path: str,
-    ) -> pd.DataFrame:
+) -> pd.DataFrame:
     """Reads and joins qv and cp data.
 
     Parameters
@@ -45,28 +44,29 @@ def get_qv_and_cp_data(
         Filepath to folder containing cp data.
     qv_path : str
         Filepath to folder containing qv data.
-        
+
     Returns
     -------
     pd.DataFrame
         Dataframe containing combined qv and cp data.
     """
-    
-    qv_df = get_patern_df(qv_path,"qv*.csv")
-    cp_df = get_patern_df(cp_path,"cp*.csv")
-    
-    qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"])
-    
+
+    qv_df = get_patern_df(qv_path, "qv*.csv")
+    cp_df = get_patern_df(cp_path, "cp*.csv")
+
+    qv_and_cp = pd.merge(qv_df, cp_df, how="left", on=["period", "reference"])
+
     return qv_and_cp
 
+
 def csw_to_spp(
     cp_path: str,
     qv_path: str,
     output_path: str,
     column_map: dict,
     period: str,
-    period_range: int
-    ) -> None:
+    period_range: int,
+) -> None:
     """Combines cp and qv files, filters and renames columns based on a mapping, and
     then saves the output as a json file.
 
@@ -79,23 +79,26 @@ def csw_to_spp(
     output_path : str
         Filepath to save json file.
     column_map : dict
-        Dictionary containing desired columns from qv and cp data as keys and their 
+        Dictionary containing desired columns from qv and cp data as keys and their
         desired names as values.
     period : str
         Date to filter output on (YYYY-MM-DD).
     period_range : str
         Number of months from the period and previous to include in the output.
     """
-    qv_and_cp = get_qv_and_cp_data(cp_path,qv_path)
-    
+    qv_and_cp = get_qv_and_cp_data(cp_path, qv_path)
+
     qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"])
-    
+
     period = pd.Timestamp(period)
-        
-    qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)]
-    
-    qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m')
-    
+
+    qv_and_cp = qv_and_cp[
+        (qv_and_cp["period"] > period - pd.DateOffset(months=period_range))
+        & (qv_and_cp["period"] <= period)
+    ]
+
+    qv_and_cp["period"] = qv_and_cp["period"].dt.strftime("%Y%m")
+
     qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map)
-        
+
     qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json")