ONSdigital · AntonZogk · Nov 7, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/mbs_results/config.json b/mbs_results/config.json
@@ -123,5 +123,8 @@
                     "0817":15,
                     "0867":15,
                     "0823":16,
-                    "0873":16}
+                    "0873":16},
+
+    "census_extra_calibration_group": [5043, 5113, 5123, 5203, 5233,
+     5403, 5643, 5763, 5783, 5903, 6073]
 }
diff --git a/mbs_results/estimation/apply_estimation.py b/mbs_results/estimation/apply_estimation.py
@@ -7,11 +7,19 @@
     calculate_design_weight,
 )
 from mbs_results.estimation.pre_processing_estimation import get_estimation_data
+from mbs_results.staging.data_cleaning import is_census
 
 # from mbs_results.estimation.validate_estimation import validate_estimation
 
 
-def apply_estimation(population_path, sample_path, period, **config):
+def apply_estimation(
+    population_path,
+    sample_path,
+    calibration_group,
+    census_extra_calibration_group,
+    period,
+    **config
+):
     """
     Read population frame and sample, merge key variables onto df then derive
     and validate estimation weights.
@@ -22,6 +30,10 @@ def apply_estimation(population_path, sample_path, period, **config):
         filepath for population frame data
     sample_path : str
         filepath for sample data
+    calibration_group: str
+        column name of dimension contaning calibration group values
+    census_extra_calibration_group: list
+        calibration groups which are census but not band 4 or 5
     period : str
         name of column containing period
 
@@ -45,12 +57,30 @@ def apply_estimation(population_path, sample_path, period, **config):
             population_file, sample_file, period, **config
         )
 
-        estimation_data = calculate_design_weight(estimation_data, period, **config)
-        estimation_data = calculate_calibration_factor(
-            estimation_data, period, **config
-        )
+        census_df = estimation_data[
+            is_census(
+                estimation_data[calibration_group], census_extra_calibration_group
+            )
+        ]
+
+        census_df["design_weight"] = 1
+        census_df["calibration_factor"] = 1
+        census_df["sampled"] = 0
+
+        non_census_df = estimation_data[
+            ~(
+                is_census(
+                    estimation_data[calibration_group], census_extra_calibration_group
+                )
+            )
+        ]
+
+        non_census_df = calculate_design_weight(non_census_df, period, **config)
+        non_census_df = calculate_calibration_factor(non_census_df, period, **config)
+
+        all_together = pd.concat([non_census_df, census_df], ignore_index=True)
 
-        estimation_df_list.append(estimation_data)
+        estimation_df_list.append(all_together)
 
     estimation_df = pd.concat(estimation_df_list, ignore_index=True)
 

diff --git a/mbs_results/staging/__init__.py b/mbs_results/staging/__init__.py
diff --git a/mbs_results/staging/data_cleaning.py b/mbs_results/staging/data_cleaning.py
@@ -381,72 +381,31 @@ def create_imputation_class(
     return df
 
 
-# TODO: Can be used when we set defaults in other parts of the pipeline
-def correct_values(
-    df: pd.DataFrame,
-    columns_to_correct: List[str] or str,
-    condition_column: str,
-    condition_values: List[int],
-    replace_with: int,
-) -> pd.DataFrame:
+def is_census(calibration_group: pd.Series, extra_bands: List) -> pd.Series:
     """
-    Sets values in a dataframe column(s) based on a condition, checks if
-    columns exists prior to correction to avoid creating them.
+    Returns a bool series indicating if calibration group is considered cencus
+    or not.
+
+    Calibration groups in extra_bands list are considered cencus groups.
+
+    Calibration groups ending with 4 or 5 are considered cencus groups.
+
 
     Parameters
     ----------
-    df : pd.DataFrame
-        Original dataframe to apply the correction.
-    columns_to_correct : List(str) or str
-        Column(s) to set values.
-    condition_column : str
-        Column for the condition.
-    condition_values : List(int)
-        Values which exist in condition_column.
-    replace_with : int
-        Value to set if condition is met.
+    calibration_group : pd.Series
+        Series with calibration groups
+    extra_bands: List
+        Extra calibration groups which are cencus
 
     Returns
     -------
-    df : pd.DataFrame
-        Dataframe with values replaced.
-
-    Examples
-    --------
-    >>> df = pd.DataFrame({'a': [0, 1, 2, 3, 4],
-            'b': [5, 6, 7, 8, 9],
-            'band': [1,2,3,4,5]})
-    >>> df
-        a  b  band
-    0  0  5     1
-    1  1  6     2
-    2  2  7     3
-    3  3  8     4
-    4  4  9     5
-
-    >>> df2 = correct_values(df,["a","b"],"band",[4,5],1)
-    >>> df2
-        a  b  band
-    0  0  5     1
-    1  1  6     2
-    2  2  7     3
-    3  1  1     4
-    4  1  1     5
+    pd.Series
+        A bool series, TRUE if calibration group is cencus
     """
 
-    df_temp = df.copy()  # to avoid changing input df
-
-    check_columns = (
-        columns_to_correct + [condition_column]  # list + list(str)
-        if pd.api.types.is_list_like(columns_to_correct)
-        else [columns_to_correct, condition_column]
-    )
+    rule_band_4_5 = calibration_group.astype(str).map(lambda x: x.endswith(("4", "5")))
 
-    # Update value only if columns exist
-    if set(check_columns).issubset(df.columns):
-
-        df_temp.loc[df[condition_column].isin(condition_values), columns_to_correct] = (
-            replace_with
-        )
+    rule_extra_bands = calibration_group.isin(extra_bands)
 
-    return df_temp
+    return rule_band_4_5 | rule_extra_bands
diff --git a/tests/data/staging/data_cleaning/test_correct_values.csv b/tests/data/staging/data_cleaning/test_correct_values.csv
diff --git a/tests/data/staging/data_cleaning/test_is_cencus.csv b/tests/data/staging/data_cleaning/test_is_cencus.csv
@@ -0,0 +1,21 @@
+calibration_group,is_census
+5043,TRUE
+5113,TRUE
+5123,TRUE
+5203,TRUE
+5233,TRUE
+5403,TRUE
+5643,TRUE
+5763,TRUE
+5783,TRUE
+5903,TRUE
+6073,TRUE
+6005,TRUE
+6004,TRUE
+6003,FALSE
+6001,FALSE
+6006,FALSE
+6007,FALSE
+6008,FALSE
+6009,FALSE
+6000,FALSE
diff --git a/tests/staging/test_data_cleaning.py b/tests/staging/test_data_cleaning.py
@@ -2,13 +2,13 @@
 
 import pandas as pd
 import pytest
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
 from mbs_results.staging.data_cleaning import (
     clean_and_merge,
-    correct_values,
     create_imputation_class,
     enforce_datatypes,
+    is_census,
     run_live_or_frozen,
 )
 
@@ -137,18 +137,29 @@ def test_run_live_or_frozen_exception(filepath):
         run_live_or_frozen(df, "target", "error", "love")
 
 
-def test_correct_values(filepath):
+def test_is_census(filepath):
 
-    df = pd.read_csv(filepath / "test_correct_values.csv")
+    df = pd.read_csv(filepath / "test_is_cencus.csv")
 
-    df_in = df[["band_no", "value_1", "value_2", "value_3"]]
-
-    expected_output = df[
-        ["band_no", "expected_value_1", "expected_value_2", "expected_value_3"]
+    extra_cal_groups = [
+        5043,
+        5113,
+        5123,
+        5203,
+        5233,
+        5403,
+        5643,
+        5763,
+        5783,
+        5903,
+        6073,
     ]
 
-    expected_output.columns = df_in.columns
+    input_series = df["calibration_group"]
+    expected_output = df["is_census"]
 
-    actual_output = correct_values(df_in, ["value_1", "value_2"], "band_no", [4, 5], 1)
+    # By default takes name of input series
+    actual_output = is_census(input_series, extra_cal_groups)
+    actual_output.name = "is_census"
 
-    assert_frame_equal(actual_output, expected_output)
+    assert_series_equal(actual_output, expected_output)