From 3a8b237293cee5faf9787b80147d74cca627ab94 Mon Sep 17 00:00:00 2001
From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com>
Date: Fri, 10 May 2024 12:49:12 +0100
Subject: [PATCH 01/20] 338 pre commit hooks (#10)

* Auto formatting items in test dir by commit hooks

* Auto formatting test data by commit hooks

* Auto formatting of pre-commit function by commit hooks

* Auto formatting construction and requirements using commit hooks

* Auto formatting and modifying L112 to fix E712

* Auto formatting workflow yaml by commit hooks

* Adding mixed line ending and detecting secrets

* Removing unused commit hook functions
---
 .github/workflows/main.yaml                   |   2 +-
 .pre-commit-config.yaml                       |  32 +++--
 pre-commits/check_added_large_files.py        |  27 ++--
 pre-commits/check_merge_conflict.py           |  33 +++--
 pre-commits/commit_msg.py                     |  41 -------
 pre-commits/end_of_line_fixer.py              |  20 ++-
 pre-commits/mixed_line_endings.py             |  42 +++----
 pre-commits/prepare_commit_msg.py             |  45 -------
 pre-commits/remove_whitespace.py              |  77 ++++++------
 requirements.txt                              |   2 +-
 src/construction_matches.py                   |   5 +-
 src/flag_and_count_matched_pairs.py           | 100 ++++++++-------
 tests/helper_functions.py                     |   3 +-
 tests/test_construction_matches.py            |  44 ++++---
 .../case1_expected_output.csv                 |   2 +-
 .../case2_expected_output.csv                 |   2 +-
 .../case3_expected_output.csv                 |   2 +-
 tests/test_flag_and_count_matched_pairs.py    | 116 ++++++++++++------
 18 files changed, 288 insertions(+), 307 deletions(-)
 delete mode 100755 pre-commits/commit_msg.py
 delete mode 100755 pre-commits/prepare_commit_msg.py

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 12fe2efe..080d4cd3 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -46,4 +46,4 @@ jobs:
 
       - name: Run pytest
         run: |
-          pytest -v
\ No newline at end of file
+          pytest -v
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 44b4d541..81880b61 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,6 +30,15 @@ repos:
         language: script
         stages: [commit]
 
+#works
+-     repo: local
+      hooks:
+      - id: mixed-line-endings
+        entry: pre-commits/mixed_line_endings.py
+        name: Check for consistent end of line type LF to CRLF to CR (auto-fixes)
+        language: script
+        stages: [commit]
+
 #works
 #if using on different file types, it will need a seperate hook per file type
 -   repo: local
@@ -76,18 +85,17 @@ repos:
         stages: [commit]
 
 
-# #needs to remove the password in hello_world.py
-# -   repo: local
-#     hooks:
-#       - id: detect-secrets
-#         entry: detect-secrets
-#         name: detect-secrets - Detect secrets in staged code
-#         #args: [ "--baseline", ".secrets.baseline" ]
-#         args: [scan, audit]
-#         language: system
-#         types: [python]
-#         stages: [commit]
-#         exclude: .*/tests/.*|^\.cruft\.json$
+# works in testing
+-   repo: local
+    hooks:
+      - id: detect-secrets
+        entry: detect-secrets-hook
+        name: detect-secrets - Detect secrets in staged code
+        args: [ "--baseline", ".secrets.baseline" ]
+        #args: [scan, audit]
+        language: system
+        types: [python]
+        stages: [commit]
 
 
 
diff --git a/pre-commits/check_added_large_files.py b/pre-commits/check_added_large_files.py
index 41fa69b3..59c0353a 100755
--- a/pre-commits/check_added_large_files.py
+++ b/pre-commits/check_added_large_files.py
@@ -4,24 +4,20 @@
 import json
 import math
 import os
-from typing import Optional
-from typing import Sequence
-from typing import Set
+from typing import Optional, Sequence, Set
 
-from pre_commit_hooks.util import added_files
-from pre_commit_hooks.util import CalledProcessError
-from pre_commit_hooks.util import cmd_output
+from pre_commit_hooks.util import CalledProcessError, added_files, cmd_output
 
 
 def _lfs_files() -> Set[str]:
     """Private function."""
     try:
         # Introduced in git-lfs 2.2.0, first working in 2.2.1
-        lfs_ret = cmd_output('git', 'lfs', 'status', '--json')
+        lfs_ret = cmd_output("git", "lfs", "status", "--json")
     except CalledProcessError:  # pragma: no cover (with git-lfs)
         lfs_ret = '{"files":{}}'
 
-    return set(json.loads(lfs_ret)['files'])
+    return set(json.loads(lfs_ret)["files"])
 
 
 def _find_large_added_files(filenames: Sequence[str], maxkb: int) -> int:
@@ -32,7 +28,7 @@ def _find_large_added_files(filenames: Sequence[str], maxkb: int) -> int:
     for filename in (added_files() & set(filenames)) - _lfs_files():
         kb = int(math.ceil(os.stat(filename).st_size / 1024))
         if kb > maxkb:
-            print(f'{filename} ({kb} KB) exceeds {maxkb} KB.')
+            print(f"{filename} ({kb} KB) exceeds {maxkb} KB.")
             retv = 1
 
     return retv
@@ -42,17 +38,20 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
     """Entry function for script."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        'filenames', nargs='*',
-        help='Filenames pre-commit believes are changed.',
+        "filenames",
+        nargs="*",
+        help="Filenames pre-commit believes are changed.",
     )
     parser.add_argument(
-        '--maxkb', type=int, default=500,
-        help='Maxmimum allowable KB for added files',
+        "--maxkb",
+        type=int,
+        default=500,
+        help="Maxmimum allowable KB for added files",
     )
 
     args = parser.parse_args(argv)
     return _find_large_added_files(args.filenames, args.maxkb)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     exit(main())
diff --git a/pre-commits/check_merge_conflict.py b/pre-commits/check_merge_conflict.py
index 85a29255..e6c67007 100755
--- a/pre-commits/check_merge_conflict.py
+++ b/pre-commits/check_merge_conflict.py
@@ -2,35 +2,30 @@
 """Pre commit hook to check for merge conflict flags in file."""
 import argparse
 import os.path
-from typing import Optional
-from typing import Sequence
-
+from typing import Optional, Sequence
 
 CONFLICT_PATTERNS = [
-    b'<<<<<<< ',
-    b'======= ',
-    b'=======\n',
-    b'>>>>>>> ',
+    b"<<<<<<< ",
+    b"======= ",
+    b"=======\n",
+    b">>>>>>> ",
 ]
 
 
 def _is_in_merge() -> int:
     """Private function."""
-    return (
-        os.path.exists(os.path.join('.git', 'MERGE_MSG')) and
-        (
-            os.path.exists(os.path.join('.git', 'MERGE_HEAD')) or
-            os.path.exists(os.path.join('.git', 'rebase-apply')) or
-            os.path.exists(os.path.join('.git', 'rebase-merge'))
-        )
+    return os.path.exists(os.path.join(".git", "MERGE_MSG")) and (
+        os.path.exists(os.path.join(".git", "MERGE_HEAD"))
+        or os.path.exists(os.path.join(".git", "rebase-apply"))
+        or os.path.exists(os.path.join(".git", "rebase-merge"))
     )
 
 
 def main(argv: Optional[Sequence[str]] = None) -> int:
     """Entry function for script."""
     parser = argparse.ArgumentParser()
-    parser.add_argument('filenames', nargs='*')
-    parser.add_argument('--assume-in-merge', action='store_true')
+    parser.add_argument("filenames", nargs="*")
+    parser.add_argument("--assume-in-merge", action="store_true")
     args = parser.parse_args(argv)
 
     if not _is_in_merge() and not args.assume_in_merge:
@@ -38,18 +33,18 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
 
     retcode = 0
     for filename in args.filenames:
-        with open(filename, 'rb') as inputfile:
+        with open(filename, "rb") as inputfile:
             for i, line in enumerate(inputfile):
                 for pattern in CONFLICT_PATTERNS:
                     if line.startswith(pattern):
                         print(
                             f'Merge conflict string "{pattern.decode()}" '
-                            f'found in {filename}:{i + 1}',
+                            f"found in {filename}:{i + 1}",
                         )
                         retcode = 1
 
     return retcode
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     exit(main())
diff --git a/pre-commits/commit_msg.py b/pre-commits/commit_msg.py
deleted file mode 100755
index e478166a..00000000
--- a/pre-commits/commit_msg.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-"""Git hook to check git commit message has appropriate length subject line.
-
-After removing the jira issue number from the subject line we check that the
-message is longer than 20 characters and shorter than 65.
-"""
-import sys
-
-# Collect the parameters
-commit_msg_filepath = sys.argv[1]
-
-with open(commit_msg_filepath, 'r') as f:
-    lines = f.readlines()
-
-    # The subject is the first line of the message, but we don't count any
-    # Jira issue note
-    commit_subject = lines[0].split(']')[-1]
-
-    if (len(commit_subject) < 20):
-        print(
-            f'''
-            commit-msg: ERROR! The commit subject is too short!
-            subject length = {len(commit_subject)} < 20 characters'
-            '''
-        )
-        sys.exit(1)
-
-    elif (len(commit_subject) > 65):
-        # We check if messages are greater than 65 char, but warn as if
-        # longer than 50
-        print(
-            f'''
-            commit-msg: ERROR!
-            The commit subject is too long!
-            subject length = {len(commit_subject)} > 50 characters'
-            '''
-        )
-        sys.exit(1)
-
-#    for line in lines[2:]:
-#        print(line)
diff --git a/pre-commits/end_of_line_fixer.py b/pre-commits/end_of_line_fixer.py
index 8f39b8c1..eb85f62e 100755
--- a/pre-commits/end_of_line_fixer.py
+++ b/pre-commits/end_of_line_fixer.py
@@ -2,9 +2,7 @@
 """Pre commit hook to ensure single blank line at end of python file."""
 import argparse
 import os
-from typing import IO
-from typing import Optional
-from typing import Sequence
+from typing import IO, Optional, Sequence
 
 
 def _fix_file(file_obj: IO[bytes]) -> int:
@@ -17,13 +15,13 @@ def _fix_file(file_obj: IO[bytes]) -> int:
         return 0
     last_character = file_obj.read(1)
     # last_character will be '' for an empty file
-    if last_character not in {b'\n', b'\r'} and last_character != b'':
+    if last_character not in {b"\n", b"\r"} and last_character != b"":
         # Needs this seek for windows, otherwise IOError
         file_obj.seek(0, os.SEEK_END)
-        file_obj.write(b'\n')
+        file_obj.write(b"\n")
         return 1
 
-    while last_character in {b'\n', b'\r'}:
+    while last_character in {b"\n", b"\r"}:
         # Deal with the beginning of the file
         if file_obj.tell() == 1:
             # If we've reached the beginning of the file and it is all
@@ -40,7 +38,7 @@ def _fix_file(file_obj: IO[bytes]) -> int:
     # newlines.  If we find extraneous newlines, then backtrack and trim them.
     position = file_obj.tell()
     remaining = file_obj.read()
-    for sequence in (b'\n', b'\r\n', b'\r'):
+    for sequence in (b"\n", b"\r\n", b"\r"):
         if remaining == sequence:
             return 0
         elif remaining.startswith(sequence):
@@ -54,21 +52,21 @@ def _fix_file(file_obj: IO[bytes]) -> int:
 def main(argv: Optional[Sequence[str]] = None) -> int:
     """Entry function for script."""
     parser = argparse.ArgumentParser()
-    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    parser.add_argument("filenames", nargs="*", help="Filenames to fix")
     args = parser.parse_args(argv)
 
     retv = 0
 
     for filename in args.filenames:
         # Read as binary so we can read byte-by-byte
-        with open(filename, 'rb+') as file_obj:
+        with open(filename, "rb+") as file_obj:
             ret_for_file = _fix_file(file_obj)
             if ret_for_file:
-                print(f'Fixing {filename}')
+                print(f"Fixing {filename}")
             retv |= ret_for_file
 
     return retv
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     exit(main())
diff --git a/pre-commits/mixed_line_endings.py b/pre-commits/mixed_line_endings.py
index 48afc2e6..8ae44909 100755
--- a/pre-commits/mixed_line_endings.py
+++ b/pre-commits/mixed_line_endings.py
@@ -2,31 +2,28 @@
 """Pre commit hook to ensure all EOL characters are the same."""
 import argparse
 import collections
-from typing import Dict
-from typing import Optional
-from typing import Sequence
+from typing import Dict, Optional, Sequence
 
-
-CRLF = b'\r\n'
-LF = b'\n'
-CR = b'\r'
+CRLF = b"\r\n"
+LF = b"\n"
+CR = b"\r"
 # Prefer LF to CRLF to CR, but detect CRLF before LF
 ALL_ENDINGS = (CR, CRLF, LF)
-FIX_TO_LINE_ENDING = {'cr': CR, 'crlf': CRLF, 'lf': LF}
+FIX_TO_LINE_ENDING = {"cr": CR, "crlf": CRLF, "lf": LF}
 
 
 def _fix(filename: str, contents: bytes, ending: bytes) -> None:
     """Private function."""
-    new_contents = b''.join(
-        line.rstrip(b'\r\n') + ending for line in contents.splitlines(True)
+    new_contents = b"".join(
+        line.rstrip(b"\r\n") + ending for line in contents.splitlines(True)
     )
-    with open(filename, 'wb') as f:
+    with open(filename, "wb") as f:
         f.write(new_contents)
 
 
 def fix_filename(filename: str, fix: str) -> int:
     """Private function."""
-    with open(filename, 'rb') as f:
+    with open(filename, "rb") as f:
         contents = f.read()
 
     counts: Dict[bytes, int] = collections.defaultdict(int)
@@ -40,10 +37,10 @@ def fix_filename(filename: str, fix: str) -> int:
     # Some amount of mixed line endings
     mixed = sum(bool(x) for x in counts.values()) > 1
 
-    if fix == 'no' or (fix == 'auto' and not mixed):
+    if fix == "no" or (fix == "auto" and not mixed):
         return mixed
 
-    if fix == 'auto':
+    if fix == "auto":
         max_ending = LF
         max_lines = 0
         # ordering is important here such that lf > crlf > cr
@@ -70,24 +67,25 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
     """Entry function for script."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '-f', '--fix',
-        choices=('auto', 'no') + tuple(FIX_TO_LINE_ENDING),
-        default='auto',
+        "-f",
+        "--fix",
+        choices=("auto", "no") + tuple(FIX_TO_LINE_ENDING),
+        default="auto",
         help='Replace line ending with the specified. Default is "auto"',
     )
-    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    parser.add_argument("filenames", nargs="*", help="Filenames to fix")
     args = parser.parse_args(argv)
 
     retv = 0
     for filename in args.filenames:
         if fix_filename(filename, args.fix):
-            if args.fix == 'no':
-                print(f'{filename}: mixed line endings')
+            if args.fix == "no":
+                print(f"{filename}: mixed line endings")
             else:
-                print(f'{filename}: fixed mixed line endings')
+                print(f"{filename}: fixed mixed line endings")
             retv = 1
     return retv
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     exit(main())
diff --git a/pre-commits/prepare_commit_msg.py b/pre-commits/prepare_commit_msg.py
deleted file mode 100755
index 7cc97878..00000000
--- a/pre-commits/prepare_commit_msg.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""Git hook to automatically prefix git commit message with Jira issue number.
-
-The issue number (e.g. Jira ticket number) from the current branch name. Works
-with or without specifying -m option at commit time.
-"""
-import re
-import sys
-from subprocess import check_output
-
-
-commit_msg_filepath = sys.argv[1]
-branch = (
-    check_output(["git", "symbolic-ref", "--short", "HEAD"])
-    .decode("utf-8").strip()
-)
-
-# If branch name contains /'s we only want the final part of the branch name
-branch_end = branch.split('/')[-1]
-
-# Regex pattern for matching to Jira issues
-regex = r"[Jj]\d+"
-
-if re.search(regex, branch_end):
-    # Create list of all matches to regex pattern
-    issue_number_matches = re.findall(regex, branch_end)
-
-    # If mutiple issues in branch name we join them together
-    commit_issue = f'{"_".join(issue_number_matches)}'
-
-    with open(commit_msg_filepath, "r+") as f:
-        commit_msg = f.read()
-        f.seek(0, 0)  # correctly position issue_number when writing commit msg
-        f.write(f"[{commit_issue}] {commit_msg}")
-
-else:
-    # If branch does not contain a jira issue number, reject the commit
-    print(
-        f'''
-        prepare-commit-msg: Error!
-        Branch name is {branch}
-        Does not match branch name strategy \'*/jxxx\'
-        '''
-    )
-    sys.exit(1)
diff --git a/pre-commits/remove_whitespace.py b/pre-commits/remove_whitespace.py
index d890b18c..61e5803f 100755
--- a/pre-commits/remove_whitespace.py
+++ b/pre-commits/remove_whitespace.py
@@ -2,21 +2,20 @@
 """Pre commit hook to remove any trailing whitespace."""
 import argparse
 import os
-from typing import Optional
-from typing import Sequence
+from typing import Optional, Sequence
 
 
 def _fix_file(
-        filename: str,
-        is_markdown: bool,
-        chars: Optional[bytes],
+    filename: str,
+    is_markdown: bool,
+    chars: Optional[bytes],
 ) -> bool:
     """Private function."""
-    with open(filename, mode='rb') as file_processed:
+    with open(filename, mode="rb") as file_processed:
         lines = file_processed.readlines()
     newlines = [_process_line(line, is_markdown, chars) for line in lines]
     if newlines != lines:
-        with open(filename, mode='wb') as file_processed:
+        with open(filename, mode="wb") as file_processed:
             for line in newlines:
                 file_processed.write(line)
         return True
@@ -25,22 +24,22 @@ def _fix_file(
 
 
 def _process_line(
-        line: bytes,
-        is_markdown: bool,
-        chars: Optional[bytes],
+    line: bytes,
+    is_markdown: bool,
+    chars: Optional[bytes],
 ) -> bytes:
     """Private function."""
-    if line[-2:] == b'\r\n':
-        eol = b'\r\n'
+    if line[-2:] == b"\r\n":
+        eol = b"\r\n"
         line = line[:-2]
-    elif line[-1:] == b'\n':
-        eol = b'\n'
+    elif line[-1:] == b"\n":
+        eol = b"\n"
         line = line[:-1]
     else:
-        eol = b''
+        eol = b""
     # preserve trailing two-space for non-blank lines in markdown files
-    if is_markdown and (not line.isspace()) and line.endswith(b'  '):
-        return line[:-2].rstrip(chars) + b'  ' + eol
+    if is_markdown and (not line.isspace()) and line.endswith(b"  "):
+        return line[:-2].rstrip(chars) + b"  " + eol
     return line.rstrip(chars) + eol
 
 
@@ -48,48 +47,46 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
     """Entry function for script."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--no-markdown-linebreak-ext',
-        action='store_true',
+        "--no-markdown-linebreak-ext",
+        action="store_true",
         help=argparse.SUPPRESS,
     )
     parser.add_argument(
-        '--markdown-linebreak-ext',
-        action='append',
+        "--markdown-linebreak-ext",
+        action="append",
         default=[],
-        metavar='*|EXT[,EXT,...]',
+        metavar="*|EXT[,EXT,...]",
         help=(
-            'Markdown extensions (or *) to not strip linebreak spaces.  '
-            'default: %(default)s'
+            "Markdown extensions (or *) to not strip linebreak spaces.  "
+            "default: %(default)s"
         ),
     )
     parser.add_argument(
-        '--chars',
+        "--chars",
         help=(
-            'The set of characters to strip from the end of lines.  '
-            'Defaults to all whitespace characters.'
+            "The set of characters to strip from the end of lines.  "
+            "Defaults to all whitespace characters."
         ),
     )
-    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    parser.add_argument("filenames", nargs="*", help="Filenames to fix")
     args = parser.parse_args(argv)
 
     if args.no_markdown_linebreak_ext:
-        print('--no-markdown-linebreak-ext now does nothing!')
+        print("--no-markdown-linebreak-ext now does nothing!")
 
     md_args = args.markdown_linebreak_ext
-    if '' in md_args:
-        parser.error('--markdown-linebreak-ext requires a non-empty argument')
-    all_markdown = '*' in md_args
+    if "" in md_args:
+        parser.error("--markdown-linebreak-ext requires a non-empty argument")
+    all_markdown = "*" in md_args
     # normalize extensions; split at ',', lowercase, and force 1 leading '.'
-    md_exts = [
-        '.' + x.lower().lstrip('.') for x in ','.join(md_args).split(',')
-    ]
+    md_exts = ["." + x.lower().lstrip(".") for x in ",".join(md_args).split(",")]
 
     # reject probable "eaten" filename as extension: skip leading '.' with [1:]
     for ext in md_exts:
-        if any(c in ext[1:] for c in r'./\:'):
+        if any(c in ext[1:] for c in r"./\:"):
             parser.error(
-                f'bad --markdown-linebreak-ext extension '
-                f'{ext!r} (has . / \\ :)\n'
+                f"bad --markdown-linebreak-ext extension "
+                f"{ext!r} (has . / \\ :)\n"
                 f"  (probably filename; use '--markdown-linebreak-ext=EXT')",
             )
     chars = None if args.chars is None else args.chars.encode()
@@ -98,10 +95,10 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
         _, extension = os.path.splitext(filename.lower())
         md = all_markdown or extension in md_exts
         if _fix_file(filename, md, chars):
-            print(f'Fixing {filename}')
+            print(f"Fixing {filename}")
             return_code = 1
     return return_code
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     exit(main())
diff --git a/requirements.txt b/requirements.txt
index bd9b2879..e26789b2 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,4 @@ nbqa
 pre_commit_hooks
 flake8
 pandas==1.1.5
-numpy
\ No newline at end of file
+numpy
diff --git a/src/construction_matches.py b/src/construction_matches.py
index dc947e0b..41ab2590 100644
--- a/src/construction_matches.py
+++ b/src/construction_matches.py
@@ -1,5 +1,6 @@
 import pandas as pd
 
+
 def flag_construction_matches(dataframe, target, period, auxiliary):
     """
     Add flag to indicate whether the record has non-null target, period and
@@ -22,6 +23,8 @@ def flag_construction_matches(dataframe, target, period, auxiliary):
         dataframe with additional flag_construction_matches column
     """
 
-    dataframe["flag_construction_matches"] = pd.notna(dataframe[[target, period, auxiliary]]).all(axis="columns")
+    dataframe["flag_construction_matches"] = pd.notna(
+        dataframe[[target, period, auxiliary]]
+    ).all(axis="columns")
 
     return dataframe
diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
index 64989781..e3d55c2a 100644
--- a/src/flag_and_count_matched_pairs.py
+++ b/src/flag_and_count_matched_pairs.py
@@ -1,7 +1,10 @@
+import numpy as np
 import pandas as pd
-import numpy as np 
 
-def flag_matched_pair_merge(df, forward_or_backward,target, period, reference, strata, time_difference=1):
+
+def flag_matched_pair_merge(
+    df, forward_or_backward, target, period, reference, strata, time_difference=1
+):
     """
     function to add flag to df if data forms a matched pair
     i.e. data is given for both period and predictive period
@@ -26,38 +29,44 @@ def flag_matched_pair_merge(df, forward_or_backward,target, period, reference, s
     Returns
     -------
     pd.DataFrame
-        dataframe with column added flagging forward matched paris and 
+        dataframe with column added flagging forward matched paris and
         predictive target variable data column
-    """    
+    """
 
-    if forward_or_backward == 'f':
+    if forward_or_backward == "f":
         time_difference = time_difference
-    elif forward_or_backward == 'b':
-        time_difference =  -time_difference
+    elif forward_or_backward == "b":
+        time_difference = -time_difference
 
     # Creating new DF, shifting period for forward or backward
     df_with_predictive_column = df[[reference, strata, target]]
-    df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(months=time_difference) 
-    df_with_predictive_column.rename(columns={target : 'predictive_'+target},inplace = True)
-
-    
-    df = df.merge(df_with_predictive_column,
-                  left_on=[reference, period, strata],
-                  right_on=[reference, "predictive_period", strata],
-                  how="left")
-
-    matched_col_name = forward_or_backward + '_matched_pair'
+    df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
+        months=time_difference
+    )
+    df_with_predictive_column.rename(
+        columns={target: "predictive_" + target}, inplace=True
+    )
+
+    df = df.merge(
+        df_with_predictive_column,
+        left_on=[reference, period, strata],
+        right_on=[reference, "predictive_period", strata],
+        how="left",
+    )
+
+    matched_col_name = forward_or_backward + "_matched_pair"
 
     df[matched_col_name] = np.where(
-        df[[target,'predictive_'+target]].isnull().any(axis=1),
-        False, 
-        True)
-    
-    df.drop(['predictive_period'],axis = 1, inplace=True)
+        df[[target, "predictive_" + target]].isnull().any(axis=1), False, True
+    )
+
+    df.drop(["predictive_period"], axis=1, inplace=True)
     return df
 
 
-def flag_matched_pair_shift(df,forward_or_backward,target, period, reference, strata, shift=1):
+def flag_matched_pair_shift(
+    df, forward_or_backward, target, period, reference, strata, shift=1
+):
     """
     function to flag matched pairs using the shift method
 
@@ -79,35 +88,41 @@ def flag_matched_pair_shift(df,forward_or_backward,target, period, reference, st
     Returns
     -------
     _type_
-        pandas dataframe with column added flagging forward matched pairs and 
+        pandas dataframe with column added flagging forward matched pairs and
         predictive target variable data column
-    """    
-    
-    if forward_or_backward == 'f':
+    """
+
+    if forward_or_backward == "f":
         shift = shift
-    elif forward_or_backward == 'b':
+    elif forward_or_backward == "b":
         shift = -shift
 
-    df = df.sort_values(by = [reference, period])
-    df[["predictive_"+target, "predictive_period"]] = df.groupby([reference, strata]).shift(shift)[[target, period]]
+    df = df.sort_values(by=[reference, period])
+    df[["predictive_" + target, "predictive_period"]] = df.groupby(
+        [reference, strata]
+    ).shift(shift)[[target, period]]
 
-    df["validate_date"] = np.where(df[period].dt.month - df["predictive_period"].dt.month == shift, True, False)
-    matched_col_name = forward_or_backward + '_matched_pair'
+    df["validate_date"] = np.where(
+        df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
+    )
+    matched_col_name = forward_or_backward + "_matched_pair"
 
     df[matched_col_name] = np.where(
-    df[[target,'predictive_target_variable']].isnull().any(axis=1) | (df["validate_date"] != True),
-    False, 
-    True)
+        df[[target, "predictive_target_variable"]].isnull().any(axis=1)
+        | (~df["validate_date"]),
+        False,
+        True,
+    )
 
-    df.drop(['validate_date','predictive_period'],axis = 1, inplace=True)
+    df.drop(["validate_date", "predictive_period"], axis=1, inplace=True)
 
     return df
- 
+
 
 def count_matches(df, flag_column_name, period, strata, count_column_name=None):
     """
     Function to count the number of records with matches per period and stratum
-    
+
     Parameters
     ----------
     df : pd.DataFrame
@@ -126,9 +141,10 @@ def count_matches(df, flag_column_name, period, strata, count_column_name=None):
     -------
     pd.DataFrame
         dataframe with column added for count of records with matches
-    """   
+    """
     if count_column_name is None:
-        count_column_name = flag_column_name.split('_')[0]+'_matched_pair_count'
-    df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform("sum")
+        count_column_name = flag_column_name.split("_")[0] + "_matched_pair_count"
+    df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform(
+        "sum"
+    )
     return df
-
diff --git a/tests/helper_functions.py b/tests/helper_functions.py
index b9006376..83bce07d 100644
--- a/tests/helper_functions.py
+++ b/tests/helper_functions.py
@@ -1,7 +1,8 @@
 import pandas as pd
 
+
 def load_and_format(filename):
     """Load csv as pandas dataframe and cast period column to datetime type"""
     df_loaded = pd.read_csv(filename)
-    df_loaded['period'] = pd.to_datetime(df_loaded['period'], format='%Y%m')
+    df_loaded["period"] = pd.to_datetime(df_loaded["period"], format="%Y%m")
     return df_loaded
diff --git a/tests/test_construction_matches.py b/tests/test_construction_matches.py
index 1ad2bce9..104521c2 100644
--- a/tests/test_construction_matches.py
+++ b/tests/test_construction_matches.py
@@ -1,37 +1,45 @@
-import pytest
-
 from pathlib import Path
+
+import pytest
+from helper_functions import load_and_format
 from pandas.testing import assert_frame_equal
 
 from src.construction_matches import flag_construction_matches
 from src.flag_and_count_matched_pairs import count_matches
-from helper_functions import load_and_format
+
 
 @pytest.fixture(scope="class")
 def construction_test_data():
-    return load_and_format(Path("tests")/"construction_matches.csv")
+    return load_and_format(Path("tests") / "construction_matches.csv")
+
 
 class TestConstructionMatches:
     def test_construction_matches_flag(self, construction_test_data):
-        expected_output = construction_test_data[[
-            "target",
-            "period",
-            "auxiliary",
-            "flag_construction_matches",
-        ]]
+        expected_output = construction_test_data[
+            [
+                "target",
+                "period",
+                "auxiliary",
+                "flag_construction_matches",
+            ]
+        ]
 
         input_data = expected_output.drop(columns=["flag_construction_matches"])
-        actual_output = flag_construction_matches(input_data, "target", "period", "auxiliary")
+        actual_output = flag_construction_matches(
+            input_data, "target", "period", "auxiliary"
+        )
 
         assert_frame_equal(actual_output, expected_output)
 
     def test_construction_matches_count(self, construction_test_data):
-        expected_output = construction_test_data[[
-            "period",
-            "flag_construction_matches",
-            "strata",
-            "count_construction_matches",
-        ]]
+        expected_output = construction_test_data[
+            [
+                "period",
+                "flag_construction_matches",
+                "strata",
+                "count_construction_matches",
+            ]
+        ]
 
         input_data = expected_output.drop(columns=["count_construction_matches"])
         actual_output = count_matches(
@@ -39,7 +47,7 @@ def test_construction_matches_count(self, construction_test_data):
             "flag_construction_matches",
             "period",
             "strata",
-            "count_construction_matches"
+            "count_construction_matches",
         )
 
         assert_frame_equal(actual_output, expected_output)
diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv
index bc126f2e..e05d9fe3 100644
--- a/tests/test_data_matched_pair/case1_expected_output.csv
+++ b/tests/test_data_matched_pair/case1_expected_output.csv
@@ -4,4 +4,4 @@ reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_ma
 1,101,202403,,False,1,False,0
 2,101,202401,270,False,0,True,2
 2,101,202402,250,True,2,True,1
-2,101,202403,255,True,1,False,0
\ No newline at end of file
+2,101,202403,255,True,1,False,0
diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv
index c03c0ff6..dbae472b 100644
--- a/tests/test_data_matched_pair/case2_expected_output.csv
+++ b/tests/test_data_matched_pair/case2_expected_output.csv
@@ -6,4 +6,4 @@ reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_ma
 2,101,202402,250,True,2,True,1
 2,101,202403,255,True,1,False,0
 2,102,202404,260,False,0,True,1
-2,102,202405,272,True,1,False,0
\ No newline at end of file
+2,102,202405,272,True,1,False,0
diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv
index 7e40574d..12ad8810 100644
--- a/tests/test_data_matched_pair/case3_expected_output.csv
+++ b/tests/test_data_matched_pair/case3_expected_output.csv
@@ -4,4 +4,4 @@ reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_ma
 1,101,202403,,False,0,False,0
 2,101,202401,270,False,0,True,2
 2,101,202402,250,True,2,False,0
-2,101,202404,255,False,0,False,0
\ No newline at end of file
+2,101,202404,255,False,0,False,0
diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py
index 927ee26a..6d765521 100644
--- a/tests/test_flag_and_count_matched_pairs.py
+++ b/tests/test_flag_and_count_matched_pairs.py
@@ -1,69 +1,113 @@
-import pandas as pd
-import pytest
-
-from pandas.testing import assert_frame_equal
 from pathlib import Path
 
-from src.flag_and_count_matched_pairs import flag_matched_pair_merge, count_matches, flag_matched_pair_shift
+import pytest
 from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+from src.flag_and_count_matched_pairs import (
+    count_matches,
+    flag_matched_pair_merge,
+    flag_matched_pair_shift,
+)
 
 # Case 1 - two businesses, one missing value
-# Case 2 - change in strata (sic) 
-# Case 3 - Missing period for one business 
+# Case 2 - change in strata (sic)
+# Case 3 - Missing period for one business
 
-filepath = Path('tests')/'test_data_matched_pair'
+filepath = Path("tests") / "test_data_matched_pair"
 
 file_name_cases = [
-    (filepath/'case1_expected_output.csv'),
-    (filepath/'case2_expected_output.csv'),
-    (filepath/'case3_expected_output.csv'),
-    ]
+    (filepath / "case1_expected_output.csv"),
+    (filepath / "case2_expected_output.csv"),
+    (filepath / "case3_expected_output.csv"),
+]
+
+pytestmark = pytest.mark.parametrize("expected_output_file", file_name_cases)
 
-pytestmark = pytest.mark.parametrize("expected_output_file",file_name_cases)
 
 class TestMatchedPair:
     def test_flag_matched_pair_merge_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(['f_matched_pair_count','b_matched_pair','b_matched_pair_count'],axis = 1,inplace=True)
-        df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']]
-        df_output = flag_matched_pair_merge(df_input, 'f', 'target_variable', 'period', 'reference', 'strata')
-        df_output.drop(['predictive_target_variable'],axis = 1, inplace=True)
+        df_expected_output.drop(
+            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
+            axis=1,
+            inplace=True,
+        )
+        df_input = df_expected_output[
+            ["reference", "strata", "period", "target_variable"]
+        ]
+        df_output = flag_matched_pair_merge(
+            df_input, "f", "target_variable", "period", "reference", "strata"
+        )
+        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_merge_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(['f_matched_pair_count', 'f_matched_pair', 'b_matched_pair_count'], axis = 1, inplace=True)
-        df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']]
-        df_output = flag_matched_pair_merge(df_input, 'b', 'target_variable', 'period', 'reference', 'strata')
-        df_output.drop(['predictive_target_variable'],axis = 1, inplace=True)
+        df_expected_output.drop(
+            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
+            axis=1,
+            inplace=True,
+        )
+        df_input = df_expected_output[
+            ["reference", "strata", "period", "target_variable"]
+        ]
+        df_output = flag_matched_pair_merge(
+            df_input, "b", "target_variable", "period", "reference", "strata"
+        )
+        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)
 
     def test_count_matched_pair_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(['b_matched_pair','b_matched_pair_count'],axis = 1,inplace=True)
-        df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable', 'f_matched_pair']]
-        df_output = count_matches(df_input,'f_matched_pair','period','strata')
+        df_expected_output.drop(
+            ["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
+        )
+        df_input = df_expected_output[
+            ["reference", "strata", "period", "target_variable", "f_matched_pair"]
+        ]
+        df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
         assert_frame_equal(df_output, df_expected_output)
 
     def test_count_matches_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(['f_matched_pair','f_matched_pair_count'],axis = 1,inplace=True)
-        df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable', 'b_matched_pair']]
-        df_output = count_matches(df_input,'b_matched_pair','period','strata')
+        df_expected_output.drop(
+            ["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
+        )
+        df_input = df_expected_output[
+            ["reference", "strata", "period", "target_variable", "b_matched_pair"]
+        ]
+        df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_shift_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(['f_matched_pair_count','b_matched_pair','b_matched_pair_count'],axis = 1,inplace=True)
-        df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']]
-        df_output = flag_matched_pair_shift(df_input,'f','target_variable','period', 'reference', 'strata')
-        df_output.drop(['predictive_target_variable'],axis=1,inplace=True)
+        df_expected_output.drop(
+            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
+            axis=1,
+            inplace=True,
+        )
+        df_input = df_expected_output[
+            ["reference", "strata", "period", "target_variable"]
+        ]
+        df_output = flag_matched_pair_shift(
+            df_input, "f", "target_variable", "period", "reference", "strata"
+        )
+        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_shift_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(['f_matched_pair_count','f_matched_pair','b_matched_pair_count'],axis = 1,inplace=True)
-        df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']]
-        df_output = flag_matched_pair_shift(df_input,'b','target_variable','period', 'reference', 'strata')
-        df_output.drop(['predictive_target_variable'],axis=1,inplace=True)
-        assert_frame_equal(df_output, df_expected_output)
\ No newline at end of file
+        df_expected_output.drop(
+            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
+            axis=1,
+            inplace=True,
+        )
+        df_input = df_expected_output[
+            ["reference", "strata", "period", "target_variable"]
+        ]
+        df_output = flag_matched_pair_shift(
+            df_input, "b", "target_variable", "period", "reference", "strata"
+        )
+        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
+        assert_frame_equal(df_output, df_expected_output)

From d2d63bf3a441bc16cae43b68299f9635ad974679 Mon Sep 17 00:00:00 2001
From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com>
Date: Mon, 13 May 2024 14:50:53 +0100
Subject: [PATCH 02/20] Adding flag (f/b) to predictive column name (#12)

---
 src/flag_and_count_matched_pairs.py        | 11 ++++++-----
 tests/test_flag_and_count_matched_pairs.py |  8 ++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
index e3d55c2a..d2b6f8a6 100644
--- a/src/flag_and_count_matched_pairs.py
+++ b/src/flag_and_count_matched_pairs.py
@@ -43,8 +43,9 @@ def flag_matched_pair_merge(
     df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
         months=time_difference
     )
+    predictive_col_name = forward_or_backward + "_predictive_" + target
     df_with_predictive_column.rename(
-        columns={target: "predictive_" + target}, inplace=True
+        columns={target: predictive_col_name}, inplace=True
     )
 
     df = df.merge(
@@ -57,7 +58,7 @@ def flag_matched_pair_merge(
     matched_col_name = forward_or_backward + "_matched_pair"
 
     df[matched_col_name] = np.where(
-        df[[target, "predictive_" + target]].isnull().any(axis=1), False, True
+        df[[target, predictive_col_name]].isnull().any(axis=1), False, True
     )
 
     df.drop(["predictive_period"], axis=1, inplace=True)
@@ -98,7 +99,8 @@ def flag_matched_pair_shift(
         shift = -shift
 
     df = df.sort_values(by=[reference, period])
-    df[["predictive_" + target, "predictive_period"]] = df.groupby(
+    predictive_col_name = forward_or_backward + "_predictive_" + target
+    df[[predictive_col_name, "predictive_period"]] = df.groupby(
         [reference, strata]
     ).shift(shift)[[target, period]]
 
@@ -108,8 +110,7 @@ def flag_matched_pair_shift(
     matched_col_name = forward_or_backward + "_matched_pair"
 
     df[matched_col_name] = np.where(
-        df[[target, "predictive_target_variable"]].isnull().any(axis=1)
-        | (~df["validate_date"]),
+        df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
         False,
         True,
     )
diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py
index 6d765521..cf4b0525 100644
--- a/tests/test_flag_and_count_matched_pairs.py
+++ b/tests/test_flag_and_count_matched_pairs.py
@@ -39,7 +39,7 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):
         df_output = flag_matched_pair_merge(
             df_input, "f", "target_variable", "period", "reference", "strata"
         )
-        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
+        df_output.drop(["f_predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_merge_backward(self, expected_output_file):
@@ -55,7 +55,7 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):
         df_output = flag_matched_pair_merge(
             df_input, "b", "target_variable", "period", "reference", "strata"
         )
-        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
+        df_output.drop(["b_predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)
 
     def test_count_matched_pair_forward(self, expected_output_file):
@@ -93,7 +93,7 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):
         df_output = flag_matched_pair_shift(
             df_input, "f", "target_variable", "period", "reference", "strata"
         )
-        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
+        df_output.drop(["f_predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_shift_backward(self, expected_output_file):
@@ -109,5 +109,5 @@ def test_flag_matched_pair_shift_backward(self, expected_output_file):
         df_output = flag_matched_pair_shift(
             df_input, "b", "target_variable", "period", "reference", "strata"
         )
-        df_output.drop(["predictive_target_variable"], axis=1, inplace=True)
+        df_output.drop(["b_predictive_target_variable"], axis=1, inplace=True)
         assert_frame_equal(df_output, df_expected_output)

From f7ba3968b7037622bc267bdcd89620eec99f3144 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Mon, 13 May 2024 17:21:07 +0100
Subject: [PATCH 03/20] Add function for forward, backward link

---
 src/forward_link.py | 105 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 src/forward_link.py

diff --git a/src/forward_link.py b/src/forward_link.py
new file mode 100644
index 00000000..7b648963
--- /dev/null
+++ b/src/forward_link.py
@@ -0,0 +1,105 @@
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+
+def zerofy_values(
+    df: pd.DataFrame, target_variable: List[str] or str, expr: str
+) -> pd.DataFrame:
+    """Convert values in a dataframe column to 0 based on a python expression
+
+    Parameters
+    ----------
+    df : pd.Dataframe
+        Pandas dataframe of original data.
+    target_variable : List[str] or str
+        Column name(s) containing target variable(s).
+    query : str
+        The expression to evaluate, see here:
+        https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html
+
+    Returns
+    -------
+    df : pd.Dataframe
+
+
+    """
+
+    try:
+        df.loc[~(df.eval(expr)), target_variable] = 0
+
+    except ValueError:
+        print(
+            f"""{expr} is not a valid expression,
+        the code uses ~(df.eval({expr}) to mask the dataframe, please see here:
+        https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html
+        """
+        )
+
+
+def get_link(
+    df: pd.DataFrame,
+    groups: List[str] or str,
+    match_col: str,
+    target_variable: str,
+    predictive_variable: str,
+    filter_cond: str = None,
+) -> pd.DataFrame:
+    """
+    Calculate link between target_variable and predictive_variable by given groups,
+    a match_col must be supplied which indicates if target_variable and
+    predictive_variable can be linked. If an optional filter_cond is given
+    it excludes them when calculating the links.
+
+    Parameters
+    ----------
+    df : pd.Dataframe
+        Original dataframe.
+    groups : List[str] or str
+        Column name(s) to calculate the sums.
+    match_col : str
+        Column of the matched pair links, this column should be bool,
+        or 0 and 1.
+    target_variable : str
+        Column name of the targeted variable.
+    predictive_variable : str
+        Column name of the predicted target variable.
+    filter_cond : str, optional
+        Expression to exclude specific values from the links.
+        The default is None.
+
+    Returns
+    -------
+    link : pd.Series
+        A pandas series with the links.
+    """
+
+    df_intermediate = df.copy()
+
+    # If condition supplied exclude filtered values from links
+    if filter_cond is not None:
+
+        df_intermediate.zerofy_values(
+            [target_variable, predictive_variable], filter_cond
+        )
+
+    df_intermediate[target_variable] = (
+        df_intermediate[target_variable] * df_intermediate[match_col]
+    )
+
+    df_intermediate[predictive_variable] = (
+        df_intermediate[predictive_variable] * df_intermediate[match_col]
+    )
+
+    numerator = df_intermediate.groupby(groups)[target_variable].transform("sum")
+
+    denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum")
+
+    denominator.replace(0, np.nan, inplace=True)  # cover division with 0
+
+    link = numerator / denominator
+
+    link.replace(np.nan, 1, inplace=True)  # set defaults
+
+    return link

From 8f5c987ea0cb014fd45d36159661cc1db627d052 Mon Sep 17 00:00:00 2001
From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com>
Date: Tue, 14 May 2024 11:45:57 +0100
Subject: [PATCH 04/20] Add pre-commit hooks as test when merging (#11)

---
 .github/workflows/main.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 080d4cd3..dc5b5228 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -47,3 +47,23 @@ jobs:
       - name: Run pytest
         run: |
           pytest -v
+
+  commit-hooks:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: 3.6.8
+          cache: 'pip'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pre-commit install
+
+      - name: Check commit hooks
+        run: |
+          pre-commit run --all-files

From dd2b3024112a1850bd2ebf4b35dd303e2fbede4b Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Wed, 15 May 2024 13:53:07 +0100
Subject: [PATCH 05/20] Add unit tests for link filters

---
 tests/test_forward_link.py | 144 +++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 tests/test_forward_link.py

diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
new file mode 100644
index 00000000..a7b4009e
--- /dev/null
+++ b/tests/test_forward_link.py
@@ -0,0 +1,144 @@
+import numpy as np
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+from src.forward_link import zerofy_values
+
+
+class TestFilters:
+    # based on 02_C_FI_input
+    df = pd.DataFrame(
+        data={
+            "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
+            "date": [202001, 202002, 202001, 202002, 202001, 202002, 202001, 202002],
+            "group": [100, 100, 100, 100, 100, 100, 100, 100],
+            "question": [2536.0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
+            "other": [35, 35, 72, 72, 77, 77, 30, 30],
+        }
+    )
+
+    def test_basic_filter(self):
+        """Test a basic filter, filters questions with identifier different to 20001"""
+
+        expected = pd.DataFrame(
+            data={
+                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
+                "date": [
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                ],
+                "group": [100, 100, 100, 100, 100, 100, 100, 100],
+                "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
+                "other": [35, 35, 72, 72, 77, 77, 30, 30],
+            }
+        )
+
+        link_filter = "identifier != '20001'"
+
+        df_copy = self.df.copy()
+
+        zerofy_values(df_copy, "question", link_filter)
+
+        assert_frame_equal(df_copy, expected)
+
+    def test_basic_multiple_columns(self):
+        """Test a basic filter in more than 1 column"""
+
+        expected = pd.DataFrame(
+            data={
+                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
+                "date": [
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                ],
+                "group": [100, 100, 100, 100, 100, 100, 100, 100],
+                "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
+                "other": [0, 0, 72, 72, 77, 77, 30, 30],
+            }
+        )
+
+        link_filter = "identifier != '20001'"
+
+        df_copy = self.df.copy()
+
+        zerofy_values(df_copy, ["question", "other"], link_filter)
+
+        assert_frame_equal(df_copy, expected)
+
+    def test_basic_multiple_values(self):
+        """
+        Test a filter in multiple values, filters questions which aren't
+        in ('20001', '20002')
+        """
+
+        expected = pd.DataFrame(
+            data={
+                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
+                "date": [
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                ],
+                "group": [100, 100, 100, 100, 100, 100, 100, 100],
+                "question": [0, 0, 0, 0, 5644.0, 989.0, np.nan, np.nan],
+                "other": [35, 35, 72, 72, 77, 77, 30, 30],
+            }
+        )
+
+        link_filter = "identifier not in ('20001', '20002')"
+
+        df_copy = self.df.copy()
+
+        zerofy_values(df_copy, "question", link_filter)
+
+        assert_frame_equal(df_copy, expected)
+
+    def test_multiple_filters(self):
+        """
+        Test multiple conditions, filters questions which aren't in date 202001
+        and identifier in 20001 in the same time
+        """
+
+        expected = pd.DataFrame(
+            data={
+                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
+                "date": [
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                    202001,
+                    202002,
+                ],
+                "group": [100, 100, 100, 100, 100, 100, 100, 100],
+                "question": [0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
+                "other": [35, 35, 72, 72, 77, 77, 30, 30],
+            }
+        )
+
+        link_filter = "not(date == '202001' and identifier in ('20001'))"
+
+        df_copy = self.df.copy()
+
+        zerofy_values(df_copy, "question", link_filter)
+
+        assert_frame_equal(df_copy, expected)

From 3562d2f5886fb63f70ca7e0c96ae808d9f17ccc9 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Wed, 15 May 2024 13:55:55 +0100
Subject: [PATCH 06/20] Add unit tests for get_link function

---
 tests/test_forward_link.py | 198 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 196 insertions(+), 2 deletions(-)

diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index a7b4009e..bcf240da 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pandas as pd
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
-from src.forward_link import zerofy_values
+from src.forward_link import get_link, zerofy_values
 
 
 class TestFilters:
@@ -142,3 +142,197 @@ def test_multiple_filters(self):
         zerofy_values(df_copy, "question", link_filter)
 
         assert_frame_equal(df_copy, expected)
+
+
+class TestLink:
+
+    # from scenario 33_multi_variable_C_BI_R
+    # We could parametrise this with more scenarios if needed
+    df = pd.DataFrame(
+        data={
+            "identifier": [
+                10001,
+                10001,
+                10001,
+                10002,
+                10002,
+                10002,
+                10001,
+                10001,
+                10001,
+                10002,
+                10002,
+                10002,
+                10005,
+                10005,
+                10005,
+            ],
+            "date": [
+                202001,
+                202002,
+                202003,
+                202001,
+                202002,
+                202003,
+                202001,
+                202002,
+                202003,
+                202001,
+                202002,
+                202003,
+                202001,
+                202002,
+                202003,
+            ],
+            "group": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2],
+            "question": [
+                547.0,
+                362.0,
+                895.0,
+                381.0,
+                573.0,
+                214.0,
+                961.0,
+                267.0,
+                314.0,
+                555.0,
+                628.0,
+                736.0,
+                np.nan,
+                np.nan,
+                100.0,
+            ],
+            "f_predictive_question": [
+                np.nan,
+                547.0,
+                362.0,
+                np.nan,
+                381.0,
+                573.0,
+                np.nan,
+                961.0,
+                267.0,
+                np.nan,
+                555.0,
+                628.0,
+                np.nan,
+                np.nan,
+                np.nan,
+            ],
+            "b_predictive_question": [
+                362.0,
+                895.0,
+                np.nan,
+                573.0,
+                214.0,
+                np.nan,
+                267.0,
+                314.0,
+                np.nan,
+                628.0,
+                736.0,
+                np.nan,
+                np.nan,
+                100.0,
+                np.nan,
+            ],
+            "f_matched_pair": [
+                False,
+                True,
+                True,
+                False,
+                True,
+                True,
+                False,
+                True,
+                True,
+                False,
+                True,
+                True,
+                False,
+                False,
+                False,
+            ],
+            "b_matched_pair": [
+                True,
+                True,
+                False,
+                True,
+                True,
+                False,
+                True,
+                True,
+                False,
+                True,
+                True,
+                False,
+                False,
+                False,
+                False,
+            ],
+        }
+    )
+
+    def test_forward_link(self):
+
+        expected_f_link = pd.Series(
+            [
+                1.0,
+                1.0075431034482758,
+                1.186096256684492,
+                1.0,
+                1.0075431034482758,
+                1.186096256684492,
+                1.0,
+                0.5903693931398417,
+                1.1731843575418994,
+                1.0,
+                0.5903693931398417,
+                1.1731843575418994,
+                1.0,
+                0.5903693931398417,
+                1.1731843575418994,
+            ]
+        )
+
+        f_link = get_link(
+            self.df,
+            ["group", "date"],
+            "f_matched_pair",
+            "question",
+            "f_predictive_question",
+        )
+
+        assert_series_equal(f_link, expected_f_link)
+
+    def test_backward_link(self):
+
+        expected_b_link = pd.Series(
+            [
+                0.9925133689839573,
+                0.8431018935978359,
+                1.0,
+                0.9925133689839573,
+                0.8431018935978359,
+                1.0,
+                1.693854748603352,
+                0.8523809523809524,
+                1.0,
+                1.693854748603352,
+                0.8523809523809524,
+                1.0,
+                0.9925133689839573,
+                0.8523809523809524,
+                1.0,
+            ]
+        )
+
+        b_link = get_link(
+            self.df,
+            ["group", "date"],
+            "b_matched_pair",
+            "question",
+            "b_predictive_question",
+        )
+
+        assert_series_equal(b_link, expected_b_link)

From d31535fa597d8e61fb4f6670cf7ee9afb3c61e24 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Thu, 16 May 2024 11:35:41 +0100
Subject: [PATCH 07/20] Rename zerofy_values function to mask_values

---
 src/forward_link.py        | 10 ++++++----
 tests/test_forward_link.py | 10 +++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/forward_link.py b/src/forward_link.py
index 7b648963..3926abdb 100644
--- a/src/forward_link.py
+++ b/src/forward_link.py
@@ -2,9 +2,10 @@
 
 import numpy as np
 import pandas as pd
+from pandas.core.base import PandasObject
 
 
-def zerofy_values(
+def mask_values(
     df: pd.DataFrame, target_variable: List[str] or str, expr: str
 ) -> pd.DataFrame:
     """Convert values in a dataframe column to 0 based on a python expression
@@ -38,6 +39,9 @@ def zerofy_values(
         )
 
 
+PandasObject.mask_values = mask_values
+
+
 def get_link(
     df: pd.DataFrame,
     groups: List[str] or str,
@@ -80,9 +84,7 @@ def get_link(
     # If condition supplied exclude filtered values from links
     if filter_cond is not None:
 
-        df_intermediate.zerofy_values(
-            [target_variable, predictive_variable], filter_cond
-        )
+        df_intermediate.mask_values([target_variable, predictive_variable], filter_cond)
 
     df_intermediate[target_variable] = (
         df_intermediate[target_variable] * df_intermediate[match_col]
diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index bcf240da..80f35ef9 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -2,7 +2,7 @@
 import pandas as pd
 from pandas.testing import assert_frame_equal, assert_series_equal
 
-from src.forward_link import get_link, zerofy_values
+from src.forward_link import get_link, mask_values
 
 
 class TestFilters:
@@ -43,7 +43,7 @@ def test_basic_filter(self):
 
         df_copy = self.df.copy()
 
-        zerofy_values(df_copy, "question", link_filter)
+        mask_values(df_copy, "question", link_filter)
 
         assert_frame_equal(df_copy, expected)
 
@@ -73,7 +73,7 @@ def test_basic_multiple_columns(self):
 
         df_copy = self.df.copy()
 
-        zerofy_values(df_copy, ["question", "other"], link_filter)
+        mask_values(df_copy, ["question", "other"], link_filter)
 
         assert_frame_equal(df_copy, expected)
 
@@ -106,7 +106,7 @@ def test_basic_multiple_values(self):
 
         df_copy = self.df.copy()
 
-        zerofy_values(df_copy, "question", link_filter)
+        mask_values(df_copy, "question", link_filter)
 
         assert_frame_equal(df_copy, expected)
 
@@ -139,7 +139,7 @@ def test_multiple_filters(self):
 
         df_copy = self.df.copy()
 
-        zerofy_values(df_copy, "question", link_filter)
+        mask_values(df_copy, "question", link_filter)
 
         assert_frame_equal(df_copy, expected)
 

From 3509145dddc391e1ce512f214572aa6a8b60e335 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Thu, 16 May 2024 11:43:47 +0100
Subject: [PATCH 08/20] Rename get_link function to calculate_imputation_link

---
 src/forward_link.py        | 2 +-
 tests/test_forward_link.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/forward_link.py b/src/forward_link.py
index 3926abdb..1cd6aca3 100644
--- a/src/forward_link.py
+++ b/src/forward_link.py
@@ -42,7 +42,7 @@ def mask_values(
 PandasObject.mask_values = mask_values
 
 
-def get_link(
+def calculate_imputation_link(
     df: pd.DataFrame,
     groups: List[str] or str,
     match_col: str,
diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index 80f35ef9..583b5485 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -2,7 +2,7 @@
 import pandas as pd
 from pandas.testing import assert_frame_equal, assert_series_equal
 
-from src.forward_link import get_link, mask_values
+from src.forward_link import calculate_imputation_link, mask_values
 
 
 class TestFilters:
@@ -295,7 +295,7 @@ def test_forward_link(self):
             ]
         )
 
-        f_link = get_link(
+        f_link = calculate_imputation_link(
             self.df,
             ["group", "date"],
             "f_matched_pair",
@@ -327,7 +327,7 @@ def test_backward_link(self):
             ]
         )
 
-        b_link = get_link(
+        b_link = calculate_imputation_link(
             self.df,
             ["group", "date"],
             "b_matched_pair",

From a3067c1e5faa69548dc859d98dd67d1cc0269010 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Fri, 17 May 2024 10:40:54 +0100
Subject: [PATCH 09/20] Update mask_values to return a series

---
 src/forward_link.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/forward_link.py b/src/forward_link.py
index 1cd6aca3..30e99fb4 100644
--- a/src/forward_link.py
+++ b/src/forward_link.py
@@ -2,12 +2,9 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.base import PandasObject
 
 
-def mask_values(
-    df: pd.DataFrame, target_variable: List[str] or str, expr: str
-) -> pd.DataFrame:
+def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series:
     """Convert values in a dataframe column to 0 based on a python expression
 
     Parameters
@@ -22,13 +19,14 @@ def mask_values(
 
     Returns
     -------
-    df : pd.Dataframe
+    df : pd.Series
 
 
     """
+    masked_column = df[target_variable].copy()
 
     try:
-        df.loc[~(df.eval(expr)), target_variable] = 0
+        masked_column.loc[~(df.eval(expr))] = np.nan
 
     except ValueError:
         print(
@@ -38,8 +36,7 @@ def mask_values(
         """
         )
 
-
-PandasObject.mask_values = mask_values
+    return masked_column
 
 
 def calculate_imputation_link(

From 4ddd9319b08fafa10ac1cc180f7bab6f996a71ec Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Fri, 17 May 2024 10:44:30 +0100
Subject: [PATCH 10/20] Remove mask_values from calculate_links function

---
 src/forward_link.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/forward_link.py b/src/forward_link.py
index 30e99fb4..14870292 100644
--- a/src/forward_link.py
+++ b/src/forward_link.py
@@ -45,13 +45,11 @@ def calculate_imputation_link(
     match_col: str,
     target_variable: str,
     predictive_variable: str,
-    filter_cond: str = None,
-) -> pd.DataFrame:
+) -> pd.Series:
     """
     Calculate link between target_variable and predictive_variable by given groups,
     a match_col must be supplied which indicates if target_variable and
-    predictive_variable can be linked. If an optional filter_cond is given
-    it excludes them when calculating the links.
+    predictive_variable can be linked.
 
     Parameters
     ----------
@@ -66,9 +64,6 @@ def calculate_imputation_link(
         Column name of the targeted variable.
     predictive_variable : str
         Column name of the predicted target variable.
-    filter_cond : str, optional
-        Expression to exclude specific values from the links.
-        The default is None.
 
     Returns
     -------
@@ -78,11 +73,6 @@ def calculate_imputation_link(
 
     df_intermediate = df.copy()
 
-    # If condition supplied exclude filtered values from links
-    if filter_cond is not None:
-
-        df_intermediate.mask_values([target_variable, predictive_variable], filter_cond)
-
     df_intermediate[target_variable] = (
         df_intermediate[target_variable] * df_intermediate[match_col]
     )
@@ -99,6 +89,4 @@ def calculate_imputation_link(
 
     link = numerator / denominator
 
-    link.replace(np.nan, 1, inplace=True)  # set defaults
-
     return link

From 22fa19e8fd566a5f23c6e50c4d922ea973a71978 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Fri, 17 May 2024 13:52:16 +0100
Subject: [PATCH 11/20] Add test data for calculate_links

---
 tests/calculate_links_test_data.csv | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100755 tests/calculate_links_test_data.csv

diff --git a/tests/calculate_links_test_data.csv b/tests/calculate_links_test_data.csv
new file mode 100755
index 00000000..72e6408d
--- /dev/null
+++ b/tests/calculate_links_test_data.csv
@@ -0,0 +1,16 @@
+,identifier,period,group,question,f_predictive_question,b_predictive_question,f_matched_pair,b_matched_pair,f_link,b_link
+0,10001,202001,1,547.0,,362.0,False,True,,0.9925133689839573
+1,10001,202002,1,362.0,547.0,895.0,True,True,1.0075431034482758,0.8431018935978359
+2,10001,202003,1,895.0,362.0,,True,False,1.186096256684492,
+3,10002,202001,1,381.0,,573.0,False,True,,0.9925133689839573
+4,10002,202002,1,573.0,381.0,214.0,True,True,1.0075431034482758,0.8431018935978359
+5,10002,202003,1,214.0,573.0,,True,False,1.186096256684492,
+6,10001,202001,2,961.0,,267.0,False,True,,1.693854748603352
+7,10001,202002,2,267.0,961.0,314.0,True,True,0.5903693931398417,0.8523809523809524
+8,10001,202003,2,314.0,267.0,,True,False,1.1731843575418994,
+9,10002,202001,2,555.0,,628.0,False,True,,1.693854748603352
+10,10002,202002,2,628.0,555.0,736.0,True,True,0.5903693931398417,0.8523809523809524
+11,10002,202003,2,736.0,628.0,,True,False,1.1731843575418994,
+12,10005,202001,1,,,,False,False,,0.9925133689839573
+13,10005,202002,2,,,100.0,False,False,0.5903693931398417,0.8523809523809524
+14,10005,202003,2,100.0,,,False,False,1.1731843575418994,

From b2b91e39aac95e3f4b9df52abb3748e1e0a57555 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Fri, 17 May 2024 13:53:19 +0100
Subject: [PATCH 12/20] Adapt tests for calculate_links with test data

---
 tests/test_forward_link.py | 204 +++++--------------------------------
 1 file changed, 26 insertions(+), 178 deletions(-)

diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index 583b5485..d1c0f6cf 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -1,5 +1,7 @@
 import numpy as np
 import pandas as pd
+import pytest
+from helper_functions import load_and_format
 from pandas.testing import assert_frame_equal, assert_series_equal
 
 from src.forward_link import calculate_imputation_link, mask_values
@@ -144,195 +146,41 @@ def test_multiple_filters(self):
         assert_frame_equal(df_copy, expected)
 
 
-class TestLink:
+scenarios = ["calculate_links_test_data"]
 
-    # from scenario 33_multi_variable_C_BI_R
-    # We could parametrise this with more scenarios if needed
-    df = pd.DataFrame(
-        data={
-            "identifier": [
-                10001,
-                10001,
-                10001,
-                10002,
-                10002,
-                10002,
-                10001,
-                10001,
-                10001,
-                10002,
-                10002,
-                10002,
-                10005,
-                10005,
-                10005,
-            ],
-            "date": [
-                202001,
-                202002,
-                202003,
-                202001,
-                202002,
-                202003,
-                202001,
-                202002,
-                202003,
-                202001,
-                202002,
-                202003,
-                202001,
-                202002,
-                202003,
-            ],
-            "group": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2],
-            "question": [
-                547.0,
-                362.0,
-                895.0,
-                381.0,
-                573.0,
-                214.0,
-                961.0,
-                267.0,
-                314.0,
-                555.0,
-                628.0,
-                736.0,
-                np.nan,
-                np.nan,
-                100.0,
-            ],
-            "f_predictive_question": [
-                np.nan,
-                547.0,
-                362.0,
-                np.nan,
-                381.0,
-                573.0,
-                np.nan,
-                961.0,
-                267.0,
-                np.nan,
-                555.0,
-                628.0,
-                np.nan,
-                np.nan,
-                np.nan,
-            ],
-            "b_predictive_question": [
-                362.0,
-                895.0,
-                np.nan,
-                573.0,
-                214.0,
-                np.nan,
-                267.0,
-                314.0,
-                np.nan,
-                628.0,
-                736.0,
-                np.nan,
-                np.nan,
-                100.0,
-                np.nan,
-            ],
-            "f_matched_pair": [
-                False,
-                True,
-                True,
-                False,
-                True,
-                True,
-                False,
-                True,
-                True,
-                False,
-                True,
-                True,
-                False,
-                False,
-                False,
-            ],
-            "b_matched_pair": [
-                True,
-                True,
-                False,
-                True,
-                True,
-                False,
-                True,
-                True,
-                False,
-                True,
-                True,
-                False,
-                False,
-                False,
-                False,
-            ],
-        }
-    )
 
-    def test_forward_link(self):
-
-        expected_f_link = pd.Series(
-            [
-                1.0,
-                1.0075431034482758,
-                1.186096256684492,
-                1.0,
-                1.0075431034482758,
-                1.186096256684492,
-                1.0,
-                0.5903693931398417,
-                1.1731843575418994,
-                1.0,
-                0.5903693931398417,
-                1.1731843575418994,
-                1.0,
-                0.5903693931398417,
-                1.1731843575418994,
-            ]
-        )
+@pytest.mark.parametrize("scenario", scenarios)
+class TestLinks:
+    def test_forward_links(self, scenario):
+        """Test if function returns the f_link column"""
+
+        df_input = load_and_format("tests/" + scenario + ".csv")
 
-        f_link = calculate_imputation_link(
-            self.df,
-            ["group", "date"],
+        expected_link = df_input["f_link"]
+
+        link_to_test = calculate_imputation_link(
+            df_input,
+            ["group", "period"],
             "f_matched_pair",
             "question",
             "f_predictive_question",
         )
 
-        assert_series_equal(f_link, expected_f_link)
-
-    def test_backward_link(self):
-
-        expected_b_link = pd.Series(
-            [
-                0.9925133689839573,
-                0.8431018935978359,
-                1.0,
-                0.9925133689839573,
-                0.8431018935978359,
-                1.0,
-                1.693854748603352,
-                0.8523809523809524,
-                1.0,
-                1.693854748603352,
-                0.8523809523809524,
-                1.0,
-                0.9925133689839573,
-                0.8523809523809524,
-                1.0,
-            ]
-        )
+        assert_series_equal(link_to_test, expected_link, check_names=False)
+
+    def test_back_links(self, scenario):
+        """Test if function returns the b_link column"""
+
+        df_input = load_and_format("tests/" + scenario + ".csv")
+
+        expected_link = df_input["b_link"]
 
-        b_link = calculate_imputation_link(
-            self.df,
-            ["group", "date"],
+        link_to_test = calculate_imputation_link(
+            df_input,
+            ["group", "period"],
             "b_matched_pair",
             "question",
             "b_predictive_question",
         )
 
-        assert_series_equal(b_link, expected_b_link)
+        assert_series_equal(link_to_test, expected_link, check_names=False)

From 4bc39c4ff6a52b70d058e0792a1a03f837cd1750 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Fri, 17 May 2024 17:09:31 +0100
Subject: [PATCH 13/20] Remove mask_values unit tests

---
 tests/test_forward_link.py | 146 +------------------------------------
 1 file changed, 2 insertions(+), 144 deletions(-)

diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index d1c0f6cf..74e32005 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -1,150 +1,8 @@
-import numpy as np
-import pandas as pd
 import pytest
 from helper_functions import load_and_format
-from pandas.testing import assert_frame_equal, assert_series_equal
-
-from src.forward_link import calculate_imputation_link, mask_values
-
-
-class TestFilters:
-    # based on 02_C_FI_input
-    df = pd.DataFrame(
-        data={
-            "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
-            "date": [202001, 202002, 202001, 202002, 202001, 202002, 202001, 202002],
-            "group": [100, 100, 100, 100, 100, 100, 100, 100],
-            "question": [2536.0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
-            "other": [35, 35, 72, 72, 77, 77, 30, 30],
-        }
-    )
-
-    def test_basic_filter(self):
-        """Test a basic filter, filters questions with identifier different to 20001"""
-
-        expected = pd.DataFrame(
-            data={
-                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
-                "date": [
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                ],
-                "group": [100, 100, 100, 100, 100, 100, 100, 100],
-                "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
-                "other": [35, 35, 72, 72, 77, 77, 30, 30],
-            }
-        )
-
-        link_filter = "identifier != '20001'"
-
-        df_copy = self.df.copy()
-
-        mask_values(df_copy, "question", link_filter)
-
-        assert_frame_equal(df_copy, expected)
-
-    def test_basic_multiple_columns(self):
-        """Test a basic filter in more than 1 column"""
-
-        expected = pd.DataFrame(
-            data={
-                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
-                "date": [
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                ],
-                "group": [100, 100, 100, 100, 100, 100, 100, 100],
-                "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
-                "other": [0, 0, 72, 72, 77, 77, 30, 30],
-            }
-        )
-
-        link_filter = "identifier != '20001'"
-
-        df_copy = self.df.copy()
-
-        mask_values(df_copy, ["question", "other"], link_filter)
-
-        assert_frame_equal(df_copy, expected)
-
-    def test_basic_multiple_values(self):
-        """
-        Test a filter in multiple values, filters questions which aren't
-        in ('20001', '20002')
-        """
-
-        expected = pd.DataFrame(
-            data={
-                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
-                "date": [
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                ],
-                "group": [100, 100, 100, 100, 100, 100, 100, 100],
-                "question": [0, 0, 0, 0, 5644.0, 989.0, np.nan, np.nan],
-                "other": [35, 35, 72, 72, 77, 77, 30, 30],
-            }
-        )
-
-        link_filter = "identifier not in ('20001', '20002')"
-
-        df_copy = self.df.copy()
-
-        mask_values(df_copy, "question", link_filter)
-
-        assert_frame_equal(df_copy, expected)
-
-    def test_multiple_filters(self):
-        """
-        Test multiple conditions, filters questions which aren't in date 202001
-        and identifier in 20001 in the same time
-        """
-
-        expected = pd.DataFrame(
-            data={
-                "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004],
-                "date": [
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                    202001,
-                    202002,
-                ],
-                "group": [100, 100, 100, 100, 100, 100, 100, 100],
-                "question": [0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan],
-                "other": [35, 35, 72, 72, 77, 77, 30, 30],
-            }
-        )
-
-        link_filter = "not(date == '202001' and identifier in ('20001'))"
-
-        df_copy = self.df.copy()
-
-        mask_values(df_copy, "question", link_filter)
-
-        assert_frame_equal(df_copy, expected)
+from pandas.testing import assert_series_equal
 
+from src.forward_link import calculate_imputation_link
 
 scenarios = ["calculate_links_test_data"]
 

From 761c28331d921fec6bcf7a00681b43c5b6e9d0d1 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Tue, 21 May 2024 11:15:14 +0100
Subject: [PATCH 14/20] Define strata and period as seperate inputs

---
 src/forward_link.py        | 63 ++++++++++----------------------------
 tests/test_forward_link.py |  6 ++--
 2 files changed, 20 insertions(+), 49 deletions(-)

diff --git a/src/forward_link.py b/src/forward_link.py
index 14870292..f58e5512 100644
--- a/src/forward_link.py
+++ b/src/forward_link.py
@@ -1,65 +1,30 @@
-from typing import List
-
 import numpy as np
 import pandas as pd
 
 
-def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series:
-    """Convert values in a dataframe column to 0 based on a python expression
-
-    Parameters
-    ----------
-    df : pd.Dataframe
-        Pandas dataframe of original data.
-    target_variable : List[str] or str
-        Column name(s) containing target variable(s).
-    query : str
-        The expression to evaluate, see here:
-        https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html
-
-    Returns
-    -------
-    df : pd.Series
-
-
-    """
-    masked_column = df[target_variable].copy()
-
-    try:
-        masked_column.loc[~(df.eval(expr))] = np.nan
-
-    except ValueError:
-        print(
-            f"""{expr} is not a valid expression,
-        the code uses ~(df.eval({expr}) to mask the dataframe, please see here:
-        https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html
-        """
-        )
-
-    return masked_column
-
-
 def calculate_imputation_link(
     df: pd.DataFrame,
-    groups: List[str] or str,
+    period: str,
+    strata: str,
     match_col: str,
     target_variable: str,
     predictive_variable: str,
 ) -> pd.Series:
     """
-    Calculate link between target_variable and predictive_variable by given groups,
-    a match_col must be supplied which indicates if target_variable and
-    predictive_variable can be linked.
+    Calculate link between target_variable and predictive_variable by strata,
+    a match_col must be supplied which indicates if target_variable
+    and predictive_variable can be linked.
 
     Parameters
     ----------
     df : pd.Dataframe
         Original dataframe.
-    groups : List[str] or str
-        Column name(s) to calculate the sums.
+    period : str
+        Column name containing time period.
+    strata : str
+        Column name containing strata information (sic).
     match_col : str
-        Column of the matched pair links, this column should be bool,
-        or 0 and 1.
+        Column name of the matched pair links, this column should be bool.
     target_variable : str
         Column name of the targeted variable.
     predictive_variable : str
@@ -81,9 +46,13 @@ def calculate_imputation_link(
         df_intermediate[predictive_variable] * df_intermediate[match_col]
     )
 
-    numerator = df_intermediate.groupby(groups)[target_variable].transform("sum")
+    numerator = df_intermediate.groupby([strata, period])[target_variable].transform(
+        "sum"
+    )
 
-    denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum")
+    denominator = df_intermediate.groupby([strata, period])[
+        predictive_variable
+    ].transform("sum")
 
     denominator.replace(0, np.nan, inplace=True)  # cover division with 0
 
diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index 74e32005..8012d001 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -18,7 +18,8 @@ def test_forward_links(self, scenario):
 
         link_to_test = calculate_imputation_link(
             df_input,
-            ["group", "period"],
+            "period",
+            "group",
             "f_matched_pair",
             "question",
             "f_predictive_question",
@@ -35,7 +36,8 @@ def test_back_links(self, scenario):
 
         link_to_test = calculate_imputation_link(
             df_input,
-            ["group", "period"],
+            "period",
+            "group",
             "b_matched_pair",
             "question",
             "b_predictive_question",

From 1eb616ce4b5e435ddfa8b21607d98a3939eac1d5 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Tue, 21 May 2024 15:42:04 +0100
Subject: [PATCH 15/20] Change return type to dataframe, add exceptions too

---
 src/forward_link.py        | 25 +++++++++++++++----
 tests/test_forward_link.py | 49 ++++++++++++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/forward_link.py b/src/forward_link.py
index f58e5512..1ac97429 100644
--- a/src/forward_link.py
+++ b/src/forward_link.py
@@ -9,7 +9,7 @@ def calculate_imputation_link(
     match_col: str,
     target_variable: str,
     predictive_variable: str,
-) -> pd.Series:
+) -> pd.DataFrame:
     """
     Calculate link between target_variable and predictive_variable by strata,
     a match_col must be supplied which indicates if target_variable
@@ -32,12 +32,27 @@ def calculate_imputation_link(
 
     Returns
     -------
-    link : pd.Series
-        A pandas series with the links.
+    df : pd.DataFrame
+        A pandas DataFrame with a new column containing either f_link or b_link
+        based on the input parameters.
     """
 
     df_intermediate = df.copy()
 
+    if match_col == "f_matched_pair" and predictive_variable == "f_predictive_question":
+        link_col_name = "f_link"
+
+    elif (
+        match_col == "b_matched_pair" and predictive_variable == "b_predictive_question"
+    ):
+        link_col_name = "b_link"
+
+    else:
+        raise ValueError(
+            f"""
+        {match_col} and {predictive_variable} do not have same wildcard."""
+        )
+
     df_intermediate[target_variable] = (
         df_intermediate[target_variable] * df_intermediate[match_col]
     )
@@ -56,6 +71,6 @@ def calculate_imputation_link(
 
     denominator.replace(0, np.nan, inplace=True)  # cover division with 0
 
-    link = numerator / denominator
+    df[link_col_name] = numerator / denominator
 
-    return link
+    return df
diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py
index 8012d001..51fa63c8 100644
--- a/tests/test_forward_link.py
+++ b/tests/test_forward_link.py
@@ -1,6 +1,6 @@
 import pytest
 from helper_functions import load_and_format
-from pandas.testing import assert_series_equal
+from pandas.testing import assert_frame_equal
 
 from src.forward_link import calculate_imputation_link
 
@@ -12,11 +12,11 @@ class TestLinks:
     def test_forward_links(self, scenario):
         """Test if function returns the f_link column"""
 
-        df_input = load_and_format("tests/" + scenario + ".csv")
+        df_output = load_and_format("tests/" + scenario + ".csv")
 
-        expected_link = df_input["f_link"]
+        df_input = df_output.drop(columns=["f_link"])
 
-        link_to_test = calculate_imputation_link(
+        df_input = calculate_imputation_link(
             df_input,
             "period",
             "group",
@@ -25,16 +25,15 @@ def test_forward_links(self, scenario):
             "f_predictive_question",
         )
 
-        assert_series_equal(link_to_test, expected_link, check_names=False)
+        assert_frame_equal(df_input, df_output, check_like=True)
 
     def test_back_links(self, scenario):
         """Test if function returns the b_link column"""
+        df_output = load_and_format("tests/" + scenario + ".csv")
 
-        df_input = load_and_format("tests/" + scenario + ".csv")
+        df_input = df_output.drop(columns=["b_link"])
 
-        expected_link = df_input["b_link"]
-
-        link_to_test = calculate_imputation_link(
+        df_input = calculate_imputation_link(
             df_input,
             "period",
             "group",
@@ -43,4 +42,34 @@ def test_back_links(self, scenario):
             "b_predictive_question",
         )
 
-        assert_series_equal(link_to_test, expected_link, check_names=False)
+        assert_frame_equal(df_input, df_output, check_like=True)
+
+    def test_exception(self, scenario):
+
+        df = load_and_format("tests/" + scenario + ".csv")
+
+        with pytest.raises(ValueError):
+            """
+            Test if function is called with wrong arguments, in particular
+            with f_matched_pair and b_predictive_question or with
+            b_matched_pair and f_predictive_question.
+            """
+
+            df = calculate_imputation_link(
+                df,
+                "period",
+                "group",
+                "f_matched_pair",
+                "question",
+                "b_predictive_question",
+            )
+        with pytest.raises(ValueError):
+
+            df = calculate_imputation_link(
+                df,
+                "period",
+                "group",
+                "b_matched_pair",
+                "question",
+                "f_predictive_question",
+            )

From 0e4b26167d6d674b337be72b88757128ad825bcb Mon Sep 17 00:00:00 2001
From: Wil Roberts <47739563+robertswh@users.noreply.github.com>
Date: Tue, 21 May 2024 16:31:19 +0100
Subject: [PATCH 16/20] 330 consecutive imputation links (#15)

* add function for cumulative imputation links

* added tests for forward and backward cumulative links

* adding pre-commit hooks

* changes after review
---
 src/cumulative_imputation_links.py        | 72 +++++++++++++++++++++++
 tests/cumulative_links.csv                |  7 +++
 tests/test_cumulative_imputation_links.py | 64 ++++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100755 src/cumulative_imputation_links.py
 create mode 100755 tests/cumulative_links.csv
 create mode 100755 tests/test_cumulative_imputation_links.py

diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py
new file mode 100755
index 00000000..91dfbed9
--- /dev/null
+++ b/src/cumulative_imputation_links.py
@@ -0,0 +1,72 @@
+import numpy as np
+
+
+def get_cumulative_links(
+    dataframe,
+    forward_or_backward,
+    strata,
+    reference,
+    target,
+    period,
+    imputation_link,
+    time_difference=1,
+):
+    """
+    Create cumulative imputation links for multiple consecutive periods
+    without a return.
+
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+    forward_or_backward: str
+        either f or b for forward or backward method
+
+    strata : str
+        column name containing strata information (sic)
+    reference : str
+        column name containing business reference id
+    target : str
+        column name containing target variable
+    period : str
+        column name containing time period
+    imputation_link : string
+        column name containing imputation links
+    time_difference : int
+        time difference between predictive and target period in months
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with imputation_group and
+        cumulative_forward/backward_imputation_link column
+    """
+
+    dataframe.sort_values([strata, reference, period], inplace=True)
+    dataframe["missing_value"] = np.where(dataframe[target].isnull(), True, False)
+
+    dataframe["imputation_group"] = (
+        (
+            (dataframe["missing_value"].diff(time_difference) != 0)
+            | (dataframe[strata].diff(time_difference) != 0)
+            | (dataframe[reference].diff(time_difference) != 0)
+        )
+        .astype("int")
+        .cumsum()
+    )
+
+    if forward_or_backward == "f":
+        dataframe["cumulative_" + imputation_link] = dataframe.groupby(
+            "imputation_group"
+        )[imputation_link].cumprod()
+    elif forward_or_backward == "b":
+        dataframe["cumulative_" + imputation_link] = (
+            dataframe[::-1].groupby("imputation_group")[imputation_link].cumprod()[::-1]
+        )
+
+    dataframe["cumulative_" + imputation_link] = np.where(
+        ~dataframe[target].isnull(),
+        np.nan,
+        dataframe["cumulative_" + imputation_link],
+    )
+
+    return dataframe[["imputation_group", "cumulative_" + imputation_link]]
diff --git a/tests/cumulative_links.csv b/tests/cumulative_links.csv
new file mode 100755
index 00000000..bef347a5
--- /dev/null
+++ b/tests/cumulative_links.csv
@@ -0,0 +1,7 @@
+strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link
+100,100000,200,202402,1,2,1,,
+100,100000,,202403,2,0.6,2,2,0.6
+100,100000,,202404,3,1,2,6,1
+200,100001,,202402,1,4,3,1,2
+200,100001,,202403,3,0.5,3,3,0.5
+200,100001,300,202404,0.5,1,4,,
diff --git a/tests/test_cumulative_imputation_links.py b/tests/test_cumulative_imputation_links.py
new file mode 100755
index 00000000..bf31094a
--- /dev/null
+++ b/tests/test_cumulative_imputation_links.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+
+import pytest
+from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+from src.cumulative_imputation_links import get_cumulative_links
+
+
+@pytest.fixture(scope="class")
+def cumulative_links_test_data():
+    return load_and_format(Path("tests") / "cumulative_links.csv")
+
+
+class TestComulativeLinks:
+    def test_get_cumulative_links_forward(self, cumulative_links_test_data):
+        input_data = cumulative_links_test_data.drop(
+            columns=["cumulative_forward_imputation_link", "imputation_group"]
+        )
+
+        expected_output = cumulative_links_test_data[
+            [
+                "imputation_group",
+                "cumulative_forward_imputation_link",
+            ]
+        ]
+
+        actual_output = get_cumulative_links(
+            input_data,
+            "f",
+            "strata",
+            "reference",
+            "target",
+            "period",
+            "forward_imputation_link",
+            1,
+        )
+
+        assert_frame_equal(actual_output, expected_output)
+
+    def test_get_cumulative_links_backward(self, cumulative_links_test_data):
+        input_data = cumulative_links_test_data.drop(
+            columns=["cumulative_backward_imputation_link", "imputation_group"]
+        )
+
+        expected_output = cumulative_links_test_data[
+            [
+                "imputation_group",
+                "cumulative_backward_imputation_link",
+            ]
+        ]
+
+        actual_output = get_cumulative_links(
+            input_data,
+            "b",
+            "strata",
+            "reference",
+            "target",
+            "period",
+            "backward_imputation_link",
+            1,
+        )
+
+        assert_frame_equal(actual_output, expected_output)

From a27bb91fb921c66b4fb8ef12efead02d59ff7cee Mon Sep 17 00:00:00 2001
From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com>
Date: Wed, 22 May 2024 14:25:55 +0100
Subject: [PATCH 17/20] 353 create imputation markers (#14)

* Change unit tests from dropping to selecting, ready for adding more cols into test data

* Adding module to calculate imputation flag columns

* Creating unit test and test data for imputation flag

* Copying input data to fix pandas copy warnings

* Adding docstrings

* Refactoring `matched_pair` column to include target column in name

* Update impute flags to include impute from construction

* Create function to convert impute flags into single column with strings

* Fixing pandas copy on slice warning

* Updating docstring and handle case where needed columns are not included

* Update error message

* Adding unit test for string flag column

* Renaming imputation flag function to imputation_flag_marker

* Rename column in test data

* Refactor to use dictionary to store imputation markers and conditions (can be extracted to yaml file if needed)

* Refactor to define column names earlier in function

* Add f_predictive_auxiliary variable to test data

* refactor: Add predictive_auxiliary as function argument

Instead of calling flag_matched_pair_merge within the function to create the predictive_auxiliary, it is defined as function argument. Hence flag_matched_pair_merge must be called before create_impute_flags.  This will convert flag_matched_pair_merge  to a low level function and using pandas framework.

* Change period type to int

* Update expected columns in function and tests

---------

Co-authored-by: zogkoa <Anton.Zogkolli@ons.gov.uk>
---
 src/flag_and_count_matched_pairs.py           |   6 +-
 src/imputation_flags.py                       | 137 ++++++++++++++++++
 tests/imputation_flag_data.csv                |  28 ++++
 .../case1_expected_output.csv                 |   2 +-
 .../case2_expected_output.csv                 |   2 +-
 .../case3_expected_output.csv                 |   2 +-
 tests/test_flag_and_count_matched_pairs.py    | 106 ++++++++++----
 tests/test_imputation_flags.py                |  50 +++++++
 8 files changed, 297 insertions(+), 36 deletions(-)
 create mode 100644 src/imputation_flags.py
 create mode 100644 tests/imputation_flag_data.csv
 create mode 100644 tests/test_imputation_flags.py

diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
index d2b6f8a6..7d286892 100644
--- a/src/flag_and_count_matched_pairs.py
+++ b/src/flag_and_count_matched_pairs.py
@@ -39,7 +39,7 @@ def flag_matched_pair_merge(
         time_difference = -time_difference
 
     # Creating new DF, shifting period for forward or backward
-    df_with_predictive_column = df[[reference, strata, target]]
+    df_with_predictive_column = df.copy()[[reference, strata, target]]
     df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
         months=time_difference
     )
@@ -55,7 +55,7 @@ def flag_matched_pair_merge(
         how="left",
     )
 
-    matched_col_name = forward_or_backward + "_matched_pair"
+    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
     df[matched_col_name] = np.where(
         df[[target, predictive_col_name]].isnull().any(axis=1), False, True
@@ -107,7 +107,7 @@ def flag_matched_pair_shift(
     df["validate_date"] = np.where(
         df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
     )
-    matched_col_name = forward_or_backward + "_matched_pair"
+    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
     df[matched_col_name] = np.where(
         df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
diff --git a/src/imputation_flags.py b/src/imputation_flags.py
new file mode 100644
index 00000000..91bc04ad
--- /dev/null
+++ b/src/imputation_flags.py
@@ -0,0 +1,137 @@
+import numpy as np
+import pandas as pd
+
+
+def create_impute_flags(
+    df: pd.DataFrame,
+    target: str,
+    reference: str,
+    strata: str,
+    auxiliary: str,
+    predictive_auxiliary: str,
+):
+
+    """
+    function to create logical columns for each type of imputation
+    output columns are needed to create the string flag column for
+    imputation methods.
+    Function requires f_predictive and b_predictive columns produced
+    by `flag_matched_pair` function.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing forward, backward predictive period columns (
+        These columns are created by calling flag_matched_pair_merge forward
+        and backwards)
+
+    target : str
+        Column name containing target variable.
+    reference : str
+        Column name containing business reference id.
+    strata : str
+        Column name containing strata information (sic).
+    auxiliary : str
+        Column name containing auxiliary data.
+    predictive_auxiliary: str
+        Column name containing predictive auxiliary data, this is created,
+        by flag_matched_pair_merge function.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with four additional logical columns determining if target
+        is a return (r_flag) can be imputed by forward imputation (fir_flag),
+        backward imputation (bir_flag) or can be constructed (c_flag)
+    """
+    for direction in ["f", "b"]:
+        try:
+            df["{}_predictive_{}".format(direction, target)]
+        except KeyError:
+            raise KeyError(
+                "Dataframe needs column '{}_predictive_{}',".format(direction, target)
+                + " run flag_matched_pair function first"
+            )
+    forward_target_roll = "f_predictive_" + target + "_roll"
+    backward_target_roll = "b_predictive_" + target + "_roll"
+    forward_aux_roll = "f_predictive_" + auxiliary + "_roll"
+
+    df[forward_target_roll] = df.groupby([reference, strata])[
+        "f_predictive_" + target
+    ].ffill()
+
+    df[backward_target_roll] = df.groupby([reference, strata])[
+        "b_predictive_" + target
+    ].bfill()
+
+    df["r_flag"] = df[target].notna()
+
+    df["fir_flag"] = np.where(
+        df[forward_target_roll].notna() & df[target].isna(), True, False
+    )
+
+    df["bir_flag"] = np.where(
+        df[backward_target_roll].notna() & df[target].isna(), True, False
+    )
+
+    construction_conditions = df[target].isna() & df[auxiliary].notna()
+    df["c_flag"] = np.where(construction_conditions, True, False)
+
+    df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill()
+
+    fic_conditions = df[target].isna() & df[forward_aux_roll].notna()
+    df["fic_flag"] = np.where(fic_conditions, True, False)
+
+    df.drop(
+        [
+            forward_target_roll,
+            backward_target_roll,
+            forward_aux_roll,
+            predictive_auxiliary,
+        ],
+        axis=1,
+        inplace=True,
+    )
+
+    return df
+
+
+def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Function to add column containing the a string indicating the method of
+    imputation to use following the hierarchy in specifications
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing logical columns produced by `create_imputation_flags`
+        (r_flag, fir_flag, bir_flag, fic_flag and c_flag)
+
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with additional column containing imputation marker
+        i.e. the type of imputation method that should be used to fill
+        missing returns.
+    """
+
+    imputation_markers_and_conditions = {
+        "r": df["r_flag"],
+        "fir": ~df["r_flag"] & df["fir_flag"],
+        "bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"],
+        "fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"],
+        "c": ~df["r_flag"]
+        & ~df["fir_flag"]
+        & ~df["bir_flag"]
+        & ~df["fic_flag"]
+        & df["c_flag"],
+    }
+
+    df["imputation_marker"] = np.select(
+        imputation_markers_and_conditions.values(),
+        imputation_markers_and_conditions.keys(),
+        default="error",
+    )
+
+    return df
diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv
new file mode 100644
index 00000000..31b56aa8
--- /dev/null
+++ b/tests/imputation_flag_data.csv
@@ -0,0 +1,28 @@
+reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker
+1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r
+1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir
+1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r
+1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r
+2,100,202001,,72.0,,,False,False,True,True,False,,bir
+2,100,202002,,,,,False,False,True,False,True,72.0,bir
+2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir
+2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r
+3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir
+3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r
+3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r
+3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r
+4,100,202001,64.0,81.0,,,True,False,False,False,False,,r
+4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir
+4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir
+4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r
+5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r
+5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r
+5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r
+5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r
+6,100,202001,64.0,81.0,,,True,False,False,False,False,,r
+6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir
+6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r
+6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir
+7,100,202001,,40.0,,,False,False,False,True,False,,c
+7,100,202002,,,,,False,False,False,False,True,40.0,fic
+7,100,202003,,,,,False,False,False,False,True,,fic
diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv
index e05d9fe3..4e833e7b 100644
--- a/tests/test_data_matched_pair/case1_expected_output.csv
+++ b/tests/test_data_matched_pair/case1_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,1
 1,101,202403,,False,1,False,0
diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv
index dbae472b..468ad85b 100644
--- a/tests/test_data_matched_pair/case2_expected_output.csv
+++ b/tests/test_data_matched_pair/case2_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,1
 1,101,202403,,False,1,False,0
diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv
index 12ad8810..a94662ca 100644
--- a/tests/test_data_matched_pair/case3_expected_output.csv
+++ b/tests/test_data_matched_pair/case3_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,0
 1,101,202403,,False,0,False,0
diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py
index cf4b0525..79c25eba 100644
--- a/tests/test_flag_and_count_matched_pairs.py
+++ b/tests/test_flag_and_count_matched_pairs.py
@@ -28,11 +28,15 @@
 class TestMatchedPair:
     def test_flag_matched_pair_merge_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):
 
     def test_flag_matched_pair_merge_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):
 
     def test_count_matched_pair_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+                "f_matched_pair_count",
+            ]
+        ]
         df_input = df_expected_output[
-            ["reference", "strata", "period", "target_variable", "f_matched_pair"]
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
         ]
-        df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
+        df_output = count_matches(
+            df_input, "f_matched_pair_target_variable", "period", "strata"
+        )
         assert_frame_equal(df_output, df_expected_output)
 
     def test_count_matches_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+                "b_matched_pair_count",
+            ]
+        ]
         df_input = df_expected_output[
-            ["reference", "strata", "period", "target_variable", "b_matched_pair"]
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
         ]
-        df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
+        df_output = count_matches(
+            df_input, "b_matched_pair_target_variable", "period", "strata"
+        )
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_shift_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):
 
     def test_flag_matched_pair_shift_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
diff --git a/tests/test_imputation_flags.py b/tests/test_imputation_flags.py
new file mode 100644
index 00000000..315b5fa3
--- /dev/null
+++ b/tests/test_imputation_flags.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+
+import pytest
+from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+from src.imputation_flags import create_impute_flags, generate_imputation_marker
+
+
+@pytest.fixture(scope="class")
+def imputation_flag_test_data():
+    return load_and_format(Path("tests") / "imputation_flag_data.csv")
+
+
+class TestImputationFlags:
+    def test_create_impute_flags(self, imputation_flag_test_data):
+        df_expected_output = imputation_flag_test_data.copy()
+        df_expected_output.drop(["imputation_marker"], axis=1, inplace=True)
+        df_input = df_expected_output.copy()
+        df_input = df_input[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "auxiliary",
+                "f_predictive_target_variable",
+                "b_predictive_target_variable",
+                "f_predictive_auxiliary",
+            ]
+        ]
+        df_output = create_impute_flags(
+            df=df_input,
+            target="target_variable",
+            reference="reference",
+            strata="strata",
+            auxiliary="auxiliary",
+            predictive_auxiliary="f_predictive_auxiliary",
+        )
+
+        df_expected_output.drop(["f_predictive_auxiliary"], axis=1, inplace=True)
+
+        assert_frame_equal(df_output, df_expected_output)
+
+    def test_imputation_marker(self, imputation_flag_test_data):
+        df_expected_output = imputation_flag_test_data.copy()
+        df_input = imputation_flag_test_data.copy()
+        df_input.drop("imputation_marker", axis=1, inplace=True)
+        df_output = generate_imputation_marker(df_input)
+        assert_frame_equal(df_output, df_expected_output)

From d2e8a386284a3f59b50df09c20b053a5fe3a3548 Mon Sep 17 00:00:00 2001
From: Wil Roberts <47739563+robertswh@users.noreply.github.com>
Date: Mon, 3 Jun 2024 13:02:26 +0100
Subject: [PATCH 18/20] 331 apply imputation link to target (#19)

* add test data

* some refactoring before function

* added construction case to test data

* refactored into functions

* add test for higher level function
---
 src/apply_imputation_link.py                  | 161 ++++++++++++++++++
 tests/apply_imputation_link.csv               |  10 ++
 tests/data/apply_imputation_link/BIR.csv      |   4 +
 tests/data/apply_imputation_link/C_FIC.csv    |   4 +
 tests/data/apply_imputation_link/FIR.csv      |   4 +
 .../apply_imputation_link/FIR_BIR_C_FIC.csv   |  10 ++
 tests/test_apply_imputation_link.py           |  37 ++++
 7 files changed, 230 insertions(+)
 create mode 100755 src/apply_imputation_link.py
 create mode 100644 tests/apply_imputation_link.csv
 create mode 100755 tests/data/apply_imputation_link/BIR.csv
 create mode 100755 tests/data/apply_imputation_link/C_FIC.csv
 create mode 100755 tests/data/apply_imputation_link/FIR.csv
 create mode 100755 tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv
 create mode 100755 tests/test_apply_imputation_link.py

diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py
new file mode 100755
index 00000000..e04104fb
--- /dev/null
+++ b/src/apply_imputation_link.py
@@ -0,0 +1,161 @@
+def create_and_merge_imputation_values(
+    df,
+    imputation_class,
+    reference,
+    period,
+    marker,
+    combined_imputation,
+    target,
+    cumulative_forward_link,
+    cumulative_backward_link,
+    auxiliary,
+    construction_link,
+    imputation_types=("c", "fir", "bir", "fic"),
+):
+    """
+    Loop through different imputation types and merge the results according
+    to an imputation marker column
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+    imputation_class : str
+        column name for the variable that defines the imputation class
+    reference : str
+        column name for the reference
+    period : str
+        column name for the period
+    marker : str
+        column name containing a marker to indicate the type of imputation required
+    combined_imputation : str
+        column name for the combined imputation types according to the imputation marker
+    target : str
+        column name for the target variable for imputation
+    cumulative_forward_link : str
+        column name for the cumulative forward imputation link
+    cumulative_backward_link : str
+        column name for the cumulative backward imputation link
+    auxiliary : str
+        column name for auxiliary variable
+    construction_link : str
+        column name for contruction link
+    imputation_types : tup
+        types of imputation to run and add to combined_imputation column stored in a
+        tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'.
+        For 'fic' to produce the correct result, the C marker must be in the first
+        period for a given reference.
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with imputation values defined by the imputation marker
+    """
+
+    # constructed has to come first to use the result for forward impute from contructed
+    imputation_config = {
+        "c": {
+            "intermediate_column": "constructed",
+            "marker": "C",
+            # doesn't actually apply a fill so can be forward or back
+            "fill_column": auxiliary,
+            "fill_method": "ffill",
+            "link_column": construction_link,
+        },
+        "fir": {
+            "intermediate_column": "fir",
+            "marker": "FIR",
+            "fill_column": target,
+            "fill_method": "ffill",
+            "link_column": cumulative_forward_link,
+        },
+        "bir": {
+            "intermediate_column": "bir",
+            "marker": "BIR",
+            "fill_column": target,
+            "fill_method": "bfill",
+            "link_column": cumulative_backward_link,
+        },
+        "fic": {
+            # FIC only works if the C is in the first period of the business being
+            # sampled. This is fine for automatic imputation, but should be careful
+            # if manual construction imputation is done
+            "intermediate_column": "fic",
+            "marker": "FIC",
+            # this has to have the same name as the intermediate column for constructed
+            "fill_column": "constructed",
+            "fill_method": "ffill",
+            "link_column": cumulative_forward_link,
+        },
+    }
+
+    df.sort_values([imputation_class, reference, period], inplace=True)
+
+    intermediate_columns = []
+
+    for imp_type in imputation_types:
+        df = create_impute(
+            df, [imputation_class, reference], imputation_config[imp_type]
+        )
+        df = merge_imputation_type(
+            df, imputation_config[imp_type], marker, combined_imputation
+        )
+
+        intermediate_columns.append(imputation_config[imp_type]["intermediate_column"])
+
+    return df.drop(columns=intermediate_columns)
+
+
+def create_impute(df, group, imputation_spec):
+    """
+    Add a new column to a dataframe of imputed values using ratio imputation.
+
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+    group : str or list
+        variables that define the imputation class
+    imputation_spec: dict
+        dictionary defining the details of the imputation type
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with an added imputation column defined by the imputation_spec
+    """
+    column_name = imputation_spec["intermediate_column"]
+    fill_column = imputation_spec["fill_column"]
+    fill_method = imputation_spec["fill_method"]
+    link_column = imputation_spec["link_column"]
+
+    df[column_name] = (
+        df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column]
+    )
+    return df
+
+
+def merge_imputation_type(df, imputation_spec, marker, combined_imputation):
+    """
+    Uses an existing column of imputed values and a imputation marker to merge values
+    into a single column
+
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+    imputation_spec: dict
+        dictionary defining the details of the imputation type
+    marker : str
+        column name containing a marker to indicate the type of imputation required
+    combined_imputation : str
+        column name for the combined imputation types according to the imputation marker
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with combined_imputation
+    """
+
+    imputation_marker = imputation_spec["marker"]
+    imputation_column = imputation_spec["intermediate_column"]
+
+    df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column]
+    return df
diff --git a/tests/apply_imputation_link.csv b/tests/apply_imputation_link.csv
new file mode 100644
index 00000000..c81711cd
--- /dev/null
+++ b/tests/apply_imputation_link.csv
@@ -0,0 +1,10 @@
+strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link
+100,100000,200,202402,1,2,1,,,R,,,
+100,100000,,202403,2,0.6,2,2,0.6,FIR,400,,
+100,100000,,202404,3,1,2,6,1,FIR,1200,,
+200,100001,,202402,1,4,3,1,2,BIR,600,,
+200,100001,,202403,3,0.5,3,3,0.5,BIR,150,,
+200,100001,300,202404,0.5,1,4,,,R,,,
+300,100002,,202402,1,4,5,1,2,C,600,40,0.1
+300,100002,,202403,3,0.5,5,3,0.5,FIC,150,,
+300,100002,,202404,0.5,1,5,2,,FIC,,,
diff --git a/tests/data/apply_imputation_link/BIR.csv b/tests/data/apply_imputation_link/BIR.csv
new file mode 100755
index 00000000..954700c4
--- /dev/null
+++ b/tests/data/apply_imputation_link/BIR.csv
@@ -0,0 +1,4 @@
+imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value
+200,100001,,202402,4,2,BIR,600
+200,100001,,202403,0.5,0.5,BIR,150
+200,100001,300,202404,1,,R,
diff --git a/tests/data/apply_imputation_link/C_FIC.csv b/tests/data/apply_imputation_link/C_FIC.csv
new file mode 100755
index 00000000..7d2424b2
--- /dev/null
+++ b/tests/data/apply_imputation_link/C_FIC.csv
@@ -0,0 +1,4 @@
+imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value
+300,100002,,202402,1,,0.1,1000,C,100
+300,100002,,202403,3,3,,,FIC,300
+300,100002,,202404,0.5,1.5,,,FIC,150
diff --git a/tests/data/apply_imputation_link/FIR.csv b/tests/data/apply_imputation_link/FIR.csv
new file mode 100755
index 00000000..341ece76
--- /dev/null
+++ b/tests/data/apply_imputation_link/FIR.csv
@@ -0,0 +1,4 @@
+imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value
+100,100000,200,202402,1,,R,
+100,100000,,202403,2,2,FIR,400
+100,100000,,202404,3,6,FIR,1200
diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv
new file mode 100755
index 00000000..91ec36ec
--- /dev/null
+++ b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv
@@ -0,0 +1,10 @@
+imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value
+100,100000,200,202402,1,2,,,,,R,
+100,100000,,202403,2,0.6,,,2,0.6,FIR,400
+100,100000,,202404,3,1,,,6,1,FIR,1200
+200,100001,,202402,1,4,,,1,2,BIR,600
+200,100001,,202403,3,0.5,,,3,0.5,BIR,150
+200,100001,300,202404,0.5,1,,,,,R,
+300,100002,,202402,1,4,1000,0.1,,2,C,100
+300,100002,,202403,3,0.5,,,3,0.5,FIC,300
+300,100002,,202404,0.5,1,,,1.5,,FIC,150
diff --git a/tests/test_apply_imputation_link.py b/tests/test_apply_imputation_link.py
new file mode 100755
index 00000000..568bfcec
--- /dev/null
+++ b/tests/test_apply_imputation_link.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+import pytest
+from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+from src.apply_imputation_link import create_and_merge_imputation_values
+
+
+@pytest.fixture(scope="class")
+def fir_bir_c_fic_test_data():
+    return load_and_format(
+        Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv"
+    )
+
+
+class TestApplyImputationLink:
+    def test_all_imputation_types(self, fir_bir_c_fic_test_data):
+        expected_output = fir_bir_c_fic_test_data
+
+        input_data = expected_output.drop(columns=["imputed_value"])
+        actual_output = create_and_merge_imputation_values(
+            input_data,
+            "imputation_class",
+            "reference",
+            "period",
+            "imputation_marker",
+            "imputed_value",
+            "target",
+            "cumulative_forward_link",
+            "cumulative_backward_link",
+            "auxiliary_variable",
+            "construction_link",
+            imputation_types=("c", "fir", "bir", "fic"),
+        )
+
+        assert_frame_equal(actual_output, expected_output)

From ba25aef807ba59f21bd9b5faccafe58d1a692941 Mon Sep 17 00:00:00 2001
From: Wil Roberts <47739563+robertswh@users.noreply.github.com>
Date: Tue, 4 Jun 2024 11:23:58 +0100
Subject: [PATCH 19/20] added testing guide and function tips (#17)

* added testing guide and function tips

* Add function context and fix external links

---------

Co-authored-by: hemsir <Rowan.Hemsi@ons.gov.uk>
---
 docs/contributor_guide/CONTRIBUTING.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/contributor_guide/CONTRIBUTING.md b/docs/contributor_guide/CONTRIBUTING.md
index ee56bcde..dee48d43 100644
--- a/docs/contributor_guide/CONTRIBUTING.md
+++ b/docs/contributor_guide/CONTRIBUTING.md
@@ -36,14 +36,14 @@ documentation][docs-pre-commit-hooks].
 
 ## Code conventions
 
-Code written for this project should follow [PEP 8 coding conventions](pep8), [project naming conventions](docs-naming) and the guidance on [quality assurance of code for analysis and research](duck-book) (also known as the Duck Book).
+Code written for this project should follow [PEP 8 coding conventions][pep8], [project naming conventions][docs-naming] and the guidance on [quality assurance of code for analysis and research][duck-book] (also known as the Duck Book).
 
 ### Git and GitHub
 
 We use Git to version control the source code and out source code is stored on
 GitHub.
 
-We follow the [GitHub flow](github-flow) workflow. This means that we create
+We follow the [GitHub flow][github-flow] workflow. This means that we create
 feature branches of the `main` branch and merge them back to `main` once they
 meet the definition of done. We give our branches short but informative names,
 in lowercase and separated with hypens. Where applicable, we start branch names
@@ -53,16 +53,20 @@ with the respective Jira ticket number. For example,
 We commit regularly, with distinct chunks of work where possible. We write
 short but informative commit messages, starting with a capitalised
 present-tense verb, for example `Add`, `Fix`. When pair-programming, we
-[add co-authors to the commit](git-coauthor). We add
-[longer commit messages](long-commit) for larger or more complex commits, for
+[add co-authors to the commit][git-coauthor]. We add
+[longer commit messages][long-commit] for larger or more complex commits, for
 example (squash) merge commits.
 
 We open a pull request to `main` once we have working code that meets a user
 need, for example meets the definition of done on the Jira ticket. Pull
 requests must be reviewed by at least one member of the team before merging.
-Reviews should follow the [pull request template](pr-template). If we want review on code that does not yet meet the definition of done, we open a draft
+Reviews should follow the [pull request template][pr-template]. If we want review on code that does not yet meet the definition of done, we open a draft
 pull request. Once a branch has been reviewed, it can be merged. We prefer to use squash merges, in order to simplify the `main` branch commit history. After merging the feature branch should be deleted.
 
+### Functions
+
+We prefer writing functions over classes to make it easier for beginners to understand the code. [Type hints][typing] should be used when writing functions. We prefer functions to return `pandas.DataFrame` rather than `pandas.Series`, for example when deriving new (temporary) variables.
+
 ### Markdown
 
 Local links can be written as normal, but external links should be referenced at the
@@ -83,6 +87,10 @@ tests, enter the following command in your terminal:
 ```shell
 pytest
 ```
+Our testing approach is:
+- use `.csv` files containing simple minimal input and output data for a function to be tested
+- individual test cases should be separated into different `.csv` files and grouped into folders
+- the name of the test data `.csv` files should reflect the test case and the folder name should be the same as the module/function
 
 ### Code coverage
 
@@ -139,3 +147,4 @@ build the documentation into an accessible, searchable website.
 [github-flow]: https://docs.github.com/en/get-started/using-github/github-flow
 [git-coauthor]: https://docs.github.com/en/pull-requests/committing-changes-to-your-project/creating-and-editing-commits/creating-a-commit-with-multiple-authors
 [long-commit]: https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
+[typing]: https://docs.python.org/3/library/typing.html

From ecca2a42288afcb0c236e10b95edcb6e45ecd68b Mon Sep 17 00:00:00 2001
From: Anton Zogkolli <110612763+AntonZogk@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:54:52 +0100
Subject: [PATCH 20/20] 321 flag to ignore response (#18)

* Upload test data for link filters

* build: function to filter rows to ignore values from link

The function adds a new column indicating if row should be ignored. It is based on set_index and index comparison.

Added 2 tests, one to check the output, and if exception is raised when columns do not match.

* build: function to filter rows to ignore values from link

The function adds a new column indicating if row should be ignored. It is based on set_index and index comparison.

Added 2 tests, one to check the output, and if exception is raised when columns do not match.
---
 src/link_filter.py          | 49 +++++++++++++++++++++++++++++++++++++
 tests/test_flag_data.csv    | 29 ++++++++++++++++++++++
 tests/test_flag_filters.csv |  3 +++
 tests/test_link_filter.py   | 39 +++++++++++++++++++++++++++++
 4 files changed, 120 insertions(+)
 create mode 100644 src/link_filter.py
 create mode 100755 tests/test_flag_data.csv
 create mode 100755 tests/test_flag_filters.csv
 create mode 100644 tests/test_link_filter.py

diff --git a/src/link_filter.py b/src/link_filter.py
new file mode 100644
index 00000000..f5ff6383
--- /dev/null
+++ b/src/link_filter.py
@@ -0,0 +1,49 @@
+import pandas as pd
+
+# TODO: Extend function to receive multiple df with *df_with_filters
+
+
+def flag_rows_to_ignore(
+    df: pd.DataFrame, df_with_filters: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Add a new column bool column named ignore_from_link to df
+    having as TRUE the observations defined in df_with_filters.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe.
+    df_with_filters : pd.DataFrame
+        Dataframe with observations which should be flagged in the original
+        dataframe.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Original dataframe with a bool column containing the flags.
+
+    """
+
+    if not set(df_with_filters.columns).issubset(df.columns):
+
+        raise ValueError(
+            f"""df_with_filters has these columns {list(df_with_filters)} while
+            df has these columns {list(df)}, please
+            double check the column names."""
+        )
+
+    # TODO: Check if values to be ignored exist
+
+    df = df.set_index(list(df_with_filters))
+
+    df_with_filters = df_with_filters.set_index(list(df_with_filters))
+
+    df["ignore_from_link"] = df.index.isin(df_with_filters.index)
+
+    df = df.reset_index()
+
+    # TODO: Consider what should be logged and reroute print to logs
+    print("These values were flagged:\n", df.loc[df["ignore_from_link"]])
+
+    return df
diff --git a/tests/test_flag_data.csv b/tests/test_flag_data.csv
new file mode 100755
index 00000000..2f97b47f
--- /dev/null
+++ b/tests/test_flag_data.csv
@@ -0,0 +1,29 @@
+identifier,date,group,question,other,ignore_from_link
+70001,202001,100,5951.0,39,False
+70001,202002,100,1814.0,39,False
+70001,202003,100,734.0,39,True
+70001,202004,100,96.0,39,False
+70001,202005,100,9086.0,39,True
+70001,202006,100,3949.0,39,False
+70001,202007,100,49.0,39,False
+70002,202001,100,6705.0,94,False
+70002,202002,100,48.0,94,False
+70002,202003,100,5361.0,94,False
+70002,202004,100,8767.0,94,False
+70002,202005,100,9214.0,94,False
+70002,202006,100,7467.0,94,False
+70002,202007,100,3475.0,94,False
+70003,202001,100,6153.0,42,False
+70003,202002,100,7711.0,42,False
+70003,202003,100,5403.0,42,False
+70003,202004,100,7445.0,42,False
+70003,202005,100,7092.0,42,False
+70003,202006,100,2038.0,42,False
+70003,202007,100,8768.0,42,False
+70004,202001,100,,6,False
+70004,202002,100,,6,False
+70004,202003,100,6288.0,6,False
+70004,202004,100,,6,False
+70004,202005,100,,6,False
+70004,202006,100,5875.0,6,False
+70004,202007,100,,6,False
diff --git a/tests/test_flag_filters.csv b/tests/test_flag_filters.csv
new file mode 100755
index 00000000..abdfb4c8
--- /dev/null
+++ b/tests/test_flag_filters.csv
@@ -0,0 +1,3 @@
+identifier,date
+70001,202003
+70001,202005
diff --git a/tests/test_link_filter.py b/tests/test_link_filter.py
new file mode 100644
index 00000000..bbd5cc75
--- /dev/null
+++ b/tests/test_link_filter.py
@@ -0,0 +1,39 @@
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal
+
+from src.link_filter import flag_rows_to_ignore
+
+
+@pytest.mark.parametrize("scenario", ["test_flag_data"])
+@pytest.mark.parametrize("filters", ["test_flag_filters"])
+class TestFilters:
+    def test_basic_filter(self, scenario, filters):
+        """Test ignore_from_link is correct"""
+
+        df_output_expected = pd.read_csv("tests/" + scenario + ".csv")
+
+        df_filters = pd.read_csv("tests/" + filters + ".csv")
+
+        df_input = df_output_expected.drop(columns=["ignore_from_link"])
+
+        df_output = flag_rows_to_ignore(df_input, df_filters)
+
+        assert_frame_equal(df_output, df_output_expected)
+
+    def test_exception(self, scenario, filters):
+
+        """Test if function raises an exception when the columns in filters
+        do not exist in scenario."""
+
+        df_output_expected = pd.read_csv("tests/" + scenario + ".csv")
+
+        df_filters = pd.read_csv("tests/" + filters + ".csv")
+
+        df_input = df_output_expected.drop(columns=["ignore_from_link"])
+
+        with pytest.raises(ValueError):
+
+            df_filters.columns = df_filters.columns + "_fail"
+
+            flag_rows_to_ignore(df_input, df_filters)