From 3a8b237293cee5faf9787b80147d74cca627ab94 Mon Sep 17 00:00:00 2001 From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Date: Fri, 10 May 2024 12:49:12 +0100 Subject: [PATCH 01/20] 338 pre commit hooks (#10) * Auto formatting items in test dir by commit hooks * Auto formatting test data by commit hooks * Auto formatting of pre-commit function by commit hooks * Auto formatting construction and requirements using commit hooks * Auto formatting and modifying L112 to fix E712 * Auto formatting workflow yaml by commit hooks * Adding mixed line ending and detecting secrets * Removing unused commit hook functions --- .github/workflows/main.yaml | 2 +- .pre-commit-config.yaml | 32 +++-- pre-commits/check_added_large_files.py | 27 ++-- pre-commits/check_merge_conflict.py | 33 +++-- pre-commits/commit_msg.py | 41 ------- pre-commits/end_of_line_fixer.py | 20 ++- pre-commits/mixed_line_endings.py | 42 +++---- pre-commits/prepare_commit_msg.py | 45 ------- pre-commits/remove_whitespace.py | 77 ++++++------ requirements.txt | 2 +- src/construction_matches.py | 5 +- src/flag_and_count_matched_pairs.py | 100 ++++++++------- tests/helper_functions.py | 3 +- tests/test_construction_matches.py | 44 ++++--- .../case1_expected_output.csv | 2 +- .../case2_expected_output.csv | 2 +- .../case3_expected_output.csv | 2 +- tests/test_flag_and_count_matched_pairs.py | 116 ++++++++++++------ 18 files changed, 288 insertions(+), 307 deletions(-) delete mode 100755 pre-commits/commit_msg.py delete mode 100755 pre-commits/prepare_commit_msg.py diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 12fe2efe..080d4cd3 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -46,4 +46,4 @@ jobs: - name: Run pytest run: | - pytest -v \ No newline at end of file + pytest -v diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 44b4d541..81880b61 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,15 @@ repos: language: script stages: [commit] +#works +- repo: local + hooks: + - id: mixed-line-endings + entry: pre-commits/mixed_line_endings.py + name: Check for consistent end of line type LF to CRLF to CR (auto-fixes) + language: script + stages: [commit] + #works #if using on different file types, it will need a seperate hook per file type - repo: local @@ -76,18 +85,17 @@ repos: stages: [commit] -# #needs to remove the password in hello_world.py -# - repo: local -# hooks: -# - id: detect-secrets -# entry: detect-secrets -# name: detect-secrets - Detect secrets in staged code -# #args: [ "--baseline", ".secrets.baseline" ] -# args: [scan, audit] -# language: system -# types: [python] -# stages: [commit] -# exclude: .*/tests/.*|^\.cruft\.json$ +# works in testing +- repo: local + hooks: + - id: detect-secrets + entry: detect-secrets-hook + name: detect-secrets - Detect secrets in staged code + args: [ "--baseline", ".secrets.baseline" ] + #args: [scan, audit] + language: system + types: [python] + stages: [commit] diff --git a/pre-commits/check_added_large_files.py b/pre-commits/check_added_large_files.py index 41fa69b3..59c0353a 100755 --- a/pre-commits/check_added_large_files.py +++ b/pre-commits/check_added_large_files.py @@ -4,24 +4,20 @@ import json import math import os -from typing import Optional -from typing import Sequence -from typing import Set +from typing import Optional, Sequence, Set -from pre_commit_hooks.util import added_files -from pre_commit_hooks.util import CalledProcessError -from pre_commit_hooks.util import cmd_output +from pre_commit_hooks.util import CalledProcessError, added_files, cmd_output def _lfs_files() -> Set[str]: """Private function.""" try: # Introduced in git-lfs 2.2.0, first working in 2.2.1 - lfs_ret = cmd_output('git', 'lfs', 'status', '--json') + lfs_ret = cmd_output("git", "lfs", "status", "--json") except CalledProcessError: # pragma: no cover (with git-lfs) lfs_ret = '{"files":{}}' - return set(json.loads(lfs_ret)['files']) + return set(json.loads(lfs_ret)["files"]) def _find_large_added_files(filenames: Sequence[str], maxkb: int) -> int: @@ -32,7 +28,7 @@ def _find_large_added_files(filenames: Sequence[str], maxkb: int) -> int: for filename in (added_files() & set(filenames)) - _lfs_files(): kb = int(math.ceil(os.stat(filename).st_size / 1024)) if kb > maxkb: - print(f'{filename} ({kb} KB) exceeds {maxkb} KB.') + print(f"{filename} ({kb} KB) exceeds {maxkb} KB.") retv = 1 return retv @@ -42,17 +38,20 @@ def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() parser.add_argument( - 'filenames', nargs='*', - help='Filenames pre-commit believes are changed.', + "filenames", + nargs="*", + help="Filenames pre-commit believes are changed.", ) parser.add_argument( - '--maxkb', type=int, default=500, - help='Maxmimum allowable KB for added files', + "--maxkb", + type=int, + default=500, + help="Maxmimum allowable KB for added files", ) args = parser.parse_args(argv) return _find_large_added_files(args.filenames, args.maxkb) -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/check_merge_conflict.py b/pre-commits/check_merge_conflict.py index 85a29255..e6c67007 100755 --- a/pre-commits/check_merge_conflict.py +++ b/pre-commits/check_merge_conflict.py @@ -2,35 +2,30 @@ """Pre commit hook to check for merge conflict flags in file.""" import argparse import os.path -from typing import Optional -from typing import Sequence - +from typing import Optional, Sequence CONFLICT_PATTERNS = [ - b'<<<<<<< ', - b'======= ', - b'=======\n', - b'>>>>>>> ', + b"<<<<<<< ", + b"======= ", + b"=======\n", + b">>>>>>> ", ] def _is_in_merge() -> int: """Private function.""" - return ( - os.path.exists(os.path.join('.git', 'MERGE_MSG')) and - ( - os.path.exists(os.path.join('.git', 'MERGE_HEAD')) or - os.path.exists(os.path.join('.git', 'rebase-apply')) or - os.path.exists(os.path.join('.git', 'rebase-merge')) - ) + return os.path.exists(os.path.join(".git", "MERGE_MSG")) and ( + os.path.exists(os.path.join(".git", "MERGE_HEAD")) + or os.path.exists(os.path.join(".git", "rebase-apply")) + or os.path.exists(os.path.join(".git", "rebase-merge")) ) def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() - parser.add_argument('filenames', nargs='*') - parser.add_argument('--assume-in-merge', action='store_true') + parser.add_argument("filenames", nargs="*") + parser.add_argument("--assume-in-merge", action="store_true") args = parser.parse_args(argv) if not _is_in_merge() and not args.assume_in_merge: @@ -38,18 +33,18 @@ def main(argv: Optional[Sequence[str]] = None) -> int: retcode = 0 for filename in args.filenames: - with open(filename, 'rb') as inputfile: + with open(filename, "rb") as inputfile: for i, line in enumerate(inputfile): for pattern in CONFLICT_PATTERNS: if line.startswith(pattern): print( f'Merge conflict string "{pattern.decode()}" ' - f'found in {filename}:{i + 1}', + f"found in {filename}:{i + 1}", ) retcode = 1 return retcode -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/commit_msg.py b/pre-commits/commit_msg.py deleted file mode 100755 index e478166a..00000000 --- a/pre-commits/commit_msg.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -"""Git hook to check git commit message has appropriate length subject line. - -After removing the jira issue number from the subject line we check that the -message is longer than 20 characters and shorter than 65. -""" -import sys - -# Collect the parameters -commit_msg_filepath = sys.argv[1] - -with open(commit_msg_filepath, 'r') as f: - lines = f.readlines() - - # The subject is the first line of the message, but we don't count any - # Jira issue note - commit_subject = lines[0].split(']')[-1] - - if (len(commit_subject) < 20): - print( - f''' - commit-msg: ERROR! The commit subject is too short! - subject length = {len(commit_subject)} < 20 characters' - ''' - ) - sys.exit(1) - - elif (len(commit_subject) > 65): - # We check if messages are greater than 65 char, but warn as if - # longer than 50 - print( - f''' - commit-msg: ERROR! - The commit subject is too long! - subject length = {len(commit_subject)} > 50 characters' - ''' - ) - sys.exit(1) - -# for line in lines[2:]: -# print(line) diff --git a/pre-commits/end_of_line_fixer.py b/pre-commits/end_of_line_fixer.py index 8f39b8c1..eb85f62e 100755 --- a/pre-commits/end_of_line_fixer.py +++ b/pre-commits/end_of_line_fixer.py @@ -2,9 +2,7 @@ """Pre commit hook to ensure single blank line at end of python file.""" import argparse import os -from typing import IO -from typing import Optional -from typing import Sequence +from typing import IO, Optional, Sequence def _fix_file(file_obj: IO[bytes]) -> int: @@ -17,13 +15,13 @@ def _fix_file(file_obj: IO[bytes]) -> int: return 0 last_character = file_obj.read(1) # last_character will be '' for an empty file - if last_character not in {b'\n', b'\r'} and last_character != b'': + if last_character not in {b"\n", b"\r"} and last_character != b"": # Needs this seek for windows, otherwise IOError file_obj.seek(0, os.SEEK_END) - file_obj.write(b'\n') + file_obj.write(b"\n") return 1 - while last_character in {b'\n', b'\r'}: + while last_character in {b"\n", b"\r"}: # Deal with the beginning of the file if file_obj.tell() == 1: # If we've reached the beginning of the file and it is all @@ -40,7 +38,7 @@ def _fix_file(file_obj: IO[bytes]) -> int: # newlines. If we find extraneous newlines, then backtrack and trim them. position = file_obj.tell() remaining = file_obj.read() - for sequence in (b'\n', b'\r\n', b'\r'): + for sequence in (b"\n", b"\r\n", b"\r"): if remaining == sequence: return 0 elif remaining.startswith(sequence): @@ -54,21 +52,21 @@ def _fix_file(file_obj: IO[bytes]) -> int: def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() - parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument("filenames", nargs="*", help="Filenames to fix") args = parser.parse_args(argv) retv = 0 for filename in args.filenames: # Read as binary so we can read byte-by-byte - with open(filename, 'rb+') as file_obj: + with open(filename, "rb+") as file_obj: ret_for_file = _fix_file(file_obj) if ret_for_file: - print(f'Fixing {filename}') + print(f"Fixing {filename}") retv |= ret_for_file return retv -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/mixed_line_endings.py b/pre-commits/mixed_line_endings.py index 48afc2e6..8ae44909 100755 --- a/pre-commits/mixed_line_endings.py +++ b/pre-commits/mixed_line_endings.py @@ -2,31 +2,28 @@ """Pre commit hook to ensure all EOL characters are the same.""" import argparse import collections -from typing import Dict -from typing import Optional -from typing import Sequence +from typing import Dict, Optional, Sequence - -CRLF = b'\r\n' -LF = b'\n' -CR = b'\r' +CRLF = b"\r\n" +LF = b"\n" +CR = b"\r" # Prefer LF to CRLF to CR, but detect CRLF before LF ALL_ENDINGS = (CR, CRLF, LF) -FIX_TO_LINE_ENDING = {'cr': CR, 'crlf': CRLF, 'lf': LF} +FIX_TO_LINE_ENDING = {"cr": CR, "crlf": CRLF, "lf": LF} def _fix(filename: str, contents: bytes, ending: bytes) -> None: """Private function.""" - new_contents = b''.join( - line.rstrip(b'\r\n') + ending for line in contents.splitlines(True) + new_contents = b"".join( + line.rstrip(b"\r\n") + ending for line in contents.splitlines(True) ) - with open(filename, 'wb') as f: + with open(filename, "wb") as f: f.write(new_contents) def fix_filename(filename: str, fix: str) -> int: """Private function.""" - with open(filename, 'rb') as f: + with open(filename, "rb") as f: contents = f.read() counts: Dict[bytes, int] = collections.defaultdict(int) @@ -40,10 +37,10 @@ def fix_filename(filename: str, fix: str) -> int: # Some amount of mixed line endings mixed = sum(bool(x) for x in counts.values()) > 1 - if fix == 'no' or (fix == 'auto' and not mixed): + if fix == "no" or (fix == "auto" and not mixed): return mixed - if fix == 'auto': + if fix == "auto": max_ending = LF max_lines = 0 # ordering is important here such that lf > crlf > cr @@ -70,24 +67,25 @@ def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() parser.add_argument( - '-f', '--fix', - choices=('auto', 'no') + tuple(FIX_TO_LINE_ENDING), - default='auto', + "-f", + "--fix", + choices=("auto", "no") + tuple(FIX_TO_LINE_ENDING), + default="auto", help='Replace line ending with the specified. Default is "auto"', ) - parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument("filenames", nargs="*", help="Filenames to fix") args = parser.parse_args(argv) retv = 0 for filename in args.filenames: if fix_filename(filename, args.fix): - if args.fix == 'no': - print(f'{filename}: mixed line endings') + if args.fix == "no": + print(f"{filename}: mixed line endings") else: - print(f'{filename}: fixed mixed line endings') + print(f"{filename}: fixed mixed line endings") retv = 1 return retv -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/prepare_commit_msg.py b/pre-commits/prepare_commit_msg.py deleted file mode 100755 index 7cc97878..00000000 --- a/pre-commits/prepare_commit_msg.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -"""Git hook to automatically prefix git commit message with Jira issue number. - -The issue number (e.g. Jira ticket number) from the current branch name. Works -with or without specifying -m option at commit time. -""" -import re -import sys -from subprocess import check_output - - -commit_msg_filepath = sys.argv[1] -branch = ( - check_output(["git", "symbolic-ref", "--short", "HEAD"]) - .decode("utf-8").strip() -) - -# If branch name contains /'s we only want the final part of the branch name -branch_end = branch.split('/')[-1] - -# Regex pattern for matching to Jira issues -regex = r"[Jj]\d+" - -if re.search(regex, branch_end): - # Create list of all matches to regex pattern - issue_number_matches = re.findall(regex, branch_end) - - # If mutiple issues in branch name we join them together - commit_issue = f'{"_".join(issue_number_matches)}' - - with open(commit_msg_filepath, "r+") as f: - commit_msg = f.read() - f.seek(0, 0) # correctly position issue_number when writing commit msg - f.write(f"[{commit_issue}] {commit_msg}") - -else: - # If branch does not contain a jira issue number, reject the commit - print( - f''' - prepare-commit-msg: Error! - Branch name is {branch} - Does not match branch name strategy \'*/jxxx\' - ''' - ) - sys.exit(1) diff --git a/pre-commits/remove_whitespace.py b/pre-commits/remove_whitespace.py index d890b18c..61e5803f 100755 --- a/pre-commits/remove_whitespace.py +++ b/pre-commits/remove_whitespace.py @@ -2,21 +2,20 @@ """Pre commit hook to remove any trailing whitespace.""" import argparse import os -from typing import Optional -from typing import Sequence +from typing import Optional, Sequence def _fix_file( - filename: str, - is_markdown: bool, - chars: Optional[bytes], + filename: str, + is_markdown: bool, + chars: Optional[bytes], ) -> bool: """Private function.""" - with open(filename, mode='rb') as file_processed: + with open(filename, mode="rb") as file_processed: lines = file_processed.readlines() newlines = [_process_line(line, is_markdown, chars) for line in lines] if newlines != lines: - with open(filename, mode='wb') as file_processed: + with open(filename, mode="wb") as file_processed: for line in newlines: file_processed.write(line) return True @@ -25,22 +24,22 @@ def _fix_file( def _process_line( - line: bytes, - is_markdown: bool, - chars: Optional[bytes], + line: bytes, + is_markdown: bool, + chars: Optional[bytes], ) -> bytes: """Private function.""" - if line[-2:] == b'\r\n': - eol = b'\r\n' + if line[-2:] == b"\r\n": + eol = b"\r\n" line = line[:-2] - elif line[-1:] == b'\n': - eol = b'\n' + elif line[-1:] == b"\n": + eol = b"\n" line = line[:-1] else: - eol = b'' + eol = b"" # preserve trailing two-space for non-blank lines in markdown files - if is_markdown and (not line.isspace()) and line.endswith(b' '): - return line[:-2].rstrip(chars) + b' ' + eol + if is_markdown and (not line.isspace()) and line.endswith(b" "): + return line[:-2].rstrip(chars) + b" " + eol return line.rstrip(chars) + eol @@ -48,48 +47,46 @@ def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() parser.add_argument( - '--no-markdown-linebreak-ext', - action='store_true', + "--no-markdown-linebreak-ext", + action="store_true", help=argparse.SUPPRESS, ) parser.add_argument( - '--markdown-linebreak-ext', - action='append', + "--markdown-linebreak-ext", + action="append", default=[], - metavar='*|EXT[,EXT,...]', + metavar="*|EXT[,EXT,...]", help=( - 'Markdown extensions (or *) to not strip linebreak spaces. ' - 'default: %(default)s' + "Markdown extensions (or *) to not strip linebreak spaces. " + "default: %(default)s" ), ) parser.add_argument( - '--chars', + "--chars", help=( - 'The set of characters to strip from the end of lines. ' - 'Defaults to all whitespace characters.' + "The set of characters to strip from the end of lines. " + "Defaults to all whitespace characters." ), ) - parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument("filenames", nargs="*", help="Filenames to fix") args = parser.parse_args(argv) if args.no_markdown_linebreak_ext: - print('--no-markdown-linebreak-ext now does nothing!') + print("--no-markdown-linebreak-ext now does nothing!") md_args = args.markdown_linebreak_ext - if '' in md_args: - parser.error('--markdown-linebreak-ext requires a non-empty argument') - all_markdown = '*' in md_args + if "" in md_args: + parser.error("--markdown-linebreak-ext requires a non-empty argument") + all_markdown = "*" in md_args # normalize extensions; split at ',', lowercase, and force 1 leading '.' - md_exts = [ - '.' + x.lower().lstrip('.') for x in ','.join(md_args).split(',') - ] + md_exts = ["." + x.lower().lstrip(".") for x in ",".join(md_args).split(",")] # reject probable "eaten" filename as extension: skip leading '.' with [1:] for ext in md_exts: - if any(c in ext[1:] for c in r'./\:'): + if any(c in ext[1:] for c in r"./\:"): parser.error( - f'bad --markdown-linebreak-ext extension ' - f'{ext!r} (has . / \\ :)\n' + f"bad --markdown-linebreak-ext extension " + f"{ext!r} (has . / \\ :)\n" f" (probably filename; use '--markdown-linebreak-ext=EXT')", ) chars = None if args.chars is None else args.chars.encode() @@ -98,10 +95,10 @@ def main(argv: Optional[Sequence[str]] = None) -> int: _, extension = os.path.splitext(filename.lower()) md = all_markdown or extension in md_exts if _fix_file(filename, md, chars): - print(f'Fixing {filename}') + print(f"Fixing {filename}") return_code = 1 return return_code -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/requirements.txt b/requirements.txt index bd9b2879..e26789b2 100755 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ nbqa pre_commit_hooks flake8 pandas==1.1.5 -numpy \ No newline at end of file +numpy diff --git a/src/construction_matches.py b/src/construction_matches.py index dc947e0b..41ab2590 100644 --- a/src/construction_matches.py +++ b/src/construction_matches.py @@ -1,5 +1,6 @@ import pandas as pd + def flag_construction_matches(dataframe, target, period, auxiliary): """ Add flag to indicate whether the record has non-null target, period and @@ -22,6 +23,8 @@ def flag_construction_matches(dataframe, target, period, auxiliary): dataframe with additional flag_construction_matches column """ - dataframe["flag_construction_matches"] = pd.notna(dataframe[[target, period, auxiliary]]).all(axis="columns") + dataframe["flag_construction_matches"] = pd.notna( + dataframe[[target, period, auxiliary]] + ).all(axis="columns") return dataframe diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py index 64989781..e3d55c2a 100644 --- a/src/flag_and_count_matched_pairs.py +++ b/src/flag_and_count_matched_pairs.py @@ -1,7 +1,10 @@ +import numpy as np import pandas as pd -import numpy as np -def flag_matched_pair_merge(df, forward_or_backward,target, period, reference, strata, time_difference=1): + +def flag_matched_pair_merge( + df, forward_or_backward, target, period, reference, strata, time_difference=1 +): """ function to add flag to df if data forms a matched pair i.e. data is given for both period and predictive period @@ -26,38 +29,44 @@ def flag_matched_pair_merge(df, forward_or_backward,target, period, reference, s Returns ------- pd.DataFrame - dataframe with column added flagging forward matched paris and + dataframe with column added flagging forward matched paris and predictive target variable data column - """ + """ - if forward_or_backward == 'f': + if forward_or_backward == "f": time_difference = time_difference - elif forward_or_backward == 'b': - time_difference = -time_difference + elif forward_or_backward == "b": + time_difference = -time_difference # Creating new DF, shifting period for forward or backward df_with_predictive_column = df[[reference, strata, target]] - df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(months=time_difference) - df_with_predictive_column.rename(columns={target : 'predictive_'+target},inplace = True) - - - df = df.merge(df_with_predictive_column, - left_on=[reference, period, strata], - right_on=[reference, "predictive_period", strata], - how="left") - - matched_col_name = forward_or_backward + '_matched_pair' + df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset( + months=time_difference + ) + df_with_predictive_column.rename( + columns={target: "predictive_" + target}, inplace=True + ) + + df = df.merge( + df_with_predictive_column, + left_on=[reference, period, strata], + right_on=[reference, "predictive_period", strata], + how="left", + ) + + matched_col_name = forward_or_backward + "_matched_pair" df[matched_col_name] = np.where( - df[[target,'predictive_'+target]].isnull().any(axis=1), - False, - True) - - df.drop(['predictive_period'],axis = 1, inplace=True) + df[[target, "predictive_" + target]].isnull().any(axis=1), False, True + ) + + df.drop(["predictive_period"], axis=1, inplace=True) return df -def flag_matched_pair_shift(df,forward_or_backward,target, period, reference, strata, shift=1): +def flag_matched_pair_shift( + df, forward_or_backward, target, period, reference, strata, shift=1 +): """ function to flag matched pairs using the shift method @@ -79,35 +88,41 @@ def flag_matched_pair_shift(df,forward_or_backward,target, period, reference, st Returns ------- _type_ - pandas dataframe with column added flagging forward matched pairs and + pandas dataframe with column added flagging forward matched pairs and predictive target variable data column - """ - - if forward_or_backward == 'f': + """ + + if forward_or_backward == "f": shift = shift - elif forward_or_backward == 'b': + elif forward_or_backward == "b": shift = -shift - df = df.sort_values(by = [reference, period]) - df[["predictive_"+target, "predictive_period"]] = df.groupby([reference, strata]).shift(shift)[[target, period]] + df = df.sort_values(by=[reference, period]) + df[["predictive_" + target, "predictive_period"]] = df.groupby( + [reference, strata] + ).shift(shift)[[target, period]] - df["validate_date"] = np.where(df[period].dt.month - df["predictive_period"].dt.month == shift, True, False) - matched_col_name = forward_or_backward + '_matched_pair' + df["validate_date"] = np.where( + df[period].dt.month - df["predictive_period"].dt.month == shift, True, False + ) + matched_col_name = forward_or_backward + "_matched_pair" df[matched_col_name] = np.where( - df[[target,'predictive_target_variable']].isnull().any(axis=1) | (df["validate_date"] != True), - False, - True) + df[[target, "predictive_target_variable"]].isnull().any(axis=1) + | (~df["validate_date"]), + False, + True, + ) - df.drop(['validate_date','predictive_period'],axis = 1, inplace=True) + df.drop(["validate_date", "predictive_period"], axis=1, inplace=True) return df - + def count_matches(df, flag_column_name, period, strata, count_column_name=None): """ Function to count the number of records with matches per period and stratum - + Parameters ---------- df : pd.DataFrame @@ -126,9 +141,10 @@ def count_matches(df, flag_column_name, period, strata, count_column_name=None): ------- pd.DataFrame dataframe with column added for count of records with matches - """ + """ if count_column_name is None: - count_column_name = flag_column_name.split('_')[0]+'_matched_pair_count' - df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform("sum") + count_column_name = flag_column_name.split("_")[0] + "_matched_pair_count" + df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform( + "sum" + ) return df - diff --git a/tests/helper_functions.py b/tests/helper_functions.py index b9006376..83bce07d 100644 --- a/tests/helper_functions.py +++ b/tests/helper_functions.py @@ -1,7 +1,8 @@ import pandas as pd + def load_and_format(filename): """Load csv as pandas dataframe and cast period column to datetime type""" df_loaded = pd.read_csv(filename) - df_loaded['period'] = pd.to_datetime(df_loaded['period'], format='%Y%m') + df_loaded["period"] = pd.to_datetime(df_loaded["period"], format="%Y%m") return df_loaded diff --git a/tests/test_construction_matches.py b/tests/test_construction_matches.py index 1ad2bce9..104521c2 100644 --- a/tests/test_construction_matches.py +++ b/tests/test_construction_matches.py @@ -1,37 +1,45 @@ -import pytest - from pathlib import Path + +import pytest +from helper_functions import load_and_format from pandas.testing import assert_frame_equal from src.construction_matches import flag_construction_matches from src.flag_and_count_matched_pairs import count_matches -from helper_functions import load_and_format + @pytest.fixture(scope="class") def construction_test_data(): - return load_and_format(Path("tests")/"construction_matches.csv") + return load_and_format(Path("tests") / "construction_matches.csv") + class TestConstructionMatches: def test_construction_matches_flag(self, construction_test_data): - expected_output = construction_test_data[[ - "target", - "period", - "auxiliary", - "flag_construction_matches", - ]] + expected_output = construction_test_data[ + [ + "target", + "period", + "auxiliary", + "flag_construction_matches", + ] + ] input_data = expected_output.drop(columns=["flag_construction_matches"]) - actual_output = flag_construction_matches(input_data, "target", "period", "auxiliary") + actual_output = flag_construction_matches( + input_data, "target", "period", "auxiliary" + ) assert_frame_equal(actual_output, expected_output) def test_construction_matches_count(self, construction_test_data): - expected_output = construction_test_data[[ - "period", - "flag_construction_matches", - "strata", - "count_construction_matches", - ]] + expected_output = construction_test_data[ + [ + "period", + "flag_construction_matches", + "strata", + "count_construction_matches", + ] + ] input_data = expected_output.drop(columns=["count_construction_matches"]) actual_output = count_matches( @@ -39,7 +47,7 @@ def test_construction_matches_count(self, construction_test_data): "flag_construction_matches", "period", "strata", - "count_construction_matches" + "count_construction_matches", ) assert_frame_equal(actual_output, expected_output) diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv index bc126f2e..e05d9fe3 100644 --- a/tests/test_data_matched_pair/case1_expected_output.csv +++ b/tests/test_data_matched_pair/case1_expected_output.csv @@ -4,4 +4,4 @@ reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_ma 1,101,202403,,False,1,False,0 2,101,202401,270,False,0,True,2 2,101,202402,250,True,2,True,1 -2,101,202403,255,True,1,False,0 \ No newline at end of file +2,101,202403,255,True,1,False,0 diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv index c03c0ff6..dbae472b 100644 --- a/tests/test_data_matched_pair/case2_expected_output.csv +++ b/tests/test_data_matched_pair/case2_expected_output.csv @@ -6,4 +6,4 @@ reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_ma 2,101,202402,250,True,2,True,1 2,101,202403,255,True,1,False,0 2,102,202404,260,False,0,True,1 -2,102,202405,272,True,1,False,0 \ No newline at end of file +2,102,202405,272,True,1,False,0 diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv index 7e40574d..12ad8810 100644 --- a/tests/test_data_matched_pair/case3_expected_output.csv +++ b/tests/test_data_matched_pair/case3_expected_output.csv @@ -4,4 +4,4 @@ reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_ma 1,101,202403,,False,0,False,0 2,101,202401,270,False,0,True,2 2,101,202402,250,True,2,False,0 -2,101,202404,255,False,0,False,0 \ No newline at end of file +2,101,202404,255,False,0,False,0 diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py index 927ee26a..6d765521 100644 --- a/tests/test_flag_and_count_matched_pairs.py +++ b/tests/test_flag_and_count_matched_pairs.py @@ -1,69 +1,113 @@ -import pandas as pd -import pytest - -from pandas.testing import assert_frame_equal from pathlib import Path -from src.flag_and_count_matched_pairs import flag_matched_pair_merge, count_matches, flag_matched_pair_shift +import pytest from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.flag_and_count_matched_pairs import ( + count_matches, + flag_matched_pair_merge, + flag_matched_pair_shift, +) # Case 1 - two businesses, one missing value -# Case 2 - change in strata (sic) -# Case 3 - Missing period for one business +# Case 2 - change in strata (sic) +# Case 3 - Missing period for one business -filepath = Path('tests')/'test_data_matched_pair' +filepath = Path("tests") / "test_data_matched_pair" file_name_cases = [ - (filepath/'case1_expected_output.csv'), - (filepath/'case2_expected_output.csv'), - (filepath/'case3_expected_output.csv'), - ] + (filepath / "case1_expected_output.csv"), + (filepath / "case2_expected_output.csv"), + (filepath / "case3_expected_output.csv"), +] + +pytestmark = pytest.mark.parametrize("expected_output_file", file_name_cases) -pytestmark = pytest.mark.parametrize("expected_output_file",file_name_cases) class TestMatchedPair: def test_flag_matched_pair_merge_forward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop(['f_matched_pair_count','b_matched_pair','b_matched_pair_count'],axis = 1,inplace=True) - df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']] - df_output = flag_matched_pair_merge(df_input, 'f', 'target_variable', 'period', 'reference', 'strata') - df_output.drop(['predictive_target_variable'],axis = 1, inplace=True) + df_expected_output.drop( + ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"], + axis=1, + inplace=True, + ) + df_input = df_expected_output[ + ["reference", "strata", "period", "target_variable"] + ] + df_output = flag_matched_pair_merge( + df_input, "f", "target_variable", "period", "reference", "strata" + ) + df_output.drop(["predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) def test_flag_matched_pair_merge_backward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop(['f_matched_pair_count', 'f_matched_pair', 'b_matched_pair_count'], axis = 1, inplace=True) - df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']] - df_output = flag_matched_pair_merge(df_input, 'b', 'target_variable', 'period', 'reference', 'strata') - df_output.drop(['predictive_target_variable'],axis = 1, inplace=True) + df_expected_output.drop( + ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"], + axis=1, + inplace=True, + ) + df_input = df_expected_output[ + ["reference", "strata", "period", "target_variable"] + ] + df_output = flag_matched_pair_merge( + df_input, "b", "target_variable", "period", "reference", "strata" + ) + df_output.drop(["predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) def test_count_matched_pair_forward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop(['b_matched_pair','b_matched_pair_count'],axis = 1,inplace=True) - df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable', 'f_matched_pair']] - df_output = count_matches(df_input,'f_matched_pair','period','strata') + df_expected_output.drop( + ["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True + ) + df_input = df_expected_output[ + ["reference", "strata", "period", "target_variable", "f_matched_pair"] + ] + df_output = count_matches(df_input, "f_matched_pair", "period", "strata") assert_frame_equal(df_output, df_expected_output) def test_count_matches_backward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop(['f_matched_pair','f_matched_pair_count'],axis = 1,inplace=True) - df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable', 'b_matched_pair']] - df_output = count_matches(df_input,'b_matched_pair','period','strata') + df_expected_output.drop( + ["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True + ) + df_input = df_expected_output[ + ["reference", "strata", "period", "target_variable", "b_matched_pair"] + ] + df_output = count_matches(df_input, "b_matched_pair", "period", "strata") assert_frame_equal(df_output, df_expected_output) def test_flag_matched_pair_shift_forward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop(['f_matched_pair_count','b_matched_pair','b_matched_pair_count'],axis = 1,inplace=True) - df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']] - df_output = flag_matched_pair_shift(df_input,'f','target_variable','period', 'reference', 'strata') - df_output.drop(['predictive_target_variable'],axis=1,inplace=True) + df_expected_output.drop( + ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"], + axis=1, + inplace=True, + ) + df_input = df_expected_output[ + ["reference", "strata", "period", "target_variable"] + ] + df_output = flag_matched_pair_shift( + df_input, "f", "target_variable", "period", "reference", "strata" + ) + df_output.drop(["predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) def test_flag_matched_pair_shift_backward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop(['f_matched_pair_count','f_matched_pair','b_matched_pair_count'],axis = 1,inplace=True) - df_input = df_expected_output[['reference', 'strata', 'period', 'target_variable']] - df_output = flag_matched_pair_shift(df_input,'b','target_variable','period', 'reference', 'strata') - df_output.drop(['predictive_target_variable'],axis=1,inplace=True) - assert_frame_equal(df_output, df_expected_output) \ No newline at end of file + df_expected_output.drop( + ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"], + axis=1, + inplace=True, + ) + df_input = df_expected_output[ + ["reference", "strata", "period", "target_variable"] + ] + df_output = flag_matched_pair_shift( + df_input, "b", "target_variable", "period", "reference", "strata" + ) + df_output.drop(["predictive_target_variable"], axis=1, inplace=True) + assert_frame_equal(df_output, df_expected_output) From d2d63bf3a441bc16cae43b68299f9635ad974679 Mon Sep 17 00:00:00 2001 From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Date: Mon, 13 May 2024 14:50:53 +0100 Subject: [PATCH 02/20] Adding flag (f/b) to predictive column name (#12) --- src/flag_and_count_matched_pairs.py | 11 ++++++----- tests/test_flag_and_count_matched_pairs.py | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py index e3d55c2a..d2b6f8a6 100644 --- a/src/flag_and_count_matched_pairs.py +++ b/src/flag_and_count_matched_pairs.py @@ -43,8 +43,9 @@ def flag_matched_pair_merge( df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset( months=time_difference ) + predictive_col_name = forward_or_backward + "_predictive_" + target df_with_predictive_column.rename( - columns={target: "predictive_" + target}, inplace=True + columns={target: predictive_col_name}, inplace=True ) df = df.merge( @@ -57,7 +58,7 @@ def flag_matched_pair_merge( matched_col_name = forward_or_backward + "_matched_pair" df[matched_col_name] = np.where( - df[[target, "predictive_" + target]].isnull().any(axis=1), False, True + df[[target, predictive_col_name]].isnull().any(axis=1), False, True ) df.drop(["predictive_period"], axis=1, inplace=True) @@ -98,7 +99,8 @@ def flag_matched_pair_shift( shift = -shift df = df.sort_values(by=[reference, period]) - df[["predictive_" + target, "predictive_period"]] = df.groupby( + predictive_col_name = forward_or_backward + "_predictive_" + target + df[[predictive_col_name, "predictive_period"]] = df.groupby( [reference, strata] ).shift(shift)[[target, period]] @@ -108,8 +110,7 @@ def flag_matched_pair_shift( matched_col_name = forward_or_backward + "_matched_pair" df[matched_col_name] = np.where( - df[[target, "predictive_target_variable"]].isnull().any(axis=1) - | (~df["validate_date"]), + df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]), False, True, ) diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py index 6d765521..cf4b0525 100644 --- a/tests/test_flag_and_count_matched_pairs.py +++ b/tests/test_flag_and_count_matched_pairs.py @@ -39,7 +39,7 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file): df_output = flag_matched_pair_merge( df_input, "f", "target_variable", "period", "reference", "strata" ) - df_output.drop(["predictive_target_variable"], axis=1, inplace=True) + df_output.drop(["f_predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) def test_flag_matched_pair_merge_backward(self, expected_output_file): @@ -55,7 +55,7 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file): df_output = flag_matched_pair_merge( df_input, "b", "target_variable", "period", "reference", "strata" ) - df_output.drop(["predictive_target_variable"], axis=1, inplace=True) + df_output.drop(["b_predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) def test_count_matched_pair_forward(self, expected_output_file): @@ -93,7 +93,7 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file): df_output = flag_matched_pair_shift( df_input, "f", "target_variable", "period", "reference", "strata" ) - df_output.drop(["predictive_target_variable"], axis=1, inplace=True) + df_output.drop(["f_predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) def test_flag_matched_pair_shift_backward(self, expected_output_file): @@ -109,5 +109,5 @@ def test_flag_matched_pair_shift_backward(self, expected_output_file): df_output = flag_matched_pair_shift( df_input, "b", "target_variable", "period", "reference", "strata" ) - df_output.drop(["predictive_target_variable"], axis=1, inplace=True) + df_output.drop(["b_predictive_target_variable"], axis=1, inplace=True) assert_frame_equal(df_output, df_expected_output) From f7ba3968b7037622bc267bdcd89620eec99f3144 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Mon, 13 May 2024 17:21:07 +0100 Subject: [PATCH 03/20] Add function for forward, backward link --- src/forward_link.py | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/forward_link.py diff --git a/src/forward_link.py b/src/forward_link.py new file mode 100644 index 00000000..7b648963 --- /dev/null +++ b/src/forward_link.py @@ -0,0 +1,105 @@ +from typing import List + +import numpy as np +import pandas as pd + + +def zerofy_values( + df: pd.DataFrame, target_variable: List[str] or str, expr: str +) -> pd.DataFrame: + """Convert values in a dataframe column to 0 based on a python expression + + Parameters + ---------- + df : pd.Dataframe + Pandas dataframe of original data. + target_variable : List[str] or str + Column name(s) containing target variable(s). + query : str + The expression to evaluate, see here: + https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html + + Returns + ------- + df : pd.Dataframe + + + """ + + try: + df.loc[~(df.eval(expr)), target_variable] = 0 + + except ValueError: + print( + f"""{expr} is not a valid expression, + the code uses ~(df.eval({expr}) to mask the dataframe, please see here: + https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html + """ + ) + + +def get_link( + df: pd.DataFrame, + groups: List[str] or str, + match_col: str, + target_variable: str, + predictive_variable: str, + filter_cond: str = None, +) -> pd.DataFrame: + """ + Calculate link between target_variable and predictive_variable by given groups, + a match_col must be supplied which indicates if target_variable and + predictive_variable can be linked. If an optional filter_cond is given + it excludes them when calculating the links. + + Parameters + ---------- + df : pd.Dataframe + Original dataframe. + groups : List[str] or str + Column name(s) to calculate the sums. + match_col : str + Column of the matched pair links, this column should be bool, + or 0 and 1. + target_variable : str + Column name of the targeted variable. + predictive_variable : str + Column name of the predicted target variable. + filter_cond : str, optional + Expression to exclude specific values from the links. + The default is None. + + Returns + ------- + link : pd.Series + A pandas series with the links. + """ + + df_intermediate = df.copy() + + # If condition supplied exclude filtered values from links + if filter_cond is not None: + + df_intermediate.zerofy_values( + [target_variable, predictive_variable], filter_cond + ) + + df_intermediate[target_variable] = ( + df_intermediate[target_variable] * df_intermediate[match_col] + ) + + df_intermediate[predictive_variable] = ( + df_intermediate[predictive_variable] * df_intermediate[match_col] + ) + + numerator = df_intermediate.groupby(groups)[target_variable].transform("sum") + + denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum") + + denominator.replace(0, np.nan, inplace=True) # cover division with 0 + + link = numerator / denominator + + link.replace(np.nan, 1, inplace=True) # set defaults + + return link From 8f5c987ea0cb014fd45d36159661cc1db627d052 Mon Sep 17 00:00:00 2001 From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Date: Tue, 14 May 2024 11:45:57 +0100 Subject: [PATCH 04/20] Add pre-commit hooks as test when merging (#11) --- .github/workflows/main.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 080d4cd3..dc5b5228 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -47,3 +47,23 @@ jobs: - name: Run pytest run: | pytest -v + + commit-hooks: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v3 + with: + python-version: 3.6.8 + cache: 'pip' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pre-commit install + + - name: Check commit hooks + run: | + pre-commit run --all-files From dd2b3024112a1850bd2ebf4b35dd303e2fbede4b Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 15 May 2024 13:53:07 +0100 Subject: [PATCH 05/20] Add unit tests for link filters --- tests/test_forward_link.py | 144 +++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/test_forward_link.py diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py new file mode 100644 index 00000000..a7b4009e --- /dev/null +++ b/tests/test_forward_link.py @@ -0,0 +1,144 @@ +import numpy as np +import pandas as pd +from pandas.testing import assert_frame_equal + +from src.forward_link import zerofy_values + + +class TestFilters: + # based on 02_C_FI_input + df = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [202001, 202002, 202001, 202002, 202001, 202002, 202001, 202002], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [2536.0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + def test_basic_filter(self): + """Test a basic filter, filters questions with identifier different to 20001""" + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "identifier != '20001'" + + df_copy = self.df.copy() + + zerofy_values(df_copy, "question", link_filter) + + assert_frame_equal(df_copy, expected) + + def test_basic_multiple_columns(self): + """Test a basic filter in more than 1 column""" + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [0, 0, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "identifier != '20001'" + + df_copy = self.df.copy() + + zerofy_values(df_copy, ["question", "other"], link_filter) + + assert_frame_equal(df_copy, expected) + + def test_basic_multiple_values(self): + """ + Test a filter in multiple values, filters questions which aren't + in ('20001', '20002') + """ + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 0, 0, 0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "identifier not in ('20001', '20002')" + + df_copy = self.df.copy() + + zerofy_values(df_copy, "question", link_filter) + + assert_frame_equal(df_copy, expected) + + def test_multiple_filters(self): + """ + Test multiple conditions, filters questions which aren't in date 202001 + and identifier in 20001 in the same time + """ + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "not(date == '202001' and identifier in ('20001'))" + + df_copy = self.df.copy() + + zerofy_values(df_copy, "question", link_filter) + + assert_frame_equal(df_copy, expected) From 3562d2f5886fb63f70ca7e0c96ae808d9f17ccc9 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 15 May 2024 13:55:55 +0100 Subject: [PATCH 06/20] Add unit tests for get_link function --- tests/test_forward_link.py | 198 ++++++++++++++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 2 deletions(-) diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index a7b4009e..bcf240da 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,8 +1,8 @@ import numpy as np import pandas as pd -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal -from src.forward_link import zerofy_values +from src.forward_link import get_link, zerofy_values class TestFilters: @@ -142,3 +142,197 @@ def test_multiple_filters(self): zerofy_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) + + +class TestLink: + + # from scenario 33_multi_variable_C_BI_R + # We could parametrise this with more scenarios if needed + df = pd.DataFrame( + data={ + "identifier": [ + 10001, + 10001, + 10001, + 10002, + 10002, + 10002, + 10001, + 10001, + 10001, + 10002, + 10002, + 10002, + 10005, + 10005, + 10005, + ], + "date": [ + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + ], + "group": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2], + "question": [ + 547.0, + 362.0, + 895.0, + 381.0, + 573.0, + 214.0, + 961.0, + 267.0, + 314.0, + 555.0, + 628.0, + 736.0, + np.nan, + np.nan, + 100.0, + ], + "f_predictive_question": [ + np.nan, + 547.0, + 362.0, + np.nan, + 381.0, + 573.0, + np.nan, + 961.0, + 267.0, + np.nan, + 555.0, + 628.0, + np.nan, + np.nan, + np.nan, + ], + "b_predictive_question": [ + 362.0, + 895.0, + np.nan, + 573.0, + 214.0, + np.nan, + 267.0, + 314.0, + np.nan, + 628.0, + 736.0, + np.nan, + np.nan, + 100.0, + np.nan, + ], + "f_matched_pair": [ + False, + True, + True, + False, + True, + True, + False, + True, + True, + False, + True, + True, + False, + False, + False, + ], + "b_matched_pair": [ + True, + True, + False, + True, + True, + False, + True, + True, + False, + True, + True, + False, + False, + False, + False, + ], + } + ) + + def test_forward_link(self): + + expected_f_link = pd.Series( + [ + 1.0, + 1.0075431034482758, + 1.186096256684492, + 1.0, + 1.0075431034482758, + 1.186096256684492, + 1.0, + 0.5903693931398417, + 1.1731843575418994, + 1.0, + 0.5903693931398417, + 1.1731843575418994, + 1.0, + 0.5903693931398417, + 1.1731843575418994, + ] + ) + + f_link = get_link( + self.df, + ["group", "date"], + "f_matched_pair", + "question", + "f_predictive_question", + ) + + assert_series_equal(f_link, expected_f_link) + + def test_backward_link(self): + + expected_b_link = pd.Series( + [ + 0.9925133689839573, + 0.8431018935978359, + 1.0, + 0.9925133689839573, + 0.8431018935978359, + 1.0, + 1.693854748603352, + 0.8523809523809524, + 1.0, + 1.693854748603352, + 0.8523809523809524, + 1.0, + 0.9925133689839573, + 0.8523809523809524, + 1.0, + ] + ) + + b_link = get_link( + self.df, + ["group", "date"], + "b_matched_pair", + "question", + "b_predictive_question", + ) + + assert_series_equal(b_link, expected_b_link) From d31535fa597d8e61fb4f6670cf7ee9afb3c61e24 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 16 May 2024 11:35:41 +0100 Subject: [PATCH 07/20] Rename zerofy_values function to mask_values --- src/forward_link.py | 10 ++++++---- tests/test_forward_link.py | 10 +++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 7b648963..3926abdb 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -2,9 +2,10 @@ import numpy as np import pandas as pd +from pandas.core.base import PandasObject -def zerofy_values( +def mask_values( df: pd.DataFrame, target_variable: List[str] or str, expr: str ) -> pd.DataFrame: """Convert values in a dataframe column to 0 based on a python expression @@ -38,6 +39,9 @@ def zerofy_values( ) +PandasObject.mask_values = mask_values + + def get_link( df: pd.DataFrame, groups: List[str] or str, @@ -80,9 +84,7 @@ def get_link( # If condition supplied exclude filtered values from links if filter_cond is not None: - df_intermediate.zerofy_values( - [target_variable, predictive_variable], filter_cond - ) + df_intermediate.mask_values([target_variable, predictive_variable], filter_cond) df_intermediate[target_variable] = ( df_intermediate[target_variable] * df_intermediate[match_col] diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index bcf240da..80f35ef9 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -2,7 +2,7 @@ import pandas as pd from pandas.testing import assert_frame_equal, assert_series_equal -from src.forward_link import get_link, zerofy_values +from src.forward_link import get_link, mask_values class TestFilters: @@ -43,7 +43,7 @@ def test_basic_filter(self): df_copy = self.df.copy() - zerofy_values(df_copy, "question", link_filter) + mask_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) @@ -73,7 +73,7 @@ def test_basic_multiple_columns(self): df_copy = self.df.copy() - zerofy_values(df_copy, ["question", "other"], link_filter) + mask_values(df_copy, ["question", "other"], link_filter) assert_frame_equal(df_copy, expected) @@ -106,7 +106,7 @@ def test_basic_multiple_values(self): df_copy = self.df.copy() - zerofy_values(df_copy, "question", link_filter) + mask_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) @@ -139,7 +139,7 @@ def test_multiple_filters(self): df_copy = self.df.copy() - zerofy_values(df_copy, "question", link_filter) + mask_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) From 3509145dddc391e1ce512f214572aa6a8b60e335 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 16 May 2024 11:43:47 +0100 Subject: [PATCH 08/20] Rename get_link function to calculate_imputation_link --- src/forward_link.py | 2 +- tests/test_forward_link.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 3926abdb..1cd6aca3 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -42,7 +42,7 @@ def mask_values( PandasObject.mask_values = mask_values -def get_link( +def calculate_imputation_link( df: pd.DataFrame, groups: List[str] or str, match_col: str, diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 80f35ef9..583b5485 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -2,7 +2,7 @@ import pandas as pd from pandas.testing import assert_frame_equal, assert_series_equal -from src.forward_link import get_link, mask_values +from src.forward_link import calculate_imputation_link, mask_values class TestFilters: @@ -295,7 +295,7 @@ def test_forward_link(self): ] ) - f_link = get_link( + f_link = calculate_imputation_link( self.df, ["group", "date"], "f_matched_pair", @@ -327,7 +327,7 @@ def test_backward_link(self): ] ) - b_link = get_link( + b_link = calculate_imputation_link( self.df, ["group", "date"], "b_matched_pair", From a3067c1e5faa69548dc859d98dd67d1cc0269010 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 10:40:54 +0100 Subject: [PATCH 09/20] Update mask_values to return a series --- src/forward_link.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 1cd6aca3..30e99fb4 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -2,12 +2,9 @@ import numpy as np import pandas as pd -from pandas.core.base import PandasObject -def mask_values( - df: pd.DataFrame, target_variable: List[str] or str, expr: str -) -> pd.DataFrame: +def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series: """Convert values in a dataframe column to 0 based on a python expression Parameters @@ -22,13 +19,14 @@ def mask_values( Returns ------- - df : pd.Dataframe + df : pd.Series """ + masked_column = df[target_variable].copy() try: - df.loc[~(df.eval(expr)), target_variable] = 0 + masked_column.loc[~(df.eval(expr))] = np.nan except ValueError: print( @@ -38,8 +36,7 @@ def mask_values( """ ) - -PandasObject.mask_values = mask_values + return masked_column def calculate_imputation_link( From 4ddd9319b08fafa10ac1cc180f7bab6f996a71ec Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 10:44:30 +0100 Subject: [PATCH 10/20] Remove mask_values from calculate_links function --- src/forward_link.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 30e99fb4..14870292 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -45,13 +45,11 @@ def calculate_imputation_link( match_col: str, target_variable: str, predictive_variable: str, - filter_cond: str = None, -) -> pd.DataFrame: +) -> pd.Series: """ Calculate link between target_variable and predictive_variable by given groups, a match_col must be supplied which indicates if target_variable and - predictive_variable can be linked. If an optional filter_cond is given - it excludes them when calculating the links. + predictive_variable can be linked. Parameters ---------- @@ -66,9 +64,6 @@ def calculate_imputation_link( Column name of the targeted variable. predictive_variable : str Column name of the predicted target variable. - filter_cond : str, optional - Expression to exclude specific values from the links. - The default is None. Returns ------- @@ -78,11 +73,6 @@ def calculate_imputation_link( df_intermediate = df.copy() - # If condition supplied exclude filtered values from links - if filter_cond is not None: - - df_intermediate.mask_values([target_variable, predictive_variable], filter_cond) - df_intermediate[target_variable] = ( df_intermediate[target_variable] * df_intermediate[match_col] ) @@ -99,6 +89,4 @@ def calculate_imputation_link( link = numerator / denominator - link.replace(np.nan, 1, inplace=True) # set defaults - return link From 22fa19e8fd566a5f23c6e50c4d922ea973a71978 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 13:52:16 +0100 Subject: [PATCH 11/20] Add test data for calculate_links --- tests/calculate_links_test_data.csv | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 tests/calculate_links_test_data.csv diff --git a/tests/calculate_links_test_data.csv b/tests/calculate_links_test_data.csv new file mode 100755 index 00000000..72e6408d --- /dev/null +++ b/tests/calculate_links_test_data.csv @@ -0,0 +1,16 @@ +,identifier,period,group,question,f_predictive_question,b_predictive_question,f_matched_pair,b_matched_pair,f_link,b_link +0,10001,202001,1,547.0,,362.0,False,True,,0.9925133689839573 +1,10001,202002,1,362.0,547.0,895.0,True,True,1.0075431034482758,0.8431018935978359 +2,10001,202003,1,895.0,362.0,,True,False,1.186096256684492, +3,10002,202001,1,381.0,,573.0,False,True,,0.9925133689839573 +4,10002,202002,1,573.0,381.0,214.0,True,True,1.0075431034482758,0.8431018935978359 +5,10002,202003,1,214.0,573.0,,True,False,1.186096256684492, +6,10001,202001,2,961.0,,267.0,False,True,,1.693854748603352 +7,10001,202002,2,267.0,961.0,314.0,True,True,0.5903693931398417,0.8523809523809524 +8,10001,202003,2,314.0,267.0,,True,False,1.1731843575418994, +9,10002,202001,2,555.0,,628.0,False,True,,1.693854748603352 +10,10002,202002,2,628.0,555.0,736.0,True,True,0.5903693931398417,0.8523809523809524 +11,10002,202003,2,736.0,628.0,,True,False,1.1731843575418994, +12,10005,202001,1,,,,False,False,,0.9925133689839573 +13,10005,202002,2,,,100.0,False,False,0.5903693931398417,0.8523809523809524 +14,10005,202003,2,100.0,,,False,False,1.1731843575418994, From b2b91e39aac95e3f4b9df52abb3748e1e0a57555 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 13:53:19 +0100 Subject: [PATCH 12/20] Adapt tests for calculate_links with test data --- tests/test_forward_link.py | 204 +++++-------------------------------- 1 file changed, 26 insertions(+), 178 deletions(-) diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 583b5485..d1c0f6cf 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,5 +1,7 @@ import numpy as np import pandas as pd +import pytest +from helper_functions import load_and_format from pandas.testing import assert_frame_equal, assert_series_equal from src.forward_link import calculate_imputation_link, mask_values @@ -144,195 +146,41 @@ def test_multiple_filters(self): assert_frame_equal(df_copy, expected) -class TestLink: +scenarios = ["calculate_links_test_data"] - # from scenario 33_multi_variable_C_BI_R - # We could parametrise this with more scenarios if needed - df = pd.DataFrame( - data={ - "identifier": [ - 10001, - 10001, - 10001, - 10002, - 10002, - 10002, - 10001, - 10001, - 10001, - 10002, - 10002, - 10002, - 10005, - 10005, - 10005, - ], - "date": [ - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - ], - "group": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2], - "question": [ - 547.0, - 362.0, - 895.0, - 381.0, - 573.0, - 214.0, - 961.0, - 267.0, - 314.0, - 555.0, - 628.0, - 736.0, - np.nan, - np.nan, - 100.0, - ], - "f_predictive_question": [ - np.nan, - 547.0, - 362.0, - np.nan, - 381.0, - 573.0, - np.nan, - 961.0, - 267.0, - np.nan, - 555.0, - 628.0, - np.nan, - np.nan, - np.nan, - ], - "b_predictive_question": [ - 362.0, - 895.0, - np.nan, - 573.0, - 214.0, - np.nan, - 267.0, - 314.0, - np.nan, - 628.0, - 736.0, - np.nan, - np.nan, - 100.0, - np.nan, - ], - "f_matched_pair": [ - False, - True, - True, - False, - True, - True, - False, - True, - True, - False, - True, - True, - False, - False, - False, - ], - "b_matched_pair": [ - True, - True, - False, - True, - True, - False, - True, - True, - False, - True, - True, - False, - False, - False, - False, - ], - } - ) - def test_forward_link(self): - - expected_f_link = pd.Series( - [ - 1.0, - 1.0075431034482758, - 1.186096256684492, - 1.0, - 1.0075431034482758, - 1.186096256684492, - 1.0, - 0.5903693931398417, - 1.1731843575418994, - 1.0, - 0.5903693931398417, - 1.1731843575418994, - 1.0, - 0.5903693931398417, - 1.1731843575418994, - ] - ) +@pytest.mark.parametrize("scenario", scenarios) +class TestLinks: + def test_forward_links(self, scenario): + """Test if function returns the f_link column""" + + df_input = load_and_format("tests/" + scenario + ".csv") - f_link = calculate_imputation_link( - self.df, - ["group", "date"], + expected_link = df_input["f_link"] + + link_to_test = calculate_imputation_link( + df_input, + ["group", "period"], "f_matched_pair", "question", "f_predictive_question", ) - assert_series_equal(f_link, expected_f_link) - - def test_backward_link(self): - - expected_b_link = pd.Series( - [ - 0.9925133689839573, - 0.8431018935978359, - 1.0, - 0.9925133689839573, - 0.8431018935978359, - 1.0, - 1.693854748603352, - 0.8523809523809524, - 1.0, - 1.693854748603352, - 0.8523809523809524, - 1.0, - 0.9925133689839573, - 0.8523809523809524, - 1.0, - ] - ) + assert_series_equal(link_to_test, expected_link, check_names=False) + + def test_back_links(self, scenario): + """Test if function returns the b_link column""" + + df_input = load_and_format("tests/" + scenario + ".csv") + + expected_link = df_input["b_link"] - b_link = calculate_imputation_link( - self.df, - ["group", "date"], + link_to_test = calculate_imputation_link( + df_input, + ["group", "period"], "b_matched_pair", "question", "b_predictive_question", ) - assert_series_equal(b_link, expected_b_link) + assert_series_equal(link_to_test, expected_link, check_names=False) From 4bc39c4ff6a52b70d058e0792a1a03f837cd1750 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 17:09:31 +0100 Subject: [PATCH 13/20] Remove mask_values unit tests --- tests/test_forward_link.py | 146 +------------------------------------ 1 file changed, 2 insertions(+), 144 deletions(-) diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index d1c0f6cf..74e32005 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,150 +1,8 @@ -import numpy as np -import pandas as pd import pytest from helper_functions import load_and_format -from pandas.testing import assert_frame_equal, assert_series_equal - -from src.forward_link import calculate_imputation_link, mask_values - - -class TestFilters: - # based on 02_C_FI_input - df = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [202001, 202002, 202001, 202002, 202001, 202002, 202001, 202002], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [2536.0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - def test_basic_filter(self): - """Test a basic filter, filters questions with identifier different to 20001""" - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "identifier != '20001'" - - df_copy = self.df.copy() - - mask_values(df_copy, "question", link_filter) - - assert_frame_equal(df_copy, expected) - - def test_basic_multiple_columns(self): - """Test a basic filter in more than 1 column""" - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [0, 0, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "identifier != '20001'" - - df_copy = self.df.copy() - - mask_values(df_copy, ["question", "other"], link_filter) - - assert_frame_equal(df_copy, expected) - - def test_basic_multiple_values(self): - """ - Test a filter in multiple values, filters questions which aren't - in ('20001', '20002') - """ - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 0, 0, 0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "identifier not in ('20001', '20002')" - - df_copy = self.df.copy() - - mask_values(df_copy, "question", link_filter) - - assert_frame_equal(df_copy, expected) - - def test_multiple_filters(self): - """ - Test multiple conditions, filters questions which aren't in date 202001 - and identifier in 20001 in the same time - """ - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "not(date == '202001' and identifier in ('20001'))" - - df_copy = self.df.copy() - - mask_values(df_copy, "question", link_filter) - - assert_frame_equal(df_copy, expected) +from pandas.testing import assert_series_equal +from src.forward_link import calculate_imputation_link scenarios = ["calculate_links_test_data"] From 761c28331d921fec6bcf7a00681b43c5b6e9d0d1 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 21 May 2024 11:15:14 +0100 Subject: [PATCH 14/20] Define strata and period as seperate inputs --- src/forward_link.py | 63 ++++++++++---------------------------- tests/test_forward_link.py | 6 ++-- 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 14870292..f58e5512 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -1,65 +1,30 @@ -from typing import List - import numpy as np import pandas as pd -def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series: - """Convert values in a dataframe column to 0 based on a python expression - - Parameters - ---------- - df : pd.Dataframe - Pandas dataframe of original data. - target_variable : List[str] or str - Column name(s) containing target variable(s). - query : str - The expression to evaluate, see here: - https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html - - Returns - ------- - df : pd.Series - - - """ - masked_column = df[target_variable].copy() - - try: - masked_column.loc[~(df.eval(expr))] = np.nan - - except ValueError: - print( - f"""{expr} is not a valid expression, - the code uses ~(df.eval({expr}) to mask the dataframe, please see here: - https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html - """ - ) - - return masked_column - - def calculate_imputation_link( df: pd.DataFrame, - groups: List[str] or str, + period: str, + strata: str, match_col: str, target_variable: str, predictive_variable: str, ) -> pd.Series: """ - Calculate link between target_variable and predictive_variable by given groups, - a match_col must be supplied which indicates if target_variable and - predictive_variable can be linked. + Calculate link between target_variable and predictive_variable by strata, + a match_col must be supplied which indicates if target_variable + and predictive_variable can be linked. Parameters ---------- df : pd.Dataframe Original dataframe. - groups : List[str] or str - Column name(s) to calculate the sums. + period : str + Column name containing time period. + strata : str + Column name containing strata information (sic). match_col : str - Column of the matched pair links, this column should be bool, - or 0 and 1. + Column name of the matched pair links, this column should be bool. target_variable : str Column name of the targeted variable. predictive_variable : str @@ -81,9 +46,13 @@ def calculate_imputation_link( df_intermediate[predictive_variable] * df_intermediate[match_col] ) - numerator = df_intermediate.groupby(groups)[target_variable].transform("sum") + numerator = df_intermediate.groupby([strata, period])[target_variable].transform( + "sum" + ) - denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum") + denominator = df_intermediate.groupby([strata, period])[ + predictive_variable + ].transform("sum") denominator.replace(0, np.nan, inplace=True) # cover division with 0 diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 74e32005..8012d001 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -18,7 +18,8 @@ def test_forward_links(self, scenario): link_to_test = calculate_imputation_link( df_input, - ["group", "period"], + "period", + "group", "f_matched_pair", "question", "f_predictive_question", @@ -35,7 +36,8 @@ def test_back_links(self, scenario): link_to_test = calculate_imputation_link( df_input, - ["group", "period"], + "period", + "group", "b_matched_pair", "question", "b_predictive_question", From 1eb616ce4b5e435ddfa8b21607d98a3939eac1d5 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 21 May 2024 15:42:04 +0100 Subject: [PATCH 15/20] Change return type to dataframe, add exceptions too --- src/forward_link.py | 25 +++++++++++++++---- tests/test_forward_link.py | 49 ++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index f58e5512..1ac97429 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -9,7 +9,7 @@ def calculate_imputation_link( match_col: str, target_variable: str, predictive_variable: str, -) -> pd.Series: +) -> pd.DataFrame: """ Calculate link between target_variable and predictive_variable by strata, a match_col must be supplied which indicates if target_variable @@ -32,12 +32,27 @@ def calculate_imputation_link( Returns ------- - link : pd.Series - A pandas series with the links. + df : pd.DataFrame + A pandas DataFrame with a new column containing either f_link or b_link + based on the input parameters. """ df_intermediate = df.copy() + if match_col == "f_matched_pair" and predictive_variable == "f_predictive_question": + link_col_name = "f_link" + + elif ( + match_col == "b_matched_pair" and predictive_variable == "b_predictive_question" + ): + link_col_name = "b_link" + + else: + raise ValueError( + f""" + {match_col} and {predictive_variable} do not have same wildcard.""" + ) + df_intermediate[target_variable] = ( df_intermediate[target_variable] * df_intermediate[match_col] ) @@ -56,6 +71,6 @@ def calculate_imputation_link( denominator.replace(0, np.nan, inplace=True) # cover division with 0 - link = numerator / denominator + df[link_col_name] = numerator / denominator - return link + return df diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 8012d001..51fa63c8 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,6 +1,6 @@ import pytest from helper_functions import load_and_format -from pandas.testing import assert_series_equal +from pandas.testing import assert_frame_equal from src.forward_link import calculate_imputation_link @@ -12,11 +12,11 @@ class TestLinks: def test_forward_links(self, scenario): """Test if function returns the f_link column""" - df_input = load_and_format("tests/" + scenario + ".csv") + df_output = load_and_format("tests/" + scenario + ".csv") - expected_link = df_input["f_link"] + df_input = df_output.drop(columns=["f_link"]) - link_to_test = calculate_imputation_link( + df_input = calculate_imputation_link( df_input, "period", "group", @@ -25,16 +25,15 @@ def test_forward_links(self, scenario): "f_predictive_question", ) - assert_series_equal(link_to_test, expected_link, check_names=False) + assert_frame_equal(df_input, df_output, check_like=True) def test_back_links(self, scenario): """Test if function returns the b_link column""" + df_output = load_and_format("tests/" + scenario + ".csv") - df_input = load_and_format("tests/" + scenario + ".csv") + df_input = df_output.drop(columns=["b_link"]) - expected_link = df_input["b_link"] - - link_to_test = calculate_imputation_link( + df_input = calculate_imputation_link( df_input, "period", "group", @@ -43,4 +42,34 @@ def test_back_links(self, scenario): "b_predictive_question", ) - assert_series_equal(link_to_test, expected_link, check_names=False) + assert_frame_equal(df_input, df_output, check_like=True) + + def test_exception(self, scenario): + + df = load_and_format("tests/" + scenario + ".csv") + + with pytest.raises(ValueError): + """ + Test if function is called with wrong arguments, in particular + with f_matched_pair and b_predictive_question or with + b_matched_pair and f_predictive_question. + """ + + df = calculate_imputation_link( + df, + "period", + "group", + "f_matched_pair", + "question", + "b_predictive_question", + ) + with pytest.raises(ValueError): + + df = calculate_imputation_link( + df, + "period", + "group", + "b_matched_pair", + "question", + "f_predictive_question", + ) From 0e4b26167d6d674b337be72b88757128ad825bcb Mon Sep 17 00:00:00 2001 From: Wil Roberts <47739563+robertswh@users.noreply.github.com> Date: Tue, 21 May 2024 16:31:19 +0100 Subject: [PATCH 16/20] 330 consecutive imputation links (#15) * add function for cumulative imputation links * added tests for forward and backward cumulative links * adding pre-commit hooks * changes after review --- src/cumulative_imputation_links.py | 72 +++++++++++++++++++++++ tests/cumulative_links.csv | 7 +++ tests/test_cumulative_imputation_links.py | 64 ++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100755 src/cumulative_imputation_links.py create mode 100755 tests/cumulative_links.csv create mode 100755 tests/test_cumulative_imputation_links.py diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py new file mode 100755 index 00000000..91dfbed9 --- /dev/null +++ b/src/cumulative_imputation_links.py @@ -0,0 +1,72 @@ +import numpy as np + + +def get_cumulative_links( + dataframe, + forward_or_backward, + strata, + reference, + target, + period, + imputation_link, + time_difference=1, +): + """ + Create cumulative imputation links for multiple consecutive periods + without a return. + + Parameters + ---------- + dataframe : pandas.DataFrame + forward_or_backward: str + either f or b for forward or backward method + + strata : str + column name containing strata information (sic) + reference : str + column name containing business reference id + target : str + column name containing target variable + period : str + column name containing time period + imputation_link : string + column name containing imputation links + time_difference : int + time difference between predictive and target period in months + + Returns + ------- + pandas.DataFrame + dataframe with imputation_group and + cumulative_forward/backward_imputation_link column + """ + + dataframe.sort_values([strata, reference, period], inplace=True) + dataframe["missing_value"] = np.where(dataframe[target].isnull(), True, False) + + dataframe["imputation_group"] = ( + ( + (dataframe["missing_value"].diff(time_difference) != 0) + | (dataframe[strata].diff(time_difference) != 0) + | (dataframe[reference].diff(time_difference) != 0) + ) + .astype("int") + .cumsum() + ) + + if forward_or_backward == "f": + dataframe["cumulative_" + imputation_link] = dataframe.groupby( + "imputation_group" + )[imputation_link].cumprod() + elif forward_or_backward == "b": + dataframe["cumulative_" + imputation_link] = ( + dataframe[::-1].groupby("imputation_group")[imputation_link].cumprod()[::-1] + ) + + dataframe["cumulative_" + imputation_link] = np.where( + ~dataframe[target].isnull(), + np.nan, + dataframe["cumulative_" + imputation_link], + ) + + return dataframe[["imputation_group", "cumulative_" + imputation_link]] diff --git a/tests/cumulative_links.csv b/tests/cumulative_links.csv new file mode 100755 index 00000000..bef347a5 --- /dev/null +++ b/tests/cumulative_links.csv @@ -0,0 +1,7 @@ +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link +100,100000,200,202402,1,2,1,, +100,100000,,202403,2,0.6,2,2,0.6 +100,100000,,202404,3,1,2,6,1 +200,100001,,202402,1,4,3,1,2 +200,100001,,202403,3,0.5,3,3,0.5 +200,100001,300,202404,0.5,1,4,, diff --git a/tests/test_cumulative_imputation_links.py b/tests/test_cumulative_imputation_links.py new file mode 100755 index 00000000..bf31094a --- /dev/null +++ b/tests/test_cumulative_imputation_links.py @@ -0,0 +1,64 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.cumulative_imputation_links import get_cumulative_links + + +@pytest.fixture(scope="class") +def cumulative_links_test_data(): + return load_and_format(Path("tests") / "cumulative_links.csv") + + +class TestComulativeLinks: + def test_get_cumulative_links_forward(self, cumulative_links_test_data): + input_data = cumulative_links_test_data.drop( + columns=["cumulative_forward_imputation_link", "imputation_group"] + ) + + expected_output = cumulative_links_test_data[ + [ + "imputation_group", + "cumulative_forward_imputation_link", + ] + ] + + actual_output = get_cumulative_links( + input_data, + "f", + "strata", + "reference", + "target", + "period", + "forward_imputation_link", + 1, + ) + + assert_frame_equal(actual_output, expected_output) + + def test_get_cumulative_links_backward(self, cumulative_links_test_data): + input_data = cumulative_links_test_data.drop( + columns=["cumulative_backward_imputation_link", "imputation_group"] + ) + + expected_output = cumulative_links_test_data[ + [ + "imputation_group", + "cumulative_backward_imputation_link", + ] + ] + + actual_output = get_cumulative_links( + input_data, + "b", + "strata", + "reference", + "target", + "period", + "backward_imputation_link", + 1, + ) + + assert_frame_equal(actual_output, expected_output) From a27bb91fb921c66b4fb8ef12efead02d59ff7cee Mon Sep 17 00:00:00 2001 From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Date: Wed, 22 May 2024 14:25:55 +0100 Subject: [PATCH 17/20] 353 create imputation markers (#14) * Change unit tests from dropping to selecting, ready for adding more cols into test data * Adding module to calculate imputation flag columns * Creating unit test and test data for imputation flag * Copying input data to fix pandas copy warnings * Adding docstrings * Refactoring `matched_pair` column to include target column in name * Update impute flags to include impute from construction * Create function to convert impute flags into single column with strings * Fixing pandas copy on slice warning * Updating docstring and handle case where needed columns are not included * Update error message * Adding unit test for string flag column * Renaming imputation flag function to imputation_flag_marker * Rename column in test data * Refactor to use dictionary to store imputation markers and conditions (can be extracted to yaml file if needed) * Refactor to define column names earlier in function * Add f_predictive_auxiliary variable to test data * refactor: Add predictive_auxiliary as function argument Instead of calling flag_matched_pair_merge within the function to create the predictive_auxiliary, it is defined as function argument. Hence flag_matched_pair_merge must be called before create_impute_flags. This will convert flag_matched_pair_merge to a low level function and using pandas framework. * Change period type to int * Update expected columns in function and tests --------- Co-authored-by: zogkoa --- src/flag_and_count_matched_pairs.py | 6 +- src/imputation_flags.py | 137 ++++++++++++++++++ tests/imputation_flag_data.csv | 28 ++++ .../case1_expected_output.csv | 2 +- .../case2_expected_output.csv | 2 +- .../case3_expected_output.csv | 2 +- tests/test_flag_and_count_matched_pairs.py | 106 ++++++++++---- tests/test_imputation_flags.py | 50 +++++++ 8 files changed, 297 insertions(+), 36 deletions(-) create mode 100644 src/imputation_flags.py create mode 100644 tests/imputation_flag_data.csv create mode 100644 tests/test_imputation_flags.py diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py index d2b6f8a6..7d286892 100644 --- a/src/flag_and_count_matched_pairs.py +++ b/src/flag_and_count_matched_pairs.py @@ -39,7 +39,7 @@ def flag_matched_pair_merge( time_difference = -time_difference # Creating new DF, shifting period for forward or backward - df_with_predictive_column = df[[reference, strata, target]] + df_with_predictive_column = df.copy()[[reference, strata, target]] df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset( months=time_difference ) @@ -55,7 +55,7 @@ def flag_matched_pair_merge( how="left", ) - matched_col_name = forward_or_backward + "_matched_pair" + matched_col_name = forward_or_backward + "_matched_pair_" + target df[matched_col_name] = np.where( df[[target, predictive_col_name]].isnull().any(axis=1), False, True @@ -107,7 +107,7 @@ def flag_matched_pair_shift( df["validate_date"] = np.where( df[period].dt.month - df["predictive_period"].dt.month == shift, True, False ) - matched_col_name = forward_or_backward + "_matched_pair" + matched_col_name = forward_or_backward + "_matched_pair_" + target df[matched_col_name] = np.where( df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]), diff --git a/src/imputation_flags.py b/src/imputation_flags.py new file mode 100644 index 00000000..91bc04ad --- /dev/null +++ b/src/imputation_flags.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd + + +def create_impute_flags( + df: pd.DataFrame, + target: str, + reference: str, + strata: str, + auxiliary: str, + predictive_auxiliary: str, +): + + """ + function to create logical columns for each type of imputation + output columns are needed to create the string flag column for + imputation methods. + Function requires f_predictive and b_predictive columns produced + by `flag_matched_pair` function. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing forward, backward predictive period columns ( + These columns are created by calling flag_matched_pair_merge forward + and backwards) + + target : str + Column name containing target variable. + reference : str + Column name containing business reference id. + strata : str + Column name containing strata information (sic). + auxiliary : str + Column name containing auxiliary data. + predictive_auxiliary: str + Column name containing predictive auxiliary data, this is created, + by flag_matched_pair_merge function. + + Returns + ------- + pd.DataFrame + Dataframe with four additional logical columns determining if target + is a return (r_flag) can be imputed by forward imputation (fir_flag), + backward imputation (bir_flag) or can be constructed (c_flag) + """ + for direction in ["f", "b"]: + try: + df["{}_predictive_{}".format(direction, target)] + except KeyError: + raise KeyError( + "Dataframe needs column '{}_predictive_{}',".format(direction, target) + + " run flag_matched_pair function first" + ) + forward_target_roll = "f_predictive_" + target + "_roll" + backward_target_roll = "b_predictive_" + target + "_roll" + forward_aux_roll = "f_predictive_" + auxiliary + "_roll" + + df[forward_target_roll] = df.groupby([reference, strata])[ + "f_predictive_" + target + ].ffill() + + df[backward_target_roll] = df.groupby([reference, strata])[ + "b_predictive_" + target + ].bfill() + + df["r_flag"] = df[target].notna() + + df["fir_flag"] = np.where( + df[forward_target_roll].notna() & df[target].isna(), True, False + ) + + df["bir_flag"] = np.where( + df[backward_target_roll].notna() & df[target].isna(), True, False + ) + + construction_conditions = df[target].isna() & df[auxiliary].notna() + df["c_flag"] = np.where(construction_conditions, True, False) + + df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill() + + fic_conditions = df[target].isna() & df[forward_aux_roll].notna() + df["fic_flag"] = np.where(fic_conditions, True, False) + + df.drop( + [ + forward_target_roll, + backward_target_roll, + forward_aux_roll, + predictive_auxiliary, + ], + axis=1, + inplace=True, + ) + + return df + + +def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame: + """ + Function to add column containing the a string indicating the method of + imputation to use following the hierarchy in specifications + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing logical columns produced by `create_imputation_flags` + (r_flag, fir_flag, bir_flag, fic_flag and c_flag) + + + Returns + ------- + pd.DataFrame + Dataframe with additional column containing imputation marker + i.e. the type of imputation method that should be used to fill + missing returns. + """ + + imputation_markers_and_conditions = { + "r": df["r_flag"], + "fir": ~df["r_flag"] & df["fir_flag"], + "bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"], + "fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"], + "c": ~df["r_flag"] + & ~df["fir_flag"] + & ~df["bir_flag"] + & ~df["fic_flag"] + & df["c_flag"], + } + + df["imputation_marker"] = np.select( + imputation_markers_and_conditions.values(), + imputation_markers_and_conditions.keys(), + default="error", + ) + + return df diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv new file mode 100644 index 00000000..31b56aa8 --- /dev/null +++ b/tests/imputation_flag_data.csv @@ -0,0 +1,28 @@ +reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker +1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r +1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir +1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r +1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r +2,100,202001,,72.0,,,False,False,True,True,False,,bir +2,100,202002,,,,,False,False,True,False,True,72.0,bir +2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir +2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r +3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir +3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r +3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r +3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r +4,100,202001,64.0,81.0,,,True,False,False,False,False,,r +4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir +4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir +4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r +5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r +5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r +5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r +5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r +6,100,202001,64.0,81.0,,,True,False,False,False,False,,r +6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir +6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r +6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir +7,100,202001,,40.0,,,False,False,False,True,False,,c +7,100,202002,,,,,False,False,False,False,True,40.0,fic +7,100,202003,,,,,False,False,False,False,True,,fic diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv index e05d9fe3..4e833e7b 100644 --- a/tests/test_data_matched_pair/case1_expected_output.csv +++ b/tests/test_data_matched_pair/case1_expected_output.csv @@ -1,4 +1,4 @@ -reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count +reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count 1,101,202401,237,False,0,True,2 1,101,202402,281,True,2,False,1 1,101,202403,,False,1,False,0 diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv index dbae472b..468ad85b 100644 --- a/tests/test_data_matched_pair/case2_expected_output.csv +++ b/tests/test_data_matched_pair/case2_expected_output.csv @@ -1,4 +1,4 @@ -reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count +reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count 1,101,202401,237,False,0,True,2 1,101,202402,281,True,2,False,1 1,101,202403,,False,1,False,0 diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv index 12ad8810..a94662ca 100644 --- a/tests/test_data_matched_pair/case3_expected_output.csv +++ b/tests/test_data_matched_pair/case3_expected_output.csv @@ -1,4 +1,4 @@ -reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count +reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count 1,101,202401,237,False,0,True,2 1,101,202402,281,True,2,False,0 1,101,202403,,False,0,False,0 diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py index cf4b0525..79c25eba 100644 --- a/tests/test_flag_and_count_matched_pairs.py +++ b/tests/test_flag_and_count_matched_pairs.py @@ -28,11 +28,15 @@ class TestMatchedPair: def test_flag_matched_pair_merge_forward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop( - ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"], - axis=1, - inplace=True, - ) + df_expected_output = df_expected_output[ + [ + "reference", + "strata", + "period", + "target_variable", + "f_matched_pair_target_variable", + ] + ] df_input = df_expected_output[ ["reference", "strata", "period", "target_variable"] ] @@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file): def test_flag_matched_pair_merge_backward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop( - ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"], - axis=1, - inplace=True, - ) + df_expected_output = df_expected_output[ + [ + "reference", + "strata", + "period", + "target_variable", + "b_matched_pair_target_variable", + ] + ] df_input = df_expected_output[ ["reference", "strata", "period", "target_variable"] ] @@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file): def test_count_matched_pair_forward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop( - ["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True - ) + df_expected_output = df_expected_output[ + [ + "reference", + "strata", + "period", + "target_variable", + "f_matched_pair_target_variable", + "f_matched_pair_count", + ] + ] df_input = df_expected_output[ - ["reference", "strata", "period", "target_variable", "f_matched_pair"] + [ + "reference", + "strata", + "period", + "target_variable", + "f_matched_pair_target_variable", + ] ] - df_output = count_matches(df_input, "f_matched_pair", "period", "strata") + df_output = count_matches( + df_input, "f_matched_pair_target_variable", "period", "strata" + ) assert_frame_equal(df_output, df_expected_output) def test_count_matches_backward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop( - ["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True - ) + df_expected_output = df_expected_output[ + [ + "reference", + "strata", + "period", + "target_variable", + "b_matched_pair_target_variable", + "b_matched_pair_count", + ] + ] df_input = df_expected_output[ - ["reference", "strata", "period", "target_variable", "b_matched_pair"] + [ + "reference", + "strata", + "period", + "target_variable", + "b_matched_pair_target_variable", + ] ] - df_output = count_matches(df_input, "b_matched_pair", "period", "strata") + df_output = count_matches( + df_input, "b_matched_pair_target_variable", "period", "strata" + ) assert_frame_equal(df_output, df_expected_output) def test_flag_matched_pair_shift_forward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop( - ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"], - axis=1, - inplace=True, - ) + df_expected_output = df_expected_output[ + [ + "reference", + "strata", + "period", + "target_variable", + "f_matched_pair_target_variable", + ] + ] df_input = df_expected_output[ ["reference", "strata", "period", "target_variable"] ] @@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file): def test_flag_matched_pair_shift_backward(self, expected_output_file): df_expected_output = load_and_format(expected_output_file) - df_expected_output.drop( - ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"], - axis=1, - inplace=True, - ) + df_expected_output = df_expected_output[ + [ + "reference", + "strata", + "period", + "target_variable", + "b_matched_pair_target_variable", + ] + ] df_input = df_expected_output[ ["reference", "strata", "period", "target_variable"] ] diff --git a/tests/test_imputation_flags.py b/tests/test_imputation_flags.py new file mode 100644 index 00000000..315b5fa3 --- /dev/null +++ b/tests/test_imputation_flags.py @@ -0,0 +1,50 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.imputation_flags import create_impute_flags, generate_imputation_marker + + +@pytest.fixture(scope="class") +def imputation_flag_test_data(): + return load_and_format(Path("tests") / "imputation_flag_data.csv") + + +class TestImputationFlags: + def test_create_impute_flags(self, imputation_flag_test_data): + df_expected_output = imputation_flag_test_data.copy() + df_expected_output.drop(["imputation_marker"], axis=1, inplace=True) + df_input = df_expected_output.copy() + df_input = df_input[ + [ + "reference", + "strata", + "period", + "target_variable", + "auxiliary", + "f_predictive_target_variable", + "b_predictive_target_variable", + "f_predictive_auxiliary", + ] + ] + df_output = create_impute_flags( + df=df_input, + target="target_variable", + reference="reference", + strata="strata", + auxiliary="auxiliary", + predictive_auxiliary="f_predictive_auxiliary", + ) + + df_expected_output.drop(["f_predictive_auxiliary"], axis=1, inplace=True) + + assert_frame_equal(df_output, df_expected_output) + + def test_imputation_marker(self, imputation_flag_test_data): + df_expected_output = imputation_flag_test_data.copy() + df_input = imputation_flag_test_data.copy() + df_input.drop("imputation_marker", axis=1, inplace=True) + df_output = generate_imputation_marker(df_input) + assert_frame_equal(df_output, df_expected_output) From d2e8a386284a3f59b50df09c20b053a5fe3a3548 Mon Sep 17 00:00:00 2001 From: Wil Roberts <47739563+robertswh@users.noreply.github.com> Date: Mon, 3 Jun 2024 13:02:26 +0100 Subject: [PATCH 18/20] 331 apply imputation link to target (#19) * add test data * some refactoring before function * added construction case to test data * refactored into functions * add test for higher level function --- src/apply_imputation_link.py | 161 ++++++++++++++++++ tests/apply_imputation_link.csv | 10 ++ tests/data/apply_imputation_link/BIR.csv | 4 + tests/data/apply_imputation_link/C_FIC.csv | 4 + tests/data/apply_imputation_link/FIR.csv | 4 + .../apply_imputation_link/FIR_BIR_C_FIC.csv | 10 ++ tests/test_apply_imputation_link.py | 37 ++++ 7 files changed, 230 insertions(+) create mode 100755 src/apply_imputation_link.py create mode 100644 tests/apply_imputation_link.csv create mode 100755 tests/data/apply_imputation_link/BIR.csv create mode 100755 tests/data/apply_imputation_link/C_FIC.csv create mode 100755 tests/data/apply_imputation_link/FIR.csv create mode 100755 tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv create mode 100755 tests/test_apply_imputation_link.py diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py new file mode 100755 index 00000000..e04104fb --- /dev/null +++ b/src/apply_imputation_link.py @@ -0,0 +1,161 @@ +def create_and_merge_imputation_values( + df, + imputation_class, + reference, + period, + marker, + combined_imputation, + target, + cumulative_forward_link, + cumulative_backward_link, + auxiliary, + construction_link, + imputation_types=("c", "fir", "bir", "fic"), +): + """ + Loop through different imputation types and merge the results according + to an imputation marker column + + Parameters + ---------- + df : pandas.DataFrame + imputation_class : str + column name for the variable that defines the imputation class + reference : str + column name for the reference + period : str + column name for the period + marker : str + column name containing a marker to indicate the type of imputation required + combined_imputation : str + column name for the combined imputation types according to the imputation marker + target : str + column name for the target variable for imputation + cumulative_forward_link : str + column name for the cumulative forward imputation link + cumulative_backward_link : str + column name for the cumulative backward imputation link + auxiliary : str + column name for auxiliary variable + construction_link : str + column name for contruction link + imputation_types : tup + types of imputation to run and add to combined_imputation column stored in a + tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'. + For 'fic' to produce the correct result, the C marker must be in the first + period for a given reference. + + Returns + ------- + pandas.DataFrame + dataframe with imputation values defined by the imputation marker + """ + + # constructed has to come first to use the result for forward impute from contructed + imputation_config = { + "c": { + "intermediate_column": "constructed", + "marker": "C", + # doesn't actually apply a fill so can be forward or back + "fill_column": auxiliary, + "fill_method": "ffill", + "link_column": construction_link, + }, + "fir": { + "intermediate_column": "fir", + "marker": "FIR", + "fill_column": target, + "fill_method": "ffill", + "link_column": cumulative_forward_link, + }, + "bir": { + "intermediate_column": "bir", + "marker": "BIR", + "fill_column": target, + "fill_method": "bfill", + "link_column": cumulative_backward_link, + }, + "fic": { + # FIC only works if the C is in the first period of the business being + # sampled. This is fine for automatic imputation, but should be careful + # if manual construction imputation is done + "intermediate_column": "fic", + "marker": "FIC", + # this has to have the same name as the intermediate column for constructed + "fill_column": "constructed", + "fill_method": "ffill", + "link_column": cumulative_forward_link, + }, + } + + df.sort_values([imputation_class, reference, period], inplace=True) + + intermediate_columns = [] + + for imp_type in imputation_types: + df = create_impute( + df, [imputation_class, reference], imputation_config[imp_type] + ) + df = merge_imputation_type( + df, imputation_config[imp_type], marker, combined_imputation + ) + + intermediate_columns.append(imputation_config[imp_type]["intermediate_column"]) + + return df.drop(columns=intermediate_columns) + + +def create_impute(df, group, imputation_spec): + """ + Add a new column to a dataframe of imputed values using ratio imputation. + + Parameters + ---------- + dataframe : pandas.DataFrame + group : str or list + variables that define the imputation class + imputation_spec: dict + dictionary defining the details of the imputation type + + Returns + ------- + pandas.DataFrame + dataframe with an added imputation column defined by the imputation_spec + """ + column_name = imputation_spec["intermediate_column"] + fill_column = imputation_spec["fill_column"] + fill_method = imputation_spec["fill_method"] + link_column = imputation_spec["link_column"] + + df[column_name] = ( + df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column] + ) + return df + + +def merge_imputation_type(df, imputation_spec, marker, combined_imputation): + """ + Uses an existing column of imputed values and a imputation marker to merge values + into a single column + + Parameters + ---------- + dataframe : pandas.DataFrame + imputation_spec: dict + dictionary defining the details of the imputation type + marker : str + column name containing a marker to indicate the type of imputation required + combined_imputation : str + column name for the combined imputation types according to the imputation marker + + Returns + ------- + pandas.DataFrame + dataframe with combined_imputation + """ + + imputation_marker = imputation_spec["marker"] + imputation_column = imputation_spec["intermediate_column"] + + df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column] + return df diff --git a/tests/apply_imputation_link.csv b/tests/apply_imputation_link.csv new file mode 100644 index 00000000..c81711cd --- /dev/null +++ b/tests/apply_imputation_link.csv @@ -0,0 +1,10 @@ +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link +100,100000,200,202402,1,2,1,,,R,,, +100,100000,,202403,2,0.6,2,2,0.6,FIR,400,, +100,100000,,202404,3,1,2,6,1,FIR,1200,, +200,100001,,202402,1,4,3,1,2,BIR,600,, +200,100001,,202403,3,0.5,3,3,0.5,BIR,150,, +200,100001,300,202404,0.5,1,4,,,R,,, +300,100002,,202402,1,4,5,1,2,C,600,40,0.1 +300,100002,,202403,3,0.5,5,3,0.5,FIC,150,, +300,100002,,202404,0.5,1,5,2,,FIC,,, diff --git a/tests/data/apply_imputation_link/BIR.csv b/tests/data/apply_imputation_link/BIR.csv new file mode 100755 index 00000000..954700c4 --- /dev/null +++ b/tests/data/apply_imputation_link/BIR.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value +200,100001,,202402,4,2,BIR,600 +200,100001,,202403,0.5,0.5,BIR,150 +200,100001,300,202404,1,,R, diff --git a/tests/data/apply_imputation_link/C_FIC.csv b/tests/data/apply_imputation_link/C_FIC.csv new file mode 100755 index 00000000..7d2424b2 --- /dev/null +++ b/tests/data/apply_imputation_link/C_FIC.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value +300,100002,,202402,1,,0.1,1000,C,100 +300,100002,,202403,3,3,,,FIC,300 +300,100002,,202404,0.5,1.5,,,FIC,150 diff --git a/tests/data/apply_imputation_link/FIR.csv b/tests/data/apply_imputation_link/FIR.csv new file mode 100755 index 00000000..341ece76 --- /dev/null +++ b/tests/data/apply_imputation_link/FIR.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value +100,100000,200,202402,1,,R, +100,100000,,202403,2,2,FIR,400 +100,100000,,202404,3,6,FIR,1200 diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv new file mode 100755 index 00000000..91ec36ec --- /dev/null +++ b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv @@ -0,0 +1,10 @@ +imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value +100,100000,200,202402,1,2,,,,,R, +100,100000,,202403,2,0.6,,,2,0.6,FIR,400 +100,100000,,202404,3,1,,,6,1,FIR,1200 +200,100001,,202402,1,4,,,1,2,BIR,600 +200,100001,,202403,3,0.5,,,3,0.5,BIR,150 +200,100001,300,202404,0.5,1,,,,,R, +300,100002,,202402,1,4,1000,0.1,,2,C,100 +300,100002,,202403,3,0.5,,,3,0.5,FIC,300 +300,100002,,202404,0.5,1,,,1.5,,FIC,150 diff --git a/tests/test_apply_imputation_link.py b/tests/test_apply_imputation_link.py new file mode 100755 index 00000000..568bfcec --- /dev/null +++ b/tests/test_apply_imputation_link.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.apply_imputation_link import create_and_merge_imputation_values + + +@pytest.fixture(scope="class") +def fir_bir_c_fic_test_data(): + return load_and_format( + Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv" + ) + + +class TestApplyImputationLink: + def test_all_imputation_types(self, fir_bir_c_fic_test_data): + expected_output = fir_bir_c_fic_test_data + + input_data = expected_output.drop(columns=["imputed_value"]) + actual_output = create_and_merge_imputation_values( + input_data, + "imputation_class", + "reference", + "period", + "imputation_marker", + "imputed_value", + "target", + "cumulative_forward_link", + "cumulative_backward_link", + "auxiliary_variable", + "construction_link", + imputation_types=("c", "fir", "bir", "fic"), + ) + + assert_frame_equal(actual_output, expected_output) From ba25aef807ba59f21bd9b5faccafe58d1a692941 Mon Sep 17 00:00:00 2001 From: Wil Roberts <47739563+robertswh@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:23:58 +0100 Subject: [PATCH 19/20] added testing guide and function tips (#17) * added testing guide and function tips * Add function context and fix external links --------- Co-authored-by: hemsir --- docs/contributor_guide/CONTRIBUTING.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/contributor_guide/CONTRIBUTING.md b/docs/contributor_guide/CONTRIBUTING.md index ee56bcde..dee48d43 100644 --- a/docs/contributor_guide/CONTRIBUTING.md +++ b/docs/contributor_guide/CONTRIBUTING.md @@ -36,14 +36,14 @@ documentation][docs-pre-commit-hooks]. ## Code conventions -Code written for this project should follow [PEP 8 coding conventions](pep8), [project naming conventions](docs-naming) and the guidance on [quality assurance of code for analysis and research](duck-book) (also known as the Duck Book). +Code written for this project should follow [PEP 8 coding conventions][pep8], [project naming conventions][docs-naming] and the guidance on [quality assurance of code for analysis and research][duck-book] (also known as the Duck Book). ### Git and GitHub We use Git to version control the source code and out source code is stored on GitHub. -We follow the [GitHub flow](github-flow) workflow. This means that we create +We follow the [GitHub flow][github-flow] workflow. This means that we create feature branches of the `main` branch and merge them back to `main` once they meet the definition of done. We give our branches short but informative names, in lowercase and separated with hypens. Where applicable, we start branch names @@ -53,16 +53,20 @@ with the respective Jira ticket number. For example, We commit regularly, with distinct chunks of work where possible. We write short but informative commit messages, starting with a capitalised present-tense verb, for example `Add`, `Fix`. When pair-programming, we -[add co-authors to the commit](git-coauthor). We add -[longer commit messages](long-commit) for larger or more complex commits, for +[add co-authors to the commit][git-coauthor]. We add +[longer commit messages][long-commit] for larger or more complex commits, for example (squash) merge commits. We open a pull request to `main` once we have working code that meets a user need, for example meets the definition of done on the Jira ticket. Pull requests must be reviewed by at least one member of the team before merging. -Reviews should follow the [pull request template](pr-template). If we want review on code that does not yet meet the definition of done, we open a draft +Reviews should follow the [pull request template][pr-template]. If we want review on code that does not yet meet the definition of done, we open a draft pull request. Once a branch has been reviewed, it can be merged. We prefer to use squash merges, in order to simplify the `main` branch commit history. After merging the feature branch should be deleted. +### Functions + +We prefer writing functions over classes to make it easier for beginners to understand the code. [Type hints][typing] should be used when writing functions. We prefer functions to return `pandas.DataFrame` rather than `pandas.Series`, for example when deriving new (temporary) variables. + ### Markdown Local links can be written as normal, but external links should be referenced at the @@ -83,6 +87,10 @@ tests, enter the following command in your terminal: ```shell pytest ``` +Our testing approach is: +- use `.csv` files containing simple minimal input and output data for a function to be tested +- individual test cases should be separated into different `.csv` files and grouped into folders +- the name of the test data `.csv` files should reflect the test case and the folder name should be the same as the module/function ### Code coverage @@ -139,3 +147,4 @@ build the documentation into an accessible, searchable website. [github-flow]: https://docs.github.com/en/get-started/using-github/github-flow [git-coauthor]: https://docs.github.com/en/pull-requests/committing-changes-to-your-project/creating-and-editing-commits/creating-a-commit-with-multiple-authors [long-commit]: https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html +[typing]: https://docs.python.org/3/library/typing.html From ecca2a42288afcb0c236e10b95edcb6e45ecd68b Mon Sep 17 00:00:00 2001 From: Anton Zogkolli <110612763+AntonZogk@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:54:52 +0100 Subject: [PATCH 20/20] 321 flag to ignore response (#18) * Upload test data for link filters * build: function to filter rows to ignore values from link The function adds a new column indicating if row should be ignored. It is based on set_index and index comparison. Added 2 tests, one to check the output, and if exception is raised when columns do not match. * build: function to filter rows to ignore values from link The function adds a new column indicating if row should be ignored. It is based on set_index and index comparison. Added 2 tests, one to check the output, and if exception is raised when columns do not match. --- src/link_filter.py | 49 +++++++++++++++++++++++++++++++++++++ tests/test_flag_data.csv | 29 ++++++++++++++++++++++ tests/test_flag_filters.csv | 3 +++ tests/test_link_filter.py | 39 +++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 src/link_filter.py create mode 100755 tests/test_flag_data.csv create mode 100755 tests/test_flag_filters.csv create mode 100644 tests/test_link_filter.py diff --git a/src/link_filter.py b/src/link_filter.py new file mode 100644 index 00000000..f5ff6383 --- /dev/null +++ b/src/link_filter.py @@ -0,0 +1,49 @@ +import pandas as pd + +# TODO: Extend function to receive multiple df with *df_with_filters + + +def flag_rows_to_ignore( + df: pd.DataFrame, df_with_filters: pd.DataFrame +) -> pd.DataFrame: + """ + Add a new column bool column named ignore_from_link to df + having as TRUE the observations defined in df_with_filters. + + Parameters + ---------- + df : pd.DataFrame + Original dataframe. + df_with_filters : pd.DataFrame + Dataframe with observations which should be flagged in the original + dataframe. + + Returns + ------- + df : pd.DataFrame + Original dataframe with a bool column containing the flags. + + """ + + if not set(df_with_filters.columns).issubset(df.columns): + + raise ValueError( + f"""df_with_filters has these columns {list(df_with_filters)} while + df has these columns {list(df)}, please + double check the column names.""" + ) + + # TODO: Check if values to be ignored exist + + df = df.set_index(list(df_with_filters)) + + df_with_filters = df_with_filters.set_index(list(df_with_filters)) + + df["ignore_from_link"] = df.index.isin(df_with_filters.index) + + df = df.reset_index() + + # TODO: Consider what should be logged and reroute print to logs + print("These values were flagged:\n", df.loc[df["ignore_from_link"]]) + + return df diff --git a/tests/test_flag_data.csv b/tests/test_flag_data.csv new file mode 100755 index 00000000..2f97b47f --- /dev/null +++ b/tests/test_flag_data.csv @@ -0,0 +1,29 @@ +identifier,date,group,question,other,ignore_from_link +70001,202001,100,5951.0,39,False +70001,202002,100,1814.0,39,False +70001,202003,100,734.0,39,True +70001,202004,100,96.0,39,False +70001,202005,100,9086.0,39,True +70001,202006,100,3949.0,39,False +70001,202007,100,49.0,39,False +70002,202001,100,6705.0,94,False +70002,202002,100,48.0,94,False +70002,202003,100,5361.0,94,False +70002,202004,100,8767.0,94,False +70002,202005,100,9214.0,94,False +70002,202006,100,7467.0,94,False +70002,202007,100,3475.0,94,False +70003,202001,100,6153.0,42,False +70003,202002,100,7711.0,42,False +70003,202003,100,5403.0,42,False +70003,202004,100,7445.0,42,False +70003,202005,100,7092.0,42,False +70003,202006,100,2038.0,42,False +70003,202007,100,8768.0,42,False +70004,202001,100,,6,False +70004,202002,100,,6,False +70004,202003,100,6288.0,6,False +70004,202004,100,,6,False +70004,202005,100,,6,False +70004,202006,100,5875.0,6,False +70004,202007,100,,6,False diff --git a/tests/test_flag_filters.csv b/tests/test_flag_filters.csv new file mode 100755 index 00000000..abdfb4c8 --- /dev/null +++ b/tests/test_flag_filters.csv @@ -0,0 +1,3 @@ +identifier,date +70001,202003 +70001,202005 diff --git a/tests/test_link_filter.py b/tests/test_link_filter.py new file mode 100644 index 00000000..bbd5cc75 --- /dev/null +++ b/tests/test_link_filter.py @@ -0,0 +1,39 @@ +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from src.link_filter import flag_rows_to_ignore + + +@pytest.mark.parametrize("scenario", ["test_flag_data"]) +@pytest.mark.parametrize("filters", ["test_flag_filters"]) +class TestFilters: + def test_basic_filter(self, scenario, filters): + """Test ignore_from_link is correct""" + + df_output_expected = pd.read_csv("tests/" + scenario + ".csv") + + df_filters = pd.read_csv("tests/" + filters + ".csv") + + df_input = df_output_expected.drop(columns=["ignore_from_link"]) + + df_output = flag_rows_to_ignore(df_input, df_filters) + + assert_frame_equal(df_output, df_output_expected) + + def test_exception(self, scenario, filters): + + """Test if function raises an exception when the columns in filters + do not exist in scenario.""" + + df_output_expected = pd.read_csv("tests/" + scenario + ".csv") + + df_filters = pd.read_csv("tests/" + filters + ".csv") + + df_input = df_output_expected.drop(columns=["ignore_from_link"]) + + with pytest.raises(ValueError): + + df_filters.columns = df_filters.columns + "_fail" + + flag_rows_to_ignore(df_input, df_filters)