diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 12fe2efe..dc5b5228 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -46,4 +46,24 @@ jobs: - name: Run pytest run: | - pytest -v \ No newline at end of file + pytest -v + + commit-hooks: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v3 + with: + python-version: 3.6.8 + cache: 'pip' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pre-commit install + + - name: Check commit hooks + run: | + pre-commit run --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 44b4d541..81880b61 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,15 @@ repos: language: script stages: [commit] +#works +- repo: local + hooks: + - id: mixed-line-endings + entry: pre-commits/mixed_line_endings.py + name: Check for consistent end of line type LF to CRLF to CR (auto-fixes) + language: script + stages: [commit] + #works #if using on different file types, it will need a seperate hook per file type - repo: local @@ -76,18 +85,17 @@ repos: stages: [commit] -# #needs to remove the password in hello_world.py -# - repo: local -# hooks: -# - id: detect-secrets -# entry: detect-secrets -# name: detect-secrets - Detect secrets in staged code -# #args: [ "--baseline", ".secrets.baseline" ] -# args: [scan, audit] -# language: system -# types: [python] -# stages: [commit] -# exclude: .*/tests/.*|^\.cruft\.json$ +# works in testing +- repo: local + hooks: + - id: detect-secrets + entry: detect-secrets-hook + name: detect-secrets - Detect secrets in staged code + args: [ "--baseline", ".secrets.baseline" ] + #args: [scan, audit] + language: system + types: [python] + stages: [commit] diff --git a/docs/contributor_guide/CONTRIBUTING.md b/docs/contributor_guide/CONTRIBUTING.md index ee56bcde..dee48d43 100644 --- a/docs/contributor_guide/CONTRIBUTING.md +++ b/docs/contributor_guide/CONTRIBUTING.md @@ -36,14 +36,14 @@ documentation][docs-pre-commit-hooks]. ## Code conventions -Code written for this project should follow [PEP 8 coding conventions](pep8), [project naming conventions](docs-naming) and the guidance on [quality assurance of code for analysis and research](duck-book) (also known as the Duck Book). +Code written for this project should follow [PEP 8 coding conventions][pep8], [project naming conventions][docs-naming] and the guidance on [quality assurance of code for analysis and research][duck-book] (also known as the Duck Book). ### Git and GitHub We use Git to version control the source code and out source code is stored on GitHub. -We follow the [GitHub flow](github-flow) workflow. This means that we create +We follow the [GitHub flow][github-flow] workflow. This means that we create feature branches of the `main` branch and merge them back to `main` once they meet the definition of done. We give our branches short but informative names, in lowercase and separated with hypens. Where applicable, we start branch names @@ -53,16 +53,20 @@ with the respective Jira ticket number. For example, We commit regularly, with distinct chunks of work where possible. We write short but informative commit messages, starting with a capitalised present-tense verb, for example `Add`, `Fix`. When pair-programming, we -[add co-authors to the commit](git-coauthor). We add -[longer commit messages](long-commit) for larger or more complex commits, for +[add co-authors to the commit][git-coauthor]. We add +[longer commit messages][long-commit] for larger or more complex commits, for example (squash) merge commits. We open a pull request to `main` once we have working code that meets a user need, for example meets the definition of done on the Jira ticket. Pull requests must be reviewed by at least one member of the team before merging. -Reviews should follow the [pull request template](pr-template). If we want review on code that does not yet meet the definition of done, we open a draft +Reviews should follow the [pull request template][pr-template]. If we want review on code that does not yet meet the definition of done, we open a draft pull request. Once a branch has been reviewed, it can be merged. We prefer to use squash merges, in order to simplify the `main` branch commit history. After merging the feature branch should be deleted. +### Functions + +We prefer writing functions over classes to make it easier for beginners to understand the code. [Type hints][typing] should be used when writing functions. We prefer functions to return `pandas.DataFrame` rather than `pandas.Series`, for example when deriving new (temporary) variables. + ### Markdown Local links can be written as normal, but external links should be referenced at the @@ -83,6 +87,10 @@ tests, enter the following command in your terminal: ```shell pytest ``` +Our testing approach is: +- use `.csv` files containing simple minimal input and output data for a function to be tested +- individual test cases should be separated into different `.csv` files and grouped into folders +- the name of the test data `.csv` files should reflect the test case and the folder name should be the same as the module/function ### Code coverage @@ -139,3 +147,4 @@ build the documentation into an accessible, searchable website. [github-flow]: https://docs.github.com/en/get-started/using-github/github-flow [git-coauthor]: https://docs.github.com/en/pull-requests/committing-changes-to-your-project/creating-and-editing-commits/creating-a-commit-with-multiple-authors [long-commit]: https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html +[typing]: https://docs.python.org/3/library/typing.html diff --git a/pre-commits/check_added_large_files.py b/pre-commits/check_added_large_files.py index 41fa69b3..59c0353a 100755 --- a/pre-commits/check_added_large_files.py +++ b/pre-commits/check_added_large_files.py @@ -4,24 +4,20 @@ import json import math import os -from typing import Optional -from typing import Sequence -from typing import Set +from typing import Optional, Sequence, Set -from pre_commit_hooks.util import added_files -from pre_commit_hooks.util import CalledProcessError -from pre_commit_hooks.util import cmd_output +from pre_commit_hooks.util import CalledProcessError, added_files, cmd_output def _lfs_files() -> Set[str]: """Private function.""" try: # Introduced in git-lfs 2.2.0, first working in 2.2.1 - lfs_ret = cmd_output('git', 'lfs', 'status', '--json') + lfs_ret = cmd_output("git", "lfs", "status", "--json") except CalledProcessError: # pragma: no cover (with git-lfs) lfs_ret = '{"files":{}}' - return set(json.loads(lfs_ret)['files']) + return set(json.loads(lfs_ret)["files"]) def _find_large_added_files(filenames: Sequence[str], maxkb: int) -> int: @@ -32,7 +28,7 @@ def _find_large_added_files(filenames: Sequence[str], maxkb: int) -> int: for filename in (added_files() & set(filenames)) - _lfs_files(): kb = int(math.ceil(os.stat(filename).st_size / 1024)) if kb > maxkb: - print(f'{filename} ({kb} KB) exceeds {maxkb} KB.') + print(f"{filename} ({kb} KB) exceeds {maxkb} KB.") retv = 1 return retv @@ -42,17 +38,20 @@ def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() parser.add_argument( - 'filenames', nargs='*', - help='Filenames pre-commit believes are changed.', + "filenames", + nargs="*", + help="Filenames pre-commit believes are changed.", ) parser.add_argument( - '--maxkb', type=int, default=500, - help='Maxmimum allowable KB for added files', + "--maxkb", + type=int, + default=500, + help="Maxmimum allowable KB for added files", ) args = parser.parse_args(argv) return _find_large_added_files(args.filenames, args.maxkb) -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/check_merge_conflict.py b/pre-commits/check_merge_conflict.py index 85a29255..e6c67007 100755 --- a/pre-commits/check_merge_conflict.py +++ b/pre-commits/check_merge_conflict.py @@ -2,35 +2,30 @@ """Pre commit hook to check for merge conflict flags in file.""" import argparse import os.path -from typing import Optional -from typing import Sequence - +from typing import Optional, Sequence CONFLICT_PATTERNS = [ - b'<<<<<<< ', - b'======= ', - b'=======\n', - b'>>>>>>> ', + b"<<<<<<< ", + b"======= ", + b"=======\n", + b">>>>>>> ", ] def _is_in_merge() -> int: """Private function.""" - return ( - os.path.exists(os.path.join('.git', 'MERGE_MSG')) and - ( - os.path.exists(os.path.join('.git', 'MERGE_HEAD')) or - os.path.exists(os.path.join('.git', 'rebase-apply')) or - os.path.exists(os.path.join('.git', 'rebase-merge')) - ) + return os.path.exists(os.path.join(".git", "MERGE_MSG")) and ( + os.path.exists(os.path.join(".git", "MERGE_HEAD")) + or os.path.exists(os.path.join(".git", "rebase-apply")) + or os.path.exists(os.path.join(".git", "rebase-merge")) ) def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() - parser.add_argument('filenames', nargs='*') - parser.add_argument('--assume-in-merge', action='store_true') + parser.add_argument("filenames", nargs="*") + parser.add_argument("--assume-in-merge", action="store_true") args = parser.parse_args(argv) if not _is_in_merge() and not args.assume_in_merge: @@ -38,18 +33,18 @@ def main(argv: Optional[Sequence[str]] = None) -> int: retcode = 0 for filename in args.filenames: - with open(filename, 'rb') as inputfile: + with open(filename, "rb") as inputfile: for i, line in enumerate(inputfile): for pattern in CONFLICT_PATTERNS: if line.startswith(pattern): print( f'Merge conflict string "{pattern.decode()}" ' - f'found in {filename}:{i + 1}', + f"found in {filename}:{i + 1}", ) retcode = 1 return retcode -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/commit_msg.py b/pre-commits/commit_msg.py deleted file mode 100755 index e478166a..00000000 --- a/pre-commits/commit_msg.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -"""Git hook to check git commit message has appropriate length subject line. - -After removing the jira issue number from the subject line we check that the -message is longer than 20 characters and shorter than 65. -""" -import sys - -# Collect the parameters -commit_msg_filepath = sys.argv[1] - -with open(commit_msg_filepath, 'r') as f: - lines = f.readlines() - - # The subject is the first line of the message, but we don't count any - # Jira issue note - commit_subject = lines[0].split(']')[-1] - - if (len(commit_subject) < 20): - print( - f''' - commit-msg: ERROR! The commit subject is too short! - subject length = {len(commit_subject)} < 20 characters' - ''' - ) - sys.exit(1) - - elif (len(commit_subject) > 65): - # We check if messages are greater than 65 char, but warn as if - # longer than 50 - print( - f''' - commit-msg: ERROR! - The commit subject is too long! - subject length = {len(commit_subject)} > 50 characters' - ''' - ) - sys.exit(1) - -# for line in lines[2:]: -# print(line) diff --git a/pre-commits/end_of_line_fixer.py b/pre-commits/end_of_line_fixer.py index 8f39b8c1..eb85f62e 100755 --- a/pre-commits/end_of_line_fixer.py +++ b/pre-commits/end_of_line_fixer.py @@ -2,9 +2,7 @@ """Pre commit hook to ensure single blank line at end of python file.""" import argparse import os -from typing import IO -from typing import Optional -from typing import Sequence +from typing import IO, Optional, Sequence def _fix_file(file_obj: IO[bytes]) -> int: @@ -17,13 +15,13 @@ def _fix_file(file_obj: IO[bytes]) -> int: return 0 last_character = file_obj.read(1) # last_character will be '' for an empty file - if last_character not in {b'\n', b'\r'} and last_character != b'': + if last_character not in {b"\n", b"\r"} and last_character != b"": # Needs this seek for windows, otherwise IOError file_obj.seek(0, os.SEEK_END) - file_obj.write(b'\n') + file_obj.write(b"\n") return 1 - while last_character in {b'\n', b'\r'}: + while last_character in {b"\n", b"\r"}: # Deal with the beginning of the file if file_obj.tell() == 1: # If we've reached the beginning of the file and it is all @@ -40,7 +38,7 @@ def _fix_file(file_obj: IO[bytes]) -> int: # newlines. If we find extraneous newlines, then backtrack and trim them. position = file_obj.tell() remaining = file_obj.read() - for sequence in (b'\n', b'\r\n', b'\r'): + for sequence in (b"\n", b"\r\n", b"\r"): if remaining == sequence: return 0 elif remaining.startswith(sequence): @@ -54,21 +52,21 @@ def _fix_file(file_obj: IO[bytes]) -> int: def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() - parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument("filenames", nargs="*", help="Filenames to fix") args = parser.parse_args(argv) retv = 0 for filename in args.filenames: # Read as binary so we can read byte-by-byte - with open(filename, 'rb+') as file_obj: + with open(filename, "rb+") as file_obj: ret_for_file = _fix_file(file_obj) if ret_for_file: - print(f'Fixing {filename}') + print(f"Fixing {filename}") retv |= ret_for_file return retv -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/mixed_line_endings.py b/pre-commits/mixed_line_endings.py index 48afc2e6..8ae44909 100755 --- a/pre-commits/mixed_line_endings.py +++ b/pre-commits/mixed_line_endings.py @@ -2,31 +2,28 @@ """Pre commit hook to ensure all EOL characters are the same.""" import argparse import collections -from typing import Dict -from typing import Optional -from typing import Sequence +from typing import Dict, Optional, Sequence - -CRLF = b'\r\n' -LF = b'\n' -CR = b'\r' +CRLF = b"\r\n" +LF = b"\n" +CR = b"\r" # Prefer LF to CRLF to CR, but detect CRLF before LF ALL_ENDINGS = (CR, CRLF, LF) -FIX_TO_LINE_ENDING = {'cr': CR, 'crlf': CRLF, 'lf': LF} +FIX_TO_LINE_ENDING = {"cr": CR, "crlf": CRLF, "lf": LF} def _fix(filename: str, contents: bytes, ending: bytes) -> None: """Private function.""" - new_contents = b''.join( - line.rstrip(b'\r\n') + ending for line in contents.splitlines(True) + new_contents = b"".join( + line.rstrip(b"\r\n") + ending for line in contents.splitlines(True) ) - with open(filename, 'wb') as f: + with open(filename, "wb") as f: f.write(new_contents) def fix_filename(filename: str, fix: str) -> int: """Private function.""" - with open(filename, 'rb') as f: + with open(filename, "rb") as f: contents = f.read() counts: Dict[bytes, int] = collections.defaultdict(int) @@ -40,10 +37,10 @@ def fix_filename(filename: str, fix: str) -> int: # Some amount of mixed line endings mixed = sum(bool(x) for x in counts.values()) > 1 - if fix == 'no' or (fix == 'auto' and not mixed): + if fix == "no" or (fix == "auto" and not mixed): return mixed - if fix == 'auto': + if fix == "auto": max_ending = LF max_lines = 0 # ordering is important here such that lf > crlf > cr @@ -70,24 +67,25 @@ def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() parser.add_argument( - '-f', '--fix', - choices=('auto', 'no') + tuple(FIX_TO_LINE_ENDING), - default='auto', + "-f", + "--fix", + choices=("auto", "no") + tuple(FIX_TO_LINE_ENDING), + default="auto", help='Replace line ending with the specified. Default is "auto"', ) - parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument("filenames", nargs="*", help="Filenames to fix") args = parser.parse_args(argv) retv = 0 for filename in args.filenames: if fix_filename(filename, args.fix): - if args.fix == 'no': - print(f'{filename}: mixed line endings') + if args.fix == "no": + print(f"{filename}: mixed line endings") else: - print(f'{filename}: fixed mixed line endings') + print(f"{filename}: fixed mixed line endings") retv = 1 return retv -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/pre-commits/prepare_commit_msg.py b/pre-commits/prepare_commit_msg.py deleted file mode 100755 index 7cc97878..00000000 --- a/pre-commits/prepare_commit_msg.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -"""Git hook to automatically prefix git commit message with Jira issue number. - -The issue number (e.g. Jira ticket number) from the current branch name. Works -with or without specifying -m option at commit time. -""" -import re -import sys -from subprocess import check_output - - -commit_msg_filepath = sys.argv[1] -branch = ( - check_output(["git", "symbolic-ref", "--short", "HEAD"]) - .decode("utf-8").strip() -) - -# If branch name contains /'s we only want the final part of the branch name -branch_end = branch.split('/')[-1] - -# Regex pattern for matching to Jira issues -regex = r"[Jj]\d+" - -if re.search(regex, branch_end): - # Create list of all matches to regex pattern - issue_number_matches = re.findall(regex, branch_end) - - # If mutiple issues in branch name we join them together - commit_issue = f'{"_".join(issue_number_matches)}' - - with open(commit_msg_filepath, "r+") as f: - commit_msg = f.read() - f.seek(0, 0) # correctly position issue_number when writing commit msg - f.write(f"[{commit_issue}] {commit_msg}") - -else: - # If branch does not contain a jira issue number, reject the commit - print( - f''' - prepare-commit-msg: Error! - Branch name is {branch} - Does not match branch name strategy \'*/jxxx\' - ''' - ) - sys.exit(1) diff --git a/pre-commits/remove_whitespace.py b/pre-commits/remove_whitespace.py index d890b18c..61e5803f 100755 --- a/pre-commits/remove_whitespace.py +++ b/pre-commits/remove_whitespace.py @@ -2,21 +2,20 @@ """Pre commit hook to remove any trailing whitespace.""" import argparse import os -from typing import Optional -from typing import Sequence +from typing import Optional, Sequence def _fix_file( - filename: str, - is_markdown: bool, - chars: Optional[bytes], + filename: str, + is_markdown: bool, + chars: Optional[bytes], ) -> bool: """Private function.""" - with open(filename, mode='rb') as file_processed: + with open(filename, mode="rb") as file_processed: lines = file_processed.readlines() newlines = [_process_line(line, is_markdown, chars) for line in lines] if newlines != lines: - with open(filename, mode='wb') as file_processed: + with open(filename, mode="wb") as file_processed: for line in newlines: file_processed.write(line) return True @@ -25,22 +24,22 @@ def _fix_file( def _process_line( - line: bytes, - is_markdown: bool, - chars: Optional[bytes], + line: bytes, + is_markdown: bool, + chars: Optional[bytes], ) -> bytes: """Private function.""" - if line[-2:] == b'\r\n': - eol = b'\r\n' + if line[-2:] == b"\r\n": + eol = b"\r\n" line = line[:-2] - elif line[-1:] == b'\n': - eol = b'\n' + elif line[-1:] == b"\n": + eol = b"\n" line = line[:-1] else: - eol = b'' + eol = b"" # preserve trailing two-space for non-blank lines in markdown files - if is_markdown and (not line.isspace()) and line.endswith(b' '): - return line[:-2].rstrip(chars) + b' ' + eol + if is_markdown and (not line.isspace()) and line.endswith(b" "): + return line[:-2].rstrip(chars) + b" " + eol return line.rstrip(chars) + eol @@ -48,48 +47,46 @@ def main(argv: Optional[Sequence[str]] = None) -> int: """Entry function for script.""" parser = argparse.ArgumentParser() parser.add_argument( - '--no-markdown-linebreak-ext', - action='store_true', + "--no-markdown-linebreak-ext", + action="store_true", help=argparse.SUPPRESS, ) parser.add_argument( - '--markdown-linebreak-ext', - action='append', + "--markdown-linebreak-ext", + action="append", default=[], - metavar='*|EXT[,EXT,...]', + metavar="*|EXT[,EXT,...]", help=( - 'Markdown extensions (or *) to not strip linebreak spaces. ' - 'default: %(default)s' + "Markdown extensions (or *) to not strip linebreak spaces. " + "default: %(default)s" ), ) parser.add_argument( - '--chars', + "--chars", help=( - 'The set of characters to strip from the end of lines. ' - 'Defaults to all whitespace characters.' + "The set of characters to strip from the end of lines. " + "Defaults to all whitespace characters." ), ) - parser.add_argument('filenames', nargs='*', help='Filenames to fix') + parser.add_argument("filenames", nargs="*", help="Filenames to fix") args = parser.parse_args(argv) if args.no_markdown_linebreak_ext: - print('--no-markdown-linebreak-ext now does nothing!') + print("--no-markdown-linebreak-ext now does nothing!") md_args = args.markdown_linebreak_ext - if '' in md_args: - parser.error('--markdown-linebreak-ext requires a non-empty argument') - all_markdown = '*' in md_args + if "" in md_args: + parser.error("--markdown-linebreak-ext requires a non-empty argument") + all_markdown = "*" in md_args # normalize extensions; split at ',', lowercase, and force 1 leading '.' - md_exts = [ - '.' + x.lower().lstrip('.') for x in ','.join(md_args).split(',') - ] + md_exts = ["." + x.lower().lstrip(".") for x in ",".join(md_args).split(",")] # reject probable "eaten" filename as extension: skip leading '.' with [1:] for ext in md_exts: - if any(c in ext[1:] for c in r'./\:'): + if any(c in ext[1:] for c in r"./\:"): parser.error( - f'bad --markdown-linebreak-ext extension ' - f'{ext!r} (has . / \\ :)\n' + f"bad --markdown-linebreak-ext extension " + f"{ext!r} (has . / \\ :)\n" f" (probably filename; use '--markdown-linebreak-ext=EXT')", ) chars = None if args.chars is None else args.chars.encode() @@ -98,10 +95,10 @@ def main(argv: Optional[Sequence[str]] = None) -> int: _, extension = os.path.splitext(filename.lower()) md = all_markdown or extension in md_exts if _fix_file(filename, md, chars): - print(f'Fixing {filename}') + print(f"Fixing {filename}") return_code = 1 return return_code -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/requirements.txt b/requirements.txt index bd9b2879..e26789b2 100755 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ nbqa pre_commit_hooks flake8 pandas==1.1.5 -numpy \ No newline at end of file +numpy diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py new file mode 100755 index 00000000..e04104fb --- /dev/null +++ b/src/apply_imputation_link.py @@ -0,0 +1,161 @@ +def create_and_merge_imputation_values( + df, + imputation_class, + reference, + period, + marker, + combined_imputation, + target, + cumulative_forward_link, + cumulative_backward_link, + auxiliary, + construction_link, + imputation_types=("c", "fir", "bir", "fic"), +): + """ + Loop through different imputation types and merge the results according + to an imputation marker column + + Parameters + ---------- + df : pandas.DataFrame + imputation_class : str + column name for the variable that defines the imputation class + reference : str + column name for the reference + period : str + column name for the period + marker : str + column name containing a marker to indicate the type of imputation required + combined_imputation : str + column name for the combined imputation types according to the imputation marker + target : str + column name for the target variable for imputation + cumulative_forward_link : str + column name for the cumulative forward imputation link + cumulative_backward_link : str + column name for the cumulative backward imputation link + auxiliary : str + column name for auxiliary variable + construction_link : str + column name for contruction link + imputation_types : tup + types of imputation to run and add to combined_imputation column stored in a + tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'. + For 'fic' to produce the correct result, the C marker must be in the first + period for a given reference. + + Returns + ------- + pandas.DataFrame + dataframe with imputation values defined by the imputation marker + """ + + # constructed has to come first to use the result for forward impute from contructed + imputation_config = { + "c": { + "intermediate_column": "constructed", + "marker": "C", + # doesn't actually apply a fill so can be forward or back + "fill_column": auxiliary, + "fill_method": "ffill", + "link_column": construction_link, + }, + "fir": { + "intermediate_column": "fir", + "marker": "FIR", + "fill_column": target, + "fill_method": "ffill", + "link_column": cumulative_forward_link, + }, + "bir": { + "intermediate_column": "bir", + "marker": "BIR", + "fill_column": target, + "fill_method": "bfill", + "link_column": cumulative_backward_link, + }, + "fic": { + # FIC only works if the C is in the first period of the business being + # sampled. This is fine for automatic imputation, but should be careful + # if manual construction imputation is done + "intermediate_column": "fic", + "marker": "FIC", + # this has to have the same name as the intermediate column for constructed + "fill_column": "constructed", + "fill_method": "ffill", + "link_column": cumulative_forward_link, + }, + } + + df.sort_values([imputation_class, reference, period], inplace=True) + + intermediate_columns = [] + + for imp_type in imputation_types: + df = create_impute( + df, [imputation_class, reference], imputation_config[imp_type] + ) + df = merge_imputation_type( + df, imputation_config[imp_type], marker, combined_imputation + ) + + intermediate_columns.append(imputation_config[imp_type]["intermediate_column"]) + + return df.drop(columns=intermediate_columns) + + +def create_impute(df, group, imputation_spec): + """ + Add a new column to a dataframe of imputed values using ratio imputation. + + Parameters + ---------- + dataframe : pandas.DataFrame + group : str or list + variables that define the imputation class + imputation_spec: dict + dictionary defining the details of the imputation type + + Returns + ------- + pandas.DataFrame + dataframe with an added imputation column defined by the imputation_spec + """ + column_name = imputation_spec["intermediate_column"] + fill_column = imputation_spec["fill_column"] + fill_method = imputation_spec["fill_method"] + link_column = imputation_spec["link_column"] + + df[column_name] = ( + df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column] + ) + return df + + +def merge_imputation_type(df, imputation_spec, marker, combined_imputation): + """ + Uses an existing column of imputed values and a imputation marker to merge values + into a single column + + Parameters + ---------- + dataframe : pandas.DataFrame + imputation_spec: dict + dictionary defining the details of the imputation type + marker : str + column name containing a marker to indicate the type of imputation required + combined_imputation : str + column name for the combined imputation types according to the imputation marker + + Returns + ------- + pandas.DataFrame + dataframe with combined_imputation + """ + + imputation_marker = imputation_spec["marker"] + imputation_column = imputation_spec["intermediate_column"] + + df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column] + return df diff --git a/src/construction_matches.py b/src/construction_matches.py index dc947e0b..41ab2590 100644 --- a/src/construction_matches.py +++ b/src/construction_matches.py @@ -1,5 +1,6 @@ import pandas as pd + def flag_construction_matches(dataframe, target, period, auxiliary): """ Add flag to indicate whether the record has non-null target, period and @@ -22,6 +23,8 @@ def flag_construction_matches(dataframe, target, period, auxiliary): dataframe with additional flag_construction_matches column """ - dataframe["flag_construction_matches"] = pd.notna(dataframe[[target, period, auxiliary]]).all(axis="columns") + dataframe["flag_construction_matches"] = pd.notna( + dataframe[[target, period, auxiliary]] + ).all(axis="columns") return dataframe diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py new file mode 100755 index 00000000..91dfbed9 --- /dev/null +++ b/src/cumulative_imputation_links.py @@ -0,0 +1,72 @@ +import numpy as np + + +def get_cumulative_links( + dataframe, + forward_or_backward, + strata, + reference, + target, + period, + imputation_link, + time_difference=1, +): + """ + Create cumulative imputation links for multiple consecutive periods + without a return. + + Parameters + ---------- + dataframe : pandas.DataFrame + forward_or_backward: str + either f or b for forward or backward method + + strata : str + column name containing strata information (sic) + reference : str + column name containing business reference id + target : str + column name containing target variable + period : str + column name containing time period + imputation_link : string + column name containing imputation links + time_difference : int + time difference between predictive and target period in months + + Returns + ------- + pandas.DataFrame + dataframe with imputation_group and + cumulative_forward/backward_imputation_link column + """ + + dataframe.sort_values([strata, reference, period], inplace=True) + dataframe["missing_value"] = np.where(dataframe[target].isnull(), True, False) + + dataframe["imputation_group"] = ( + ( + (dataframe["missing_value"].diff(time_difference) != 0) + | (dataframe[strata].diff(time_difference) != 0) + | (dataframe[reference].diff(time_difference) != 0) + ) + .astype("int") + .cumsum() + ) + + if forward_or_backward == "f": + dataframe["cumulative_" + imputation_link] = dataframe.groupby( + "imputation_group" + )[imputation_link].cumprod() + elif forward_or_backward == "b": + dataframe["cumulative_" + imputation_link] = ( + dataframe[::-1].groupby("imputation_group")[imputation_link].cumprod()[::-1] + ) + + dataframe["cumulative_" + imputation_link] = np.where( + ~dataframe[target].isnull(), + np.nan, + dataframe["cumulative_" + imputation_link], + ) + + return dataframe[["imputation_group", "cumulative_" + imputation_link]] diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py index 9ab4a480..36df9e75 100644 --- a/src/flag_and_count_matched_pairs.py +++ b/src/flag_and_count_matched_pairs.py @@ -6,31 +6,32 @@ def flag_matched_pair( df, forward_or_backward, target, period, reference, strata, time_difference=1 ): """ - function to flag matched pairs using the shift method + function to flag matched pairs using the shift method - Parameters - ---------- - df : pd.DataFrame - pandas dataframe of original data - forward_or_backward : str - number of rows to shift up or down - target : str - column name containing target variable - period : str - column name containing time period - reference : str - column name containing business reference id - strata : str - column name containing strata information (sic) - time_difference: int - lookup distance for matched pairs + Parameters + ---------- + df : pd.DataFrame + pandas dataframe of original data + forward_or_backward : str + number of rows to shift up or down + target : str + column name containing target variable + period : str + column name containing time period + reference : str + column name containing business reference id + strata : str + column name containing strata information (sic) + time_difference: int + lookup distance for matched pairs - Returns - ------- - _type_ - two pandas dataframes: the main dataframe with column added flagging - forward matched pairs and - predictive target variable data column + Returns + ------- + _type_ + <<<<<<< HEAD + two pandas dataframes: the main dataframe with column added flagging + forward matched pairs and + predictive target variable data column """ df = df.sort_values(by=[reference, period]) @@ -52,6 +53,7 @@ def count_matches(df, flag, period, strata): """ function to flag matched pairs using the shift method + Parameters ---------- df : pd.DataFrame diff --git a/src/forward_link.py b/src/forward_link.py new file mode 100644 index 00000000..1ac97429 --- /dev/null +++ b/src/forward_link.py @@ -0,0 +1,76 @@ +import numpy as np +import pandas as pd + + +def calculate_imputation_link( + df: pd.DataFrame, + period: str, + strata: str, + match_col: str, + target_variable: str, + predictive_variable: str, +) -> pd.DataFrame: + """ + Calculate link between target_variable and predictive_variable by strata, + a match_col must be supplied which indicates if target_variable + and predictive_variable can be linked. + + Parameters + ---------- + df : pd.Dataframe + Original dataframe. + period : str + Column name containing time period. + strata : str + Column name containing strata information (sic). + match_col : str + Column name of the matched pair links, this column should be bool. + target_variable : str + Column name of the targeted variable. + predictive_variable : str + Column name of the predicted target variable. + + Returns + ------- + df : pd.DataFrame + A pandas DataFrame with a new column containing either f_link or b_link + based on the input parameters. + """ + + df_intermediate = df.copy() + + if match_col == "f_matched_pair" and predictive_variable == "f_predictive_question": + link_col_name = "f_link" + + elif ( + match_col == "b_matched_pair" and predictive_variable == "b_predictive_question" + ): + link_col_name = "b_link" + + else: + raise ValueError( + f""" + {match_col} and {predictive_variable} do not have same wildcard.""" + ) + + df_intermediate[target_variable] = ( + df_intermediate[target_variable] * df_intermediate[match_col] + ) + + df_intermediate[predictive_variable] = ( + df_intermediate[predictive_variable] * df_intermediate[match_col] + ) + + numerator = df_intermediate.groupby([strata, period])[target_variable].transform( + "sum" + ) + + denominator = df_intermediate.groupby([strata, period])[ + predictive_variable + ].transform("sum") + + denominator.replace(0, np.nan, inplace=True) # cover division with 0 + + df[link_col_name] = numerator / denominator + + return df diff --git a/src/imputation_flags.py b/src/imputation_flags.py new file mode 100644 index 00000000..91bc04ad --- /dev/null +++ b/src/imputation_flags.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd + + +def create_impute_flags( + df: pd.DataFrame, + target: str, + reference: str, + strata: str, + auxiliary: str, + predictive_auxiliary: str, +): + + """ + function to create logical columns for each type of imputation + output columns are needed to create the string flag column for + imputation methods. + Function requires f_predictive and b_predictive columns produced + by `flag_matched_pair` function. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing forward, backward predictive period columns ( + These columns are created by calling flag_matched_pair_merge forward + and backwards) + + target : str + Column name containing target variable. + reference : str + Column name containing business reference id. + strata : str + Column name containing strata information (sic). + auxiliary : str + Column name containing auxiliary data. + predictive_auxiliary: str + Column name containing predictive auxiliary data, this is created, + by flag_matched_pair_merge function. + + Returns + ------- + pd.DataFrame + Dataframe with four additional logical columns determining if target + is a return (r_flag) can be imputed by forward imputation (fir_flag), + backward imputation (bir_flag) or can be constructed (c_flag) + """ + for direction in ["f", "b"]: + try: + df["{}_predictive_{}".format(direction, target)] + except KeyError: + raise KeyError( + "Dataframe needs column '{}_predictive_{}',".format(direction, target) + + " run flag_matched_pair function first" + ) + forward_target_roll = "f_predictive_" + target + "_roll" + backward_target_roll = "b_predictive_" + target + "_roll" + forward_aux_roll = "f_predictive_" + auxiliary + "_roll" + + df[forward_target_roll] = df.groupby([reference, strata])[ + "f_predictive_" + target + ].ffill() + + df[backward_target_roll] = df.groupby([reference, strata])[ + "b_predictive_" + target + ].bfill() + + df["r_flag"] = df[target].notna() + + df["fir_flag"] = np.where( + df[forward_target_roll].notna() & df[target].isna(), True, False + ) + + df["bir_flag"] = np.where( + df[backward_target_roll].notna() & df[target].isna(), True, False + ) + + construction_conditions = df[target].isna() & df[auxiliary].notna() + df["c_flag"] = np.where(construction_conditions, True, False) + + df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill() + + fic_conditions = df[target].isna() & df[forward_aux_roll].notna() + df["fic_flag"] = np.where(fic_conditions, True, False) + + df.drop( + [ + forward_target_roll, + backward_target_roll, + forward_aux_roll, + predictive_auxiliary, + ], + axis=1, + inplace=True, + ) + + return df + + +def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame: + """ + Function to add column containing the a string indicating the method of + imputation to use following the hierarchy in specifications + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing logical columns produced by `create_imputation_flags` + (r_flag, fir_flag, bir_flag, fic_flag and c_flag) + + + Returns + ------- + pd.DataFrame + Dataframe with additional column containing imputation marker + i.e. the type of imputation method that should be used to fill + missing returns. + """ + + imputation_markers_and_conditions = { + "r": df["r_flag"], + "fir": ~df["r_flag"] & df["fir_flag"], + "bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"], + "fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"], + "c": ~df["r_flag"] + & ~df["fir_flag"] + & ~df["bir_flag"] + & ~df["fic_flag"] + & df["c_flag"], + } + + df["imputation_marker"] = np.select( + imputation_markers_and_conditions.values(), + imputation_markers_and_conditions.keys(), + default="error", + ) + + return df diff --git a/src/link_filter.py b/src/link_filter.py new file mode 100644 index 00000000..f5ff6383 --- /dev/null +++ b/src/link_filter.py @@ -0,0 +1,49 @@ +import pandas as pd + +# TODO: Extend function to receive multiple df with *df_with_filters + + +def flag_rows_to_ignore( + df: pd.DataFrame, df_with_filters: pd.DataFrame +) -> pd.DataFrame: + """ + Add a new column bool column named ignore_from_link to df + having as TRUE the observations defined in df_with_filters. + + Parameters + ---------- + df : pd.DataFrame + Original dataframe. + df_with_filters : pd.DataFrame + Dataframe with observations which should be flagged in the original + dataframe. + + Returns + ------- + df : pd.DataFrame + Original dataframe with a bool column containing the flags. + + """ + + if not set(df_with_filters.columns).issubset(df.columns): + + raise ValueError( + f"""df_with_filters has these columns {list(df_with_filters)} while + df has these columns {list(df)}, please + double check the column names.""" + ) + + # TODO: Check if values to be ignored exist + + df = df.set_index(list(df_with_filters)) + + df_with_filters = df_with_filters.set_index(list(df_with_filters)) + + df["ignore_from_link"] = df.index.isin(df_with_filters.index) + + df = df.reset_index() + + # TODO: Consider what should be logged and reroute print to logs + print("These values were flagged:\n", df.loc[df["ignore_from_link"]]) + + return df diff --git a/tests/apply_imputation_link.csv b/tests/apply_imputation_link.csv new file mode 100644 index 00000000..c81711cd --- /dev/null +++ b/tests/apply_imputation_link.csv @@ -0,0 +1,10 @@ +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link +100,100000,200,202402,1,2,1,,,R,,, +100,100000,,202403,2,0.6,2,2,0.6,FIR,400,, +100,100000,,202404,3,1,2,6,1,FIR,1200,, +200,100001,,202402,1,4,3,1,2,BIR,600,, +200,100001,,202403,3,0.5,3,3,0.5,BIR,150,, +200,100001,300,202404,0.5,1,4,,,R,,, +300,100002,,202402,1,4,5,1,2,C,600,40,0.1 +300,100002,,202403,3,0.5,5,3,0.5,FIC,150,, +300,100002,,202404,0.5,1,5,2,,FIC,,, diff --git a/tests/calculate_links_test_data.csv b/tests/calculate_links_test_data.csv new file mode 100755 index 00000000..72e6408d --- /dev/null +++ b/tests/calculate_links_test_data.csv @@ -0,0 +1,16 @@ +,identifier,period,group,question,f_predictive_question,b_predictive_question,f_matched_pair,b_matched_pair,f_link,b_link +0,10001,202001,1,547.0,,362.0,False,True,,0.9925133689839573 +1,10001,202002,1,362.0,547.0,895.0,True,True,1.0075431034482758,0.8431018935978359 +2,10001,202003,1,895.0,362.0,,True,False,1.186096256684492, +3,10002,202001,1,381.0,,573.0,False,True,,0.9925133689839573 +4,10002,202002,1,573.0,381.0,214.0,True,True,1.0075431034482758,0.8431018935978359 +5,10002,202003,1,214.0,573.0,,True,False,1.186096256684492, +6,10001,202001,2,961.0,,267.0,False,True,,1.693854748603352 +7,10001,202002,2,267.0,961.0,314.0,True,True,0.5903693931398417,0.8523809523809524 +8,10001,202003,2,314.0,267.0,,True,False,1.1731843575418994, +9,10002,202001,2,555.0,,628.0,False,True,,1.693854748603352 +10,10002,202002,2,628.0,555.0,736.0,True,True,0.5903693931398417,0.8523809523809524 +11,10002,202003,2,736.0,628.0,,True,False,1.1731843575418994, +12,10005,202001,1,,,,False,False,,0.9925133689839573 +13,10005,202002,2,,,100.0,False,False,0.5903693931398417,0.8523809523809524 +14,10005,202003,2,100.0,,,False,False,1.1731843575418994, diff --git a/tests/cumulative_links.csv b/tests/cumulative_links.csv new file mode 100755 index 00000000..bef347a5 --- /dev/null +++ b/tests/cumulative_links.csv @@ -0,0 +1,7 @@ +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link +100,100000,200,202402,1,2,1,, +100,100000,,202403,2,0.6,2,2,0.6 +100,100000,,202404,3,1,2,6,1 +200,100001,,202402,1,4,3,1,2 +200,100001,,202403,3,0.5,3,3,0.5 +200,100001,300,202404,0.5,1,4,, diff --git a/tests/data/apply_imputation_link/BIR.csv b/tests/data/apply_imputation_link/BIR.csv new file mode 100755 index 00000000..954700c4 --- /dev/null +++ b/tests/data/apply_imputation_link/BIR.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value +200,100001,,202402,4,2,BIR,600 +200,100001,,202403,0.5,0.5,BIR,150 +200,100001,300,202404,1,,R, diff --git a/tests/data/apply_imputation_link/C_FIC.csv b/tests/data/apply_imputation_link/C_FIC.csv new file mode 100755 index 00000000..7d2424b2 --- /dev/null +++ b/tests/data/apply_imputation_link/C_FIC.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value +300,100002,,202402,1,,0.1,1000,C,100 +300,100002,,202403,3,3,,,FIC,300 +300,100002,,202404,0.5,1.5,,,FIC,150 diff --git a/tests/data/apply_imputation_link/FIR.csv b/tests/data/apply_imputation_link/FIR.csv new file mode 100755 index 00000000..341ece76 --- /dev/null +++ b/tests/data/apply_imputation_link/FIR.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value +100,100000,200,202402,1,,R, +100,100000,,202403,2,2,FIR,400 +100,100000,,202404,3,6,FIR,1200 diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv new file mode 100755 index 00000000..91ec36ec --- /dev/null +++ b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv @@ -0,0 +1,10 @@ +imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value +100,100000,200,202402,1,2,,,,,R, +100,100000,,202403,2,0.6,,,2,0.6,FIR,400 +100,100000,,202404,3,1,,,6,1,FIR,1200 +200,100001,,202402,1,4,,,1,2,BIR,600 +200,100001,,202403,3,0.5,,,3,0.5,BIR,150 +200,100001,300,202404,0.5,1,,,,,R, +300,100002,,202402,1,4,1000,0.1,,2,C,100 +300,100002,,202403,3,0.5,,,3,0.5,FIC,300 +300,100002,,202404,0.5,1,,,1.5,,FIC,150 diff --git a/tests/helper_functions.py b/tests/helper_functions.py index b9006376..83bce07d 100644 --- a/tests/helper_functions.py +++ b/tests/helper_functions.py @@ -1,7 +1,8 @@ import pandas as pd + def load_and_format(filename): """Load csv as pandas dataframe and cast period column to datetime type""" df_loaded = pd.read_csv(filename) - df_loaded['period'] = pd.to_datetime(df_loaded['period'], format='%Y%m') + df_loaded["period"] = pd.to_datetime(df_loaded["period"], format="%Y%m") return df_loaded diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv new file mode 100644 index 00000000..31b56aa8 --- /dev/null +++ b/tests/imputation_flag_data.csv @@ -0,0 +1,28 @@ +reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker +1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r +1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir +1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r +1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r +2,100,202001,,72.0,,,False,False,True,True,False,,bir +2,100,202002,,,,,False,False,True,False,True,72.0,bir +2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir +2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r +3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir +3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r +3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r +3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r +4,100,202001,64.0,81.0,,,True,False,False,False,False,,r +4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir +4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir +4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r +5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r +5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r +5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r +5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r +6,100,202001,64.0,81.0,,,True,False,False,False,False,,r +6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir +6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r +6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir +7,100,202001,,40.0,,,False,False,False,True,False,,c +7,100,202002,,,,,False,False,False,False,True,40.0,fic +7,100,202003,,,,,False,False,False,False,True,,fic diff --git a/tests/test_apply_imputation_link.py b/tests/test_apply_imputation_link.py new file mode 100755 index 00000000..568bfcec --- /dev/null +++ b/tests/test_apply_imputation_link.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.apply_imputation_link import create_and_merge_imputation_values + + +@pytest.fixture(scope="class") +def fir_bir_c_fic_test_data(): + return load_and_format( + Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv" + ) + + +class TestApplyImputationLink: + def test_all_imputation_types(self, fir_bir_c_fic_test_data): + expected_output = fir_bir_c_fic_test_data + + input_data = expected_output.drop(columns=["imputed_value"]) + actual_output = create_and_merge_imputation_values( + input_data, + "imputation_class", + "reference", + "period", + "imputation_marker", + "imputed_value", + "target", + "cumulative_forward_link", + "cumulative_backward_link", + "auxiliary_variable", + "construction_link", + imputation_types=("c", "fir", "bir", "fic"), + ) + + assert_frame_equal(actual_output, expected_output) diff --git a/tests/test_construction_matches.py b/tests/test_construction_matches.py index 3daf0260..1378c6ba 100644 --- a/tests/test_construction_matches.py +++ b/tests/test_construction_matches.py @@ -1,25 +1,31 @@ -import pytest - from pathlib import Path + +import pytest +from helper_functions import load_and_format from pandas.testing import assert_frame_equal from src.construction_matches import flag_construction_matches -from helper_functions import load_and_format + @pytest.fixture(scope="class") def construction_test_data(): - return load_and_format(Path("tests")/"construction_matches.csv") - + return load_and_format(Path("tests") / "construction_matches.csv") + + class TestConstructionMatches: def test_construction_matches_flag(self, construction_test_data): - expected_output = construction_test_data[[ - "target", - "period", - "auxiliary", - "flag_construction_matches", - ]] + expected_output = construction_test_data[ + [ + "target", + "period", + "auxiliary", + "flag_construction_matches", + ] + ] input_data = expected_output.drop(columns=["flag_construction_matches"]) - actual_output = flag_construction_matches(input_data, "target", "period", "auxiliary") + actual_output = flag_construction_matches( + input_data, "target", "period", "auxiliary" + ) - assert_frame_equal(actual_output, expected_output) \ No newline at end of file + assert_frame_equal(actual_output, expected_output) diff --git a/tests/test_cumulative_imputation_links.py b/tests/test_cumulative_imputation_links.py new file mode 100755 index 00000000..bf31094a --- /dev/null +++ b/tests/test_cumulative_imputation_links.py @@ -0,0 +1,64 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.cumulative_imputation_links import get_cumulative_links + + +@pytest.fixture(scope="class") +def cumulative_links_test_data(): + return load_and_format(Path("tests") / "cumulative_links.csv") + + +class TestComulativeLinks: + def test_get_cumulative_links_forward(self, cumulative_links_test_data): + input_data = cumulative_links_test_data.drop( + columns=["cumulative_forward_imputation_link", "imputation_group"] + ) + + expected_output = cumulative_links_test_data[ + [ + "imputation_group", + "cumulative_forward_imputation_link", + ] + ] + + actual_output = get_cumulative_links( + input_data, + "f", + "strata", + "reference", + "target", + "period", + "forward_imputation_link", + 1, + ) + + assert_frame_equal(actual_output, expected_output) + + def test_get_cumulative_links_backward(self, cumulative_links_test_data): + input_data = cumulative_links_test_data.drop( + columns=["cumulative_backward_imputation_link", "imputation_group"] + ) + + expected_output = cumulative_links_test_data[ + [ + "imputation_group", + "cumulative_backward_imputation_link", + ] + ] + + actual_output = get_cumulative_links( + input_data, + "b", + "strata", + "reference", + "target", + "period", + "backward_imputation_link", + 1, + ) + + assert_frame_equal(actual_output, expected_output) diff --git a/tests/test_flag_data.csv b/tests/test_flag_data.csv new file mode 100755 index 00000000..2f97b47f --- /dev/null +++ b/tests/test_flag_data.csv @@ -0,0 +1,29 @@ +identifier,date,group,question,other,ignore_from_link +70001,202001,100,5951.0,39,False +70001,202002,100,1814.0,39,False +70001,202003,100,734.0,39,True +70001,202004,100,96.0,39,False +70001,202005,100,9086.0,39,True +70001,202006,100,3949.0,39,False +70001,202007,100,49.0,39,False +70002,202001,100,6705.0,94,False +70002,202002,100,48.0,94,False +70002,202003,100,5361.0,94,False +70002,202004,100,8767.0,94,False +70002,202005,100,9214.0,94,False +70002,202006,100,7467.0,94,False +70002,202007,100,3475.0,94,False +70003,202001,100,6153.0,42,False +70003,202002,100,7711.0,42,False +70003,202003,100,5403.0,42,False +70003,202004,100,7445.0,42,False +70003,202005,100,7092.0,42,False +70003,202006,100,2038.0,42,False +70003,202007,100,8768.0,42,False +70004,202001,100,,6,False +70004,202002,100,,6,False +70004,202003,100,6288.0,6,False +70004,202004,100,,6,False +70004,202005,100,,6,False +70004,202006,100,5875.0,6,False +70004,202007,100,,6,False diff --git a/tests/test_flag_filters.csv b/tests/test_flag_filters.csv new file mode 100755 index 00000000..abdfb4c8 --- /dev/null +++ b/tests/test_flag_filters.csv @@ -0,0 +1,3 @@ +identifier,date +70001,202003 +70001,202005 diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py new file mode 100644 index 00000000..51fa63c8 --- /dev/null +++ b/tests/test_forward_link.py @@ -0,0 +1,75 @@ +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.forward_link import calculate_imputation_link + +scenarios = ["calculate_links_test_data"] + + +@pytest.mark.parametrize("scenario", scenarios) +class TestLinks: + def test_forward_links(self, scenario): + """Test if function returns the f_link column""" + + df_output = load_and_format("tests/" + scenario + ".csv") + + df_input = df_output.drop(columns=["f_link"]) + + df_input = calculate_imputation_link( + df_input, + "period", + "group", + "f_matched_pair", + "question", + "f_predictive_question", + ) + + assert_frame_equal(df_input, df_output, check_like=True) + + def test_back_links(self, scenario): + """Test if function returns the b_link column""" + df_output = load_and_format("tests/" + scenario + ".csv") + + df_input = df_output.drop(columns=["b_link"]) + + df_input = calculate_imputation_link( + df_input, + "period", + "group", + "b_matched_pair", + "question", + "b_predictive_question", + ) + + assert_frame_equal(df_input, df_output, check_like=True) + + def test_exception(self, scenario): + + df = load_and_format("tests/" + scenario + ".csv") + + with pytest.raises(ValueError): + """ + Test if function is called with wrong arguments, in particular + with f_matched_pair and b_predictive_question or with + b_matched_pair and f_predictive_question. + """ + + df = calculate_imputation_link( + df, + "period", + "group", + "f_matched_pair", + "question", + "b_predictive_question", + ) + with pytest.raises(ValueError): + + df = calculate_imputation_link( + df, + "period", + "group", + "b_matched_pair", + "question", + "f_predictive_question", + ) diff --git a/tests/test_imputation_flags.py b/tests/test_imputation_flags.py new file mode 100644 index 00000000..315b5fa3 --- /dev/null +++ b/tests/test_imputation_flags.py @@ -0,0 +1,50 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.imputation_flags import create_impute_flags, generate_imputation_marker + + +@pytest.fixture(scope="class") +def imputation_flag_test_data(): + return load_and_format(Path("tests") / "imputation_flag_data.csv") + + +class TestImputationFlags: + def test_create_impute_flags(self, imputation_flag_test_data): + df_expected_output = imputation_flag_test_data.copy() + df_expected_output.drop(["imputation_marker"], axis=1, inplace=True) + df_input = df_expected_output.copy() + df_input = df_input[ + [ + "reference", + "strata", + "period", + "target_variable", + "auxiliary", + "f_predictive_target_variable", + "b_predictive_target_variable", + "f_predictive_auxiliary", + ] + ] + df_output = create_impute_flags( + df=df_input, + target="target_variable", + reference="reference", + strata="strata", + auxiliary="auxiliary", + predictive_auxiliary="f_predictive_auxiliary", + ) + + df_expected_output.drop(["f_predictive_auxiliary"], axis=1, inplace=True) + + assert_frame_equal(df_output, df_expected_output) + + def test_imputation_marker(self, imputation_flag_test_data): + df_expected_output = imputation_flag_test_data.copy() + df_input = imputation_flag_test_data.copy() + df_input.drop("imputation_marker", axis=1, inplace=True) + df_output = generate_imputation_marker(df_input) + assert_frame_equal(df_output, df_expected_output) diff --git a/tests/test_link_filter.py b/tests/test_link_filter.py new file mode 100644 index 00000000..bbd5cc75 --- /dev/null +++ b/tests/test_link_filter.py @@ -0,0 +1,39 @@ +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from src.link_filter import flag_rows_to_ignore + + +@pytest.mark.parametrize("scenario", ["test_flag_data"]) +@pytest.mark.parametrize("filters", ["test_flag_filters"]) +class TestFilters: + def test_basic_filter(self, scenario, filters): + """Test ignore_from_link is correct""" + + df_output_expected = pd.read_csv("tests/" + scenario + ".csv") + + df_filters = pd.read_csv("tests/" + filters + ".csv") + + df_input = df_output_expected.drop(columns=["ignore_from_link"]) + + df_output = flag_rows_to_ignore(df_input, df_filters) + + assert_frame_equal(df_output, df_output_expected) + + def test_exception(self, scenario, filters): + + """Test if function raises an exception when the columns in filters + do not exist in scenario.""" + + df_output_expected = pd.read_csv("tests/" + scenario + ".csv") + + df_filters = pd.read_csv("tests/" + filters + ".csv") + + df_input = df_output_expected.drop(columns=["ignore_from_link"]) + + with pytest.raises(ValueError): + + df_filters.columns = df_filters.columns + "_fail" + + flag_rows_to_ignore(df_input, df_filters)