From d899c73129b60d18c3df9e5e9248168d74b0572c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 18 Apr 2024 20:12:31 -0700 Subject: [PATCH 1/6] KP2023 ETL: Symptom columns data dict update --- lib/seattleflu/id3c/cli/command/clinical.py | 78 +++++++++++++++++---- 1 file changed, 63 insertions(+), 15 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index d9e6a2e5..0a5c70a4 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -12,6 +12,7 @@ import os import re import pandas as pd +import numpy as np import id3c.db as db import time import requests @@ -899,12 +900,12 @@ def parse_kp2023(kp2023_filename: str) -> None: 'symptom_cough', 'symptom_fever', 'symptom_chills', - 'symptom__throat', # this typo has been in all KP2023 sheets so far, so expect it; however, if the typo is fixed, it will be let through below - 'sympton_sob', # this typo has been in all KP2023 sheets so far, so expect it; however, if the typo is fixed, it will be let through below + 'symptom_throat', # typo that appeared in early KP2023 metadata will be let through below + 'symptom_sob', # typo that appeared in early KP2023 metadata will be let through below 'symptom_nose', 'symptom_smell_taste', - 'symptom_unk', - 'symptom_no_answer', + #'symptom_unk', # no longer in data dictionary as of January 2024 + #'symptom_no_answer', # no longer in data dictionary as of January 2024 'date_flu_1', 'date_flu_2', 'flu_type_1', @@ -920,11 +921,11 @@ def parse_kp2023(kp2023_filename: str) -> None: # check for missing expected columns missing_cols = list(set(expected_columns).difference(clinical_records.columns)) - # If KP fixes known typos on their end, allow those through. (if typo name is missing but fixed name is present) - if 'sympton_sob' in missing_cols and 'symptom_sob' in clinical_records.columns: - missing_cols.remove('sympton_sob') - if 'symptom__throat' in missing_cols and 'symptom_throat' in clinical_records.columns: - missing_cols.remove('symptom__throat') + # Allow typos from early KP2023 metadata sheets through + if 'symptom_sob' in missing_cols and 'sympton_sob' in clinical_records.columns: + missing_cols.remove('symptom_sob') + if 'symptom_throat' in missing_cols and 'symptom__throat' in clinical_records.columns: + missing_cols.remove('symptom_throat') if len(missing_cols) > 0: raise MissingColumn(f'One or more expected columns are missing from the input spreadsheet: {*missing_cols,}') @@ -933,17 +934,14 @@ def parse_kp2023(kp2023_filename: str) -> None: 'marshfield_lab_id': 'collection_id', # will be mapped to lims barcode with id3c clinical match-kp2023 'hispaniclatino': 'ethnicity', 'assigned_sex': 'sex', - 'symptom__throat': 'symptom_throat', # fix extra underscore + 'symptom__throat': 'symptom_throat', # fix typo if present 'censustract': 'census_tract', 'type_of_visit': 'patient_class', - 'sympton_sob': 'symptom_sob' # fix typo + 'sympton_sob': 'symptom_sob' # fix typo if present } clinical_records = clinical_records.rename(columns=column_map) - # check for missing or duplicated barcodes? - #barcode_quality_control(clinical_records) - # The collection ids on the tubes from KP have aliquot numbers appended to them (ex: KPWB100001C-1) # but the collection ids in the metadata spreadsheet do not have these aliquot numbers at the end (ex: KPWB100001C) # therefore, we will check that there is no aliquot number at the end of the collection id in the metadata spreadsheet, @@ -957,6 +955,19 @@ def parse_kp2023(kp2023_filename: str) -> None: clinical_records.loc[collection_ids_with_aliquot, 'collection_id'] = clinical_records.loc[ collection_ids_with_aliquot, 'collection_id' ].apply(lambda cid: re.sub(r'-\d+$','', cid)) + + # convert symptom columns from numeric to binary (0/1) + clinical_records = convert_column_set_to_binary(clinical_records, 'symptom_') + + # check that expected binary columns only contain 0/1/None values + # race + if not column_set_is_binary(clinical_records, 'race_'): + raise UnexpectedNumeric(f'One or more columns with prefix "race_" have values other than 0/1/None.\ + These columns are expected to be binary.') + # sex column is binary, but the map function that we use below + # will automatically convert non-0/1 values to None, + # so don't need to check that here + # likewise with patient_class # map high risk codes to ICD-10 codes, and collapse into one column 'icd10' clinical_records = map_icd10_codes(clinical_records) @@ -1071,6 +1082,36 @@ def parse_kp2023(kp2023_filename: str) -> None: dump_ndjson(clinical_records) +def convert_column_set_to_binary(df: pd.DataFrame, prefix: str) -> pd.DataFrame: + """ + Given a DataFrame *df* of clinical records and a string *prefix* with + a prefix denoting which columns to convert, returns a DataFrame where + columns whose names begin with *prefix* contain only values 0, 1, or None. + Any value other than 1 in the input column is converted to None. + See KP2023 data dictionary for details. + """ + cols = [c for c in df.columns if c.startswith(prefix)] + for c in cols: + df.loc[df[c] > 1, c] = None + + return df + + +def column_set_is_binary(df: pd.DataFrame, prefix:str) -> bool: + """ + Given a DataFrame *df* of clinical records and a string *prefix* with + a prefix denoting columns of interest, returns True if all columns + beginning with the provided prefix contain only 0/1/None values, + otherwise returns False. + """ + cols = [c for c in df.columns if c.startswith(prefix)] + for c in cols: + if not np.isin(df[c].dropna().unique(), [0, 1]).all(): + return False + + return True + + def map_icd10_codes(df: pd.DataFrame) -> pd.DataFrame: """ Given a DataFrame *df* of clinical records, returns a DataFrame @@ -1613,4 +1654,11 @@ class MissingColumn(KeyError): Raised by :function: `parse-kp2023` if any expected columns are not found in the input spreadsheet after standardizing column names """ - pass \ No newline at end of file + pass + +class UnexpectedNumeric(KeyError): + """ + Raised by function parse-kp2023 if any columns that are expected to be binary + have values other than 0/1/None + """ + pass From d0febf4d67d3d75b2887c16dfeb2a48db2d3616f Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 18 Apr 2024 21:23:55 -0700 Subject: [PATCH 2/6] Clinical ETL: add encounter reference to ICD-10 condition entries FHIR ETL requires encounter reference to be present in order to process condition entries --- lib/seattleflu/id3c/cli/command/etl/clinical.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/etl/clinical.py b/lib/seattleflu/id3c/cli/command/etl/clinical.py index f4d08016..b068af42 100644 --- a/lib/seattleflu/id3c/cli/command/etl/clinical.py +++ b/lib/seattleflu/id3c/cli/command/etl/clinical.py @@ -302,7 +302,7 @@ def generate_fhir_bundle(db: DatabaseSession, record: dict, site_id: str) -> Opt if record["site"].upper() == 'KP2023': # KP2023 includes some types of metadata that PHSKC does not - icd10_condition_entries = create_icd10_conditions_kp2023(record, patient_reference) + icd10_condition_entries = create_icd10_conditions_kp2023(record, patient_reference, encounter_reference) symptom_condition_entries = create_symptom_conditions(record, patient_reference, encounter_reference) immunization_entries = create_immunization_kp2023(record, patient_reference) resource_entries.extend(icd10_condition_entries + symptom_condition_entries + immunization_entries) @@ -513,7 +513,7 @@ def create_symptom_conditions(record: dict, patient_reference: dict, encounter_r return condition_entries -def create_icd10_conditions_kp2023(record:dict, patient_reference: dict) -> list: +def create_icd10_conditions_kp2023(record:dict, patient_reference: dict, encounter_reference: dict) -> list: """ Create a condition resource for each ICD-10 code, following the FHIR format (http://www.hl7.org/implement/standards/fhir/condition.html) @@ -1051,7 +1051,8 @@ def create_icd10_conditions_kp2023(record:dict, patient_reference: dict) -> list create_codeable_concept( system = icd10_codes[icd10_code]["system"], code = icd10_codes[icd10_code]["code"], - display = icd10_codes[icd10_code]["display"]) + display = icd10_codes[icd10_code]["display"]), + encounter_reference ) condition_entries.append(create_resource_entry( From 05a6bb08f75ef4dc48ca48d1901339d61fcc447f Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 18 Apr 2024 21:25:32 -0700 Subject: [PATCH 3/6] KP ETL: ingest comorbidity metadata for original sample stream --- lib/seattleflu/id3c/cli/command/clinical.py | 239 +++++++++++--------- 1 file changed, 131 insertions(+), 108 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 0a5c70a4..43079c12 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -394,6 +394,7 @@ def parse_kp(kp_filename, kp_specimen_manifest_filename, manifest_format, output clinical_records = rename_symptoms_columns(clinical_records) clinical_records = collapse_columns(clinical_records, 'symptom') clinical_records = collapse_columns(clinical_records, 'race') + clinical_records = map_icd10_codes(clinical_records, 'kp') clinical_records['FluShot'] = clinical_records['fluvaxdt'].notna() @@ -409,6 +410,7 @@ def parse_kp(kp_filename, kp_specimen_manifest_filename, manifest_format, output "FluShot": "FluShot", "censustract": "census_tract", "_provenance": "_provenance", + "icd10": "ICD10" } if manifest_format=="year1": @@ -970,7 +972,7 @@ def parse_kp2023(kp2023_filename: str) -> None: # likewise with patient_class # map high risk codes to ICD-10 codes, and collapse into one column 'icd10' - clinical_records = map_icd10_codes(clinical_records) + clinical_records = map_icd10_codes(clinical_records, 'kp2023') # collapse race and symptom columns clinical_records = collapse_columns(clinical_records, 'symptom_', 'collection_id') @@ -1112,117 +1114,138 @@ def column_set_is_binary(df: pd.DataFrame, prefix:str) -> bool: return True -def map_icd10_codes(df: pd.DataFrame) -> pd.DataFrame: +def map_icd10_codes(df: pd.DataFrame, sample_stream: str) -> pd.DataFrame: """ Given a DataFrame *df* of clinical records, returns a DataFrame with an icd10 column containing a list of all positive icd10 codes for each record """ - icd10_mapper = { - "chronic ischemic heart disease": "I25", - "heart failure": "I50", - "simple and mucopurulent chronic bronchitis": "J41", - "unspecified chronic bronchitis": "J42", - "other chronic obstructive pulmonary disease": "J44", - "asthma": "J45", - "bronchiectasis": "J47", - "acute respiratory distress syndrome": "J80", - "type 2 diabetes mellitus": "E11", - "encounter for antineoplastic chemotherapy and immunotherapy": "Z51.1", - "transplanted organ and tissue status": "Z94", - "chronic viral hepatitis": "B18", - "alcoholic liver disease": "K70", - "malignant neoplasm of lip": "C00", - "malignant neoplasm of base of tongue": "C01", - "malignant neoplasm of other and unspecified parts of tongue": "C02", - "malignant neoplasm of gum": "C03", - "malignant neoplasm of floor of mouth": "C04", - "malignant neoplasm of palate": "C05", - "malignant neoplasm of other and unspecified parts of mouth": "C06", - "malignant neoplasm of parotid gland": "C07", - "malignant neoplasm of other and unspecified major salivary glands": "C08", - "malignant neoplasm of tonsil": "C09", - "malignant neoplasm of oropharynx": "C10", - "malignant neoplasm of nasopharynx": "C11", - "malignant neoplasm of pyriform sinus": "C12", - "malignant neoplasm of hypopharynx": "C13", - "malignant neoplasm of other and ill-defined sites in the lip, oral cavity and pharynx": "C14", - "malignant neoplasm of esophagus": "C15", - "malignant neoplasm of stomach": "C16", - "malignant neoplasm of small intestine": "C17", - "malignant neoplasm of colon": "C18", - "malignant neoplasm of rectosigmoid junction": "C19", - "malignant neoplasm of rectum": "C20", - "malignant neoplasm of anus and anal canal": "C21", - "malignant neoplasm of liver and intrahepatic bile ducts": "C22", - "malignant neoplasm of gallbladder": "C23", - "malignant neoplasm of other and unspecified parts of biliary tract": "C24", - "malignant neoplasm of pancreas": "C25", - "malignant neoplasm of other and ill-defined digestive organs": "C26", - "malignant neoplasm of nasal cavity and middle ear": "C30", - "malignant neoplasm of accessory sinuses": "C31", - "malignant neoplasm of larynx": "C32", - "malignant neoplasm of trachea": "C33", - "malignant neoplasm of bronchus and lung": "C34", - "malignant neoplasm of thymus": "C37", - "malignant neoplasm of heart, mediastinum and pleura": "C38", - "malignant neoplasm of other and ill-defined sites in the respiratory system and intrathoracic organs": "C39", - "malignant neoplasm of bone and articular cartilage of limbs": "C40", - "malignant neoplasm of bone and articular cartilage of other and unspecified sites": "C41", - "malignant melanoma of skin": "C43", - "other and unspecified malignant neoplasm of skin": "C44", - "mesothelioma": "C45", - "kaposi's sarcoma": "C46", - "malignant neoplasm of peripheral nerves and autonomic nervous system": "C47", - "malignant neoplasm of retroperitoneum and peritoneum": "C48", - "malignant neoplasm of other connective and soft tissue": "C49", - "merkel cell carcinoma": "C4A", - "malignant neoplasms of breast": "C50", - "malignant neoplasm of vulva": "C51", - "malignant neoplasm of vagina": "C52", - "malignant neoplasm of cervix uteri": "C53", - "malignant neoplasm of corpus uteri": "C54", - "malignant neoplasm of uterus, part unspecified": "C55", - "malignant neoplasm of ovary": "C56", - "malignant neoplasm of other and unspecified female genital organs": "C57", - "malignant neoplasm of placenta": "C58", - "malignant neoplasm of penis": "C60", - "malignant neoplasm of prostate": "C61", - "malignant neoplasm of testis": "C62", - "malignant neoplasm of other and unspecified male genital organs": "C63", - "malignant neoplasm of kidney, except renal pelvis": "C64", - "malignant neoplasm of renal pelvis": "C65", - "malignant neoplasm of ureter": "C66", - "malignant neoplasm of bladder": "C67", - "malignant neoplasm of other and unspecified urinary organs": "C68", - "malignant neoplasm of eye and adnexa": "C69", - "malignant neoplasm of meninges": "C70", - "malignant neoplasm of brain": "C71", - "malignant neoplasm of spinal cord, cranial nerves and other parts of central nervous system": "C72", - "malignant neoplasm of thyroid gland": "C73", - "malignant neoplasm of adrenal gland": "C74", - "malignant neoplasm of other endocrine glands and related structures": "C75", - "malignant neoplasm of other and ill-defined sites": "C76", - "secondary and unspecified malignant neoplasm of lymph nodes": "C77", - "secondary malignant neoplasm of respiratory and digestive organs": "C78", - "secondary malignant neoplasm of other and unspecified sites": "C79", - "malignant neuroendocrine tumors": "C7A", - "secondary neuroendocrine tumors": "C7B", - "malignant neoplasm without specification of site": "C80", - "hodgkin lymphoma": "C81", - "follicular lymphoma": "C82", - "non-follicular lymphoma": "C83", - "mature t/nk-cell lymphomas": "C84", - "other specified and unspecified types of non-hodgkin lymphoma": "C85", - "other specified types of t/nk-cell lymphoma": "C86", - "malignant immunoproliferative diseases and certain other b-cell lymphomas": "C88", - "multiple myeloma and malignant plasma cell neoplasms": "C90", - "lymphoid leukemia": "C91", - "myeloid leukemia": "C92", - "monocytic leukemia": "C93", - "other leukemias of specified cell type": "C94", - "leukemia of unspecified cell type": "C95", - "other and unspecified malignant neoplasms of lymphoid, hematopoietic and related tissue": "C96" - } + if sample_stream == 'kp': + icd10_mapper = { + "cvd": "I25.10", + "chf": "I50.9", + "bronch": "J42", # could map to J41 or J42 + "copd": "J44.9", + "asthma": "J45", + "diabetes": "E11.9", + "renal": "E18.9", + "chemo": "Z51.1", + "solidorgan": "Z94", + "hsct": "Z94.84", + "liver": "B18", # this could map to B18 or 70.9 + "cancer": "C", # not sure what to do for original kp sample stream which is not more specific about cancer type + "lungmalig": "C34" + } + + elif sample_stream == 'kp2023': + icd10_mapper = { + "chronic ischemic heart disease": "I25", + "heart failure": "I50", + "simple and mucopurulent chronic bronchitis": "J41", + "unspecified chronic bronchitis": "J42", + "other chronic obstructive pulmonary disease": "J44", + "asthma": "J45", + "bronchiectasis": "J47", + "acute respiratory distress syndrome": "J80", + "type 2 diabetes mellitus": "E11", + "encounter for antineoplastic chemotherapy and immunotherapy": "Z51.1", + "transplanted organ and tissue status": "Z94", + "chronic viral hepatitis": "B18", + "alcoholic liver disease": "K70", + "malignant neoplasm of lip": "C00", + "malignant neoplasm of base of tongue": "C01", + "malignant neoplasm of other and unspecified parts of tongue": "C02", + "malignant neoplasm of gum": "C03", + "malignant neoplasm of floor of mouth": "C04", + "malignant neoplasm of palate": "C05", + "malignant neoplasm of other and unspecified parts of mouth": "C06", + "malignant neoplasm of parotid gland": "C07", + "malignant neoplasm of other and unspecified major salivary glands": "C08", + "malignant neoplasm of tonsil": "C09", + "malignant neoplasm of oropharynx": "C10", + "malignant neoplasm of nasopharynx": "C11", + "malignant neoplasm of pyriform sinus": "C12", + "malignant neoplasm of hypopharynx": "C13", + "malignant neoplasm of other and ill-defined sites in the lip, oral cavity and pharynx": "C14", + "malignant neoplasm of esophagus": "C15", + "malignant neoplasm of stomach": "C16", + "malignant neoplasm of small intestine": "C17", + "malignant neoplasm of colon": "C18", + "malignant neoplasm of rectosigmoid junction": "C19", + "malignant neoplasm of rectum": "C20", + "malignant neoplasm of anus and anal canal": "C21", + "malignant neoplasm of liver and intrahepatic bile ducts": "C22", + "malignant neoplasm of gallbladder": "C23", + "malignant neoplasm of other and unspecified parts of biliary tract": "C24", + "malignant neoplasm of pancreas": "C25", + "malignant neoplasm of other and ill-defined digestive organs": "C26", + "malignant neoplasm of nasal cavity and middle ear": "C30", + "malignant neoplasm of accessory sinuses": "C31", + "malignant neoplasm of larynx": "C32", + "malignant neoplasm of trachea": "C33", + "malignant neoplasm of bronchus and lung": "C34", + "malignant neoplasm of thymus": "C37", + "malignant neoplasm of heart, mediastinum and pleura": "C38", + "malignant neoplasm of other and ill-defined sites in the respiratory system and intrathoracic organs": "C39", + "malignant neoplasm of bone and articular cartilage of limbs": "C40", + "malignant neoplasm of bone and articular cartilage of other and unspecified sites": "C41", + "malignant melanoma of skin": "C43", + "other and unspecified malignant neoplasm of skin": "C44", + "mesothelioma": "C45", + "kaposi's sarcoma": "C46", + "malignant neoplasm of peripheral nerves and autonomic nervous system": "C47", + "malignant neoplasm of retroperitoneum and peritoneum": "C48", + "malignant neoplasm of other connective and soft tissue": "C49", + "merkel cell carcinoma": "C4A", + "malignant neoplasms of breast": "C50", + "malignant neoplasm of vulva": "C51", + "malignant neoplasm of vagina": "C52", + "malignant neoplasm of cervix uteri": "C53", + "malignant neoplasm of corpus uteri": "C54", + "malignant neoplasm of uterus, part unspecified": "C55", + "malignant neoplasm of ovary": "C56", + "malignant neoplasm of other and unspecified female genital organs": "C57", + "malignant neoplasm of placenta": "C58", + "malignant neoplasm of penis": "C60", + "malignant neoplasm of prostate": "C61", + "malignant neoplasm of testis": "C62", + "malignant neoplasm of other and unspecified male genital organs": "C63", + "malignant neoplasm of kidney, except renal pelvis": "C64", + "malignant neoplasm of renal pelvis": "C65", + "malignant neoplasm of ureter": "C66", + "malignant neoplasm of bladder": "C67", + "malignant neoplasm of other and unspecified urinary organs": "C68", + "malignant neoplasm of eye and adnexa": "C69", + "malignant neoplasm of meninges": "C70", + "malignant neoplasm of brain": "C71", + "malignant neoplasm of spinal cord, cranial nerves and other parts of central nervous system": "C72", + "malignant neoplasm of thyroid gland": "C73", + "malignant neoplasm of adrenal gland": "C74", + "malignant neoplasm of other endocrine glands and related structures": "C75", + "malignant neoplasm of other and ill-defined sites": "C76", + "secondary and unspecified malignant neoplasm of lymph nodes": "C77", + "secondary malignant neoplasm of respiratory and digestive organs": "C78", + "secondary malignant neoplasm of other and unspecified sites": "C79", + "malignant neuroendocrine tumors": "C7A", + "secondary neuroendocrine tumors": "C7B", + "malignant neoplasm without specification of site": "C80", + "hodgkin lymphoma": "C81", + "follicular lymphoma": "C82", + "non-follicular lymphoma": "C83", + "mature t/nk-cell lymphomas": "C84", + "other specified and unspecified types of non-hodgkin lymphoma": "C85", + "other specified types of t/nk-cell lymphoma": "C86", + "malignant immunoproliferative diseases and certain other b-cell lymphomas": "C88", + "multiple myeloma and malignant plasma cell neoplasms": "C90", + "lymphoid leukemia": "C91", + "myeloid leukemia": "C92", + "monocytic leukemia": "C93", + "other leukemias of specified cell type": "C94", + "leukemia of unspecified cell type": "C95", + "other and unspecified malignant neoplasms of lymphoid, hematopoietic and related tissue": "C96" + } + + else: + raise ValueError(f'Unrecognized sample stream input to function map_icd10_codes: ' + sample_stream) # rename columns df = df.rename(columns=icd10_mapper) From 9b71b08a20902932a2275fc8a6028a7131ff1301 Mon Sep 17 00:00:00 2001 From: sallybg <136365880+sallybg@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:10:56 -0700 Subject: [PATCH 4/6] parse-kp2023: refactor column_set_is_binary() Co-authored-by: Benjamin Capodanno <31941502+bencap@users.noreply.github.com> --- lib/seattleflu/id3c/cli/command/clinical.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 43079c12..34f2eca6 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -1107,11 +1107,7 @@ def column_set_is_binary(df: pd.DataFrame, prefix:str) -> bool: otherwise returns False. """ cols = [c for c in df.columns if c.startswith(prefix)] - for c in cols: - if not np.isin(df[c].dropna().unique(), [0, 1]).all(): - return False - - return True + return all([np.isin(df[c].dropna().unique(), [0, 1]).all() for c in cols]) def map_icd10_codes(df: pd.DataFrame, sample_stream: str) -> pd.DataFrame: From 681d6be149545af0e217cc5cf4eb0b13a7a6716f Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 22 Apr 2024 12:24:21 -0700 Subject: [PATCH 5/6] parse-kp: do not convert timestamp to local timezone In summer 2022, all clinical parse functions were updated to convert encountered date timestamps from UTC to local time (see https://github.com/seattleflu/id3c-customizations/commit/3205e0b2e5f24900e59548f30a3d63ef5d78af19) Now, in 2024, we are reingesting kp encounter metadata from 2018-2021. These encounters were processed and uploaded to id3c with UTC encounter dates. Since the encounter identifier depends on the encounter date, and since encounter identifiers should be the same each time a sample is uploaded to id3c, the encounter date should be in UTC for these kp samples. --- lib/seattleflu/id3c/cli/command/clinical.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 34f2eca6..3ed80e39 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -424,7 +424,14 @@ def parse_kp(kp_filename, kp_specimen_manifest_filename, manifest_format, output clinical_records = clinical_records[column_map.values()] # Convert dtypes - clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]).dt.tz_localize('America/Los_Angeles') + #clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]).dt.tz_localize('America/Los_Angeles') + # unlike other clinical parse functions, do not convert from UTC to local timezone + # this is because of a reingestion of kp 2018-2021 encounter metadata in 2024, in order to include ICD-10 codes + # timestamp conversion from UTC to local timezone only was added after kp 2018-2021 encounters were processed into id3c + # encounter identifiers are based on encounter date, so need to keep encounter date consistent with old + # records in order to avoid re-uploading the same encounter to id3c with a different encounter identifier than before + + clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]) # Insert static value columns clinical_records["site"] = "KP" From 493c44e88cd4a7485512f9e84b9ee83f6bde14db Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 22 Apr 2024 13:30:45 -0700 Subject: [PATCH 6/6] parse-kp2023: add extra check that symptom column is binary --- lib/seattleflu/id3c/cli/command/clinical.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 3ed80e39..7a599bcb 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -430,7 +430,7 @@ def parse_kp(kp_filename, kp_specimen_manifest_filename, manifest_format, output # timestamp conversion from UTC to local timezone only was added after kp 2018-2021 encounters were processed into id3c # encounter identifiers are based on encounter date, so need to keep encounter date consistent with old # records in order to avoid re-uploading the same encounter to id3c with a different encounter identifier than before - + clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]) # Insert static value columns @@ -973,6 +973,14 @@ def parse_kp2023(kp2023_filename: str) -> None: if not column_set_is_binary(clinical_records, 'race_'): raise UnexpectedNumeric(f'One or more columns with prefix "race_" have values other than 0/1/None.\ These columns are expected to be binary.') + # since symptoms column could still contain numeric values after call to convert_column_set_to_binary + # if there were any negative values present, + # check that symptoms column only contains binary or None values + if not column_set_is_binary(clinical_records, 'symptom_'): + raise UnexpectedNumeric(f'One or more columns with prefix "symptom_" have values other than 0/1/None\ + after attempted conversion from numeric to binary. \ + Check for negative values present in input symptom columns.') + # sex column is binary, but the map function that we use below # will automatically convert non-0/1 values to None, # so don't need to check that here @@ -1096,7 +1104,8 @@ def convert_column_set_to_binary(df: pd.DataFrame, prefix: str) -> pd.DataFrame: Given a DataFrame *df* of clinical records and a string *prefix* with a prefix denoting which columns to convert, returns a DataFrame where columns whose names begin with *prefix* contain only values 0, 1, or None. - Any value other than 1 in the input column is converted to None. + Any positive value other than 0 or 1 in the input column is converted to None. + Assumes no negative values as input. See KP2023 data dictionary for details. """ cols = [c for c in df.columns if c.startswith(prefix)]