From a0f8dba14c1776348303b0596f835c8a3dba77db Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 25 Apr 2023 15:55:43 -0600 Subject: [PATCH 01/24] begin transform refactor - split off utils and biodomains --- .vscode/settings.json | 2 + .../etl/{ => transform}/transform.py | 293 +----------------- .../transform/transform_genes_biodomains.py | 75 +++++ src/agoradatatools/etl/transform/utils.py | 197 ++++++++++++ src/agoradatatools/process.py | 3 +- tests/test_process.py | 4 +- tests/test_transform.py | 49 +-- 7 files changed, 313 insertions(+), 310 deletions(-) create mode 100644 .vscode/settings.json rename src/agoradatatools/etl/{ => transform}/transform.py (53%) create mode 100644 src/agoradatatools/etl/transform/transform_genes_biodomains.py create mode 100644 src/agoradatatools/etl/transform/utils.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..7a73a41b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,2 @@ +{ +} \ No newline at end of file diff --git a/src/agoradatatools/etl/transform.py b/src/agoradatatools/etl/transform/transform.py similarity index 53% rename from src/agoradatatools/etl/transform.py rename to src/agoradatatools/etl/transform/transform.py index 6cc27601..c9fbced0 100644 --- a/src/agoradatatools/etl/transform.py +++ b/src/agoradatatools/etl/transform/transform.py @@ -1,280 +1,10 @@ import numpy as np import pandas as pd - -def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame: - """Takes in a dataframe replaces problematic characters in column names - and makes column names all lowercase characters - - Args: - df (pd.DataFrame): DataFrame with columns to be standardized - - Returns: - pd.DataFrame: New dataframe with cleaned column names - """ - - df.columns = df.columns.str.replace( - "[#@&*^?()%$#!/]", "", regex=True - ) # the commas were unnessesary and were breaking the prelacement of '-' characters - df.columns = df.columns.str.replace("[ -.]", "_", regex=True) - df.columns = map(str.lower, df.columns) - - return df - - -def standardize_values(df: pd.DataFrame) -> pd.DataFrame: - """Finds non-compliant values and corrects them - *if more data cleaning options need to be added to this, - this needs to be refactored to another function - - Args: - df (pd.DataFrame): DataFrame with values to be standardized - - Returns: - pd.DataFrame: Resulting DataFrame with standardized values - """ - try: - df.replace(["n/a", "N/A", "n/A", "N/a"], np.nan, regex=True, inplace=True) - except TypeError: # I could not get this to trigger without mocking replace - print("Error comparing types.") - - return df - - -def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: - """Takes in a dataframe and renames columns according to the mapping provided - - Args: - df (pd.DataFrame): DataFrame with columns to be renamed - column_map (dict): Dictionary mapping original column names to new columns - - Returns: - pd.DataFrame: DataFrame with new columns names - """ - try: - df.rename(columns=column_map, inplace=True) - except TypeError: - print("Column mapping must be a dictionary") - return df - - return df - - """ - This will create a dictionary object with the result of the grouping provided - :param df: a dataframe - :param grouping: a string containing the column to group by - :param new_column: a string with the name of the new column that will contain - the nested field - :param drop_columns: a list of column names to drop (remove) from the - nested dictionary. Optional argument, defaults to empty list. - :return: a dataframe - """ - - -def nest_fields( - df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] -) -> pd.DataFrame: - """Collapses the provided DataFrame into 2 columns: - 1. The grouping column - 2. A column containing a nested dictionary with the data from the rest of the DataFrame - - Args: - df (pd.DataFrame): DataFrame to be collapsed - grouping (str): The column that you want to group by - new_column (str): the new column created to contain the nested dictionaries created - drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. - - Returns: - pd.DataFrame: New 2 column DataFrame with group and nested dictionaries - """ - return ( - df.groupby(grouping) - .apply( - lambda row: row.replace({np.nan: None}) - .drop(columns=drop_columns) - .to_dict("records") - ) - .reset_index() - .rename(columns={0: new_column}) - ) - - -def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: - if is_scored: - df = df[df[is_scored] == "Y"] # df does not have the isscored - else: - df = df[df.isin(["Y"]).any(axis=1)] - - if df[col].dtype == object: - df = df.copy() # Necessary to prevent SettingWithCopy warning - df[col] = df[col].astype(float) - - obj = {} - - """ - In order to smooth out the bins and make sure the entire range from 0 - to the theoretical maximum value has been found, we create a copy of the - column with both 0 and that maximum value added to it. We use the copy to calculate - distributions and bins, and subtract the values at the end - """ - distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True) - - obj["distribution"] = list( - pd.cut( - distribution, bins=10, precision=3, include_lowest=True, right=True - ).value_counts(sort=False) - ) - obj["distribution"][0] -= 1 # since this was calculated with the artificial 0 value, we subtract it - obj["distribution"][-1] -= 1 # since this was calculated with the artificial upper_bound, we subtract it - - discard, obj["bins"] = list( - pd.cut(distribution, bins=10, precision=3, retbins=True) - ) - obj["bins"] = np.around(obj["bins"].tolist()[1:], 2) - base = [0, *obj["bins"][:-1]] - obj["bins"] = zip(base, obj["bins"]) - obj["bins"] = list(obj["bins"]) - - obj["min"] = np.around(df[col].min(), 4) - obj["max"] = np.around(df[col].max(), 4) - obj["mean"] = np.around(df[col].mean(), 4) - obj["first_quartile"] = np.around( - df[col].quantile(q=0.25, interpolation="midpoint") - ) - obj["third_quartile"] = np.around( - df[col].quantile(q=0.75, interpolation="midpoint") - ) - - return obj - - -def count_grouped_total(df: pd.DataFrame, - grouping: [str, list], - input_colname: str, - output_colname: str) -> pd.DataFrame: - """For each unique item/combination in the column(s) specified by grouping, - counts the number of unique items in the column [input_colname] that - correspond to that grouping. The calculated counts are put in a new - column and named with [output_colname]. - Args: - df (pd.DataFrame): contains columns listed in grouping and - input_colname. May contain other columns as well, but - these will be dropped from the returned data frame. - grouping (str or list): a string with a single column name, or a list of - strings for multiple column names - input_colname (str): the name of the column to count - output_colname (str): the name of the new column with calculated counts - Returns: - pd.DataFrame: a data frame containing the grouping column(s) and a - new column for output_colname, which contains the count of - unique items in input_colname for each grouping item. - """ - df = ( - df.groupby(grouping)[input_colname] - .nunique().reset_index() - .rename(columns={input_colname: output_colname}) - ) - return df - - -def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: - """Takes dictionary of dataset DataFrames, extracts the genes_biodomains - DataFrame, calculates some metrics on GO terms per gene / biodomain, and - performs nest_fields on the final DataFrame. This results in a 2 column - DataFrame grouped by "ensembl_gene_id" and includes a collapsed nested - dictionary field "gene_biodomains" - - Args: - datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame - - Returns: - pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including - a collapsed nested dictionary field "gene_biodomains" - """ - genes_biodomains = datasets["genes_biodomains"] - interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"] - genes_biodomains = genes_biodomains[interesting_columns].dropna() - - # Count the number of go_terms associated with each biodomain - n_biodomain_terms = count_grouped_total(genes_biodomains, - "biodomain", - "go_terms", - "n_biodomain_terms") - - # Count the number of go_terms associated with each gene, ignoring biodomain - n_gene_total_terms = count_grouped_total(genes_biodomains, - "ensembl_gene_id", - "go_terms", - "n_gene_total_terms") - - # Count the number of go_terms associated with each gene / biodomain combo - n_gene_biodomain_terms = count_grouped_total(genes_biodomains, - ["ensembl_gene_id", "biodomain"], - "go_terms", - "n_gene_biodomain_terms") - - # Group rows by ensg and biodomain to produce nested lists of go_terms per ensg/biodomain - genes_biodomains = ( - genes_biodomains.groupby(["ensembl_gene_id", "biodomain"])["go_terms"] - .apply(list) - .reset_index() - ) - - # Merge all the different count metrics into the main data frame so each - # ensembl_gene_id / biodomain combo has an entry for each count - genes_biodomains = ( - genes_biodomains.merge(n_gene_total_terms, on="ensembl_gene_id", how="left") - .merge(n_biodomain_terms, on="biodomain", how="left") - .merge(n_gene_biodomain_terms, on=["ensembl_gene_id", "biodomain"], how="left") - ) - - # Calculate percent linking terms: - # n_gene_biodomain_terms / n_gene_total_terms * 100 - genes_biodomains["pct_linking_terms"] = ( - (genes_biodomains["n_gene_biodomain_terms"] / - genes_biodomains["n_gene_total_terms"] * 100) - .round(decimals=2) - ) - - # Remove n_gene_total_terms column - genes_biodomains = genes_biodomains.drop(columns="n_gene_total_terms") - - genes_biodomains = nest_fields( - df=genes_biodomains, - grouping="ensembl_gene_id", - new_column="gene_biodomains", - drop_columns="ensembl_gene_id", - ) - - return genes_biodomains - - -def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame: - interesting_columns = [ - "ensg", - "hgnc_gene_id", - "overall", - "geneticsscore", - "omicsscore", - "literaturescore", - ] - - # create mapping to deal with missing values as they take different shape across the fields - scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] - mapping = dict(zip(interesting_columns[3:], scored)) - - for field, is_scored in mapping.items(): - df.loc[lambda row: row[is_scored] == "N", field] = np.nan - - # LiteratureScore is a string in the source file, so convert to numeric - df["literaturescore"] = pd.to_numeric(df["literaturescore"]) - - # Remove identical rows (see AG-826) - return df[interesting_columns].drop_duplicates() - - -def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str): - return pd.merge(left=left, right=right, how=how, on=on) +from agoradatatools.etl.transform.utils import * +from agoradatatools.etl.transform.transform_genes_biodomains import ( + transform_genes_biodomains, +) def transform_team_info(datasets: dict): @@ -440,15 +170,13 @@ def transform_gene_info( gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 gene_info["rna_in_ad_brain_change"] = ( - (gene_info["adj_p_val"] <= adjusted_p_value_threshold) & - gene_info["rna_brain_change_studied"] - ) + gene_info["adj_p_val"] <= adjusted_p_value_threshold + ) & gene_info["rna_brain_change_studied"] gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1 gene_info["protein_in_ad_brain_change"] = ( - (gene_info["cor_pval"] <= protein_level_threshold) & - gene_info["protein_brain_change_studied"] - ) + gene_info["cor_pval"] <= protein_level_threshold + ) & gene_info["protein_brain_change_studied"] # create 'nominations' field gene_info["nominations"] = gene_info.apply( @@ -492,7 +220,6 @@ def transform_distribution_data( omics_max_score, lit_max_score, ): - overall_scores = datasets["overall_scores"] interesting_columns = [ "ensg", @@ -544,7 +271,7 @@ def transform_distribution_data( def transform_rna_distribution_data(datasets: dict): # "datasets" contains the unprocessed RNA-seq data, which needs to go - # through the same processing as before in order to use it here. + # through the same processing as before in order to use it here. rna_df = transform_rna_seq_data(datasets) rna_df = rna_df[["tissue", "model", "logfc"]] @@ -611,7 +338,6 @@ def transform_proteomics_distribution_data( def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame: - transformed = [] for name, dataset in datasets.items(): if name == "proteomics": @@ -631,7 +357,6 @@ def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame: def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): - if type(datasets) is not dict or type(dataset_name) is not str: return None diff --git a/src/agoradatatools/etl/transform/transform_genes_biodomains.py b/src/agoradatatools/etl/transform/transform_genes_biodomains.py new file mode 100644 index 00000000..b3149e3e --- /dev/null +++ b/src/agoradatatools/etl/transform/transform_genes_biodomains.py @@ -0,0 +1,75 @@ +import pandas as pd + +from agoradatatools.etl.transform.utils import * + + +def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: + """Takes dictionary of dataset DataFrames, extracts the genes_biodomains + DataFrame, calculates some metrics on GO terms per gene / biodomain, and + performs nest_fields on the final DataFrame. This results in a 2 column + DataFrame grouped by "ensembl_gene_id" and includes a collapsed nested + dictionary field "gene_biodomains" + + Args: + datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame + + Returns: + pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including + a collapsed nested dictionary field "gene_biodomains" + """ + genes_biodomains = datasets["genes_biodomains"] + interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"] + genes_biodomains = genes_biodomains[interesting_columns].dropna() + + # Count the number of go_terms associated with each biodomain + n_biodomain_terms = count_grouped_total( + genes_biodomains, "biodomain", "go_terms", "n_biodomain_terms" + ) + + # Count the number of go_terms associated with each gene, ignoring biodomain + n_gene_total_terms = count_grouped_total( + genes_biodomains, "ensembl_gene_id", "go_terms", "n_gene_total_terms" + ) + + # Count the number of go_terms associated with each gene / biodomain combo + n_gene_biodomain_terms = count_grouped_total( + genes_biodomains, + ["ensembl_gene_id", "biodomain"], + "go_terms", + "n_gene_biodomain_terms", + ) + + # Group rows by ensg and biodomain to produce nested lists of go_terms per ensg/biodomain + genes_biodomains = ( + genes_biodomains.groupby(["ensembl_gene_id", "biodomain"])["go_terms"] + .apply(list) + .reset_index() + ) + + # Merge all the different count metrics into the main data frame so each + # ensembl_gene_id / biodomain combo has an entry for each count + genes_biodomains = ( + genes_biodomains.merge(n_gene_total_terms, on="ensembl_gene_id", how="left") + .merge(n_biodomain_terms, on="biodomain", how="left") + .merge(n_gene_biodomain_terms, on=["ensembl_gene_id", "biodomain"], how="left") + ) + + # Calculate percent linking terms: + # n_gene_biodomain_terms / n_gene_total_terms * 100 + genes_biodomains["pct_linking_terms"] = ( + genes_biodomains["n_gene_biodomain_terms"] + / genes_biodomains["n_gene_total_terms"] + * 100 + ).round(decimals=2) + + # Remove n_gene_total_terms column + genes_biodomains = genes_biodomains.drop(columns="n_gene_total_terms") + + genes_biodomains = nest_fields( + df=genes_biodomains, + grouping="ensembl_gene_id", + new_column="gene_biodomains", + drop_columns="ensembl_gene_id", + ) + + return genes_biodomains diff --git a/src/agoradatatools/etl/transform/utils.py b/src/agoradatatools/etl/transform/utils.py new file mode 100644 index 00000000..813ded69 --- /dev/null +++ b/src/agoradatatools/etl/transform/utils.py @@ -0,0 +1,197 @@ +import numpy as np +import pandas as pd + + +def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame: + """Takes in a dataframe replaces problematic characters in column names + and makes column names all lowercase characters + + Args: + df (pd.DataFrame): DataFrame with columns to be standardized + + Returns: + pd.DataFrame: New dataframe with cleaned column names + """ + + df.columns = df.columns.str.replace( + "[#@&*^?()%$#!/]", "", regex=True + ) # the commas were unnessesary and were breaking the prelacement of '-' characters + df.columns = df.columns.str.replace("[ -.]", "_", regex=True) + df.columns = map(str.lower, df.columns) + + return df + + +def standardize_values(df: pd.DataFrame) -> pd.DataFrame: + """Finds non-compliant values and corrects them + *if more data cleaning options need to be added to this, + this needs to be refactored to another function + + Args: + df (pd.DataFrame): DataFrame with values to be standardized + + Returns: + pd.DataFrame: Resulting DataFrame with standardized values + """ + try: + df.replace(["n/a", "N/A", "n/A", "N/a"], np.nan, regex=True, inplace=True) + except TypeError: # I could not get this to trigger without mocking replace + print("Error comparing types.") + + return df + + +def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: + """Takes in a dataframe and renames columns according to the mapping provided + + Args: + df (pd.DataFrame): DataFrame with columns to be renamed + column_map (dict): Dictionary mapping original column names to new columns + + Returns: + pd.DataFrame: DataFrame with new columns names + """ + try: + df.rename(columns=column_map, inplace=True) + except TypeError: + print("Column mapping must be a dictionary") + return df + + return df + + +def nest_fields( + df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] +) -> pd.DataFrame: + """Collapses the provided DataFrame into 2 columns: + 1. The grouping column + 2. A column containing a nested dictionary with the data from the rest of the DataFrame + + Args: + df (pd.DataFrame): DataFrame to be collapsed + grouping (str): The column that you want to group by + new_column (str): the new column created to contain the nested dictionaries created + drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. + + Returns: + pd.DataFrame: New 2 column DataFrame with group and nested dictionaries + """ + return ( + df.groupby(grouping) + .apply( + lambda row: row.replace({np.nan: None}) + .drop(columns=drop_columns) + .to_dict("records") + ) + .reset_index() + .rename(columns={0: new_column}) + ) + + +def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: + if is_scored: + df = df[df[is_scored] == "Y"] # df does not have the isscored + else: + df = df[df.isin(["Y"]).any(axis=1)] + + if df[col].dtype == object: + df = df.copy() # Necessary to prevent SettingWithCopy warning + df[col] = df[col].astype(float) + + obj = {} + + # In order to smooth out the bins and make sure the entire range from 0 + # to the theoretical maximum value has been found, we create a copy of the + # column with both 0 and that maximum value added to it. We use the copy to calculate + # distributions and bins, and subtract the values at the end + + distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True) + + obj["distribution"] = list( + pd.cut( + distribution, bins=10, precision=3, include_lowest=True, right=True + ).value_counts(sort=False) + ) + obj["distribution"][ + 0 + ] -= 1 # since this was calculated with the artificial 0 value, we subtract it + obj["distribution"][ + -1 + ] -= 1 # since this was calculated with the artificial upper_bound, we subtract it + + discard, obj["bins"] = list( + pd.cut(distribution, bins=10, precision=3, retbins=True) + ) + obj["bins"] = np.around(obj["bins"].tolist()[1:], 2) + base = [0, *obj["bins"][:-1]] + obj["bins"] = zip(base, obj["bins"]) + obj["bins"] = list(obj["bins"]) + + obj["min"] = np.around(df[col].min(), 4) + obj["max"] = np.around(df[col].max(), 4) + obj["mean"] = np.around(df[col].mean(), 4) + obj["first_quartile"] = np.around( + df[col].quantile(q=0.25, interpolation="midpoint") + ) + obj["third_quartile"] = np.around( + df[col].quantile(q=0.75, interpolation="midpoint") + ) + + return obj + + +def count_grouped_total( + df: pd.DataFrame, grouping: [str, list], input_colname: str, output_colname: str +) -> pd.DataFrame: + """For each unique item/combination in the column(s) specified by grouping, + counts the number of unique items in the column [input_colname] that + correspond to that grouping. The calculated counts are put in a new + column and named with [output_colname]. + Args: + df (pd.DataFrame): contains columns listed in grouping and + input_colname. May contain other columns as well, but + these will be dropped from the returned data frame. + grouping (str or list): a string with a single column name, or a list of + strings for multiple column names + input_colname (str): the name of the column to count + output_colname (str): the name of the new column with calculated counts + Returns: + pd.DataFrame: a data frame containing the grouping column(s) and a + new column for output_colname, which contains the count of + unique items in input_colname for each grouping item. + """ + df = ( + df.groupby(grouping)[input_colname] + .nunique() + .reset_index() + .rename(columns={input_colname: output_colname}) + ) + return df + + +def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame: + interesting_columns = [ + "ensg", + "hgnc_gene_id", + "overall", + "geneticsscore", + "omicsscore", + "literaturescore", + ] + + # create mapping to deal with missing values as they take different shape across the fields + scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] + mapping = dict(zip(interesting_columns[3:], scored)) + + for field, is_scored in mapping.items(): + df.loc[lambda row: row[is_scored] == "N", field] = np.nan + + # LiteratureScore is a string in the source file, so convert to numeric + df["literaturescore"] = pd.to_numeric(df["literaturescore"]) + + # Remove identical rows (see AG-826) + return df[interesting_columns].drop_duplicates() + + +def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str): + return pd.merge(left=left, right=right, how=how, on=on) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index ce837a18..a63f1e98 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -6,7 +6,8 @@ import agoradatatools.etl.extract as extract import agoradatatools.etl.load as load -import agoradatatools.etl.transform as transform +import agoradatatools.etl.transform.transform as transform +from agoradatatools.etl.transform.utils import * import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError diff --git a/tests/test_process.py b/tests/test_process.py index 96bc9c60..eabd78c9 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -1,12 +1,12 @@ -import argparse from unittest import mock from unittest.mock import patch import pandas as pd +from agoradatatools.etl.transform import transform import pytest from agoradatatools import process -from agoradatatools.etl import extract, load, transform, utils +from agoradatatools.etl import extract, load, utils from agoradatatools.errors import ADTDataProcessingError diff --git a/tests/test_transform.py b/tests/test_transform.py index e0360bcc..21d0f9ea 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from agoradatatools.etl import transform +from agoradatatools.etl.transform import utils def test_standardize_column_names(): @@ -30,7 +30,7 @@ def test_standardize_column_names(): "AAA": ["test_value"], } ) - standard_df = transform.standardize_column_names(df=df) + standard_df = utils.standardize_column_names(df=df) assert list(standard_df.columns) == [ "a", "b", @@ -63,7 +63,7 @@ class TestStandardizeValues: ) def test_standardize_values_success(self): - standard_df = transform.standardize_values(df=self.df.copy()) + standard_df = utils.standardize_values(df=self.df.copy()) for value in standard_df.iloc[0].tolist(): assert np.isnan(value) @@ -72,7 +72,7 @@ def test_standardize_values_TypeError(self): patch_replace.side_effect = TypeError captured_output = StringIO() sys.stdout = captured_output - standard_df = transform.standardize_values(df=self.df.copy()) + standard_df = utils.standardize_values(df=self.df.copy()) assert "Error comparing types." in captured_output.getvalue() assert standard_df.equals(self.df) @@ -90,7 +90,7 @@ class TestRenameColumns: bad_column_map = [] def test_rename_columns_success(self): - renamed_df = transform.rename_columns( + renamed_df = utils.rename_columns( df=self.df.copy(), column_map=self.good_column_map ) assert list(renamed_df.columns) == list(self.good_column_map.values()) @@ -98,7 +98,7 @@ def test_rename_columns_success(self): def test_rename_columns_TypeError(self): captured_output = StringIO() sys.stdout = captured_output - bad_renamed_df = transform.rename_columns( + bad_renamed_df = utils.rename_columns( df=self.df.copy(), column_map=self.bad_column_map ) assert "Column mapping must be a dictionary" in captured_output.getvalue() @@ -120,33 +120,35 @@ def test_nest_fields(): [{"a": "group_3", "b": "1", "c": "1"}, {"a": "group_3", "b": "1", "c": "1"}], ] - nested_df = transform.nest_fields( + nested_df = utils.nest_fields( df=df, grouping="a", new_column="e", drop_columns=["d"] ) assert list(nested_df["e"]) == expected_column_e -class TestCountGroupedTotal(): +class TestCountGroupedTotal: df = pd.DataFrame( { "col_1": ["a", "a", "a", "b", "c", "c", "c"], # 3 'Ensembl IDs' "col_2": ["x", "y", "z", "x", "y", "z", "z"], # 3 'biodomains' "col_3": ["1", "1", "2", "3", "2", "1", "3"], # 3 'go_terms' - "col_4": ["m", "m", "n", "n", "o", "o", "o"] # An extra column that should get ignored + "col_4": [ + "m", + "m", + "n", + "n", + "o", + "o", + "o", + ], # An extra column that should get ignored } ) # How many unique "col_2"'s per unique "col_1" value? def test_count_grouped_total_one_group(self): - expected_df = pd.DataFrame( - { - "col_1": ["a", "b", "c"], - "output": [3, 1, 2] - } - ) - counted = transform.count_grouped_total( - df=self.df, grouping="col_1", - input_colname="col_2", output_colname="output" + expected_df = pd.DataFrame({"col_1": ["a", "b", "c"], "output": [3, 1, 2]}) + counted = utils.count_grouped_total( + df=self.df, grouping="col_1", input_colname="col_2", output_colname="output" ) assert counted.equals(expected_df) @@ -156,18 +158,19 @@ def test_count_grouped_total_two_groups(self): { "col_1": ["a", "a", "a", "b", "c", "c"], "col_2": ["x", "y", "z", "x", "y", "z"], - "output": [1, 1, 1, 1, 1, 2] + "output": [1, 1, 1, 1, 1, 2], } ) - counted = transform.count_grouped_total( - df=self.df, grouping=["col_1", "col_2"], - input_colname="col_3", output_colname="output" + counted = utils.count_grouped_total( + df=self.df, + grouping=["col_1", "col_2"], + input_colname="col_3", + output_colname="output", ) assert counted.equals(expected_df) - # def test_transform_biodomains(): # test_datasets = { # "biodomains": pd.DataFrame( From 03fe225d64e29da0fc8191816bd098f780835d59 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 25 Apr 2023 16:05:15 -0600 Subject: [PATCH 02/24] remove settings.json --- .vscode/settings.json | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 7a73a41b..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,2 +0,0 @@ -{ -} \ No newline at end of file From 2d676fcbe0792e88ff2e7d6bb664c6df2de612d8 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Mon, 1 May 2023 14:46:42 -0600 Subject: [PATCH 03/24] fixes list formatting --- tests/test_transform.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/test_transform.py b/tests/test_transform.py index 21d0f9ea..cbebdf1f 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -132,15 +132,7 @@ class TestCountGroupedTotal: "col_1": ["a", "a", "a", "b", "c", "c", "c"], # 3 'Ensembl IDs' "col_2": ["x", "y", "z", "x", "y", "z", "z"], # 3 'biodomains' "col_3": ["1", "1", "2", "3", "2", "1", "3"], # 3 'go_terms' - "col_4": [ - "m", - "m", - "n", - "n", - "o", - "o", - "o", - ], # An extra column that should get ignored + "col_4": ["m", "m", "n", "n", "o", "o", "o"], # An ignored column } ) From b75dfbdd1807d94c63e0ed9bc1061d35215bde35 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:09:01 -0600 Subject: [PATCH 04/24] splits up transforms --- .../etl/transform/apply_transform.py | 50 +++ .../etl/transform/distribution_data.py | 109 +++++ src/agoradatatools/etl/transform/gene_info.py | 177 ++++++++ ...enes_biodomains.py => genes_biodomains.py} | 34 +- .../etl/transform/overall_scores.py | 26 ++ .../etl/transform/proteomics_distribution.py | 62 +++ .../etl/transform/rna_distribution.py | 71 ++++ src/agoradatatools/etl/transform/team_info.py | 24 ++ src/agoradatatools/etl/transform/transform.py | 399 ------------------ src/agoradatatools/etl/transform/utils.py | 137 ------ src/agoradatatools/process.py | 12 +- tests/test_process.py | 10 +- 12 files changed, 563 insertions(+), 548 deletions(-) create mode 100644 src/agoradatatools/etl/transform/apply_transform.py create mode 100644 src/agoradatatools/etl/transform/distribution_data.py create mode 100644 src/agoradatatools/etl/transform/gene_info.py rename src/agoradatatools/etl/transform/{transform_genes_biodomains.py => genes_biodomains.py} (67%) create mode 100644 src/agoradatatools/etl/transform/overall_scores.py create mode 100644 src/agoradatatools/etl/transform/proteomics_distribution.py create mode 100644 src/agoradatatools/etl/transform/rna_distribution.py create mode 100644 src/agoradatatools/etl/transform/team_info.py delete mode 100644 src/agoradatatools/etl/transform/transform.py diff --git a/src/agoradatatools/etl/transform/apply_transform.py b/src/agoradatatools/etl/transform/apply_transform.py new file mode 100644 index 00000000..bf3549b5 --- /dev/null +++ b/src/agoradatatools/etl/transform/apply_transform.py @@ -0,0 +1,50 @@ +import numpy as np +import pandas as pd + +from agoradatatools.etl.transform.utils import * +from agoradatatools.etl.transform.genes_biodomains import ( + transform_genes_biodomains, +) + + +def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): + if type(datasets) is not dict or type(dataset_name) is not str: + return None + + elif dataset_name == "genes_biodomains": + return transform_genes_biodomains(datasets=datasets) + if dataset_name == "overall_scores": + df = datasets["overall_scores"] + return transform_overall_scores(df=df) + elif dataset_name == "distribution_data": + return transform_distribution_data( + datasets=datasets, + overall_max_score=dataset_obj["custom_transformations"][ + "overall_max_score" + ], + genetics_max_score=dataset_obj["custom_transformations"][ + "genetics_max_score" + ], + omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], + lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], + ) + elif dataset_name == "team_info": + return transform_team_info(datasets=datasets) + elif dataset_name == "rnaseq_differential_expression": + return transform_rna_seq_data(datasets=datasets) + elif dataset_name == "gene_info": + return transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=dataset_obj["custom_transformations"][ + "adjusted_p_value_threshold" + ], + protein_level_threshold=dataset_obj["custom_transformations"][ + "protein_level_threshold" + ], + ) + elif dataset_name == "rna_distribution_data": + return transform_rna_distribution_data(datasets=datasets) + elif dataset_name == "proteomics_distribution_data": + return create_proteomics_distribution_data(datasets=datasets) + else: + return None diff --git a/src/agoradatatools/etl/transform/distribution_data.py b/src/agoradatatools/etl/transform/distribution_data.py new file mode 100644 index 00000000..568e1e6e --- /dev/null +++ b/src/agoradatatools/etl/transform/distribution_data.py @@ -0,0 +1,109 @@ +import pandas as pd + + +def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: + if is_scored: + df = df[df[is_scored] == "Y"] # df does not have the isscored + else: + df = df[df.isin(["Y"]).any(axis=1)] + + if df[col].dtype == object: + df = df.copy() # Necessary to prevent SettingWithCopy warning + df[col] = df[col].astype(float) + + obj = {} + + # In order to smooth out the bins and make sure the entire range from 0 + # to the theoretical maximum value has been found, we create a copy of the + # column with both 0 and that maximum value added to it. We use the copy to calculate + # distributions and bins, and subtract the values at the end + + distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True) + + obj["distribution"] = list( + pd.cut( + distribution, bins=10, precision=3, include_lowest=True, right=True + ).value_counts(sort=False) + ) + obj["distribution"][ + 0 + ] -= 1 # since this was calculated with the artificial 0 value, we subtract it + obj["distribution"][ + -1 + ] -= 1 # since this was calculated with the artificial upper_bound, we subtract it + + discard, obj["bins"] = list( + pd.cut(distribution, bins=10, precision=3, retbins=True) + ) + obj["bins"] = np.around(obj["bins"].tolist()[1:], 2) + base = [0, *obj["bins"][:-1]] + obj["bins"] = zip(base, obj["bins"]) + obj["bins"] = list(obj["bins"]) + + obj["min"] = np.around(df[col].min(), 4) + obj["max"] = np.around(df[col].max(), 4) + obj["mean"] = np.around(df[col].mean(), 4) + obj["first_quartile"] = np.around( + df[col].quantile(q=0.25, interpolation="midpoint") + ) + obj["third_quartile"] = np.around( + df[col].quantile(q=0.75, interpolation="midpoint") + ) + + return obj + + +def transform_distribution_data( + datasets: dict, + overall_max_score, + genetics_max_score, + omics_max_score, + lit_max_score, +): + overall_scores = datasets["overall_scores"] + interesting_columns = [ + "ensg", + "overall", + "geneticsscore", + "omicsscore", + "literaturescore", + ] + + # create mapping to deal with missing values as they take different shape across the fields + scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] + mapping = dict(zip(interesting_columns[2:], scored)) + mapping["overall"] = None + + # create mapping for max score values from config + max_score = dict( + zip( + interesting_columns[1:], + [overall_max_score, genetics_max_score, omics_max_score, lit_max_score], + ) + ) + + overall_scores = overall_scores[interesting_columns + scored] + + neo_matrix = {} + for col in interesting_columns[1:]: # excludes the ENSG + neo_matrix[col] = calculate_distribution( + overall_scores, col, mapping[col], max_score[col] + ) + + neo_matrix["target_risk_score"] = neo_matrix.pop("overall") + neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore") + neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore") + neo_matrix["literature_score"] = neo_matrix.pop("literaturescore") + + additional_data = [ + {"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"}, + {"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"}, + {"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"}, + {"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"}, + ] + for col, additional in zip(neo_matrix.keys(), additional_data): + neo_matrix[col]["name"] = additional["name"] + neo_matrix[col]["syn_id"] = additional["syn_id"] + neo_matrix[col]["wiki_id"] = additional["wiki_id"] + + return neo_matrix diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py new file mode 100644 index 00000000..8832d10c --- /dev/null +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -0,0 +1,177 @@ +import pandas as pd +import numpy as np + + +def nest_fields( + df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] +) -> pd.DataFrame: + """Collapses the provided DataFrame into 2 columns: + 1. The grouping column + 2. A column containing a nested dictionary with the data from the rest of the DataFrame + + Args: + df (pd.DataFrame): DataFrame to be collapsed + grouping (str): The column that you want to group by + new_column (str): the new column created to contain the nested dictionaries created + drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. + + Returns: + pd.DataFrame: New 2 column DataFrame with group and nested dictionaries + """ + return ( + df.groupby(grouping) + .apply( + lambda row: row.replace({np.nan: None}) + .drop(columns=drop_columns) + .to_dict("records") + ) + .reset_index() + .rename(columns={0: new_column}) + ) + + +def transform_gene_info( + datasets: dict, adjusted_p_value_threshold, protein_level_threshold +): + """ + This function will perform transformations and incrementally create a dataset called gene_info. + Each dataset will be left_joined onto gene_info, starting with gene_metadata. + """ + gene_metadata = datasets["gene_metadata"] + igap = datasets["igap"] + eqtl = datasets["eqtl"] + proteomics = datasets["proteomics"] + rna_change = datasets["rna_expression_change"] + proteomics_tmt = datasets["agora_proteomics_tmt"] + target_list = datasets["target_list"] + median_expression = datasets["median_expression"] + druggability = datasets["druggability"] + + # Modify the data before merging + + # All genes in this list should have 'is_igap' = True when added to gene_info. + # Creating the column here automatically adds the column in to gene_info + # during merge, with True values correctly populated. + igap["is_igap"] = True + + # Get the smallest adj_p_val for each gene, to determine significance + rna_change = ( + rna_change.groupby("ensembl_gene_id")["adj_p_val"].agg("min").reset_index() + ) + + # Get the smallest cor_pval for each protein, to determine significance + proteomics_concat = pd.concat([proteomics, proteomics_tmt]) + proteomics_concat = proteomics_concat.dropna( + subset=["log2_fc", "cor_pval", "ci_lwr", "ci_upr"] + ) + proteomics_concat = ( + proteomics_concat.groupby("ensembl_gene_id")["cor_pval"] + .agg("min") + .reset_index() + ) + + # these are the interesting columns of the druggability dataset + useful_columns = [ + "geneid", + "sm_druggability_bucket", + "safety_bucket", + "abability_bucket", + "pharos_class", + "classification", + "safety_bucket_definition", + "abability_bucket_definition", + ] + druggability = druggability[useful_columns] + + target_list = nest_fields( + df=target_list, grouping="ensembl_gene_id", new_column="nominated_target" + ) + + median_expression = nest_fields( + df=median_expression, grouping="ensembl_gene_id", new_column="median_expression" + ) + + druggability = nest_fields( + df=druggability, grouping="geneid", new_column="druggability" + ) + druggability.rename(columns={"geneid": "ensembl_gene_id"}, inplace=True) + + # Merge all the datasets + + gene_info = gene_metadata + + for dataset in [ + igap, + eqtl, + rna_change, + proteomics_concat, + target_list, + median_expression, + druggability, + ]: + gene_info = pd.merge( + left=gene_info, + right=dataset, + on="ensembl_gene_id", + how="outer", + validate="one_to_one", + ) + + # Populate values for rows that didn't exist in the individual datasets + + gene_info.fillna( + {"is_igap": False, "has_eqtl": False, "adj_p_val": -1, "cor_pval": -1}, + inplace=True, + ) + + # fillna doesn't work for creating an empty array, need this function instead + gene_info["alias"] = gene_info.apply( + lambda row: row["alias"] + if isinstance(row["alias"], np.ndarray) + else np.ndarray(0, dtype=object), + axis=1, + ) + + gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 + gene_info["rna_in_ad_brain_change"] = ( + gene_info["adj_p_val"] <= adjusted_p_value_threshold + ) & gene_info["rna_brain_change_studied"] + + gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1 + gene_info["protein_in_ad_brain_change"] = ( + gene_info["cor_pval"] <= protein_level_threshold + ) & gene_info["protein_brain_change_studied"] + + # create 'nominations' field + gene_info["nominations"] = gene_info.apply( + lambda row: len(row["nominated_target"]) + if isinstance(row["nominated_target"], list) + else np.NaN, + axis=1, + ) + + # Remove some extra columns that got added during merges + gene_info = gene_info[ + [ + "ensembl_gene_id", + "name", + "summary", + "symbol", + "alias", + "is_igap", + "has_eqtl", + "rna_in_ad_brain_change", + "rna_brain_change_studied", + "protein_in_ad_brain_change", + "protein_brain_change_studied", + "nominated_target", + "median_expression", + "druggability", + "nominations", + ] + ] + + # Make sure there are no N/A Ensembl IDs + gene_info = gene_info.dropna(subset=["ensembl_gene_id"]) + + return gene_info diff --git a/src/agoradatatools/etl/transform/transform_genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py similarity index 67% rename from src/agoradatatools/etl/transform/transform_genes_biodomains.py rename to src/agoradatatools/etl/transform/genes_biodomains.py index b3149e3e..693e2672 100644 --- a/src/agoradatatools/etl/transform/transform_genes_biodomains.py +++ b/src/agoradatatools/etl/transform/genes_biodomains.py @@ -1,6 +1,38 @@ import pandas as pd -from agoradatatools.etl.transform.utils import * +from typing import Union + + +def count_grouped_total( + df: pd.DataFrame, + grouping: Union[str, list], + input_colname: str, + output_colname: str, +) -> pd.DataFrame: + """For each unique item/combination in the column(s) specified by grouping, + counts the number of unique items in the column [input_colname] that + correspond to that grouping. The calculated counts are put in a new + column and named with [output_colname]. + Args: + df (pd.DataFrame): contains columns listed in grouping and + input_colname. May contain other columns as well, but + these will be dropped from the returned data frame. + grouping (str or list): a string with a single column name, or a list of + strings for multiple column names + input_colname (str): the name of the column to count + output_colname (str): the name of the new column with calculated counts + Returns: + pd.DataFrame: a data frame containing the grouping column(s) and a + new column for output_colname, which contains the count of + unique items in input_colname for each grouping item. + """ + df = ( + df.groupby(grouping)[input_colname] + .nunique() + .reset_index() + .rename(columns={input_colname: output_colname}) + ) + return df def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: diff --git a/src/agoradatatools/etl/transform/overall_scores.py b/src/agoradatatools/etl/transform/overall_scores.py new file mode 100644 index 00000000..c651078b --- /dev/null +++ b/src/agoradatatools/etl/transform/overall_scores.py @@ -0,0 +1,26 @@ +import pandas as pd +import numpy as np + + +def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame: + interesting_columns = [ + "ensg", + "hgnc_gene_id", + "overall", + "geneticsscore", + "omicsscore", + "literaturescore", + ] + + # create mapping to deal with missing values as they take different shape across the fields + scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] + mapping = dict(zip(interesting_columns[3:], scored)) + + for field, is_scored in mapping.items(): + df.loc[lambda row: row[is_scored] == "N", field] = np.nan + + # LiteratureScore is a string in the source file, so convert to numeric + df["literaturescore"] = pd.to_numeric(df["literaturescore"]) + + # Remove identical rows (see AG-826) + return df[interesting_columns].drop_duplicates() diff --git a/src/agoradatatools/etl/transform/proteomics_distribution.py b/src/agoradatatools/etl/transform/proteomics_distribution.py new file mode 100644 index 00000000..067fbd75 --- /dev/null +++ b/src/agoradatatools/etl/transform/proteomics_distribution.py @@ -0,0 +1,62 @@ +import pandas as pd +import numpy as np + + +def transform_proteomics_distribution_data( + proteomics_df: pd.DataFrame, datatype: str +) -> pd.DataFrame: + """Transform proteomics data + Args: + proteomics_df (pd.DataFrame): Dataframe + datatype (str): Data Type + Returns: + pd.DataFrame: Transformed data + """ + proteomics_df = ( + proteomics_df.groupby(["tissue"]) + .agg("describe")["log2_fc"] + .reset_index()[["tissue", "min", "max", "25%", "50%", "75%"]] + ) + + proteomics_df.rename( + columns={"25%": "first_quartile", "50%": "median", "75%": "third_quartile"}, + inplace=True, + ) + + proteomics_df["IQR"] = ( + proteomics_df["third_quartile"] - proteomics_df["first_quartile"] + ) + proteomics_df["min"] = proteomics_df["first_quartile"] - ( + 1.5 * proteomics_df["IQR"] + ) + proteomics_df["max"] = proteomics_df["third_quartile"] + ( + 1.5 * proteomics_df["IQR"] + ) + + for col in ["min", "max", "median", "first_quartile", "third_quartile"]: + proteomics_df[col] = np.around(proteomics_df[col], 4) + + proteomics_df.drop("IQR", axis=1, inplace=True) + proteomics_df["type"] = datatype + + return proteomics_df + + +# should be own transformation combined with one above +def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame: + transformed = [] + for name, dataset in datasets.items(): + if name == "proteomics": + transformed.append( + transform_proteomics_distribution_data( + proteomics_df=dataset, datatype="LFQ" + ) + ) + elif name == "proteomics_tmt": + transformed.append( + transform_proteomics_distribution_data( + proteomics_df=dataset, datatype="TMT" + ) + ) + + return pd.concat(transformed) diff --git a/src/agoradatatools/etl/transform/rna_distribution.py b/src/agoradatatools/etl/transform/rna_distribution.py new file mode 100644 index 00000000..12e068f3 --- /dev/null +++ b/src/agoradatatools/etl/transform/rna_distribution.py @@ -0,0 +1,71 @@ +import numpy as np + + +def transform_rna_seq_data(datasets: dict): + diff_exp_data = datasets["diff_exp_data"] + + diff_exp_data["study"].replace( + to_replace={"MAYO": "MayoRNAseq", "MSSM": "MSBB"}, regex=True, inplace=True + ) + diff_exp_data["sex"].replace( + to_replace={ + "ALL": "males and females", + "FEMALE": "females only", + "MALE": "males only", + }, + regex=True, + inplace=True, + ) + diff_exp_data["model"].replace( + to_replace="\\.", value=" x ", regex=True, inplace=True + ) + diff_exp_data["model"].replace( + to_replace={"Diagnosis": "AD Diagnosis"}, regex=True, inplace=True + ) + diff_exp_data["fc"] = 2 ** diff_exp_data["logfc"] + diff_exp_data["model"] = diff_exp_data["model"] + " (" + diff_exp_data["sex"] + ")" + + diff_exp_data = diff_exp_data[ + [ + "ensembl_gene_id", + "hgnc_symbol", + "logfc", + "fc", + "ci_l", + "ci_r", + "adj_p_val", + "tissue", + "study", + "model", + ] + ] + + return diff_exp_data + + +def transform_rna_distribution_data(datasets: dict): + # "datasets" contains the unprocessed RNA-seq data, which needs to go + # through the same processing as before in order to use it here. + rna_df = transform_rna_seq_data(datasets) + rna_df = rna_df[["tissue", "model", "logfc"]] + + rna_df = ( + rna_df.groupby(["tissue", "model"]) + .agg("describe")["logfc"] + .reset_index()[["model", "tissue", "min", "max", "25%", "50%", "75%"]] + ) + rna_df.rename( + columns={"25%": "first_quartile", "50%": "median", "75%": "third_quartile"}, + inplace=True, + ) + + rna_df["IQR"] = rna_df["third_quartile"] - rna_df["first_quartile"] + rna_df["min"] = rna_df["first_quartile"] - (1.5 * rna_df["IQR"]) + rna_df["max"] = rna_df["third_quartile"] + (1.5 * rna_df["IQR"]) + + for col in ["min", "max", "median", "first_quartile", "third_quartile"]: + rna_df[col] = np.around(rna_df[col], 4) + + rna_df.drop("IQR", axis=1, inplace=True) + + return rna_df diff --git a/src/agoradatatools/etl/transform/team_info.py b/src/agoradatatools/etl/transform/team_info.py new file mode 100644 index 00000000..55a5c5ab --- /dev/null +++ b/src/agoradatatools/etl/transform/team_info.py @@ -0,0 +1,24 @@ +import pandas as pd + + +def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str): + return pd.merge(left=left, right=right, how=how, on=on) + + +def transform_team_info(datasets: dict): + team_info = datasets["team_info"] + team_member_info = datasets["team_member_info"] + + team_member_info = ( + team_member_info.groupby("team") + .apply( + lambda x: x[x.columns.difference(["team"])] + .fillna("") + .to_dict(orient="records") + ) + .reset_index(name="members") + ) + joined_df = join_datasets( + left=team_info, right=team_member_info, how="left", on="team" + ) + return joined_df diff --git a/src/agoradatatools/etl/transform/transform.py b/src/agoradatatools/etl/transform/transform.py deleted file mode 100644 index c9fbced0..00000000 --- a/src/agoradatatools/etl/transform/transform.py +++ /dev/null @@ -1,399 +0,0 @@ -import numpy as np -import pandas as pd - -from agoradatatools.etl.transform.utils import * -from agoradatatools.etl.transform.transform_genes_biodomains import ( - transform_genes_biodomains, -) - - -def transform_team_info(datasets: dict): - team_info = datasets["team_info"] - team_member_info = datasets["team_member_info"] - - team_member_info = ( - team_member_info.groupby("team") - .apply( - lambda x: x[x.columns.difference(["team"])] - .fillna("") - .to_dict(orient="records") - ) - .reset_index(name="members") - ) - - return join_datasets(left=team_info, right=team_member_info, how="left", on="team") - - -def transform_rna_seq_data(datasets: dict): - diff_exp_data = datasets["diff_exp_data"] - - diff_exp_data["study"].replace( - to_replace={"MAYO": "MayoRNAseq", "MSSM": "MSBB"}, regex=True, inplace=True - ) - diff_exp_data["sex"].replace( - to_replace={ - "ALL": "males and females", - "FEMALE": "females only", - "MALE": "males only", - }, - regex=True, - inplace=True, - ) - diff_exp_data["model"].replace( - to_replace="\\.", value=" x ", regex=True, inplace=True - ) - diff_exp_data["model"].replace( - to_replace={"Diagnosis": "AD Diagnosis"}, regex=True, inplace=True - ) - diff_exp_data["fc"] = 2 ** diff_exp_data["logfc"] - diff_exp_data["model"] = diff_exp_data["model"] + " (" + diff_exp_data["sex"] + ")" - - diff_exp_data = diff_exp_data[ - [ - "ensembl_gene_id", - "hgnc_symbol", - "logfc", - "fc", - "ci_l", - "ci_r", - "adj_p_val", - "tissue", - "study", - "model", - ] - ] - - return diff_exp_data - - -def transform_gene_info( - datasets: dict, adjusted_p_value_threshold, protein_level_threshold -): - """ - This function will perform transformations and incrementally create a dataset called gene_info. - Each dataset will be left_joined onto gene_info, starting with gene_metadata. - """ - gene_metadata = datasets["gene_metadata"] - igap = datasets["igap"] - eqtl = datasets["eqtl"] - proteomics = datasets["proteomics"] - rna_change = datasets["rna_expression_change"] - proteomics_tmt = datasets["agora_proteomics_tmt"] - target_list = datasets["target_list"] - median_expression = datasets["median_expression"] - druggability = datasets["druggability"] - - # Modify the data before merging - - # All genes in this list should have 'is_igap' = True when added to gene_info. - # Creating the column here automatically adds the column in to gene_info - # during merge, with True values correctly populated. - igap["is_igap"] = True - - # Get the smallest adj_p_val for each gene, to determine significance - rna_change = ( - rna_change.groupby("ensembl_gene_id")["adj_p_val"].agg("min").reset_index() - ) - - # Get the smallest cor_pval for each protein, to determine significance - proteomics_concat = pd.concat([proteomics, proteomics_tmt]) - proteomics_concat = proteomics_concat.dropna( - subset=["log2_fc", "cor_pval", "ci_lwr", "ci_upr"] - ) - proteomics_concat = ( - proteomics_concat.groupby("ensembl_gene_id")["cor_pval"] - .agg("min") - .reset_index() - ) - - # these are the interesting columns of the druggability dataset - useful_columns = [ - "geneid", - "sm_druggability_bucket", - "safety_bucket", - "abability_bucket", - "pharos_class", - "classification", - "safety_bucket_definition", - "abability_bucket_definition", - ] - druggability = druggability[useful_columns] - - target_list = nest_fields( - df=target_list, grouping="ensembl_gene_id", new_column="nominated_target" - ) - - median_expression = nest_fields( - df=median_expression, grouping="ensembl_gene_id", new_column="median_expression" - ) - - druggability = nest_fields( - df=druggability, grouping="geneid", new_column="druggability" - ) - druggability.rename(columns={"geneid": "ensembl_gene_id"}, inplace=True) - - # Merge all the datasets - - gene_info = gene_metadata - - for dataset in [ - igap, - eqtl, - rna_change, - proteomics_concat, - target_list, - median_expression, - druggability, - ]: - gene_info = pd.merge( - left=gene_info, - right=dataset, - on="ensembl_gene_id", - how="outer", - validate="one_to_one", - ) - - # Populate values for rows that didn't exist in the individual datasets - - gene_info.fillna( - {"is_igap": False, "has_eqtl": False, "adj_p_val": -1, "cor_pval": -1}, - inplace=True, - ) - - # fillna doesn't work for creating an empty array, need this function instead - gene_info["alias"] = gene_info.apply( - lambda row: row["alias"] - if isinstance(row["alias"], np.ndarray) - else np.ndarray(0, dtype=object), - axis=1, - ) - - gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 - gene_info["rna_in_ad_brain_change"] = ( - gene_info["adj_p_val"] <= adjusted_p_value_threshold - ) & gene_info["rna_brain_change_studied"] - - gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1 - gene_info["protein_in_ad_brain_change"] = ( - gene_info["cor_pval"] <= protein_level_threshold - ) & gene_info["protein_brain_change_studied"] - - # create 'nominations' field - gene_info["nominations"] = gene_info.apply( - lambda row: len(row["nominated_target"]) - if isinstance(row["nominated_target"], list) - else np.NaN, - axis=1, - ) - - # Remove some extra columns that got added during merges - gene_info = gene_info[ - [ - "ensembl_gene_id", - "name", - "summary", - "symbol", - "alias", - "is_igap", - "has_eqtl", - "rna_in_ad_brain_change", - "rna_brain_change_studied", - "protein_in_ad_brain_change", - "protein_brain_change_studied", - "nominated_target", - "median_expression", - "druggability", - "nominations", - ] - ] - - # Make sure there are no N/A Ensembl IDs - gene_info = gene_info.dropna(subset=["ensembl_gene_id"]) - - return gene_info - - -def transform_distribution_data( - datasets: dict, - overall_max_score, - genetics_max_score, - omics_max_score, - lit_max_score, -): - overall_scores = datasets["overall_scores"] - interesting_columns = [ - "ensg", - "overall", - "geneticsscore", - "omicsscore", - "literaturescore", - ] - - # create mapping to deal with missing values as they take different shape across the fields - scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] - mapping = dict(zip(interesting_columns[2:], scored)) - mapping["overall"] = None - - # create mapping for max score values from config - max_score = dict( - zip( - interesting_columns[1:], - [overall_max_score, genetics_max_score, omics_max_score, lit_max_score], - ) - ) - - overall_scores = overall_scores[interesting_columns + scored] - - neo_matrix = {} - for col in interesting_columns[1:]: # excludes the ENSG - neo_matrix[col] = calculate_distribution( - overall_scores, col, mapping[col], max_score[col] - ) - - neo_matrix["target_risk_score"] = neo_matrix.pop("overall") - neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore") - neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore") - neo_matrix["literature_score"] = neo_matrix.pop("literaturescore") - - additional_data = [ - {"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"}, - {"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"}, - {"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"}, - {"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"}, - ] - for col, additional in zip(neo_matrix.keys(), additional_data): - neo_matrix[col]["name"] = additional["name"] - neo_matrix[col]["syn_id"] = additional["syn_id"] - neo_matrix[col]["wiki_id"] = additional["wiki_id"] - - return neo_matrix - - -def transform_rna_distribution_data(datasets: dict): - # "datasets" contains the unprocessed RNA-seq data, which needs to go - # through the same processing as before in order to use it here. - rna_df = transform_rna_seq_data(datasets) - rna_df = rna_df[["tissue", "model", "logfc"]] - - rna_df = ( - rna_df.groupby(["tissue", "model"]) - .agg("describe")["logfc"] - .reset_index()[["model", "tissue", "min", "max", "25%", "50%", "75%"]] - ) - rna_df.rename( - columns={"25%": "first_quartile", "50%": "median", "75%": "third_quartile"}, - inplace=True, - ) - - rna_df["IQR"] = rna_df["third_quartile"] - rna_df["first_quartile"] - rna_df["min"] = rna_df["first_quartile"] - (1.5 * rna_df["IQR"]) - rna_df["max"] = rna_df["third_quartile"] + (1.5 * rna_df["IQR"]) - - for col in ["min", "max", "median", "first_quartile", "third_quartile"]: - rna_df[col] = np.around(rna_df[col], 4) - - rna_df.drop("IQR", axis=1, inplace=True) - - return rna_df - - -def transform_proteomics_distribution_data( - proteomics_df: pd.DataFrame, datatype: str -) -> pd.DataFrame: - """Transform proteomics data - Args: - proteomics_df (pd.DataFrame): Dataframe - datatype (str): Data Type - Returns: - pd.DataFrame: Transformed data - """ - proteomics_df = ( - proteomics_df.groupby(["tissue"]) - .agg("describe")["log2_fc"] - .reset_index()[["tissue", "min", "max", "25%", "50%", "75%"]] - ) - - proteomics_df.rename( - columns={"25%": "first_quartile", "50%": "median", "75%": "third_quartile"}, - inplace=True, - ) - - proteomics_df["IQR"] = ( - proteomics_df["third_quartile"] - proteomics_df["first_quartile"] - ) - proteomics_df["min"] = proteomics_df["first_quartile"] - ( - 1.5 * proteomics_df["IQR"] - ) - proteomics_df["max"] = proteomics_df["third_quartile"] + ( - 1.5 * proteomics_df["IQR"] - ) - - for col in ["min", "max", "median", "first_quartile", "third_quartile"]: - proteomics_df[col] = np.around(proteomics_df[col], 4) - - proteomics_df.drop("IQR", axis=1, inplace=True) - proteomics_df["type"] = datatype - - return proteomics_df - - -def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame: - transformed = [] - for name, dataset in datasets.items(): - if name == "proteomics": - transformed.append( - transform_proteomics_distribution_data( - proteomics_df=dataset, datatype="LFQ" - ) - ) - elif name == "proteomics_tmt": - transformed.append( - transform_proteomics_distribution_data( - proteomics_df=dataset, datatype="TMT" - ) - ) - - return pd.concat(transformed) - - -def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): - if type(datasets) is not dict or type(dataset_name) is not str: - return None - - elif dataset_name == "genes_biodomains": - return transform_genes_biodomains(datasets=datasets) - if dataset_name == "overall_scores": - df = datasets["overall_scores"] - return transform_overall_scores(df=df) - elif dataset_name == "distribution_data": - return transform_distribution_data( - datasets=datasets, - overall_max_score=dataset_obj["custom_transformations"][ - "overall_max_score" - ], - genetics_max_score=dataset_obj["custom_transformations"][ - "genetics_max_score" - ], - omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], - lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], - ) - elif dataset_name == "team_info": - return transform_team_info(datasets=datasets) - elif dataset_name == "rnaseq_differential_expression": - return transform_rna_seq_data(datasets=datasets) - elif dataset_name == "gene_info": - return transform_gene_info( - datasets=datasets, - adjusted_p_value_threshold=dataset_obj["custom_transformations"][ - "adjusted_p_value_threshold" - ], - protein_level_threshold=dataset_obj["custom_transformations"][ - "protein_level_threshold" - ], - ) - elif dataset_name == "rna_distribution_data": - return transform_rna_distribution_data(datasets=datasets) - elif dataset_name == "proteomics_distribution_data": - return create_proteomics_distribution_data(datasets=datasets) - else: - return None diff --git a/src/agoradatatools/etl/transform/utils.py b/src/agoradatatools/etl/transform/utils.py index 813ded69..4f5484e0 100644 --- a/src/agoradatatools/etl/transform/utils.py +++ b/src/agoradatatools/etl/transform/utils.py @@ -58,140 +58,3 @@ def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: return df return df - - -def nest_fields( - df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] -) -> pd.DataFrame: - """Collapses the provided DataFrame into 2 columns: - 1. The grouping column - 2. A column containing a nested dictionary with the data from the rest of the DataFrame - - Args: - df (pd.DataFrame): DataFrame to be collapsed - grouping (str): The column that you want to group by - new_column (str): the new column created to contain the nested dictionaries created - drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. - - Returns: - pd.DataFrame: New 2 column DataFrame with group and nested dictionaries - """ - return ( - df.groupby(grouping) - .apply( - lambda row: row.replace({np.nan: None}) - .drop(columns=drop_columns) - .to_dict("records") - ) - .reset_index() - .rename(columns={0: new_column}) - ) - - -def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: - if is_scored: - df = df[df[is_scored] == "Y"] # df does not have the isscored - else: - df = df[df.isin(["Y"]).any(axis=1)] - - if df[col].dtype == object: - df = df.copy() # Necessary to prevent SettingWithCopy warning - df[col] = df[col].astype(float) - - obj = {} - - # In order to smooth out the bins and make sure the entire range from 0 - # to the theoretical maximum value has been found, we create a copy of the - # column with both 0 and that maximum value added to it. We use the copy to calculate - # distributions and bins, and subtract the values at the end - - distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True) - - obj["distribution"] = list( - pd.cut( - distribution, bins=10, precision=3, include_lowest=True, right=True - ).value_counts(sort=False) - ) - obj["distribution"][ - 0 - ] -= 1 # since this was calculated with the artificial 0 value, we subtract it - obj["distribution"][ - -1 - ] -= 1 # since this was calculated with the artificial upper_bound, we subtract it - - discard, obj["bins"] = list( - pd.cut(distribution, bins=10, precision=3, retbins=True) - ) - obj["bins"] = np.around(obj["bins"].tolist()[1:], 2) - base = [0, *obj["bins"][:-1]] - obj["bins"] = zip(base, obj["bins"]) - obj["bins"] = list(obj["bins"]) - - obj["min"] = np.around(df[col].min(), 4) - obj["max"] = np.around(df[col].max(), 4) - obj["mean"] = np.around(df[col].mean(), 4) - obj["first_quartile"] = np.around( - df[col].quantile(q=0.25, interpolation="midpoint") - ) - obj["third_quartile"] = np.around( - df[col].quantile(q=0.75, interpolation="midpoint") - ) - - return obj - - -def count_grouped_total( - df: pd.DataFrame, grouping: [str, list], input_colname: str, output_colname: str -) -> pd.DataFrame: - """For each unique item/combination in the column(s) specified by grouping, - counts the number of unique items in the column [input_colname] that - correspond to that grouping. The calculated counts are put in a new - column and named with [output_colname]. - Args: - df (pd.DataFrame): contains columns listed in grouping and - input_colname. May contain other columns as well, but - these will be dropped from the returned data frame. - grouping (str or list): a string with a single column name, or a list of - strings for multiple column names - input_colname (str): the name of the column to count - output_colname (str): the name of the new column with calculated counts - Returns: - pd.DataFrame: a data frame containing the grouping column(s) and a - new column for output_colname, which contains the count of - unique items in input_colname for each grouping item. - """ - df = ( - df.groupby(grouping)[input_colname] - .nunique() - .reset_index() - .rename(columns={input_colname: output_colname}) - ) - return df - - -def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame: - interesting_columns = [ - "ensg", - "hgnc_gene_id", - "overall", - "geneticsscore", - "omicsscore", - "literaturescore", - ] - - # create mapping to deal with missing values as they take different shape across the fields - scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] - mapping = dict(zip(interesting_columns[3:], scored)) - - for field, is_scored in mapping.items(): - df.loc[lambda row: row[is_scored] == "N", field] = np.nan - - # LiteratureScore is a string in the source file, so convert to numeric - df["literaturescore"] = pd.to_numeric(df["literaturescore"]) - - # Remove identical rows (see AG-826) - return df[interesting_columns].drop_duplicates() - - -def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str): - return pd.merge(left=left, right=right, how=how, on=on) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index a63f1e98..b90d876a 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -6,7 +6,7 @@ import agoradatatools.etl.extract as extract import agoradatatools.etl.load as load -import agoradatatools.etl.transform.transform as transform +import agoradatatools.etl.transform.apply_transform as apply_transform from agoradatatools.etl.transform.utils import * import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError @@ -35,19 +35,19 @@ def process_dataset( entity_name = entity["name"] df = extract.get_entity_as_df(syn_id=entity_id, source=entity_format, syn=syn) - df = transform.standardize_column_names(df=df) - df = transform.standardize_values(df=df) + df = apply_transform.standardize_column_names(df=df) + df = apply_transform.standardize_values(df=df) # the column rename gets applied to all entities in a dataset if "column_rename" in dataset_obj[dataset_name].keys(): - df = transform.rename_columns( + df = apply_transform.rename_columns( df=df, column_map=dataset_obj[dataset_name]["column_rename"] ) entities_as_df[entity_name] = df if "custom_transformations" in dataset_obj[dataset_name].keys(): - df = transform.apply_custom_transformations( + df = apply_transform.apply_custom_transformations( datasets=entities_as_df, dataset_name=dataset_name, dataset_obj=dataset_obj[dataset_name], @@ -56,7 +56,7 @@ def process_dataset( df = entities_as_df[list(entities_as_df)[0]] if "agora_rename" in dataset_obj[dataset_name].keys(): - df = transform.rename_columns( + df = apply_transform.rename_columns( df=df, column_map=dataset_obj[dataset_name]["agora_rename"] ) diff --git a/tests/test_process.py b/tests/test_process.py index eabd78c9..5e27b689 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -2,7 +2,7 @@ from unittest.mock import patch import pandas as pd -from agoradatatools.etl.transform import transform +from agoradatatools.etl.transform import apply_transform import pytest from agoradatatools import process @@ -55,20 +55,20 @@ def setup_method(self): extract, "get_entity_as_df", return_value=pd.DataFrame ).start() self.patch_standardize_column_names = patch.object( - transform, "standardize_column_names", return_value=pd.DataFrame + apply_transform, "standardize_column_names", return_value=pd.DataFrame ).start() self.patch_standardize_values = patch.object( - transform, "standardize_values", return_value=pd.DataFrame + apply_transform, "standardize_values", return_value=pd.DataFrame ).start() self.patch_rename_columns = patch.object( - transform, "rename_columns", return_value=pd.DataFrame + apply_transform, "rename_columns", return_value=pd.DataFrame ).start() self.patch_df_to_json = patch.object( load, "df_to_json", return_value="path/to/json" ).start() self.patch_load = patch.object(load, "load", return_value=None).start() self.patch_custom_transform = patch.object( - transform, "apply_custom_transformations", return_value=pd.DataFrame + apply_transform, "apply_custom_transformations", return_value=pd.DataFrame ).start() self.patch_dict_to_json = patch.object( load, "dict_to_json", return_value="path/to/json" From 882ed5dd8ac166fd1e31d47a02c365980690a4fd Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:16:54 -0600 Subject: [PATCH 05/24] simplifies team_info --- .../transform/custom_transforms/team_info.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/agoradatatools/etl/transform/custom_transforms/team_info.py diff --git a/src/agoradatatools/etl/transform/custom_transforms/team_info.py b/src/agoradatatools/etl/transform/custom_transforms/team_info.py new file mode 100644 index 00000000..82e045bb --- /dev/null +++ b/src/agoradatatools/etl/transform/custom_transforms/team_info.py @@ -0,0 +1,18 @@ +import pandas as pd + + +def transform_team_info(datasets: dict): + team_info = datasets["team_info"] + team_member_info = datasets["team_member_info"] + + team_member_info = ( + team_member_info.groupby("team") + .apply( + lambda x: x[x.columns.difference(["team"])] + .fillna("") + .to_dict(orient="records") + ) + .reset_index(name="members") + ) + joined_df = pd.merge(left=team_info, right=team_member_info, how="left", on="team") + return joined_df From cd0115f1e618a9d862d0c6e6c5392488089b2777 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:23:49 -0600 Subject: [PATCH 06/24] adds `custom` transform submodule, rounds up functions for import into `transform/apply.py` --- src/agoradatatools/etl/transform/apply.py | 47 +++++++++++++++++++ .../etl/transform/custom/__init__.py | 31 ++++++++++++ .../{ => custom}/distribution_data.py | 0 .../etl/transform/{ => custom}/gene_info.py | 0 .../{ => custom}/genes_biodomains.py | 0 .../transform/{ => custom}/overall_scores.py | 0 .../{ => custom}/proteomics_distribution.py | 0 .../{ => custom}/rna_distribution.py | 0 .../team_info.py | 0 src/agoradatatools/etl/transform/team_info.py | 24 ---------- tests/test_process.py | 10 ++-- 11 files changed, 83 insertions(+), 29 deletions(-) create mode 100644 src/agoradatatools/etl/transform/apply.py create mode 100644 src/agoradatatools/etl/transform/custom/__init__.py rename src/agoradatatools/etl/transform/{ => custom}/distribution_data.py (100%) rename src/agoradatatools/etl/transform/{ => custom}/gene_info.py (100%) rename src/agoradatatools/etl/transform/{ => custom}/genes_biodomains.py (100%) rename src/agoradatatools/etl/transform/{ => custom}/overall_scores.py (100%) rename src/agoradatatools/etl/transform/{ => custom}/proteomics_distribution.py (100%) rename src/agoradatatools/etl/transform/{ => custom}/rna_distribution.py (100%) rename src/agoradatatools/etl/transform/{custom_transforms => custom}/team_info.py (100%) delete mode 100644 src/agoradatatools/etl/transform/team_info.py diff --git a/src/agoradatatools/etl/transform/apply.py b/src/agoradatatools/etl/transform/apply.py new file mode 100644 index 00000000..adf79b49 --- /dev/null +++ b/src/agoradatatools/etl/transform/apply.py @@ -0,0 +1,47 @@ +import numpy as np +import pandas as pd + +from agoradatatools.etl.transform.utils import * +from agoradatatools.etl.transform.custom import * + + +def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): + if type(datasets) is not dict or type(dataset_name) is not str: + return None + elif dataset_name == "genes_biodomains": + return transform_genes_biodomains(datasets=datasets) + if dataset_name == "overall_scores": + df = datasets["overall_scores"] + return transform_overall_scores(df=df) + elif dataset_name == "distribution_data": + return transform_distribution_data( + datasets=datasets, + overall_max_score=dataset_obj["custom_transformations"][ + "overall_max_score" + ], + genetics_max_score=dataset_obj["custom_transformations"][ + "genetics_max_score" + ], + omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], + lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], + ) + elif dataset_name == "team_info": + return transform_team_info(datasets=datasets) + elif dataset_name == "rnaseq_differential_expression": + return transform_rna_seq_data(datasets=datasets) + elif dataset_name == "gene_info": + return transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=dataset_obj["custom_transformations"][ + "adjusted_p_value_threshold" + ], + protein_level_threshold=dataset_obj["custom_transformations"][ + "protein_level_threshold" + ], + ) + elif dataset_name == "rna_distribution_data": + return transform_rna_distribution_data(datasets=datasets) + elif dataset_name == "proteomics_distribution_data": + return create_proteomics_distribution_data(datasets=datasets) + else: + return None diff --git a/src/agoradatatools/etl/transform/custom/__init__.py b/src/agoradatatools/etl/transform/custom/__init__.py new file mode 100644 index 00000000..fed2e9b7 --- /dev/null +++ b/src/agoradatatools/etl/transform/custom/__init__.py @@ -0,0 +1,31 @@ +"""Submodule for Agora Data Tools Custom Transformations""" + +from agoradatatools.etl.transform.custom.distribution_data import ( + transform_distribution_data, +) +from agoradatatools.etl.transform.custom.gene_info import transform_gene_info +from agoradatatools.etl.transform.custom.genes_biodomains import ( + transform_genes_biodomains, +) +from agoradatatools.etl.transform.custom.overall_scores import ( + transform_overall_scores, +) +from agoradatatools.etl.transform.custom.proteomics_distribution import ( + create_proteomics_distribution_data, +) +from agoradatatools.etl.transform.custom.rna_distribution import ( + transform_rna_distribution_data, + transform_rna_seq_data, +) +from agoradatatools.etl.transform.custom.team_info import transform_team_info + +__all__ = [ + "transform_distribution_data", + "transform_gene_info", + "transform_genes_biodomains", + "transform_overall_scores", + "create_proteomics_distribution_data", + "transform_rna_distribution_data", + "transform_rna_seq_data", + "transform_team_info", +] diff --git a/src/agoradatatools/etl/transform/distribution_data.py b/src/agoradatatools/etl/transform/custom/distribution_data.py similarity index 100% rename from src/agoradatatools/etl/transform/distribution_data.py rename to src/agoradatatools/etl/transform/custom/distribution_data.py diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/custom/gene_info.py similarity index 100% rename from src/agoradatatools/etl/transform/gene_info.py rename to src/agoradatatools/etl/transform/custom/gene_info.py diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/custom/genes_biodomains.py similarity index 100% rename from src/agoradatatools/etl/transform/genes_biodomains.py rename to src/agoradatatools/etl/transform/custom/genes_biodomains.py diff --git a/src/agoradatatools/etl/transform/overall_scores.py b/src/agoradatatools/etl/transform/custom/overall_scores.py similarity index 100% rename from src/agoradatatools/etl/transform/overall_scores.py rename to src/agoradatatools/etl/transform/custom/overall_scores.py diff --git a/src/agoradatatools/etl/transform/proteomics_distribution.py b/src/agoradatatools/etl/transform/custom/proteomics_distribution.py similarity index 100% rename from src/agoradatatools/etl/transform/proteomics_distribution.py rename to src/agoradatatools/etl/transform/custom/proteomics_distribution.py diff --git a/src/agoradatatools/etl/transform/rna_distribution.py b/src/agoradatatools/etl/transform/custom/rna_distribution.py similarity index 100% rename from src/agoradatatools/etl/transform/rna_distribution.py rename to src/agoradatatools/etl/transform/custom/rna_distribution.py diff --git a/src/agoradatatools/etl/transform/custom_transforms/team_info.py b/src/agoradatatools/etl/transform/custom/team_info.py similarity index 100% rename from src/agoradatatools/etl/transform/custom_transforms/team_info.py rename to src/agoradatatools/etl/transform/custom/team_info.py diff --git a/src/agoradatatools/etl/transform/team_info.py b/src/agoradatatools/etl/transform/team_info.py deleted file mode 100644 index 55a5c5ab..00000000 --- a/src/agoradatatools/etl/transform/team_info.py +++ /dev/null @@ -1,24 +0,0 @@ -import pandas as pd - - -def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str): - return pd.merge(left=left, right=right, how=how, on=on) - - -def transform_team_info(datasets: dict): - team_info = datasets["team_info"] - team_member_info = datasets["team_member_info"] - - team_member_info = ( - team_member_info.groupby("team") - .apply( - lambda x: x[x.columns.difference(["team"])] - .fillna("") - .to_dict(orient="records") - ) - .reset_index(name="members") - ) - joined_df = join_datasets( - left=team_info, right=team_member_info, how="left", on="team" - ) - return joined_df diff --git a/tests/test_process.py b/tests/test_process.py index 5e27b689..b9dffd00 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -2,7 +2,7 @@ from unittest.mock import patch import pandas as pd -from agoradatatools.etl.transform import apply_transform +from agoradatatools.etl.transform import apply import pytest from agoradatatools import process @@ -55,20 +55,20 @@ def setup_method(self): extract, "get_entity_as_df", return_value=pd.DataFrame ).start() self.patch_standardize_column_names = patch.object( - apply_transform, "standardize_column_names", return_value=pd.DataFrame + apply, "standardize_column_names", return_value=pd.DataFrame ).start() self.patch_standardize_values = patch.object( - apply_transform, "standardize_values", return_value=pd.DataFrame + apply, "standardize_values", return_value=pd.DataFrame ).start() self.patch_rename_columns = patch.object( - apply_transform, "rename_columns", return_value=pd.DataFrame + apply, "rename_columns", return_value=pd.DataFrame ).start() self.patch_df_to_json = patch.object( load, "df_to_json", return_value="path/to/json" ).start() self.patch_load = patch.object(load, "load", return_value=None).start() self.patch_custom_transform = patch.object( - apply_transform, "apply_custom_transformations", return_value=pd.DataFrame + apply, "apply_custom_transformations", return_value=pd.DataFrame ).start() self.patch_dict_to_json = patch.object( load, "dict_to_json", return_value="path/to/json" From 5c7e021ef75d1921b9ccb11df3c4a5ad86ab61db Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:24:12 -0600 Subject: [PATCH 07/24] removes util import from appy --- src/agoradatatools/etl/transform/apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agoradatatools/etl/transform/apply.py b/src/agoradatatools/etl/transform/apply.py index adf79b49..6e3d15da 100644 --- a/src/agoradatatools/etl/transform/apply.py +++ b/src/agoradatatools/etl/transform/apply.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd -from agoradatatools.etl.transform.utils import * from agoradatatools.etl.transform.custom import * From f1f1c566a391ecd9d4c68105f501ca1f1f8d0516 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:27:35 -0600 Subject: [PATCH 08/24] cleans up apply.py --- src/agoradatatools/etl/transform/apply.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/agoradatatools/etl/transform/apply.py b/src/agoradatatools/etl/transform/apply.py index 6e3d15da..7a7ec4a6 100644 --- a/src/agoradatatools/etl/transform/apply.py +++ b/src/agoradatatools/etl/transform/apply.py @@ -4,15 +4,16 @@ from agoradatatools.etl.transform.custom import * +# TODO refactor to avoid so many if's - maybe some sort of mapping to callables def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): - if type(datasets) is not dict or type(dataset_name) is not str: + if not isinstance(datasets, dict) or not isinstance(dataset_name, str): return None - elif dataset_name == "genes_biodomains": + if dataset_name == "genes_biodomains": return transform_genes_biodomains(datasets=datasets) if dataset_name == "overall_scores": df = datasets["overall_scores"] return transform_overall_scores(df=df) - elif dataset_name == "distribution_data": + if dataset_name == "distribution_data": return transform_distribution_data( datasets=datasets, overall_max_score=dataset_obj["custom_transformations"][ @@ -24,11 +25,11 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], ) - elif dataset_name == "team_info": + if dataset_name == "team_info": return transform_team_info(datasets=datasets) - elif dataset_name == "rnaseq_differential_expression": + if dataset_name == "rnaseq_differential_expression": return transform_rna_seq_data(datasets=datasets) - elif dataset_name == "gene_info": + if dataset_name == "gene_info": return transform_gene_info( datasets=datasets, adjusted_p_value_threshold=dataset_obj["custom_transformations"][ @@ -38,9 +39,9 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: "protein_level_threshold" ], ) - elif dataset_name == "rna_distribution_data": + if dataset_name == "rna_distribution_data": return transform_rna_distribution_data(datasets=datasets) - elif dataset_name == "proteomics_distribution_data": + if dataset_name == "proteomics_distribution_data": return create_proteomics_distribution_data(datasets=datasets) else: return None From b2634e4b9e34cf40ad7fb11b5449a9c5d9a24717 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:29:01 -0600 Subject: [PATCH 09/24] renamed apply --- .../etl/transform/apply_transform.py | 50 ------------------- 1 file changed, 50 deletions(-) delete mode 100644 src/agoradatatools/etl/transform/apply_transform.py diff --git a/src/agoradatatools/etl/transform/apply_transform.py b/src/agoradatatools/etl/transform/apply_transform.py deleted file mode 100644 index bf3549b5..00000000 --- a/src/agoradatatools/etl/transform/apply_transform.py +++ /dev/null @@ -1,50 +0,0 @@ -import numpy as np -import pandas as pd - -from agoradatatools.etl.transform.utils import * -from agoradatatools.etl.transform.genes_biodomains import ( - transform_genes_biodomains, -) - - -def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): - if type(datasets) is not dict or type(dataset_name) is not str: - return None - - elif dataset_name == "genes_biodomains": - return transform_genes_biodomains(datasets=datasets) - if dataset_name == "overall_scores": - df = datasets["overall_scores"] - return transform_overall_scores(df=df) - elif dataset_name == "distribution_data": - return transform_distribution_data( - datasets=datasets, - overall_max_score=dataset_obj["custom_transformations"][ - "overall_max_score" - ], - genetics_max_score=dataset_obj["custom_transformations"][ - "genetics_max_score" - ], - omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], - lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], - ) - elif dataset_name == "team_info": - return transform_team_info(datasets=datasets) - elif dataset_name == "rnaseq_differential_expression": - return transform_rna_seq_data(datasets=datasets) - elif dataset_name == "gene_info": - return transform_gene_info( - datasets=datasets, - adjusted_p_value_threshold=dataset_obj["custom_transformations"][ - "adjusted_p_value_threshold" - ], - protein_level_threshold=dataset_obj["custom_transformations"][ - "protein_level_threshold" - ], - ) - elif dataset_name == "rna_distribution_data": - return transform_rna_distribution_data(datasets=datasets) - elif dataset_name == "proteomics_distribution_data": - return create_proteomics_distribution_data(datasets=datasets) - else: - return None From 69744c7dbbd96174d8ea6726d72e2a588751b813 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:32:20 -0600 Subject: [PATCH 10/24] updates process.py --- src/agoradatatools/process.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index b90d876a..249d4918 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -6,8 +6,8 @@ import agoradatatools.etl.extract as extract import agoradatatools.etl.load as load -import agoradatatools.etl.transform.apply_transform as apply_transform -from agoradatatools.etl.transform.utils import * +from agoradatatools.etl.transform.apply import apply_custom_transformations +from agoradatatools.etl.transform import utils as transform_utils import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError @@ -35,19 +35,19 @@ def process_dataset( entity_name = entity["name"] df = extract.get_entity_as_df(syn_id=entity_id, source=entity_format, syn=syn) - df = apply_transform.standardize_column_names(df=df) - df = apply_transform.standardize_values(df=df) + df = transform_utils.standardize_column_names(df=df) + df = transform_utils.standardize_values(df=df) # the column rename gets applied to all entities in a dataset if "column_rename" in dataset_obj[dataset_name].keys(): - df = apply_transform.rename_columns( + df = transform_utils.rename_columns( df=df, column_map=dataset_obj[dataset_name]["column_rename"] ) entities_as_df[entity_name] = df if "custom_transformations" in dataset_obj[dataset_name].keys(): - df = apply_transform.apply_custom_transformations( + df = apply_custom_transformations( datasets=entities_as_df, dataset_name=dataset_name, dataset_obj=dataset_obj[dataset_name], @@ -56,7 +56,7 @@ def process_dataset( df = entities_as_df[list(entities_as_df)[0]] if "agora_rename" in dataset_obj[dataset_name].keys(): - df = apply_transform.rename_columns( + df = transform_utils.rename_columns( df=df, column_map=dataset_obj[dataset_name]["agora_rename"] ) From 019924980441a1a9d6e9f31160a5842598727600 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:44:23 -0600 Subject: [PATCH 11/24] move nest_fields back to utils --- .../etl/transform/custom/distribution_data.py | 1 + .../etl/transform/custom/gene_info.py | 28 +------------------ .../etl/transform/custom/genes_biodomains.py | 3 +- src/agoradatatools/etl/transform/utils.py | 28 +++++++++++++++++++ 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/agoradatatools/etl/transform/custom/distribution_data.py b/src/agoradatatools/etl/transform/custom/distribution_data.py index 568e1e6e..17e7e3fb 100644 --- a/src/agoradatatools/etl/transform/custom/distribution_data.py +++ b/src/agoradatatools/etl/transform/custom/distribution_data.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: diff --git a/src/agoradatatools/etl/transform/custom/gene_info.py b/src/agoradatatools/etl/transform/custom/gene_info.py index 8832d10c..7997e96f 100644 --- a/src/agoradatatools/etl/transform/custom/gene_info.py +++ b/src/agoradatatools/etl/transform/custom/gene_info.py @@ -1,33 +1,7 @@ import pandas as pd import numpy as np - -def nest_fields( - df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] -) -> pd.DataFrame: - """Collapses the provided DataFrame into 2 columns: - 1. The grouping column - 2. A column containing a nested dictionary with the data from the rest of the DataFrame - - Args: - df (pd.DataFrame): DataFrame to be collapsed - grouping (str): The column that you want to group by - new_column (str): the new column created to contain the nested dictionaries created - drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. - - Returns: - pd.DataFrame: New 2 column DataFrame with group and nested dictionaries - """ - return ( - df.groupby(grouping) - .apply( - lambda row: row.replace({np.nan: None}) - .drop(columns=drop_columns) - .to_dict("records") - ) - .reset_index() - .rename(columns={0: new_column}) - ) +from agoradatatools.etl.transform.utils import nest_fields def transform_gene_info( diff --git a/src/agoradatatools/etl/transform/custom/genes_biodomains.py b/src/agoradatatools/etl/transform/custom/genes_biodomains.py index 693e2672..b1cf47a6 100644 --- a/src/agoradatatools/etl/transform/custom/genes_biodomains.py +++ b/src/agoradatatools/etl/transform/custom/genes_biodomains.py @@ -1,7 +1,8 @@ import pandas as pd - from typing import Union +from agoradatatools.etl.transform.utils import nest_fields + def count_grouped_total( df: pd.DataFrame, diff --git a/src/agoradatatools/etl/transform/utils.py b/src/agoradatatools/etl/transform/utils.py index 4f5484e0..3f34c82d 100644 --- a/src/agoradatatools/etl/transform/utils.py +++ b/src/agoradatatools/etl/transform/utils.py @@ -58,3 +58,31 @@ def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: return df return df + + +def nest_fields( + df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] +) -> pd.DataFrame: + """Collapses the provided DataFrame into 2 columns: + 1. The grouping column + 2. A column containing a nested dictionary with the data from the rest of the DataFrame + + Args: + df (pd.DataFrame): DataFrame to be collapsed + grouping (str): The column that you want to group by + new_column (str): the new column created to contain the nested dictionaries created + drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. + + Returns: + pd.DataFrame: New 2 column DataFrame with group and nested dictionaries + """ + return ( + df.groupby(grouping) + .apply( + lambda row: row.replace({np.nan: None}) + .drop(columns=drop_columns) + .to_dict("records") + ) + .reset_index() + .rename(columns={0: new_column}) + ) From 3b5052247c3f3492d656dcfcb6967abc6319db93 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 10:56:12 -0600 Subject: [PATCH 12/24] updates test_process --- src/agoradatatools/process.py | 4 ++-- tests/test_process.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 249d4918..07a2dcd9 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -6,7 +6,7 @@ import agoradatatools.etl.extract as extract import agoradatatools.etl.load as load -from agoradatatools.etl.transform.apply import apply_custom_transformations +import agoradatatools.etl.transform.apply as apply from agoradatatools.etl.transform import utils as transform_utils import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError @@ -47,7 +47,7 @@ def process_dataset( entities_as_df[entity_name] = df if "custom_transformations" in dataset_obj[dataset_name].keys(): - df = apply_custom_transformations( + df = apply.apply_custom_transformations( datasets=entities_as_df, dataset_name=dataset_name, dataset_obj=dataset_obj[dataset_name], diff --git a/tests/test_process.py b/tests/test_process.py index b9dffd00..2eddd9ed 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -2,7 +2,7 @@ from unittest.mock import patch import pandas as pd -from agoradatatools.etl.transform import apply +from agoradatatools.etl.transform import apply, utils as transform_utils import pytest from agoradatatools import process @@ -55,13 +55,13 @@ def setup_method(self): extract, "get_entity_as_df", return_value=pd.DataFrame ).start() self.patch_standardize_column_names = patch.object( - apply, "standardize_column_names", return_value=pd.DataFrame + transform_utils, "standardize_column_names", return_value=pd.DataFrame ).start() self.patch_standardize_values = patch.object( - apply, "standardize_values", return_value=pd.DataFrame + transform_utils, "standardize_values", return_value=pd.DataFrame ).start() self.patch_rename_columns = patch.object( - apply, "rename_columns", return_value=pd.DataFrame + transform_utils, "rename_columns", return_value=pd.DataFrame ).start() self.patch_df_to_json = patch.object( load, "df_to_json", return_value="path/to/json" From 23aecc1f22dbc97f3b4fe8db460b50c3acda7ca6 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 11:02:33 -0600 Subject: [PATCH 13/24] updates tests --- .../test_transform_genes_biodomains.py | 98 +++++++++++++++++++ .../test_transform_utils.py} | 93 ------------------ 2 files changed, 98 insertions(+), 93 deletions(-) create mode 100644 tests/transform/test_transform_genes_biodomains.py rename tests/{test_transform.py => transform/test_transform_utils.py} (50%) diff --git a/tests/transform/test_transform_genes_biodomains.py b/tests/transform/test_transform_genes_biodomains.py new file mode 100644 index 00000000..e4ca5d58 --- /dev/null +++ b/tests/transform/test_transform_genes_biodomains.py @@ -0,0 +1,98 @@ +from unittest.mock import patch + +import pandas as pd + +from agoradatatools.etl.transform.custom.genes_biodomains import * + + +class TestCountGroupedTotal: + df = pd.DataFrame( + { + "col_1": ["a", "a", "a", "b", "c", "c", "c"], # 3 'Ensembl IDs' + "col_2": ["x", "y", "z", "x", "y", "z", "z"], # 3 'biodomains' + "col_3": ["1", "1", "2", "3", "2", "1", "3"], # 3 'go_terms' + "col_4": ["m", "m", "n", "n", "o", "o", "o"], # An ignored column + } + ) + + # How many unique "col_2"'s per unique "col_1" value? + def test_count_grouped_total_one_group(self): + expected_df = pd.DataFrame({"col_1": ["a", "b", "c"], "output": [3, 1, 2]}) + counted = count_grouped_total( + df=self.df, grouping="col_1", input_colname="col_2", output_colname="output" + ) + assert counted.equals(expected_df) + + # How many unique "col_3"'s per unique combination of "col_1" + "col_2"? + def test_count_grouped_total_two_groups(self): + expected_df = pd.DataFrame( + { + "col_1": ["a", "a", "a", "b", "c", "c"], + "col_2": ["x", "y", "z", "x", "y", "z"], + "output": [1, 1, 1, 1, 1, 2], + } + ) + + counted = count_grouped_total( + df=self.df, + grouping=["col_1", "col_2"], + input_colname="col_3", + output_colname="output", + ) + assert counted.equals(expected_df) + + +# def test_transform_biodomains(): +# test_datasets = { +# "biodomains": pd.DataFrame( +# { +# "ensembl_gene_id": ["1", "1", "2", "2", "3", "3"], +# "biodomain": ["a", "b", "c", "d", "e", "f"], +# "go_terms": ["a", "b", "c", "d", "e", "f"], +# } +# ) +# } +# expected_gene_biodomains_col = [ +# [{"biodomain": "a", "go_terms": ["a"]}, {"biodomain": "b", "go_terms": ["b"]}], +# [{"biodomain": "c", "go_terms": ["c"]}, {"biodomain": "d", "go_terms": ["d"]}], +# [{"biodomain": "e", "go_terms": ["e"]}, {"biodomain": "f", "go_terms": ["f"]}], +# ] +# test_biodomains = transform.transform_biodomains(datasets=test_datasets) +# assert list(test_biodomains["gene_biodomains"]) == expected_gene_biodomains_col + + +# df = pd.DataFrame( +# {'team id': [np.nan, 0, 1, 2], +# 'team.Name': ['MSN', 'Team 1', 'Team 2', np.nan], +# 'team-Sco@#&': ['x', 'y', 'z', "na"]}) + +# def test_standardize_column_names(): + +# result_df = transform.standardize_column_names(df) +# assert type(result_df) is pd.core.frame.DataFrame +# assert list(result_df.columns) == ['team_id', 'team_name', 'team-sco'] + + +# def test_standardize_values(): + +# assert df.isna().sum().sum() == 2 + +# result_df = transform.standardize_values(df) + +# assert type(result_df) is pd.core.frame.DataFrame +# assert result_df.isna().sum().sum() == 0 +# assert result_df.shape == (4, 3) + +# def test_rename_columns(): +# refresh_df = pd.DataFrame( +# {'team id': [np.nan, 0, 1, 2], +# 'team.Name': ['MSN', 'Team 1', 'Team 2', np.nan], +# 'team-Sco@#&': ['x', 'y', 'z', "na"]}) + +# bad_result_df = transform.rename_columns(df=refresh_df, column_map={"team-Sco@#&"}) +# assert type(bad_result_df) is pd.core.frame.DataFrame +# assert list(bad_result_df.columns) == ["team id", "team.Name", "team-Sco@#&"] + +# partial_good_result_df = transform.rename_columns(df=refresh_df, column_map={"team-Sco@#&": "team_scope"}) +# assert list(partial_good_result_df.columns) == ['team id', 'team.Name', 'team_scope'] +# assert type(partial_good_result_df) is pd.core.frame.DataFrame diff --git a/tests/test_transform.py b/tests/transform/test_transform_utils.py similarity index 50% rename from tests/test_transform.py rename to tests/transform/test_transform_utils.py index cbebdf1f..5ab9c396 100644 --- a/tests/test_transform.py +++ b/tests/transform/test_transform_utils.py @@ -124,96 +124,3 @@ def test_nest_fields(): df=df, grouping="a", new_column="e", drop_columns=["d"] ) assert list(nested_df["e"]) == expected_column_e - - -class TestCountGroupedTotal: - df = pd.DataFrame( - { - "col_1": ["a", "a", "a", "b", "c", "c", "c"], # 3 'Ensembl IDs' - "col_2": ["x", "y", "z", "x", "y", "z", "z"], # 3 'biodomains' - "col_3": ["1", "1", "2", "3", "2", "1", "3"], # 3 'go_terms' - "col_4": ["m", "m", "n", "n", "o", "o", "o"], # An ignored column - } - ) - - # How many unique "col_2"'s per unique "col_1" value? - def test_count_grouped_total_one_group(self): - expected_df = pd.DataFrame({"col_1": ["a", "b", "c"], "output": [3, 1, 2]}) - counted = utils.count_grouped_total( - df=self.df, grouping="col_1", input_colname="col_2", output_colname="output" - ) - assert counted.equals(expected_df) - - # How many unique "col_3"'s per unique combination of "col_1" + "col_2"? - def test_count_grouped_total_two_groups(self): - expected_df = pd.DataFrame( - { - "col_1": ["a", "a", "a", "b", "c", "c"], - "col_2": ["x", "y", "z", "x", "y", "z"], - "output": [1, 1, 1, 1, 1, 2], - } - ) - - counted = utils.count_grouped_total( - df=self.df, - grouping=["col_1", "col_2"], - input_colname="col_3", - output_colname="output", - ) - assert counted.equals(expected_df) - - -# def test_transform_biodomains(): -# test_datasets = { -# "biodomains": pd.DataFrame( -# { -# "ensembl_gene_id": ["1", "1", "2", "2", "3", "3"], -# "biodomain": ["a", "b", "c", "d", "e", "f"], -# "go_terms": ["a", "b", "c", "d", "e", "f"], -# } -# ) -# } -# expected_gene_biodomains_col = [ -# [{"biodomain": "a", "go_terms": ["a"]}, {"biodomain": "b", "go_terms": ["b"]}], -# [{"biodomain": "c", "go_terms": ["c"]}, {"biodomain": "d", "go_terms": ["d"]}], -# [{"biodomain": "e", "go_terms": ["e"]}, {"biodomain": "f", "go_terms": ["f"]}], -# ] -# test_biodomains = transform.transform_biodomains(datasets=test_datasets) -# assert list(test_biodomains["gene_biodomains"]) == expected_gene_biodomains_col - - -# df = pd.DataFrame( -# {'team id': [np.nan, 0, 1, 2], -# 'team.Name': ['MSN', 'Team 1', 'Team 2', np.nan], -# 'team-Sco@#&': ['x', 'y', 'z', "na"]}) - -# def test_standardize_column_names(): - -# result_df = transform.standardize_column_names(df) -# assert type(result_df) is pd.core.frame.DataFrame -# assert list(result_df.columns) == ['team_id', 'team_name', 'team-sco'] - - -# def test_standardize_values(): - -# assert df.isna().sum().sum() == 2 - -# result_df = transform.standardize_values(df) - -# assert type(result_df) is pd.core.frame.DataFrame -# assert result_df.isna().sum().sum() == 0 -# assert result_df.shape == (4, 3) - -# def test_rename_columns(): -# refresh_df = pd.DataFrame( -# {'team id': [np.nan, 0, 1, 2], -# 'team.Name': ['MSN', 'Team 1', 'Team 2', np.nan], -# 'team-Sco@#&': ['x', 'y', 'z', "na"]}) - -# bad_result_df = transform.rename_columns(df=refresh_df, column_map={"team-Sco@#&"}) -# assert type(bad_result_df) is pd.core.frame.DataFrame -# assert list(bad_result_df.columns) == ["team id", "team.Name", "team-Sco@#&"] - -# partial_good_result_df = transform.rename_columns(df=refresh_df, column_map={"team-Sco@#&": "team_scope"}) -# assert list(partial_good_result_df.columns) == ['team id', 'team.Name', 'team_scope'] -# assert type(partial_good_result_df) is pd.core.frame.DataFrame From 4efb42c9cf04528bd795f5032034b3d860a6b02d Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 11:17:28 -0600 Subject: [PATCH 14/24] cleans up tests --- tests/test_process.py | 28 ++++++++++--------- .../test_transform_genes_biodomains.py | 4 +-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/test_process.py b/tests/test_process.py index 2eddd9ed..66d47492 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -2,12 +2,14 @@ from unittest.mock import patch import pandas as pd -from agoradatatools.etl.transform import apply, utils as transform_utils import pytest +from typing import Any from agoradatatools import process -from agoradatatools.etl import extract, load, utils from agoradatatools.errors import ADTDataProcessingError +from agoradatatools.etl import extract, load, utils +from agoradatatools.etl.transform import apply +from agoradatatools.etl.transform import utils as transform_utils class TestProcessDataset: @@ -84,7 +86,7 @@ def teardown_method(self): self.patch_custom_transform.stop() self.patch_dict_to_json.stop() - def test_process_dataset_with_column_rename(self, syn): + def test_process_dataset_with_column_rename(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object_col_rename, staging_path="./staging", @@ -96,7 +98,7 @@ def test_process_dataset_with_column_rename(self, syn): self.patch_custom_transform.assert_not_called() self.patch_dict_to_json.assert_not_called() - def test_process_dataset_custom_transformations(self, syn): + def test_process_dataset_custom_transformations(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object_custom_transform, staging_path="./staging", @@ -116,7 +118,7 @@ def test_process_dataset_custom_transformations(self, syn): self.patch_rename_columns.assert_not_called() self.patch_dict_to_json.assert_not_called() - def test_process_dataset_with_agora_rename(self, syn): + def test_process_dataset_with_agora_rename(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object_col_rename, staging_path="./staging", @@ -128,7 +130,7 @@ def test_process_dataset_with_agora_rename(self, syn): self.patch_custom_transform.assert_not_called() self.patch_dict_to_json.assert_not_called() - def test_process_dataset_type_dict(self, syn): + def test_process_dataset_type_dict(self, syn: Any): self.patch_standardize_values.return_value = ( dict() ) # test if it is a dictionary later @@ -145,7 +147,7 @@ def test_process_dataset_type_dict(self, syn): class TestCreateDataManifest: @pytest.fixture(scope="function", autouse=True) - def setup_method(self, syn): + def setup_method(self, syn: Any): self.patch_syn_login = patch.object( utils, "_login_to_synapse", return_value=syn ).start() @@ -156,7 +158,7 @@ def setup_method(self, syn): def teardown_method(self): mock.patch.stopall() - def test_create_data_manifest_parent_none(self, syn): + def test_create_data_manifest_parent_none(self, syn: Any): assert process.create_data_manifest(parent=None, syn=syn) is None self.patch_syn_login.assert_not_called() @@ -164,7 +166,7 @@ def test_create_data_manifest_syn_none(self): process.create_data_manifest(parent="syn1111111", syn=None) self.patch_syn_login.assert_called_once() - def test_create_data_manifest_no_none(self, syn): + def test_create_data_manifest_no_none(self, syn: Any): df = process.create_data_manifest(parent="syn1111111", syn=syn) self.patch_get_children.assert_called_once_with("syn1111111") self.patch_syn_login.assert_not_called() @@ -201,21 +203,21 @@ def setup_method(self): def teardown_method(self): mock.patch.stopall() - def test_process_all_files_config_path(self, syn): + def test_process_all_files_config_path(self, syn: Any): process.process_all_files(config_path="path/to/config", syn=syn) self.patch_get_config.assert_called_once_with(config_path="path/to/config") - def test_process_all_files_no_config_path(self, syn): + def test_process_all_files_no_config_path(self, syn: Any): process.process_all_files(config_path=None, syn=syn) self.patch_get_config.assert_called_once_with() - def test_process_all_files_process_dataset_fails(self, syn): + def test_process_all_files_process_dataset_fails(self, syn: Any): with pytest.raises(ADTDataProcessingError): self.patch_process_dataset.side_effect = Exception process.process_all_files(config_path="path/to/config", syn=syn) self.patch_create_data_manifest.assert_not_called() - def test_process_all_files_full(self, syn): + def test_process_all_files_full(self, syn: Any): process.process_all_files(config_path=None, syn=syn) self.patch_process_dataset.assert_any_call( dataset_obj={"a": {"b": "c"}}, staging_path="./staging", syn=syn diff --git a/tests/transform/test_transform_genes_biodomains.py b/tests/transform/test_transform_genes_biodomains.py index e4ca5d58..6cd59ef3 100644 --- a/tests/transform/test_transform_genes_biodomains.py +++ b/tests/transform/test_transform_genes_biodomains.py @@ -1,8 +1,6 @@ -from unittest.mock import patch - import pandas as pd -from agoradatatools.etl.transform.custom.genes_biodomains import * +from agoradatatools.etl.transform.custom.genes_biodomains import count_grouped_total class TestCountGroupedTotal: From 12a1a940abd584ce2fa236773eb83d17da8815ef Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 11:19:14 -0600 Subject: [PATCH 15/24] adds init to transform module --- src/agoradatatools/etl/transform/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/agoradatatools/etl/transform/__init__.py diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py new file mode 100644 index 00000000..e69de29b From 8b3adfbf1e2b36e8d66c6274dcdb365eabf92a13 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 12:37:56 -0600 Subject: [PATCH 16/24] test_transform --- .../test_transform_genes_biodomains.py | 0 tests/{transform => test_transform}/test_transform_utils.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/{transform => test_transform}/test_transform_genes_biodomains.py (100%) rename tests/{transform => test_transform}/test_transform_utils.py (100%) diff --git a/tests/transform/test_transform_genes_biodomains.py b/tests/test_transform/test_transform_genes_biodomains.py similarity index 100% rename from tests/transform/test_transform_genes_biodomains.py rename to tests/test_transform/test_transform_genes_biodomains.py diff --git a/tests/transform/test_transform_utils.py b/tests/test_transform/test_transform_utils.py similarity index 100% rename from tests/transform/test_transform_utils.py rename to tests/test_transform/test_transform_utils.py From 418bdbfd87e014ce08a598cc990488d3b1efeddf Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 12:52:43 -0600 Subject: [PATCH 17/24] cleans up imports --- src/agoradatatools/etl/transform/apply.py | 3 --- src/agoradatatools/process.py | 4 +--- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/agoradatatools/etl/transform/apply.py b/src/agoradatatools/etl/transform/apply.py index 7a7ec4a6..ea075a0a 100644 --- a/src/agoradatatools/etl/transform/apply.py +++ b/src/agoradatatools/etl/transform/apply.py @@ -1,6 +1,3 @@ -import numpy as np -import pandas as pd - from agoradatatools.etl.transform.custom import * diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 07a2dcd9..f85f821d 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -1,5 +1,3 @@ -import os - import synapseclient from pandas import DataFrame from typer import Argument, Option, Typer @@ -7,9 +5,9 @@ import agoradatatools.etl.extract as extract import agoradatatools.etl.load as load import agoradatatools.etl.transform.apply as apply -from agoradatatools.etl.transform import utils as transform_utils import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError +from agoradatatools.etl.transform import utils as transform_utils def process_dataset( From d5900c4cada6875fac5a110a7b7a5be0779509e5 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Tue, 2 May 2023 16:04:34 -0600 Subject: [PATCH 18/24] change `test_load` and `test_transform` to `load` and `transform` --- tests/{test_load => load}/test_load.py | 0 tests/{test_load => load}/test_numpyencoder.py | 0 .../test_transform_genes_biodomains.py | 0 tests/{test_transform => transform}/test_transform_utils.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_load => load}/test_load.py (100%) rename tests/{test_load => load}/test_numpyencoder.py (100%) rename tests/{test_transform => transform}/test_transform_genes_biodomains.py (100%) rename tests/{test_transform => transform}/test_transform_utils.py (100%) diff --git a/tests/test_load/test_load.py b/tests/load/test_load.py similarity index 100% rename from tests/test_load/test_load.py rename to tests/load/test_load.py diff --git a/tests/test_load/test_numpyencoder.py b/tests/load/test_numpyencoder.py similarity index 100% rename from tests/test_load/test_numpyencoder.py rename to tests/load/test_numpyencoder.py diff --git a/tests/test_transform/test_transform_genes_biodomains.py b/tests/transform/test_transform_genes_biodomains.py similarity index 100% rename from tests/test_transform/test_transform_genes_biodomains.py rename to tests/transform/test_transform_genes_biodomains.py diff --git a/tests/test_transform/test_transform_utils.py b/tests/transform/test_transform_utils.py similarity index 100% rename from tests/test_transform/test_transform_utils.py rename to tests/transform/test_transform_utils.py From 2bf52aa54052d27b6598ec0b00db24ecbe249652 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 3 May 2023 12:51:30 -0600 Subject: [PATCH 19/24] moves transform utils to etl utils --- .../etl/transform/custom/gene_info.py | 2 +- .../etl/transform/custom/genes_biodomains.py | 2 +- src/agoradatatools/etl/transform/utils.py | 88 ------------------ src/agoradatatools/etl/utils.py | 89 +++++++++++++++++++ src/agoradatatools/process.py | 9 +- 5 files changed, 95 insertions(+), 95 deletions(-) delete mode 100644 src/agoradatatools/etl/transform/utils.py diff --git a/src/agoradatatools/etl/transform/custom/gene_info.py b/src/agoradatatools/etl/transform/custom/gene_info.py index 7997e96f..183bdd45 100644 --- a/src/agoradatatools/etl/transform/custom/gene_info.py +++ b/src/agoradatatools/etl/transform/custom/gene_info.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -from agoradatatools.etl.transform.utils import nest_fields +from agoradatatools.etl.utils import nest_fields def transform_gene_info( diff --git a/src/agoradatatools/etl/transform/custom/genes_biodomains.py b/src/agoradatatools/etl/transform/custom/genes_biodomains.py index b1cf47a6..e34f3a35 100644 --- a/src/agoradatatools/etl/transform/custom/genes_biodomains.py +++ b/src/agoradatatools/etl/transform/custom/genes_biodomains.py @@ -1,7 +1,7 @@ import pandas as pd from typing import Union -from agoradatatools.etl.transform.utils import nest_fields +from agoradatatools.etl.utils import nest_fields def count_grouped_total( diff --git a/src/agoradatatools/etl/transform/utils.py b/src/agoradatatools/etl/transform/utils.py deleted file mode 100644 index 3f34c82d..00000000 --- a/src/agoradatatools/etl/transform/utils.py +++ /dev/null @@ -1,88 +0,0 @@ -import numpy as np -import pandas as pd - - -def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame: - """Takes in a dataframe replaces problematic characters in column names - and makes column names all lowercase characters - - Args: - df (pd.DataFrame): DataFrame with columns to be standardized - - Returns: - pd.DataFrame: New dataframe with cleaned column names - """ - - df.columns = df.columns.str.replace( - "[#@&*^?()%$#!/]", "", regex=True - ) # the commas were unnessesary and were breaking the prelacement of '-' characters - df.columns = df.columns.str.replace("[ -.]", "_", regex=True) - df.columns = map(str.lower, df.columns) - - return df - - -def standardize_values(df: pd.DataFrame) -> pd.DataFrame: - """Finds non-compliant values and corrects them - *if more data cleaning options need to be added to this, - this needs to be refactored to another function - - Args: - df (pd.DataFrame): DataFrame with values to be standardized - - Returns: - pd.DataFrame: Resulting DataFrame with standardized values - """ - try: - df.replace(["n/a", "N/A", "n/A", "N/a"], np.nan, regex=True, inplace=True) - except TypeError: # I could not get this to trigger without mocking replace - print("Error comparing types.") - - return df - - -def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: - """Takes in a dataframe and renames columns according to the mapping provided - - Args: - df (pd.DataFrame): DataFrame with columns to be renamed - column_map (dict): Dictionary mapping original column names to new columns - - Returns: - pd.DataFrame: DataFrame with new columns names - """ - try: - df.rename(columns=column_map, inplace=True) - except TypeError: - print("Column mapping must be a dictionary") - return df - - return df - - -def nest_fields( - df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] -) -> pd.DataFrame: - """Collapses the provided DataFrame into 2 columns: - 1. The grouping column - 2. A column containing a nested dictionary with the data from the rest of the DataFrame - - Args: - df (pd.DataFrame): DataFrame to be collapsed - grouping (str): The column that you want to group by - new_column (str): the new column created to contain the nested dictionaries created - drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. - - Returns: - pd.DataFrame: New 2 column DataFrame with group and nested dictionaries - """ - return ( - df.groupby(grouping) - .apply( - lambda row: row.replace({np.nan: None}) - .drop(columns=drop_columns) - .to_dict("records") - ) - .reset_index() - .rename(columns={0: new_column}) - ) diff --git a/src/agoradatatools/etl/utils.py b/src/agoradatatools/etl/utils.py index 6d50985c..e2ef4661 100644 --- a/src/agoradatatools/etl/utils.py +++ b/src/agoradatatools/etl/utils.py @@ -1,7 +1,10 @@ import synapseclient import yaml +import pandas as pd +import numpy as np +# TODO these utils functions are not protected... refactor removing "_" def _login_to_synapse(token: str = None) -> synapseclient.Synapse: """Logs into Synapse python client, returns authenticated Synapse session. @@ -68,3 +71,89 @@ def _find_config_by_name(config: list, name: str): if name in item.keys(): return item[name] return None + + +def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame: + """Takes in a dataframe replaces problematic characters in column names + and makes column names all lowercase characters + + Args: + df (pd.DataFrame): DataFrame with columns to be standardized + + Returns: + pd.DataFrame: New dataframe with cleaned column names + """ + + df.columns = df.columns.str.replace( + "[#@&*^?()%$#!/]", "", regex=True + ) # the commas were unnessesary and were breaking the prelacement of '-' characters + df.columns = df.columns.str.replace("[ -.]", "_", regex=True) + df.columns = map(str.lower, df.columns) + + return df + + +def standardize_values(df: pd.DataFrame) -> pd.DataFrame: + """Finds non-compliant values and corrects them + *if more data cleaning options need to be added to this, + this needs to be refactored to another function + + Args: + df (pd.DataFrame): DataFrame with values to be standardized + + Returns: + pd.DataFrame: Resulting DataFrame with standardized values + """ + try: + df.replace(["n/a", "N/A", "n/A", "N/a"], np.nan, regex=True, inplace=True) + except TypeError: # I could not get this to trigger without mocking replace + print("Error comparing types.") + + return df + + +def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: + """Takes in a dataframe and renames columns according to the mapping provided + + Args: + df (pd.DataFrame): DataFrame with columns to be renamed + column_map (dict): Dictionary mapping original column names to new columns + + Returns: + pd.DataFrame: DataFrame with new columns names + """ + try: + df.rename(columns=column_map, inplace=True) + except TypeError: + print("Column mapping must be a dictionary") + return df + + return df + + +def nest_fields( + df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = [] +) -> pd.DataFrame: + """Collapses the provided DataFrame into 2 columns: + 1. The grouping column + 2. A column containing a nested dictionary with the data from the rest of the DataFrame + + Args: + df (pd.DataFrame): DataFrame to be collapsed + grouping (str): The column that you want to group by + new_column (str): the new column created to contain the nested dictionaries created + drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to []. + + Returns: + pd.DataFrame: New 2 column DataFrame with group and nested dictionaries + """ + return ( + df.groupby(grouping) + .apply( + lambda row: row.replace({np.nan: None}) + .drop(columns=drop_columns) + .to_dict("records") + ) + .reset_index() + .rename(columns={0: new_column}) + ) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index f85f821d..22eb3cf7 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -7,7 +7,6 @@ import agoradatatools.etl.transform.apply as apply import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError -from agoradatatools.etl.transform import utils as transform_utils def process_dataset( @@ -33,12 +32,12 @@ def process_dataset( entity_name = entity["name"] df = extract.get_entity_as_df(syn_id=entity_id, source=entity_format, syn=syn) - df = transform_utils.standardize_column_names(df=df) - df = transform_utils.standardize_values(df=df) + df = utils.standardize_column_names(df=df) + df = utils.standardize_values(df=df) # the column rename gets applied to all entities in a dataset if "column_rename" in dataset_obj[dataset_name].keys(): - df = transform_utils.rename_columns( + df = utils.rename_columns( df=df, column_map=dataset_obj[dataset_name]["column_rename"] ) @@ -54,7 +53,7 @@ def process_dataset( df = entities_as_df[list(entities_as_df)[0]] if "agora_rename" in dataset_obj[dataset_name].keys(): - df = transform_utils.rename_columns( + df = utils.rename_columns( df=df, column_map=dataset_obj[dataset_name]["agora_rename"] ) From 2a8183cf201a40f676aa930821c49d65754dfc53 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 3 May 2023 12:56:03 -0600 Subject: [PATCH 20/24] moves apply_custom_transformations to process.py --- src/agoradatatools/etl/transform/apply.py | 44 --------------------- src/agoradatatools/process.py | 48 ++++++++++++++++++++++- 2 files changed, 46 insertions(+), 46 deletions(-) delete mode 100644 src/agoradatatools/etl/transform/apply.py diff --git a/src/agoradatatools/etl/transform/apply.py b/src/agoradatatools/etl/transform/apply.py deleted file mode 100644 index ea075a0a..00000000 --- a/src/agoradatatools/etl/transform/apply.py +++ /dev/null @@ -1,44 +0,0 @@ -from agoradatatools.etl.transform.custom import * - - -# TODO refactor to avoid so many if's - maybe some sort of mapping to callables -def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): - if not isinstance(datasets, dict) or not isinstance(dataset_name, str): - return None - if dataset_name == "genes_biodomains": - return transform_genes_biodomains(datasets=datasets) - if dataset_name == "overall_scores": - df = datasets["overall_scores"] - return transform_overall_scores(df=df) - if dataset_name == "distribution_data": - return transform_distribution_data( - datasets=datasets, - overall_max_score=dataset_obj["custom_transformations"][ - "overall_max_score" - ], - genetics_max_score=dataset_obj["custom_transformations"][ - "genetics_max_score" - ], - omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], - lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], - ) - if dataset_name == "team_info": - return transform_team_info(datasets=datasets) - if dataset_name == "rnaseq_differential_expression": - return transform_rna_seq_data(datasets=datasets) - if dataset_name == "gene_info": - return transform_gene_info( - datasets=datasets, - adjusted_p_value_threshold=dataset_obj["custom_transformations"][ - "adjusted_p_value_threshold" - ], - protein_level_threshold=dataset_obj["custom_transformations"][ - "protein_level_threshold" - ], - ) - if dataset_name == "rna_distribution_data": - return transform_rna_distribution_data(datasets=datasets) - if dataset_name == "proteomics_distribution_data": - return create_proteomics_distribution_data(datasets=datasets) - else: - return None diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 22eb3cf7..019266f9 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -4,10 +4,54 @@ import agoradatatools.etl.extract as extract import agoradatatools.etl.load as load -import agoradatatools.etl.transform.apply as apply import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError +from agoradatatools.etl.transform.custom import * + + +# TODO refactor to avoid so many if's - maybe some sort of mapping to callables +def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): + if not isinstance(datasets, dict) or not isinstance(dataset_name, str): + return None + if dataset_name == "genes_biodomains": + return transform_genes_biodomains(datasets=datasets) + if dataset_name == "overall_scores": + df = datasets["overall_scores"] + return transform_overall_scores(df=df) + if dataset_name == "distribution_data": + return transform_distribution_data( + datasets=datasets, + overall_max_score=dataset_obj["custom_transformations"][ + "overall_max_score" + ], + genetics_max_score=dataset_obj["custom_transformations"][ + "genetics_max_score" + ], + omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], + lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], + ) + if dataset_name == "team_info": + return transform_team_info(datasets=datasets) + if dataset_name == "rnaseq_differential_expression": + return transform_rna_seq_data(datasets=datasets) + if dataset_name == "gene_info": + return transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=dataset_obj["custom_transformations"][ + "adjusted_p_value_threshold" + ], + protein_level_threshold=dataset_obj["custom_transformations"][ + "protein_level_threshold" + ], + ) + if dataset_name == "rna_distribution_data": + return transform_rna_distribution_data(datasets=datasets) + if dataset_name == "proteomics_distribution_data": + return create_proteomics_distribution_data(datasets=datasets) + else: + return None + def process_dataset( dataset_obj: dict, staging_path: str, syn: synapseclient.Synapse @@ -44,7 +88,7 @@ def process_dataset( entities_as_df[entity_name] = df if "custom_transformations" in dataset_obj[dataset_name].keys(): - df = apply.apply_custom_transformations( + df = apply_custom_transformations( datasets=entities_as_df, dataset_name=dataset_name, dataset_obj=dataset_obj[dataset_name], From 88c1f8850b1be381236b2f9cd3e275f7b3bc42ac Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 3 May 2023 13:01:31 -0600 Subject: [PATCH 21/24] move `custom` submodule up to `transform` --- src/agoradatatools/etl/transform/__init__.py | 31 +++++++++++++++++++ .../etl/transform/custom/__init__.py | 31 ------------------- .../{custom => }/distribution_data.py | 0 .../etl/transform/{custom => }/gene_info.py | 0 .../{custom => }/genes_biodomains.py | 0 .../transform/{custom => }/overall_scores.py | 0 .../{custom => }/proteomics_distribution.py | 0 .../{custom => }/rna_distribution.py | 0 .../etl/transform/{custom => }/team_info.py | 0 src/agoradatatools/process.py | 2 +- 10 files changed, 32 insertions(+), 32 deletions(-) delete mode 100644 src/agoradatatools/etl/transform/custom/__init__.py rename src/agoradatatools/etl/transform/{custom => }/distribution_data.py (100%) rename src/agoradatatools/etl/transform/{custom => }/gene_info.py (100%) rename src/agoradatatools/etl/transform/{custom => }/genes_biodomains.py (100%) rename src/agoradatatools/etl/transform/{custom => }/overall_scores.py (100%) rename src/agoradatatools/etl/transform/{custom => }/proteomics_distribution.py (100%) rename src/agoradatatools/etl/transform/{custom => }/rna_distribution.py (100%) rename src/agoradatatools/etl/transform/{custom => }/team_info.py (100%) diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py index e69de29b..7d687806 100644 --- a/src/agoradatatools/etl/transform/__init__.py +++ b/src/agoradatatools/etl/transform/__init__.py @@ -0,0 +1,31 @@ +"""Submodule for Agora Data Tools Transformations""" + +from agoradatatools.etl.transform.distribution_data import ( + transform_distribution_data, +) +from agoradatatools.etl.transform.gene_info import transform_gene_info +from agoradatatools.etl.transform.genes_biodomains import ( + transform_genes_biodomains, +) +from agoradatatools.etl.transform.overall_scores import ( + transform_overall_scores, +) +from agoradatatools.etl.transform.proteomics_distribution import ( + create_proteomics_distribution_data, +) +from agoradatatools.etl.transform.rna_distribution import ( + transform_rna_distribution_data, + transform_rna_seq_data, +) +from agoradatatools.etl.transform.team_info import transform_team_info + +__all__ = [ + "transform_distribution_data", + "transform_gene_info", + "transform_genes_biodomains", + "transform_overall_scores", + "create_proteomics_distribution_data", + "transform_rna_distribution_data", + "transform_rna_seq_data", + "transform_team_info", +] diff --git a/src/agoradatatools/etl/transform/custom/__init__.py b/src/agoradatatools/etl/transform/custom/__init__.py deleted file mode 100644 index fed2e9b7..00000000 --- a/src/agoradatatools/etl/transform/custom/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Submodule for Agora Data Tools Custom Transformations""" - -from agoradatatools.etl.transform.custom.distribution_data import ( - transform_distribution_data, -) -from agoradatatools.etl.transform.custom.gene_info import transform_gene_info -from agoradatatools.etl.transform.custom.genes_biodomains import ( - transform_genes_biodomains, -) -from agoradatatools.etl.transform.custom.overall_scores import ( - transform_overall_scores, -) -from agoradatatools.etl.transform.custom.proteomics_distribution import ( - create_proteomics_distribution_data, -) -from agoradatatools.etl.transform.custom.rna_distribution import ( - transform_rna_distribution_data, - transform_rna_seq_data, -) -from agoradatatools.etl.transform.custom.team_info import transform_team_info - -__all__ = [ - "transform_distribution_data", - "transform_gene_info", - "transform_genes_biodomains", - "transform_overall_scores", - "create_proteomics_distribution_data", - "transform_rna_distribution_data", - "transform_rna_seq_data", - "transform_team_info", -] diff --git a/src/agoradatatools/etl/transform/custom/distribution_data.py b/src/agoradatatools/etl/transform/distribution_data.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/distribution_data.py rename to src/agoradatatools/etl/transform/distribution_data.py diff --git a/src/agoradatatools/etl/transform/custom/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/gene_info.py rename to src/agoradatatools/etl/transform/gene_info.py diff --git a/src/agoradatatools/etl/transform/custom/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/genes_biodomains.py rename to src/agoradatatools/etl/transform/genes_biodomains.py diff --git a/src/agoradatatools/etl/transform/custom/overall_scores.py b/src/agoradatatools/etl/transform/overall_scores.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/overall_scores.py rename to src/agoradatatools/etl/transform/overall_scores.py diff --git a/src/agoradatatools/etl/transform/custom/proteomics_distribution.py b/src/agoradatatools/etl/transform/proteomics_distribution.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/proteomics_distribution.py rename to src/agoradatatools/etl/transform/proteomics_distribution.py diff --git a/src/agoradatatools/etl/transform/custom/rna_distribution.py b/src/agoradatatools/etl/transform/rna_distribution.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/rna_distribution.py rename to src/agoradatatools/etl/transform/rna_distribution.py diff --git a/src/agoradatatools/etl/transform/custom/team_info.py b/src/agoradatatools/etl/transform/team_info.py similarity index 100% rename from src/agoradatatools/etl/transform/custom/team_info.py rename to src/agoradatatools/etl/transform/team_info.py diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 019266f9..d7b970ee 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -7,7 +7,7 @@ import agoradatatools.etl.utils as utils from agoradatatools.errors import ADTDataProcessingError -from agoradatatools.etl.transform.custom import * +from agoradatatools.etl.transform import * # TODO refactor to avoid so many if's - maybe some sort of mapping to callables From 6e30f8d37295b7414d480a77c45917bd65e0f090 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 3 May 2023 13:11:11 -0600 Subject: [PATCH 22/24] updates process.py imports --- src/agoradatatools/process.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index d7b970ee..618056c8 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -2,25 +2,21 @@ from pandas import DataFrame from typer import Argument, Option, Typer -import agoradatatools.etl.extract as extract -import agoradatatools.etl.load as load -import agoradatatools.etl.utils as utils +from agoradatatools.etl import extract, load, utils, transform from agoradatatools.errors import ADTDataProcessingError -from agoradatatools.etl.transform import * - # TODO refactor to avoid so many if's - maybe some sort of mapping to callables def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): if not isinstance(datasets, dict) or not isinstance(dataset_name, str): return None if dataset_name == "genes_biodomains": - return transform_genes_biodomains(datasets=datasets) + return transform.transform_genes_biodomains(datasets=datasets) if dataset_name == "overall_scores": df = datasets["overall_scores"] - return transform_overall_scores(df=df) + return transform.transform_overall_scores(df=df) if dataset_name == "distribution_data": - return transform_distribution_data( + return transform.transform_distribution_data( datasets=datasets, overall_max_score=dataset_obj["custom_transformations"][ "overall_max_score" @@ -32,11 +28,11 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], ) if dataset_name == "team_info": - return transform_team_info(datasets=datasets) + return transform.transform_team_info(datasets=datasets) if dataset_name == "rnaseq_differential_expression": - return transform_rna_seq_data(datasets=datasets) + return transform.transform_rna_seq_data(datasets=datasets) if dataset_name == "gene_info": - return transform_gene_info( + return transform.transform_gene_info( datasets=datasets, adjusted_p_value_threshold=dataset_obj["custom_transformations"][ "adjusted_p_value_threshold" @@ -46,9 +42,9 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: ], ) if dataset_name == "rna_distribution_data": - return transform_rna_distribution_data(datasets=datasets) + return transform.transform_rna_distribution_data(datasets=datasets) if dataset_name == "proteomics_distribution_data": - return create_proteomics_distribution_data(datasets=datasets) + return transform.create_proteomics_distribution_data(datasets=datasets) else: return None From a35e7f61ebd91f73fb2c9c3982521fd5f39fa953 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 3 May 2023 13:17:08 -0600 Subject: [PATCH 23/24] updates tests --- tests/test_process.py | 10 +- tests/test_utils.py | 124 +++++++++++++++++ ...biodomains.py => test_genes_biodomains.py} | 2 +- tests/transform/test_transform_utils.py | 126 ------------------ 4 files changed, 129 insertions(+), 133 deletions(-) rename tests/transform/{test_transform_genes_biodomains.py => test_genes_biodomains.py} (97%) delete mode 100644 tests/transform/test_transform_utils.py diff --git a/tests/test_process.py b/tests/test_process.py index 66d47492..7d597e8c 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -8,8 +8,6 @@ from agoradatatools import process from agoradatatools.errors import ADTDataProcessingError from agoradatatools.etl import extract, load, utils -from agoradatatools.etl.transform import apply -from agoradatatools.etl.transform import utils as transform_utils class TestProcessDataset: @@ -57,20 +55,20 @@ def setup_method(self): extract, "get_entity_as_df", return_value=pd.DataFrame ).start() self.patch_standardize_column_names = patch.object( - transform_utils, "standardize_column_names", return_value=pd.DataFrame + utils, "standardize_column_names", return_value=pd.DataFrame ).start() self.patch_standardize_values = patch.object( - transform_utils, "standardize_values", return_value=pd.DataFrame + utils, "standardize_values", return_value=pd.DataFrame ).start() self.patch_rename_columns = patch.object( - transform_utils, "rename_columns", return_value=pd.DataFrame + utils, "rename_columns", return_value=pd.DataFrame ).start() self.patch_df_to_json = patch.object( load, "df_to_json", return_value="path/to/json" ).start() self.patch_load = patch.object(load, "load", return_value=None).start() self.patch_custom_transform = patch.object( - apply, "apply_custom_transformations", return_value=pd.DataFrame + process, "apply_custom_transformations", return_value=pd.DataFrame ).start() self.patch_dict_to_json = patch.object( load, "dict_to_json", return_value="path/to/json" diff --git a/tests/test_utils.py b/tests/test_utils.py index 7bb1c8e1..f380153d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,8 +3,14 @@ import pytest import synapseclient +import sys import yaml +import numpy as np +import pandas as pd + +from io import StringIO + from agoradatatools.etl import utils @@ -69,3 +75,121 @@ def test_find_config_by_name_where_name_not_in_config(): config = [{"a": "b"}, {"c": "d"}] returned_object = utils._find_config_by_name(config=config, name="z") assert returned_object is None + + +def test_standardize_column_names(): + df = pd.DataFrame( + { + "a#": ["test_value"], + "b@": ["test_value"], + "c&": ["test_value"], + "d*": ["test_value"], + "e^": ["test_value"], + "f?": ["test_value"], + "g(": ["test_value"], + "h)": ["test_value"], + "i%": ["test_value"], + "j$": ["test_value"], + "k#": ["test_value"], + "l!": ["test_value"], + "m/": ["test_value"], + "n ": ["test_value"], + "o-": ["test_value"], + "p.": ["test_value"], + "AAA": ["test_value"], + } + ) + standard_df = utils.standardize_column_names(df=df) + assert list(standard_df.columns) == [ + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n_", + "o_", + "p_", + "aaa", + ] + + +class TestStandardizeValues: + df = pd.DataFrame( + { + "a": ["n/a"], + "b": ["N/A"], + "c": ["n/A"], + "d": ["N/a"], + } + ) + + def test_standardize_values_success(self): + standard_df = utils.standardize_values(df=self.df.copy()) + for value in standard_df.iloc[0].tolist(): + assert np.isnan(value) + + def test_standardize_values_TypeError(self): + with patch.object(pd.DataFrame, "replace") as patch_replace: + patch_replace.side_effect = TypeError + captured_output = StringIO() + sys.stdout = captured_output + standard_df = utils.standardize_values(df=self.df.copy()) + assert "Error comparing types." in captured_output.getvalue() + assert standard_df.equals(self.df) + + +class TestRenameColumns: + df = pd.DataFrame( + { + "a": ["test_value"], + "b": ["test_value"], + "c": ["test_value"], + "d": ["test_value"], + } + ) + good_column_map = {"a": "e", "b": "f", "c": "g", "d": "h"} + bad_column_map = [] + + def test_rename_columns_success(self): + renamed_df = utils.rename_columns( + df=self.df.copy(), column_map=self.good_column_map + ) + assert list(renamed_df.columns) == list(self.good_column_map.values()) + + def test_rename_columns_TypeError(self): + captured_output = StringIO() + sys.stdout = captured_output + bad_renamed_df = utils.rename_columns( + df=self.df.copy(), column_map=self.bad_column_map + ) + assert "Column mapping must be a dictionary" in captured_output.getvalue() + assert list(bad_renamed_df.columns) == list(self.good_column_map.keys()) + + +def test_nest_fields(): + df = pd.DataFrame( + { + "a": ["group_1", "group_1", "group_2", "group_2", "group_3", "group_3"], + "b": ["1", "1", "1", "1", "1", "1"], + "c": ["1", "1", "1", "1", "1", "1"], + "d": ["1", "1", "1", "1", "1", "1"], + } + ) + expected_column_e = [ + [{"a": "group_1", "b": "1", "c": "1"}, {"a": "group_1", "b": "1", "c": "1"}], + [{"a": "group_2", "b": "1", "c": "1"}, {"a": "group_2", "b": "1", "c": "1"}], + [{"a": "group_3", "b": "1", "c": "1"}, {"a": "group_3", "b": "1", "c": "1"}], + ] + + nested_df = utils.nest_fields( + df=df, grouping="a", new_column="e", drop_columns=["d"] + ) + assert list(nested_df["e"]) == expected_column_e diff --git a/tests/transform/test_transform_genes_biodomains.py b/tests/transform/test_genes_biodomains.py similarity index 97% rename from tests/transform/test_transform_genes_biodomains.py rename to tests/transform/test_genes_biodomains.py index 6cd59ef3..feb80811 100644 --- a/tests/transform/test_transform_genes_biodomains.py +++ b/tests/transform/test_genes_biodomains.py @@ -1,6 +1,6 @@ import pandas as pd -from agoradatatools.etl.transform.custom.genes_biodomains import count_grouped_total +from agoradatatools.etl.transform.genes_biodomains import count_grouped_total class TestCountGroupedTotal: diff --git a/tests/transform/test_transform_utils.py b/tests/transform/test_transform_utils.py deleted file mode 100644 index 5ab9c396..00000000 --- a/tests/transform/test_transform_utils.py +++ /dev/null @@ -1,126 +0,0 @@ -import sys -from io import StringIO -from unittest.mock import patch - -import numpy as np -import pandas as pd - -from agoradatatools.etl.transform import utils - - -def test_standardize_column_names(): - df = pd.DataFrame( - { - "a#": ["test_value"], - "b@": ["test_value"], - "c&": ["test_value"], - "d*": ["test_value"], - "e^": ["test_value"], - "f?": ["test_value"], - "g(": ["test_value"], - "h)": ["test_value"], - "i%": ["test_value"], - "j$": ["test_value"], - "k#": ["test_value"], - "l!": ["test_value"], - "m/": ["test_value"], - "n ": ["test_value"], - "o-": ["test_value"], - "p.": ["test_value"], - "AAA": ["test_value"], - } - ) - standard_df = utils.standardize_column_names(df=df) - assert list(standard_df.columns) == [ - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n_", - "o_", - "p_", - "aaa", - ] - - -class TestStandardizeValues: - df = pd.DataFrame( - { - "a": ["n/a"], - "b": ["N/A"], - "c": ["n/A"], - "d": ["N/a"], - } - ) - - def test_standardize_values_success(self): - standard_df = utils.standardize_values(df=self.df.copy()) - for value in standard_df.iloc[0].tolist(): - assert np.isnan(value) - - def test_standardize_values_TypeError(self): - with patch.object(pd.DataFrame, "replace") as patch_replace: - patch_replace.side_effect = TypeError - captured_output = StringIO() - sys.stdout = captured_output - standard_df = utils.standardize_values(df=self.df.copy()) - assert "Error comparing types." in captured_output.getvalue() - assert standard_df.equals(self.df) - - -class TestRenameColumns: - df = pd.DataFrame( - { - "a": ["test_value"], - "b": ["test_value"], - "c": ["test_value"], - "d": ["test_value"], - } - ) - good_column_map = {"a": "e", "b": "f", "c": "g", "d": "h"} - bad_column_map = [] - - def test_rename_columns_success(self): - renamed_df = utils.rename_columns( - df=self.df.copy(), column_map=self.good_column_map - ) - assert list(renamed_df.columns) == list(self.good_column_map.values()) - - def test_rename_columns_TypeError(self): - captured_output = StringIO() - sys.stdout = captured_output - bad_renamed_df = utils.rename_columns( - df=self.df.copy(), column_map=self.bad_column_map - ) - assert "Column mapping must be a dictionary" in captured_output.getvalue() - assert list(bad_renamed_df.columns) == list(self.good_column_map.keys()) - - -def test_nest_fields(): - df = pd.DataFrame( - { - "a": ["group_1", "group_1", "group_2", "group_2", "group_3", "group_3"], - "b": ["1", "1", "1", "1", "1", "1"], - "c": ["1", "1", "1", "1", "1", "1"], - "d": ["1", "1", "1", "1", "1", "1"], - } - ) - expected_column_e = [ - [{"a": "group_1", "b": "1", "c": "1"}, {"a": "group_1", "b": "1", "c": "1"}], - [{"a": "group_2", "b": "1", "c": "1"}, {"a": "group_2", "b": "1", "c": "1"}], - [{"a": "group_3", "b": "1", "c": "1"}, {"a": "group_3", "b": "1", "c": "1"}], - ] - - nested_df = utils.nest_fields( - df=df, grouping="a", new_column="e", drop_columns=["d"] - ) - assert list(nested_df["e"]) == expected_column_e From 5e54c1ada3c8f0b21c0ccf3a5f0b661fa1e092bc Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 3 May 2023 17:42:04 -0600 Subject: [PATCH 24/24] edits Utils comment --- src/agoradatatools/etl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agoradatatools/etl/utils.py b/src/agoradatatools/etl/utils.py index e2ef4661..b69220fa 100644 --- a/src/agoradatatools/etl/utils.py +++ b/src/agoradatatools/etl/utils.py @@ -4,7 +4,7 @@ import numpy as np -# TODO these utils functions are not protected... refactor removing "_" +# TODO remove "_" - these utils functions are not only used internally def _login_to_synapse(token: str = None) -> synapseclient.Synapse: """Logs into Synapse python client, returns authenticated Synapse session.