Sage-Bionetworks · BWMac · May 5, 2023 · Apr 25, 2023 · Apr 25, 2023 · May 1, 2023
diff --git a/src/agoradatatools/etl/transform.py → ...agoradatatools/etl/transform/transform.py b/src/agoradatatools/etl/transform.py → ...agoradatatools/etl/transform/transform.py
@@ -1,280 +1,10 @@
 import numpy as np
 import pandas as pd
 
-
-def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame:
-    """Takes in a dataframe replaces problematic characters in column names
-    and makes column names all lowercase characters
-
-    Args:
-        df (pd.DataFrame): DataFrame with columns to be standardized
-
-    Returns:
-        pd.DataFrame: New dataframe with cleaned column names
-    """
-
-    df.columns = df.columns.str.replace(
-        "[#@&*^?()%$#!/]", "", regex=True
-    )  # the commas were unnessesary and were breaking the prelacement of '-' characters
-    df.columns = df.columns.str.replace("[ -.]", "_", regex=True)
-    df.columns = map(str.lower, df.columns)
-
-    return df
-
-
-def standardize_values(df: pd.DataFrame) -> pd.DataFrame:
-    """Finds non-compliant values and corrects them
-    *if more data cleaning options need to be added to this,
-    this needs to be refactored to another function
-
-    Args:
-        df (pd.DataFrame): DataFrame with values to be standardized
-
-    Returns:
-        pd.DataFrame: Resulting DataFrame with standardized values
-    """
-    try:
-        df.replace(["n/a", "N/A", "n/A", "N/a"], np.nan, regex=True, inplace=True)
-    except TypeError:  # I could not get this to trigger without mocking replace
-        print("Error comparing types.")
-
-    return df
-
-
-def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame:
-    """Takes in a dataframe and renames columns according to the mapping provided
-
-    Args:
-        df (pd.DataFrame): DataFrame with columns to be renamed
-        column_map (dict): Dictionary mapping original column names to new columns
-
-    Returns:
-        pd.DataFrame: DataFrame with new columns names
-    """
-    try:
-        df.rename(columns=column_map, inplace=True)
-    except TypeError:
-        print("Column mapping must be a dictionary")
-        return df
-
-    return df
-
-    """
-    This will create a dictionary object with the result of the grouping provided
-    :param df: a dataframe
-    :param grouping: a string containing the column to group by
-    :param new_column: a string with the name of the new column that will contain
-    the nested field
-    :param drop_columns: a list of column names to drop (remove) from the
-    nested dictionary. Optional argument, defaults to empty list.
-    :return: a dataframe
-    """
-
-
-def nest_fields(
-    df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = []
-) -> pd.DataFrame:
-    """Collapses the provided DataFrame into 2 columns:
-    1. The grouping column
-    2. A column containing a nested dictionary with the data from the rest of the DataFrame
-
-    Args:
-        df (pd.DataFrame): DataFrame to be collapsed
-        grouping (str): The column that you want to group by
-        new_column (str): the new column created to contain the nested dictionaries created
-        drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to [].
-
-    Returns:
-        pd.DataFrame: New 2 column DataFrame with group and nested dictionaries
-    """
-    return (
-        df.groupby(grouping)
-        .apply(
-            lambda row: row.replace({np.nan: None})
-            .drop(columns=drop_columns)
-            .to_dict("records")
-        )
-        .reset_index()
-        .rename(columns={0: new_column})
-    )
-
-
-def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict:
-    if is_scored:
-        df = df[df[is_scored] == "Y"]  # df does not have the isscored
-    else:
-        df = df[df.isin(["Y"]).any(axis=1)]
-
-    if df[col].dtype == object:
-        df = df.copy() # Necessary to prevent SettingWithCopy warning
-        df[col] = df[col].astype(float)
-
-    obj = {}
-
-    """
-    In order to smooth out the bins and make sure the entire range from 0
-    to the theoretical maximum value has been found, we create a copy of the
-    column with both 0 and that maximum value added to it.  We use the copy to calculate 
-    distributions and bins, and subtract the values at the end
-    """
-    distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True)
-
-    obj["distribution"] = list(
-        pd.cut(
-            distribution, bins=10, precision=3, include_lowest=True, right=True
-        ).value_counts(sort=False)
-    )
-    obj["distribution"][0] -= 1  # since this was calculated with the artificial 0 value, we subtract it
-    obj["distribution"][-1] -= 1  # since this was calculated with the artificial upper_bound, we subtract it
-
-    discard, obj["bins"] = list(
-        pd.cut(distribution, bins=10, precision=3, retbins=True)
-    )
-    obj["bins"] = np.around(obj["bins"].tolist()[1:], 2)
-    base = [0, *obj["bins"][:-1]]
-    obj["bins"] = zip(base, obj["bins"])
-    obj["bins"] = list(obj["bins"])
-
-    obj["min"] = np.around(df[col].min(), 4)
-    obj["max"] = np.around(df[col].max(), 4)
-    obj["mean"] = np.around(df[col].mean(), 4)
-    obj["first_quartile"] = np.around(
-        df[col].quantile(q=0.25, interpolation="midpoint")
-    )
-    obj["third_quartile"] = np.around(
-        df[col].quantile(q=0.75, interpolation="midpoint")
-    )
-
-    return obj
-
-
-def count_grouped_total(df: pd.DataFrame,
-                        grouping: [str, list],
-                        input_colname: str,
-                        output_colname: str) -> pd.DataFrame:
-    """For each unique item/combination in the column(s) specified by grouping,
-    counts the number of unique items in the column [input_colname] that
-    correspond to that grouping. The calculated counts are put in a new
-    column and named with [output_colname].
-    Args:
-        df (pd.DataFrame): contains columns listed in grouping and
-                           input_colname. May contain other columns as well, but
-                           these will be dropped from the returned data frame.
-        grouping (str or list): a string with a single column name, or a list of
-                                strings for multiple column names
-        input_colname (str): the name of the column to count
-        output_colname (str): the name of the new column with calculated counts
-    Returns:
-        pd.DataFrame: a data frame containing the grouping column(s) and a
-                      new column for output_colname, which contains the count of
-                      unique items in input_colname for each grouping item.
-    """
-    df = (
-        df.groupby(grouping)[input_colname]
-        .nunique().reset_index()
-        .rename(columns={input_colname: output_colname})
-    )
-    return df
-
-
-def transform_genes_biodomains(datasets: dict) -> pd.DataFrame:
-    """Takes dictionary of dataset DataFrames, extracts the genes_biodomains
-    DataFrame, calculates some metrics on GO terms per gene / biodomain, and
-    performs nest_fields on the final DataFrame. This results in a 2 column
-    DataFrame grouped by "ensembl_gene_id" and includes a collapsed nested
-    dictionary field "gene_biodomains"
-
-    Args:
-        datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame
-
-    Returns:
-        pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including
-                      a collapsed nested dictionary field "gene_biodomains"
-    """
-    genes_biodomains = datasets["genes_biodomains"]
-    interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"]
-    genes_biodomains = genes_biodomains[interesting_columns].dropna()
-
-    # Count the number of go_terms associated with each biodomain
-    n_biodomain_terms = count_grouped_total(genes_biodomains,
-                                            "biodomain",
-                                            "go_terms",
-                                            "n_biodomain_terms")
-
-    # Count the number of go_terms associated with each gene, ignoring biodomain
-    n_gene_total_terms = count_grouped_total(genes_biodomains,
-                                             "ensembl_gene_id",
-                                             "go_terms",
-                                             "n_gene_total_terms")
-
-    # Count the number of go_terms associated with each gene / biodomain combo
-    n_gene_biodomain_terms = count_grouped_total(genes_biodomains,
-                                                 ["ensembl_gene_id", "biodomain"],
-                                                 "go_terms",
-                                                 "n_gene_biodomain_terms")
-
-    # Group rows by ensg and biodomain to produce nested lists of go_terms per ensg/biodomain
-    genes_biodomains = (
-        genes_biodomains.groupby(["ensembl_gene_id", "biodomain"])["go_terms"]
-        .apply(list)
-        .reset_index()
-    )
-
-    # Merge all the different count metrics into the main data frame so each
-    # ensembl_gene_id / biodomain combo has an entry for each count
-    genes_biodomains = (
-        genes_biodomains.merge(n_gene_total_terms, on="ensembl_gene_id", how="left")
-        .merge(n_biodomain_terms, on="biodomain", how="left")
-        .merge(n_gene_biodomain_terms, on=["ensembl_gene_id", "biodomain"], how="left")
-    )
-
-    # Calculate percent linking terms:
-    # n_gene_biodomain_terms / n_gene_total_terms * 100
-    genes_biodomains["pct_linking_terms"] = (
-        (genes_biodomains["n_gene_biodomain_terms"] /
-         genes_biodomains["n_gene_total_terms"] * 100)
-        .round(decimals=2)
-    )
-
-    # Remove n_gene_total_terms column
-    genes_biodomains = genes_biodomains.drop(columns="n_gene_total_terms")
-
-    genes_biodomains = nest_fields(
-        df=genes_biodomains,
-        grouping="ensembl_gene_id",
-        new_column="gene_biodomains",
-        drop_columns="ensembl_gene_id",
-    )
-
-    return genes_biodomains
-
-
-def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame:
-    interesting_columns = [
-        "ensg",
-        "hgnc_gene_id",
-        "overall",
-        "geneticsscore",
-        "omicsscore",
-        "literaturescore",
-    ]
-
-    # create mapping to deal with missing values as they take different shape across the fields
-    scored = ["isscored_genetics", "isscored_omics", "isscored_lit"]
-    mapping = dict(zip(interesting_columns[3:], scored))
-
-    for field, is_scored in mapping.items():
-        df.loc[lambda row: row[is_scored] == "N", field] = np.nan
-
-    # LiteratureScore is a string in the source file, so convert to numeric
-    df["literaturescore"] = pd.to_numeric(df["literaturescore"])
-
-    # Remove identical rows (see AG-826)
-    return df[interesting_columns].drop_duplicates()
-
-
-def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str):
-    return pd.merge(left=left, right=right, how=how, on=on)
+from agoradatatools.etl.transform.utils import *
+from agoradatatools.etl.transform.transform_genes_biodomains import (
+    transform_genes_biodomains,
+)
 
 
 def transform_team_info(datasets: dict):
@@ -440,15 +170,13 @@ def transform_gene_info(
 
     gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1
     gene_info["rna_in_ad_brain_change"] = (
-        (gene_info["adj_p_val"] <= adjusted_p_value_threshold) &
-        gene_info["rna_brain_change_studied"]
-    )
+        gene_info["adj_p_val"] <= adjusted_p_value_threshold
+    ) & gene_info["rna_brain_change_studied"]
 
     gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1
     gene_info["protein_in_ad_brain_change"] = (
-        (gene_info["cor_pval"] <= protein_level_threshold) &
-        gene_info["protein_brain_change_studied"]
-    )
+        gene_info["cor_pval"] <= protein_level_threshold
+    ) & gene_info["protein_brain_change_studied"]
 
     # create 'nominations' field
     gene_info["nominations"] = gene_info.apply(
@@ -492,7 +220,6 @@ def transform_distribution_data(
     omics_max_score,
     lit_max_score,
 ):
-
     overall_scores = datasets["overall_scores"]
     interesting_columns = [
         "ensg",
@@ -544,7 +271,7 @@ def transform_distribution_data(
 
 def transform_rna_distribution_data(datasets: dict):
     # "datasets" contains the unprocessed RNA-seq data, which needs to go
-    # through the same processing as before in order to use it here. 
+    # through the same processing as before in order to use it here.
     rna_df = transform_rna_seq_data(datasets)
     rna_df = rna_df[["tissue", "model", "logfc"]]
 
@@ -611,7 +338,6 @@ def transform_proteomics_distribution_data(
 
 
 def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame:
-
     transformed = []
     for name, dataset in datasets.items():
         if name == "proteomics":
@@ -631,7 +357,6 @@ def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame:
 
 
 def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict):
-
     if type(datasets) is not dict or type(dataset_name) is not str:
         return None