Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AG-838] Support independent transforms #70

Merged
merged 25 commits into from
May 5, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,280 +1,10 @@
import numpy as np
import pandas as pd


def standardize_column_names(df: pd.DataFrame) -> pd.DataFrame:
"""Takes in a dataframe replaces problematic characters in column names
and makes column names all lowercase characters

Args:
df (pd.DataFrame): DataFrame with columns to be standardized

Returns:
pd.DataFrame: New dataframe with cleaned column names
"""

df.columns = df.columns.str.replace(
"[#@&*^?()%$#!/]", "", regex=True
) # the commas were unnessesary and were breaking the prelacement of '-' characters
df.columns = df.columns.str.replace("[ -.]", "_", regex=True)
df.columns = map(str.lower, df.columns)

return df


def standardize_values(df: pd.DataFrame) -> pd.DataFrame:
"""Finds non-compliant values and corrects them
*if more data cleaning options need to be added to this,
this needs to be refactored to another function

Args:
df (pd.DataFrame): DataFrame with values to be standardized

Returns:
pd.DataFrame: Resulting DataFrame with standardized values
"""
try:
df.replace(["n/a", "N/A", "n/A", "N/a"], np.nan, regex=True, inplace=True)
except TypeError: # I could not get this to trigger without mocking replace
print("Error comparing types.")

return df


def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame:
"""Takes in a dataframe and renames columns according to the mapping provided

Args:
df (pd.DataFrame): DataFrame with columns to be renamed
column_map (dict): Dictionary mapping original column names to new columns

Returns:
pd.DataFrame: DataFrame with new columns names
"""
try:
df.rename(columns=column_map, inplace=True)
except TypeError:
print("Column mapping must be a dictionary")
return df

return df

"""
This will create a dictionary object with the result of the grouping provided
:param df: a dataframe
:param grouping: a string containing the column to group by
:param new_column: a string with the name of the new column that will contain
the nested field
:param drop_columns: a list of column names to drop (remove) from the
nested dictionary. Optional argument, defaults to empty list.
:return: a dataframe
"""


def nest_fields(
df: pd.DataFrame, grouping: str, new_column: str, drop_columns: list = []
) -> pd.DataFrame:
"""Collapses the provided DataFrame into 2 columns:
1. The grouping column
2. A column containing a nested dictionary with the data from the rest of the DataFrame

Args:
df (pd.DataFrame): DataFrame to be collapsed
grouping (str): The column that you want to group by
new_column (str): the new column created to contain the nested dictionaries created
drop_columns (list, optional): List of columns to leave out of the new nested dictionary. Defaults to [].

Returns:
pd.DataFrame: New 2 column DataFrame with group and nested dictionaries
"""
return (
df.groupby(grouping)
.apply(
lambda row: row.replace({np.nan: None})
.drop(columns=drop_columns)
.to_dict("records")
)
.reset_index()
.rename(columns={0: new_column})
)


def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict:
if is_scored:
df = df[df[is_scored] == "Y"] # df does not have the isscored
else:
df = df[df.isin(["Y"]).any(axis=1)]

if df[col].dtype == object:
df = df.copy() # Necessary to prevent SettingWithCopy warning
df[col] = df[col].astype(float)

obj = {}

"""
In order to smooth out the bins and make sure the entire range from 0
to the theoretical maximum value has been found, we create a copy of the
column with both 0 and that maximum value added to it. We use the copy to calculate
distributions and bins, and subtract the values at the end
"""
distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True)

obj["distribution"] = list(
pd.cut(
distribution, bins=10, precision=3, include_lowest=True, right=True
).value_counts(sort=False)
)
obj["distribution"][0] -= 1 # since this was calculated with the artificial 0 value, we subtract it
obj["distribution"][-1] -= 1 # since this was calculated with the artificial upper_bound, we subtract it

discard, obj["bins"] = list(
pd.cut(distribution, bins=10, precision=3, retbins=True)
)
obj["bins"] = np.around(obj["bins"].tolist()[1:], 2)
base = [0, *obj["bins"][:-1]]
obj["bins"] = zip(base, obj["bins"])
obj["bins"] = list(obj["bins"])

obj["min"] = np.around(df[col].min(), 4)
obj["max"] = np.around(df[col].max(), 4)
obj["mean"] = np.around(df[col].mean(), 4)
obj["first_quartile"] = np.around(
df[col].quantile(q=0.25, interpolation="midpoint")
)
obj["third_quartile"] = np.around(
df[col].quantile(q=0.75, interpolation="midpoint")
)

return obj


def count_grouped_total(df: pd.DataFrame,
grouping: [str, list],
input_colname: str,
output_colname: str) -> pd.DataFrame:
"""For each unique item/combination in the column(s) specified by grouping,
counts the number of unique items in the column [input_colname] that
correspond to that grouping. The calculated counts are put in a new
column and named with [output_colname].
Args:
df (pd.DataFrame): contains columns listed in grouping and
input_colname. May contain other columns as well, but
these will be dropped from the returned data frame.
grouping (str or list): a string with a single column name, or a list of
strings for multiple column names
input_colname (str): the name of the column to count
output_colname (str): the name of the new column with calculated counts
Returns:
pd.DataFrame: a data frame containing the grouping column(s) and a
new column for output_colname, which contains the count of
unique items in input_colname for each grouping item.
"""
df = (
df.groupby(grouping)[input_colname]
.nunique().reset_index()
.rename(columns={input_colname: output_colname})
)
return df


def transform_genes_biodomains(datasets: dict) -> pd.DataFrame:
"""Takes dictionary of dataset DataFrames, extracts the genes_biodomains
DataFrame, calculates some metrics on GO terms per gene / biodomain, and
performs nest_fields on the final DataFrame. This results in a 2 column
DataFrame grouped by "ensembl_gene_id" and includes a collapsed nested
dictionary field "gene_biodomains"

Args:
datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame

Returns:
pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including
a collapsed nested dictionary field "gene_biodomains"
"""
genes_biodomains = datasets["genes_biodomains"]
interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"]
genes_biodomains = genes_biodomains[interesting_columns].dropna()

# Count the number of go_terms associated with each biodomain
n_biodomain_terms = count_grouped_total(genes_biodomains,
"biodomain",
"go_terms",
"n_biodomain_terms")

# Count the number of go_terms associated with each gene, ignoring biodomain
n_gene_total_terms = count_grouped_total(genes_biodomains,
"ensembl_gene_id",
"go_terms",
"n_gene_total_terms")

# Count the number of go_terms associated with each gene / biodomain combo
n_gene_biodomain_terms = count_grouped_total(genes_biodomains,
["ensembl_gene_id", "biodomain"],
"go_terms",
"n_gene_biodomain_terms")

# Group rows by ensg and biodomain to produce nested lists of go_terms per ensg/biodomain
genes_biodomains = (
genes_biodomains.groupby(["ensembl_gene_id", "biodomain"])["go_terms"]
.apply(list)
.reset_index()
)

# Merge all the different count metrics into the main data frame so each
# ensembl_gene_id / biodomain combo has an entry for each count
genes_biodomains = (
genes_biodomains.merge(n_gene_total_terms, on="ensembl_gene_id", how="left")
.merge(n_biodomain_terms, on="biodomain", how="left")
.merge(n_gene_biodomain_terms, on=["ensembl_gene_id", "biodomain"], how="left")
)

# Calculate percent linking terms:
# n_gene_biodomain_terms / n_gene_total_terms * 100
genes_biodomains["pct_linking_terms"] = (
(genes_biodomains["n_gene_biodomain_terms"] /
genes_biodomains["n_gene_total_terms"] * 100)
.round(decimals=2)
)

# Remove n_gene_total_terms column
genes_biodomains = genes_biodomains.drop(columns="n_gene_total_terms")

genes_biodomains = nest_fields(
df=genes_biodomains,
grouping="ensembl_gene_id",
new_column="gene_biodomains",
drop_columns="ensembl_gene_id",
)

return genes_biodomains


def transform_overall_scores(df: pd.DataFrame) -> pd.DataFrame:
interesting_columns = [
"ensg",
"hgnc_gene_id",
"overall",
"geneticsscore",
"omicsscore",
"literaturescore",
]

# create mapping to deal with missing values as they take different shape across the fields
scored = ["isscored_genetics", "isscored_omics", "isscored_lit"]
mapping = dict(zip(interesting_columns[3:], scored))

for field, is_scored in mapping.items():
df.loc[lambda row: row[is_scored] == "N", field] = np.nan

# LiteratureScore is a string in the source file, so convert to numeric
df["literaturescore"] = pd.to_numeric(df["literaturescore"])

# Remove identical rows (see AG-826)
return df[interesting_columns].drop_duplicates()


def join_datasets(left: pd.DataFrame, right: pd.DataFrame, how: str, on: str):
return pd.merge(left=left, right=right, how=how, on=on)
from agoradatatools.etl.transform.utils import *
from agoradatatools.etl.transform.transform_genes_biodomains import (
transform_genes_biodomains,
)


def transform_team_info(datasets: dict):
Expand Down Expand Up @@ -440,15 +170,13 @@ def transform_gene_info(

gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1
gene_info["rna_in_ad_brain_change"] = (
(gene_info["adj_p_val"] <= adjusted_p_value_threshold) &
gene_info["rna_brain_change_studied"]
)
gene_info["adj_p_val"] <= adjusted_p_value_threshold
) & gene_info["rna_brain_change_studied"]

gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1
gene_info["protein_in_ad_brain_change"] = (
(gene_info["cor_pval"] <= protein_level_threshold) &
gene_info["protein_brain_change_studied"]
)
gene_info["cor_pval"] <= protein_level_threshold
) & gene_info["protein_brain_change_studied"]

# create 'nominations' field
gene_info["nominations"] = gene_info.apply(
Expand Down Expand Up @@ -492,7 +220,6 @@ def transform_distribution_data(
omics_max_score,
lit_max_score,
):

overall_scores = datasets["overall_scores"]
interesting_columns = [
"ensg",
Expand Down Expand Up @@ -544,7 +271,7 @@ def transform_distribution_data(

def transform_rna_distribution_data(datasets: dict):
# "datasets" contains the unprocessed RNA-seq data, which needs to go
# through the same processing as before in order to use it here.
# through the same processing as before in order to use it here.
rna_df = transform_rna_seq_data(datasets)
rna_df = rna_df[["tissue", "model", "logfc"]]

Expand Down Expand Up @@ -611,7 +338,6 @@ def transform_proteomics_distribution_data(


def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame:

transformed = []
for name, dataset in datasets.items():
if name == "proteomics":
Expand All @@ -631,7 +357,6 @@ def create_proteomics_distribution_data(datasets: dict) -> pd.DataFrame:


def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict):

if type(datasets) is not dict or type(dataset_name) is not str:
return None

Expand Down
Loading