Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AG-838] Support independent transforms #70

Merged
merged 25 commits into from
May 5, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
674 changes: 0 additions & 674 deletions src/agoradatatools/etl/transform.py

This file was deleted.

Empty file.
44 changes: 44 additions & 0 deletions src/agoradatatools/etl/transform/apply.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from agoradatatools.etl.transform.custom import *
BWMac marked this conversation as resolved.
Show resolved Hide resolved


# TODO refactor to avoid so many if's - maybe some sort of mapping to callables
def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict):
if not isinstance(datasets, dict) or not isinstance(dataset_name, str):
return None
if dataset_name == "genes_biodomains":
return transform_genes_biodomains(datasets=datasets)
if dataset_name == "overall_scores":
df = datasets["overall_scores"]
return transform_overall_scores(df=df)
if dataset_name == "distribution_data":
return transform_distribution_data(
datasets=datasets,
overall_max_score=dataset_obj["custom_transformations"][
"overall_max_score"
],
genetics_max_score=dataset_obj["custom_transformations"][
"genetics_max_score"
],
omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"],
lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"],
)
if dataset_name == "team_info":
return transform_team_info(datasets=datasets)
if dataset_name == "rnaseq_differential_expression":
return transform_rna_seq_data(datasets=datasets)
if dataset_name == "gene_info":
return transform_gene_info(
datasets=datasets,
adjusted_p_value_threshold=dataset_obj["custom_transformations"][
"adjusted_p_value_threshold"
],
protein_level_threshold=dataset_obj["custom_transformations"][
"protein_level_threshold"
],
)
if dataset_name == "rna_distribution_data":
return transform_rna_distribution_data(datasets=datasets)
if dataset_name == "proteomics_distribution_data":
return create_proteomics_distribution_data(datasets=datasets)
else:
return None
31 changes: 31 additions & 0 deletions src/agoradatatools/etl/transform/custom/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Submodule for Agora Data Tools Custom Transformations"""
BWMac marked this conversation as resolved.
Show resolved Hide resolved

from agoradatatools.etl.transform.custom.distribution_data import (
transform_distribution_data,
)
from agoradatatools.etl.transform.custom.gene_info import transform_gene_info
from agoradatatools.etl.transform.custom.genes_biodomains import (
transform_genes_biodomains,
)
from agoradatatools.etl.transform.custom.overall_scores import (
transform_overall_scores,
)
from agoradatatools.etl.transform.custom.proteomics_distribution import (
create_proteomics_distribution_data,
)
from agoradatatools.etl.transform.custom.rna_distribution import (
transform_rna_distribution_data,
transform_rna_seq_data,
)
from agoradatatools.etl.transform.custom.team_info import transform_team_info

__all__ = [
"transform_distribution_data",
"transform_gene_info",
"transform_genes_biodomains",
"transform_overall_scores",
"create_proteomics_distribution_data",
"transform_rna_distribution_data",
"transform_rna_seq_data",
"transform_team_info",
]
110 changes: 110 additions & 0 deletions src/agoradatatools/etl/transform/custom/distribution_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pandas as pd
import numpy as np


def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict:
if is_scored:
df = df[df[is_scored] == "Y"] # df does not have the isscored
else:
df = df[df.isin(["Y"]).any(axis=1)]

if df[col].dtype == object:
df = df.copy() # Necessary to prevent SettingWithCopy warning
df[col] = df[col].astype(float)

obj = {}

# In order to smooth out the bins and make sure the entire range from 0
# to the theoretical maximum value has been found, we create a copy of the
# column with both 0 and that maximum value added to it. We use the copy to calculate
# distributions and bins, and subtract the values at the end

distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True)

obj["distribution"] = list(
pd.cut(
distribution, bins=10, precision=3, include_lowest=True, right=True
).value_counts(sort=False)
)
obj["distribution"][
0
] -= 1 # since this was calculated with the artificial 0 value, we subtract it
obj["distribution"][
-1
] -= 1 # since this was calculated with the artificial upper_bound, we subtract it

discard, obj["bins"] = list(
pd.cut(distribution, bins=10, precision=3, retbins=True)
)
obj["bins"] = np.around(obj["bins"].tolist()[1:], 2)
base = [0, *obj["bins"][:-1]]
obj["bins"] = zip(base, obj["bins"])
obj["bins"] = list(obj["bins"])

obj["min"] = np.around(df[col].min(), 4)
obj["max"] = np.around(df[col].max(), 4)
obj["mean"] = np.around(df[col].mean(), 4)
obj["first_quartile"] = np.around(
df[col].quantile(q=0.25, interpolation="midpoint")
)
obj["third_quartile"] = np.around(
df[col].quantile(q=0.75, interpolation="midpoint")
)

return obj


def transform_distribution_data(
datasets: dict,
overall_max_score,
genetics_max_score,
omics_max_score,
lit_max_score,
):
overall_scores = datasets["overall_scores"]
interesting_columns = [
"ensg",
"overall",
"geneticsscore",
"omicsscore",
"literaturescore",
]

# create mapping to deal with missing values as they take different shape across the fields
scored = ["isscored_genetics", "isscored_omics", "isscored_lit"]
mapping = dict(zip(interesting_columns[2:], scored))
mapping["overall"] = None

# create mapping for max score values from config
max_score = dict(
zip(
interesting_columns[1:],
[overall_max_score, genetics_max_score, omics_max_score, lit_max_score],
)
)

overall_scores = overall_scores[interesting_columns + scored]

neo_matrix = {}
for col in interesting_columns[1:]: # excludes the ENSG
neo_matrix[col] = calculate_distribution(
overall_scores, col, mapping[col], max_score[col]
)

neo_matrix["target_risk_score"] = neo_matrix.pop("overall")
neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore")
neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore")
neo_matrix["literature_score"] = neo_matrix.pop("literaturescore")

additional_data = [
{"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"},
{"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"},
{"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"},
{"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"},
]
for col, additional in zip(neo_matrix.keys(), additional_data):
neo_matrix[col]["name"] = additional["name"]
neo_matrix[col]["syn_id"] = additional["syn_id"]
neo_matrix[col]["wiki_id"] = additional["wiki_id"]

return neo_matrix
151 changes: 151 additions & 0 deletions src/agoradatatools/etl/transform/custom/gene_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import pandas as pd
import numpy as np

from agoradatatools.etl.transform.utils import nest_fields


def transform_gene_info(
datasets: dict, adjusted_p_value_threshold, protein_level_threshold
):
"""
This function will perform transformations and incrementally create a dataset called gene_info.
Each dataset will be left_joined onto gene_info, starting with gene_metadata.
"""
gene_metadata = datasets["gene_metadata"]
igap = datasets["igap"]
eqtl = datasets["eqtl"]
proteomics = datasets["proteomics"]
rna_change = datasets["rna_expression_change"]
proteomics_tmt = datasets["agora_proteomics_tmt"]
target_list = datasets["target_list"]
median_expression = datasets["median_expression"]
druggability = datasets["druggability"]

# Modify the data before merging

# All genes in this list should have 'is_igap' = True when added to gene_info.
# Creating the column here automatically adds the column in to gene_info
# during merge, with True values correctly populated.
igap["is_igap"] = True

# Get the smallest adj_p_val for each gene, to determine significance
rna_change = (
rna_change.groupby("ensembl_gene_id")["adj_p_val"].agg("min").reset_index()
)

# Get the smallest cor_pval for each protein, to determine significance
proteomics_concat = pd.concat([proteomics, proteomics_tmt])
proteomics_concat = proteomics_concat.dropna(
subset=["log2_fc", "cor_pval", "ci_lwr", "ci_upr"]
)
proteomics_concat = (
proteomics_concat.groupby("ensembl_gene_id")["cor_pval"]
.agg("min")
.reset_index()
)

# these are the interesting columns of the druggability dataset
useful_columns = [
"geneid",
"sm_druggability_bucket",
"safety_bucket",
"abability_bucket",
"pharos_class",
"classification",
"safety_bucket_definition",
"abability_bucket_definition",
]
druggability = druggability[useful_columns]

target_list = nest_fields(
df=target_list, grouping="ensembl_gene_id", new_column="nominated_target"
)

median_expression = nest_fields(
df=median_expression, grouping="ensembl_gene_id", new_column="median_expression"
)

druggability = nest_fields(
df=druggability, grouping="geneid", new_column="druggability"
)
druggability.rename(columns={"geneid": "ensembl_gene_id"}, inplace=True)

# Merge all the datasets

gene_info = gene_metadata

for dataset in [
igap,
eqtl,
rna_change,
proteomics_concat,
target_list,
median_expression,
druggability,
]:
gene_info = pd.merge(
left=gene_info,
right=dataset,
on="ensembl_gene_id",
how="outer",
validate="one_to_one",
)

# Populate values for rows that didn't exist in the individual datasets

gene_info.fillna(
{"is_igap": False, "has_eqtl": False, "adj_p_val": -1, "cor_pval": -1},
inplace=True,
)

# fillna doesn't work for creating an empty array, need this function instead
gene_info["alias"] = gene_info.apply(
lambda row: row["alias"]
if isinstance(row["alias"], np.ndarray)
else np.ndarray(0, dtype=object),
axis=1,
)

gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1
gene_info["rna_in_ad_brain_change"] = (
gene_info["adj_p_val"] <= adjusted_p_value_threshold
) & gene_info["rna_brain_change_studied"]

gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1
gene_info["protein_in_ad_brain_change"] = (
gene_info["cor_pval"] <= protein_level_threshold
) & gene_info["protein_brain_change_studied"]

# create 'nominations' field
gene_info["nominations"] = gene_info.apply(
lambda row: len(row["nominated_target"])
if isinstance(row["nominated_target"], list)
else np.NaN,
axis=1,
)

# Remove some extra columns that got added during merges
gene_info = gene_info[
[
"ensembl_gene_id",
"name",
"summary",
"symbol",
"alias",
"is_igap",
"has_eqtl",
"rna_in_ad_brain_change",
"rna_brain_change_studied",
"protein_in_ad_brain_change",
"protein_brain_change_studied",
"nominated_target",
"median_expression",
"druggability",
"nominations",
]
]

# Make sure there are no N/A Ensembl IDs
gene_info = gene_info.dropna(subset=["ensembl_gene_id"])

return gene_info
Loading