Skip to content

Commit

Permalink
Improve default generation and id creation
Browse files Browse the repository at this point in the history
  • Loading branch information
axdanbol committed Feb 20, 2024
1 parent 1d13d36 commit 1c77b1a
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 87 deletions.
116 changes: 51 additions & 65 deletions containers/crosswalking/context/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import argparse
import csv
import re
from pathlib import Path
import typing as t
from pathlib import Path

import anndata
import pandas as pd

from src.util.ids import create_temp_asctb_id


def filter_crosswalk_table(
table: pd.DataFrame,
Expand All @@ -18,7 +19,7 @@ def filter_crosswalk_table(
) -> pd.DataFrame:
"""Filter the crosswalk table to only include rows with organ id and level.
Also removes empty rows and cast values to string.
Also removes empty rows.
Args:
table (pd.DataFrame): Original full crosswalk table
Expand All @@ -29,26 +30,11 @@ def filter_crosswalk_table(
organ_id_rows = table[organ_id_column].str.lower() == organ_id.lower()
organ_level_rows = table[organ_level_column].str.lower() == organ_level.lower()
filtered_table = table[organ_id_rows & organ_level_rows]
normalized_table = filtered_table.dropna().astype(str)
normalized_table = filtered_table.dropna(how="all")
unique_table = normalized_table.drop_duplicates(table_label_column)
return unique_table


def generate_iri(label: str) -> str:
"""Create a temporary IRI based on a label.
Args:
label (str): Label for the row
Returns:
str: Temporary IRI
"""
suffix = label.lower().strip()
suffix = re.sub(r"\W+", "-", suffix)
suffix = re.sub(r"[^a-z0-9-]+", "", suffix)
return "ASCTB-TEMP:" + suffix


def crosswalk(
matrix: anndata.AnnData,
organ_id: str,
Expand All @@ -75,10 +61,10 @@ def crosswalk(
data_match_column (str): Column to store match type in
table (pd.DataFrame): Crosswalk table
table_organ_id_column (str): Column storing organ uberon ids
table_organ_lavel_column (str): Column storing organ levels
table_organ_level_column (str): Column storing organ levels
table_label_column (str): Column used to match against the data
table_clid_column (str): Column storing CLIDs
table_clid_column (str): Column storing CL labels
table_clid_label_column (str): Column storing CL labels
table_match_column (str): Column storing match type
Returns:
Expand All @@ -105,9 +91,10 @@ def crosswalk(
)
merged_obs.index = matrix.obs.index

_set_default_clid(merged_obs, data_clid_column, data_label_column)
_set_default_match(merged_obs, data_match_column)
_set_default_clid(merged_obs, table_clid_label_column, data_label_column)
default_clids = merged_obs[data_label_column].map(create_temp_asctb_id)
_set_defaults(merged_obs, data_clid_column, default_clids)
_set_defaults(merged_obs, table_clid_label_column, merged_obs[data_label_column])
_set_defaults(merged_obs, data_match_column, "skos:exactMatch")

result = matrix.copy()
result.obs = merged_obs
Expand All @@ -116,37 +103,17 @@ def crosswalk(
return result


def _set_default_clid(obs: pd.DataFrame, clid_column: str, label_column: str) -> None:
"""Adds default CLIDs to rows that did not match against the crosswalk table.
def _set_defaults(
obs: pd.DataFrame, column: str, defaults: t.Union[pd.Series, str]
) -> None:
"""Replace nan values with defaults in a column.
Args:
obs (pd.DataFrame): Data rows
clid_column (str): Column to check and update with default CLIDs
label_column (str): Column used when generating default CLIDs
obs (pd.DataFrame): Data frame
column (str): Column to update
defaults (t.Union[pd.Series, str]): Default values
"""
defaults = obs.apply(lambda row: generate_iri(row[label_column]), axis=1)
obs.loc[obs[clid_column].isna(), clid_column] = defaults


def _set_default_clid(obs: pd.DataFrame, clid_label_column: str, label_column: str) -> None:
"""Adds default CL labels to rows that did not match against the crosswalk table.
Args:
obs (pd.DataFrame): Data rows
clid_label_column (str): Column to check and update with default CL labels
label_column (str): Column with defaults
"""
obs.loc[obs[clid_label_column].isna(), clid_label_column] = obs[label_column]


def _set_default_match(obs: pd.DataFrame, column: str) -> None:
"""Adds default match type to rows that did not match against the crosswalk table.
Args:
obs (pd.DataFrame): Data rows
column (str): Column to check and update with default match type
"""
obs.loc[obs[column].isna(), column] = "skos:exactMatch"
obs.loc[obs[column].isna(), column] = defaults


def _fix_obs_columns_dtype(matrix: anndata.AnnData):
Expand Down Expand Up @@ -182,20 +149,43 @@ def _get_empty_table(args: argparse.Namespace) -> pd.DataFrame:
)


def _read_table(path: str) -> t.Optional[pd.DataFrame]:
def _is_header_row(row: t.List[str], args: argparse.ArgumentParser) -> bool:
"""Tests whether a row is the header row.
Args:
row (t.List[str]): Row to test
args (argparse.ArgumentParser): Same arguments as provided to `main`
Returns:
bool: True if it is the header row
"""
to_lower_set = lambda items: set(map(str.lower, items))
columns = [
args.crosswalk_table_organ_id_column,
args.crosswalk_table_organ_level_column,
args.crosswalk_table_label_column,
args.crosswalk_table_clid_column,
args.crosswalk_table_clid_label_column,
args.crosswalk_table_match_column,
]

return to_lower_set(columns).issubset(to_lower_set(row))


def _read_table(args: argparse.Namespace) -> t.Optional[pd.DataFrame]:
"""Read a crosswalking table. Metadata rows before the header are skipped.
Args:
path (str): Path to the csv file
args (argparse.Namespace): Same arguments as provided to `main`
Returns:
pd.DataFrame: A data frame with the table data
"""
with open(path) as file:
with open(args.crosswalk_table) as file:
for row in csv.reader(file):
if row[0].lower() == "organ_level":
if _is_header_row(row, args):
return pd.read_csv(file, names=row)
return None
return _get_empty_table(args)


def main(args: argparse.Namespace):
Expand All @@ -213,16 +203,12 @@ def main(args: argparse.Namespace):
metadata = args.matrix.uns["hra_crosswalking"]
matrix = crosswalk(
args.matrix,
metadata["organ_id"],
metadata["organ_level"],
str(metadata["organ_id"]),
str(metadata["organ_level"]),
args.annotation_column,
args.clid_column,
args.match_column,
(
args.crosswalk_table
if args.crosswalk_table is not None
else _get_empty_table(args)
),
_read_table(args),
args.crosswalk_table_organ_id_column,
args.crosswalk_table_organ_level_column,
args.crosswalk_table_label_column,
Expand All @@ -247,7 +233,7 @@ def _get_arg_parser() -> argparse.ArgumentParser:
)
parser.add_argument(
"--crosswalk-table",
type=_read_table,
required=True,
help="crosswalking csv file path",
)
parser.add_argument(
Expand Down
3 changes: 3 additions & 0 deletions containers/extract-summary/context/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import anndata
import pandas as pd

from src.util.ids import create_cell_id


def get_unique_rows_with_counts(
matrix: anndata.AnnData, clid_column: str
Expand Down Expand Up @@ -54,6 +56,7 @@ def unique_rows_to_summary_rows(
)

df["@type"] = "CellSummaryRow"
df["cell_id"] = df["cell_id"].map(create_cell_id)
df["percentage"] = df["count"] / df["count"].sum()
df["gene_expr"] = (
df["gene_expr"]
Expand Down
41 changes: 19 additions & 22 deletions src/util/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import anndata
import pandas as pd

from .ids import create_gene_id

_ENSEMBLE_COLUMN = "ensemble"
_GENE_COLUMN = "hgnc"
Expand All @@ -14,12 +15,29 @@


def strip_version(value: str) -> str:
"""Removes the version specifier from an ensembl id.
Args:
value (str): Id with optional version specified
Returns:
str: Ensembl id without version
"""
return re.sub(_ENSEMBLE_VERSION_REPLACE_REGEX, "", value)


def add_ensemble_data(
matrix: anndata.AnnData, ensemble: t.Union[str, bytes, os.PathLike, pd.DataFrame]
) -> anndata.AnnData:
"""Add ensembl and gene information to `var` from a lookup file.
Args:
matrix (anndata.AnnData): Original matrix
ensemble (t.Union[str, bytes, os.PathLike, pd.DataFrame]): Ensemble lookup or path to csv file
Returns:
anndata.AnnData: Matrix with ensembl information
"""
if not isinstance(ensemble, pd.DataFrame):
ensemble = pd.read_csv(ensemble, dtype=str)
ensemble = ensemble.drop_duplicates(_ENSEMBLE_COLUMN)
Expand All @@ -30,30 +48,9 @@ def add_ensemble_data(
ensemble, how="left", left_on=keys, right_on=_ENSEMBLE_COLUMN
)
merged_var.index = index
merged_var[_GENE_COLUMN].fillna(index.to_series().map(_create_default_gene_id), inplace=True)
merged_var[_GENE_COLUMN].fillna(index.to_series().map(create_gene_id), inplace=True)
merged_var[_GENE_NAME_COLUMN].fillna(index.to_series(), inplace=True)

result = matrix.copy()
result.var = merged_var
return result


def _create_temp_asctb_id(value: str) -> str:
"""Create a temporary IRI based on a label.
Args:
value (str): Label for the row
Returns:
str: Temporary IRI
"""
suffix = value.lower().strip()
suffix = re.sub(r"\W+", "-", suffix)
suffix = re.sub(r"[^a-z0-9-]+", "", suffix)
return "ASCTB-TEMP:" + suffix


def _create_default_gene_id(value: str) -> str:
if value.lower().startswith('ens'):
return 'ensembl:' + value
return _create_temp_asctb_id(value)
55 changes: 55 additions & 0 deletions src/util/ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import re

_NON_WORD_REGEX = re.compile(r"\W+")
_NON_ALPHANUM_HYPHEN_REGEX = re.compile(r"[^a-z0-9-]+")
_ASCTB_ID_REGEX = re.compile(r"^ASCTB", re.IGNORECASE)
_ENSEMBL_ID_REGEX = re.compile(r"^ens", re.IGNORECASE)
_VALID_CELL_ID_REGEX = re.compile(r"^(CL|PCL):", re.IGNORECASE)


def create_temp_asctb_id(value: str) -> str:
"""Generate a temporary asctb id from a value.
Args:
value (str): The value to create an id for
Returns:
str: A temp id
"""
if _ASCTB_ID_REGEX.match(value):
return value

value = value.strip().lower()
value = re.sub(_NON_WORD_REGEX, "-", value)
value = re.sub(_NON_ALPHANUM_HYPHEN_REGEX, "", value)
return f"ASCTB-TEMP:{value}"


def create_cell_id(id: str) -> str:
"""Turn an id into a cell id.
Cell ids start with CL: or PCL: otherwise it is turned into
a temporary asctb id.
Args:
id (str): Original id
Returns:
str: Valid cell id
"""
if _VALID_CELL_ID_REGEX.match(id):
return id
return create_temp_asctb_id(id)


def create_gene_id(id: str) -> str:
"""Turn an id into a gene id.
Args:
id (str): Original id
Returns:
str: Valid gene id
"""
if _ENSEMBL_ID_REGEX.match(id):
return f"ensembl:{id}"
return create_temp_asctb_id(id)

0 comments on commit 1c77b1a

Please sign in to comment.