Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce ModifiedSequenceReader #253

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ spectronaut_report:
'charge': 'charge'
mod_seq_columns:
- 'ModifiedSequence'
precursor_id_columns:
- "EG.PrecursorId"
modification_mapping_type: 'maxquant'

spectronaut:
Expand All @@ -215,6 +217,8 @@ spectronaut:
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
precursor_id_columns:
- "EG.PrecursorId"
modification_mapping_type: 'maxquant'

library_reader_base:
Expand Down
2 changes: 2 additions & 0 deletions alphabase/psm_reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"MSFragger_PSM_TSV_Reader",
"pFindReader",
"MSFraggerPepXML",
"MSFraggerPepXMLReader",
"SageReaderTSV",
"SageReaderParquet",
]
Expand Down Expand Up @@ -39,6 +40,7 @@
from alphabase.psm_reader.msfragger_reader import (
MSFragger_PSM_TSV_Reader,
MSFraggerPepXML,
MSFraggerPepXMLReader,
)
from alphabase.psm_reader.msfragger_reader import (
register_readers as register_fragger_readers,
Expand Down
12 changes: 3 additions & 9 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,15 @@ def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)
return df

def _load_modifications(self, df: pd.DataFrame) -> None:
if len(df) == 0:
self._psm_df[PsmDfCols.SEQUENCE] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.DECOY] = 0
return

def _load_modifications(self, origin_df: pd.DataFrame) -> None:
(
self._psm_df[PsmDfCols.SEQUENCE],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
_charges,
self._psm_df[PsmDfCols.DECOY],
) = zip(*df["precursor"].apply(parse_ap))
) = zip(*origin_df["precursor"].apply(parse_ap))

self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8)


Expand Down
33 changes: 9 additions & 24 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,12 @@
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.maxquant_reader import ModifiedSequenceReader
from alphabase.psm_reader.psm_reader import psm_reader_provider


class SpectronautReader(MaxQuantReader):
"""Reader for Spectronaut's output library TSV/CSV.

Other parameters, please see `MaxQuantReader`
in `alphabase.psm_reader.maxquant_reader`
"""
class SpectronautReader(ModifiedSequenceReader):
"""Reader for Spectronaut's output library TSV/CSV."""

_reader_type = "spectronaut"
_add_unimod_to_mod_mapping = True
Expand All @@ -39,34 +35,23 @@ class SwathReader(SpectronautReader):
_add_unimod_to_mod_mapping = True


class DiannReader(MaxQuantReader):
class DiannReader(ModifiedSequenceReader):
"""Reader for DIANN data."""

_reader_type = "diann"
_add_unimod_to_mod_mapping = True
_min_max_rt_norm = False

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""DIANN-specific preprocessing of output data.

Nothing to do for DIANN, still method of superclass needs to be overwritten.
TODO disentangle the inheritance structure.
"""
return df

def _post_process(self) -> None:
super()._post_process()
def _post_process(self, origin_df: pd.DataFrame) -> None:
self._psm_df.rename(
columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True
)

super()._post_process(origin_df)

class SpectronautReportReader(MaxQuantReader):
"""Reader for Spectronaut's report TSV/CSV.

Other parameters, please see `MaxQuantReader`
in `alphabase.psm_reader.maxquant_reader`
"""
class SpectronautReportReader(ModifiedSequenceReader):
"""Reader for Spectronaut's report TSV/CSV."""

_reader_type = "spectronaut_report"
_add_unimod_to_mod_mapping = True
Expand All @@ -75,7 +60,7 @@ class SpectronautReportReader(MaxQuantReader):
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut report-specific preprocessing of output data."""
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
"EG.PrecursorId" # TODO: move to yaml
self._precursor_id_column
].str.split(".", expand=True, n=2)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8)
return df
Expand Down
57 changes: 33 additions & 24 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Reader for MaxQuant data."""

import warnings
from abc import ABC
from typing import List, Optional

import numba
Expand Down Expand Up @@ -123,10 +124,9 @@ def parse_mod_seq(
)


class MaxQuantReader(PSMReaderBase):
"""Reader for MaxQuant data."""
class ModifiedSequenceReader(PSMReaderBase, ABC):
"""Reader for MaxQuant-like data."""

_reader_type = "maxquant"
_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
Expand All @@ -142,7 +142,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
**kwargs,
):
"""Reader for MaxQuant msms.txt and evidence.txt.
"""Reader for MaxQuant-like data (in terms of modification loading and decoy translation).

See documentation of `PSMReaderBase` for more information.

Expand Down Expand Up @@ -178,6 +178,35 @@ def _translate_decoy(self) -> None:
self._psm_df[PsmDfCols.DECOY] == "-"
).astype(np.int8)

def _load_modifications(self, origin_df: pd.DataFrame) -> None:
if origin_df[self.mod_seq_column].str.contains("[", regex=False).any():
if origin_df[self.mod_seq_column].str.contains("(", regex=False).any():
origin_df[self.mod_seq_column] = origin_df[self.mod_seq_column].apply(
replace_parentheses_with_brackets
)
mod_sep = "[]"
else:
mod_sep = "()"

seqs, mods, mod_sites = zip(
*origin_df[self.mod_seq_column].apply(
parse_mod_seq,
mod_sep=mod_sep,
fixed_C57=self.fixed_C57,
)
)

self._psm_df[PsmDfCols.MODS] = mods
self._psm_df[PsmDfCols.MOD_SITES] = mod_sites
if PsmDfCols.SEQUENCE not in self._psm_df.columns:
self._psm_df[PsmDfCols.SEQUENCE] = seqs


class MaxQuantReader(ModifiedSequenceReader):
"""Reader for MaxQuant data."""

_reader_type = "maxquant"

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""MaxQuant-specific preprocessing of output data."""
df = df[~pd.isna(df["Retention time"])]
Expand All @@ -201,26 +230,6 @@ def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
# min_rt = df['Retention time'].min()
return df

def _load_modifications(self, origin_df: pd.DataFrame) -> None:
if origin_df[self.mod_seq_column].str.contains("[", regex=False).any():
if origin_df[self.mod_seq_column].str.contains("(", regex=False).any():
origin_df[self.mod_seq_column] = origin_df[self.mod_seq_column].apply(
replace_parentheses_with_brackets
)
mod_sep = "[]"
else:
mod_sep = "()"

(seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*origin_df[self.mod_seq_column].apply(
parse_mod_seq,
mod_sep=mod_sep,
fixed_C57=self.fixed_C57,
)
)
if PsmDfCols.SEQUENCE not in self._psm_df.columns:
self._psm_df[PsmDfCols.SEQUENCE] = seqs


def register_readers() -> None:
"""Register MaxQuant reader."""
Expand Down
31 changes: 19 additions & 12 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""MSFragger reader."""

import warnings
from typing import List, Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -92,7 +93,7 @@ def __init__(
raise NotImplementedError("MSFragger_PSM_TSV_Reader for psm.tsv")


class MSFraggerPepXML(PSMReaderBase):
class MSFraggerPepXMLReader(PSMReaderBase):
"""Reader for MSFragger's pep.xml file."""

_reader_type = "msfragger_pepxml"
Expand Down Expand Up @@ -167,17 +168,10 @@ def _translate_decoy(self) -> None:
self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0

def _translate_score(self) -> None:
# evalue score
"""Translate MSFragger evalue to AlphaBase score: the larger the better."""
self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100)

def _load_modifications(self, origin_df: pd.DataFrame) -> None:
if len(origin_df) == 0:
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFFS] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = ""
return

(
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
Expand All @@ -203,17 +197,30 @@ def _load_modifications(self, origin_df: pd.DataFrame) -> None:
inplace=True,
)

def _post_process(self) -> None:
super()._post_process()
def _post_process(self, origin_df: pd.DataFrame) -> None:
self._psm_df = (
self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0")
.drop(columns=PsmDfCols.TO_REMOVE)
.reset_index(drop=True)
)
super()._post_process(origin_df)


class MSFraggerPepXML(MSFraggerPepXMLReader):
"""Deprecated."""

def __init__(self, *args, **kwargs):
"""Deprecated."""
warnings.warn(
"MSFraggerPepXML is deprecated and will ne removed in alphabase>1.5.0.",
"Please use the equivalent MSFraggerPepXMLReader instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)


def register_readers() -> None:
"""Register MSFragger readers."""
psm_reader_provider.register_reader("msfragger_psm_tsv", MSFragger_PSM_TSV_Reader)
psm_reader_provider.register_reader("msfragger", MSFragger_PSM_TSV_Reader)
psm_reader_provider.register_reader("msfragger_pepxml", MSFraggerPepXML)
psm_reader_provider.register_reader("msfragger_pepxml", MSFraggerPepXMLReader)
17 changes: 5 additions & 12 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _load_file(self, filename: str) -> pd.DataFrame:
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""pFind-specific preprocessing of output data."""
df.fillna("", inplace=True)
df = df[df.Sequence != ""]
df = df[df["Sequence"] != ""]
df[PsmDfCols.RAW_NAME] = df["File_Name"].str.split(".").apply(lambda x: x[0])
df["Proteins"] = df["Proteins"].apply(parse_pfind_protein)
return df
Expand All @@ -119,23 +119,16 @@ def _translate_decoy(self) -> None:
).astype(np.int8)

def _translate_score(self) -> None:
"""Translate pFind pvalue to AlphaBase score: the larger the better."""
self._psm_df[PsmDfCols.SCORE] = -np.log(
self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100
)

def _load_modifications(self, origin_df: pd.DataFrame) -> None:
if len(origin_df) == 0:
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
return
mods, mod_sites = zip(*origin_df["Modification"].apply(get_pFind_mods))

(self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*origin_df["Modification"].apply(get_pFind_mods)
)

self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply(
translate_pFind_mod
)
self._psm_df[PsmDfCols.MODS] = [translate_pFind_mod(mod) for mod in mods]
self._psm_df[PsmDfCols.MOD_SITES] = mod_sites


def register_readers() -> None:
Expand Down
Loading
Loading