Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spectrum feature generator #178

Draft
wants to merge 32 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fdceeba
initial commit
ArthurDeclercq Feb 24, 2024
5374ed8
finalize ms2 feature generation
ArthurDeclercq Feb 25, 2024
60207a3
add rustyms
ArthurDeclercq Feb 25, 2024
ae39844
remove exit statement fixed IM required value
ArthurDeclercq Feb 26, 2024
9b98c4d
change logger.info to debug
ArthurDeclercq Feb 26, 2024
5e45756
added profile decorator to get timings for functions
ArthurDeclercq Feb 26, 2024
304777c
removed profile as standard rescore debug statement
ArthurDeclercq Feb 26, 2024
95ee475
added new basic features
ArthurDeclercq Feb 26, 2024
73f4573
fixes for ms2 feature generator, removed multiprocessing
ArthurDeclercq Feb 26, 2024
947233e
return empty list on parsing error with rustyms, removed multiprocessing
ArthurDeclercq Feb 28, 2024
24ce565
add deeplc_calibration psm set
ArthurDeclercq Mar 15, 2024
114b006
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 17, 2024
33c38b0
remove unused import
ArthurDeclercq Apr 17, 2024
40425c7
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 19, 2024
b810b8c
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 19, 2024
69b5d1a
Merge tag 'main' of https://github.com/compomics/ms2rescore into spec…
ArthurDeclercq Aug 16, 2024
6e2d102
Merge pull request #177 from compomics/main
ArthurDeclercq Aug 16, 2024
11fdc51
integrate mumble into ms2branch
ArthurDeclercq Aug 21, 2024
3140c44
Merge remote-tracking branch 'origin/main' into spectrum-feature-gene…
ArthurDeclercq Sep 23, 2024
883169a
temp removal of sage features before rescoring
ArthurDeclercq Sep 27, 2024
97865e7
Merge branch 'main' of https://github.com/compomics/ms2rescore into s…
ArthurDeclercq Sep 27, 2024
da39ae8
remove psm_file features when rescoring with mumble
ArthurDeclercq Nov 8, 2024
37fff28
linting
SamvPy Nov 19, 2024
e8b59f3
add hyperscore calculation
SamvPy Nov 19, 2024
c51cd34
calibration fixes
ArthurDeclercq Nov 21, 2024
295e37f
changes for mumble implementation
ArthurDeclercq Nov 21, 2024
909860d
change openms peptide formatting
SamvPy Nov 22, 2024
c5902c2
add mumble psm filtering functionality
ArthurDeclercq Nov 22, 2024
6eaceb2
Merge branch 'spectrum-feature-generator' of https://github.com/compo…
ArthurDeclercq Nov 22, 2024
5ce55f5
remove pyopenms dependency for hyperscore calculation
SamvPy Nov 22, 2024
986c5f6
fix spectrum_id accession
ArthurDeclercq Nov 22, 2024
bbecf6a
Merge branch 'spectrum-feature-generator' of https://github.com/compo…
ArthurDeclercq Nov 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
from ms2rescore.rescoring_engines.mokapot import (
add_peptide_confidence,
add_psm_confidence,
)
from ms2rescore.utils import filter_mumble_psms

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -99,6 +103,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
)
psm_list = psm_list[psms_with_features]

if "mumble" in config["psm_generator"]:
# Remove PSMS that have a less matched ions than the original hit
psm_list = filter_mumble_psms(psm_list)

# Write feature names to file
_write_feature_names(feature_names, output_file_root)

Expand Down Expand Up @@ -248,7 +256,10 @@ def _write_feature_names(feature_names, output_file_root):
def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
"""Log #PSMs identified before rescoring."""
id_psms_before = (
(psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
(psm_list["qvalue"] <= 0.01)
& (psm_list["rank"] <= max_rank)
& (~psm_list["is_decoy"])
& ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it might be quite inefficient, however I'm not sure if it can be improved significantly, given that original_psm is in the metadata dict. Maybe keeping it a series instead of a list might be better. Or adding it to the dataframe.

).sum()
logger.info(
f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
Expand Down
6 changes: 6 additions & 0 deletions ms2rescore/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError):
"""Error while rescoring PSMs."""

pass


class ParseSpectrumError(MS2RescoreError):
"""Error while rescoring PSMs."""

pass
4 changes: 3 additions & 1 deletion ms2rescore/feature_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

from ms2rescore.feature_generators.basic import BasicFeatureGenerator
from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator

FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
Expand All @@ -16,4 +17,5 @@
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
"im2deep": IM2DeepFeatureGenerator,
"ms2": MS2FeatureGenerator,
}
13 changes: 13 additions & 0 deletions ms2rescore/feature_generators/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None:
charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
precursor_mzs = psm_list["precursor_mz"]
scores = psm_list["score"]
peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list])

has_charge = None not in charge_states
has_mz = None not in precursor_mzs and has_charge
Expand All @@ -74,13 +75,25 @@ def add_features(self, psm_list: PSMList) -> None:
if has_score:
self._feature_names.append("search_engine_score")

if has_mz and has_charge:
experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
mass_error = experimental_mass - theoretical_mass
self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])

self._feature_names.append("pep_len")

for i, psm in enumerate(psm_list):
psm.rescoring_features.update(
dict(
**{"charge_n": charge_n[i]} if has_charge else {},
**charge_one_hot[i] if has_charge else {},
**{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {},
**{"search_engine_score": scores[i]} if has_score else {},
**{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {},
**{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {},
**{"mass_error": mass_error[i]} if has_mz and has_charge else {},
**{"pep_len": peptide_lengths[i]},
)
)

Expand Down
18 changes: 12 additions & 6 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import numpy as np
from psm_utils import PSMList
from psm_utils.io import read_file

from ms2rescore.feature_generators.base import FeatureGeneratorBase

Expand All @@ -40,6 +41,7 @@ def __init__(
*args,
lower_score_is_better: bool = False,
calibration_set_size: Union[int, float, None] = None,
calibration_set: Union[str, None] = None,
processes: int = 1,
**kwargs,
) -> None:
Expand Down Expand Up @@ -71,6 +73,7 @@ def __init__(

self.lower_psm_score_better = lower_score_is_better
self.calibration_set_size = calibration_set_size
self.calibration_set = calibration_set
self.processes = processes
self.deeplc_kwargs = kwargs or {}

Expand Down Expand Up @@ -120,7 +123,6 @@ def add_features(self, psm_list: PSMList) -> None:
# Run DeepLC for each spectrum file
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

for runs in psm_dict.values():
# Reset DeepLC predictor for each collection of runs
self.deeplc_predictor = None
Expand All @@ -138,12 +140,13 @@ def add_features(self, psm_list: PSMList) -> None:
)

# Disable wild logging to stdout by Tensorflow, unless in debug mode
with contextlib.redirect_stdout(
open(os.devnull, "w", encoding="utf-8")
) if not self._verbose else contextlib.nullcontext():
with (
contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
if not self._verbose
else contextlib.nullcontext()
):
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

psm_list_calibration = self._get_calibration_psms(psm_list_run)
logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
self.deeplc_predictor = self.DeepLC(
Expand Down Expand Up @@ -193,7 +196,10 @@ def add_features(self, psm_list: PSMList) -> None:

def _get_calibration_psms(self, psm_list: PSMList):
"""Get N best scoring target PSMs for calibration."""
psm_list_targets = psm_list[~psm_list["is_decoy"]]
psm_list_targets = psm_list[
~psm_list["is_decoy"]
& [metadata.get("original_psm", True) for metadata in psm_list["metadata"]]
]
if self.calibration_set_size:
n_psms = self._get_number_of_calibration_psms(psm_list_targets)
indices = np.argsort(psm_list_targets["score"])
Expand Down
3 changes: 2 additions & 1 deletion ms2rescore/feature_generators/im2deep.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p
identified_psms = psm_list_df[
(psm_list_df["qvalue"] < 0.01)
& (~psm_list_df["is_decoy"])
& (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep
& (psm_list_df["charge"] < 7) # predictions do not go higher for IM2Deep
& ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]])
]
calibration_psms = identified_psms[
identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
Expand Down
11 changes: 9 additions & 2 deletions ms2rescore/feature_generators/ionmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,19 @@
import tensorflow as tf
from psm_utils import Peptidoform, PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
from ms2rescore.feature_generators.base import (
FeatureGeneratorBase,
FeatureGeneratorException,
)

try:
from ionmob import __file__ as ionmob_file
from ionmob.preprocess.data import to_tf_dataset_inference
from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs
from ionmob.utilities.chemistry import (
VARIANT_DICT,
calculate_mz,
reduced_mobility_to_ccs,
)
from ionmob.utilities.tokenization import tokenizer_from_json
from ionmob.utilities.utility import get_ccs_shift
except ImportError:
Expand Down
Loading