From a0928fc72bee00515341df8a9eb7a54a2a98fcff Mon Sep 17 00:00:00 2001 From: SamvPy Date: Thu, 3 Oct 2024 13:19:57 +0200 Subject: [PATCH 1/4] Add mokapot model saving functionality --- ms2rescore/package_data/config_default.json | 3 ++- ms2rescore/package_data/config_default_tims.json | 3 ++- ms2rescore/package_data/config_schema.json | 5 +++++ ms2rescore/rescoring_engines/mokapot.py | 8 +++++++- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 805be2c..5775b89 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -16,7 +16,8 @@ "mokapot": { "train_fdr": 0.01, "write_weights": true, - "write_txt": true + "write_txt": true, + "save_models": true } }, "config_file": null, diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json index 89913cc..6bd78c3 100644 --- a/ms2rescore/package_data/config_default_tims.json +++ b/ms2rescore/package_data/config_default_tims.json @@ -16,7 +16,8 @@ "rescoring_engine": { "mokapot": { "write_weights": true, - "write_txt": true + "write_txt": true, + "save_models": true } }, "psm_file": null diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index e9b0379..31010b8 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -300,6 +300,11 @@ "description": "Write Mokapot results to a text file", "type": "boolean", "default": false + }, + "save_models": { + "description": "Save Mokapot models to a pickle file", + "type": "boolean", + "default": false } } }, diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index 967c40b..6860a8c 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -19,6 +19,7 @@ """ +import os import logging import re from typing import Any, Dict, List, Optional, Tuple @@ -29,7 +30,7 @@ import psm_utils from mokapot.brew import brew from mokapot.dataset import LinearPsmDataset -from mokapot.model import PercolatorModel +from mokapot.model import Model, PercolatorModel, load_model from pyteomics.mass import nist_mass from ms2rescore.exceptions import RescoringError @@ -45,6 +46,7 @@ def rescore( train_fdr: float = 0.01, write_weights: bool = False, write_txt: bool = False, + save_models: bool = False, protein_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> None: @@ -123,6 +125,10 @@ def rescore( if write_txt: confidence_results.to_txt(file_root=output_file_root, decoys=True) + if save_models: + for i, model in enumerate(models): + model.save(output_file_root + f".mokapot.model_{i}.pkl") + def convert_psm_list( psm_list: psm_utils.PSMList, From c6ef3d4a4db4a96602c8be6f60b6939c1b7c21ac Mon Sep 17 00:00:00 2001 From: SamvPy Date: Thu, 3 Oct 2024 13:26:44 +0200 Subject: [PATCH 2/4] delete unused imports --- ms2rescore/rescoring_engines/mokapot.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index 6860a8c..36a3223 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -19,7 +19,6 @@ """ -import os import logging import re from typing import Any, Dict, List, Optional, Tuple @@ -30,7 +29,7 @@ import psm_utils from mokapot.brew import brew from mokapot.dataset import LinearPsmDataset -from mokapot.model import Model, PercolatorModel, load_model +from mokapot.model import PercolatorModel from pyteomics.mass import nist_mass from ms2rescore.exceptions import RescoringError From f72210eafaf083297cf80f2459afaa880db1917f Mon Sep 17 00:00:00 2001 From: SamvPy Date: Tue, 22 Oct 2024 09:35:16 +0200 Subject: [PATCH 3/4] add fgen to config schema --- ms2rescore/package_data/config_schema.json | 67 ++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index 31010b8..19e1f31 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -183,6 +183,11 @@ "description": "Write a txt report using cProfile for profiling", "type": "boolean", "default": false + }, + "rescoring_features": { + "description": "Features to include for rescoring", + "type": "array", + "default": ["all"] } } } @@ -265,6 +270,68 @@ } } }, + "peak_fgen": { + "$ref": "#/definitions/feature_generator", + "description": "Peak feature generator", + "type": "object", + "additionalProperties": true, + "properties": { + "ion_types": { + "description": "Type of ions to try and annotate", + "type": "array", + "default": [ + "a1", + "a2", + "b1", + "b2", + "c1", + "c2", + "y1", + "y2", + "z1", + "z2", + "x1", + "x2", + "p1", + "p2", + "p3" + ] + }, + "neutral_losses": { + "description": "Neutral losses to include for annotation", + "type": "array", + "default": [ + "", + "-H2O1" + ] + }, + "max_workers": { + "description": "Parallel processes to use for annotation", + "type": "integer", + "default":-1 + }, + "ion_types_evidence": { + "description": "Ion types to consider for fragmentation site evidence", + "type": "array", + "default": ["b1", "y1", "b2", "y2"] + }, + "neutral_losses_evidence": { + "description": "Neutral losses to consider for fragmentation site evidence", + "type": "array", + "default": ["", "-H2O1"] + }, + "fragment_tol_mass": { + "description": "Fragmentation tolerance mass for hyperscore calculation", + "type": "integer", + "default": 20 + }, + "fragment_tol_mode": { + "description": "Fragmentation tolerance mode", + "type": "string", + "default": "ppm" + } + } + }, "im2deep": { "$ref": "#/definitions/feature_generator", "description": "Ion mobility feature generator configuration using IM2Deep", From 53b050d4a69318342b31b7f33d04fb35d30d9ecb Mon Sep 17 00:00:00 2001 From: SamvPy Date: Tue, 22 Oct 2024 09:44:04 +0200 Subject: [PATCH 4/4] generalize so it works on denovo --- ms2rescore/feature_generators/__init__.py | 2 + ms2rescore/feature_generators/deeplc.py | 51 ++++++++++++++++----- ms2rescore/package_data/config_default.json | 39 ++++++++++++++-- ms2rescore/parse_psms.py | 31 ++++++++----- ms2rescore/utils.py | 2 +- 5 files changed, 98 insertions(+), 27 deletions(-) diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 52440a5..b934be4 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -8,6 +8,7 @@ from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator +from peak_pack.feature_generators import PeakFeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, @@ -16,4 +17,5 @@ "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, "im2deep": IM2DeepFeatureGenerator, + "peak_fgen": PeakFeatureGenerator } diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index bea210c..f17ef46 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -109,10 +109,8 @@ def feature_names(self) -> List[str]: "rt_diff_best", ] - def add_features(self, psm_list: PSMList) -> None: - """Add DeepLC-derived features to PSMs.""" - - logger.info("Adding DeepLC-derived features to PSMs.") + def retrain_deeplc(self, psm_list: PSMList) -> None: + logger.info("Transfer learning of DeepLC model") # Get easy-access nested version of PSMList psm_dict = psm_list.get_psm_dict() @@ -120,19 +118,15 @@ def add_features(self, psm_list: PSMList) -> None: # Run DeepLC for each spectrum file current_run = 1 total_runs = sum(len(runs) for runs in psm_dict.values()) + assert total_runs == 1 + # Only one iteration for runs in psm_dict.values(): # Reset DeepLC predictor for each collection of runs self.deeplc_predictor = None self.selected_model = None for run, psms in runs.items(): - peptide_rt_diff_dict = defaultdict( - lambda: { - "observed_retention_time_best": np.Inf, - "predicted_retention_time_best": np.Inf, - "rt_diff_best": np.Inf, - } - ) + logger.info( f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." ) @@ -163,8 +157,41 @@ def add_features(self, psm_list: PSMList) -> None: "calibration of first run. Using this model (after new " "calibrations) for the remaining runs." ) + + def add_features(self, psm_list: PSMList) -> None: + """Add DeepLC-derived features to PSMs.""" + + logger.info("Adding DeepLC-derived features to PSMs.") + + # Get easy-access nested version of PSMList + psm_dict = psm_list.get_psm_dict() + + # Run DeepLC for each spectrum file + current_run = 1 + total_runs = sum(len(runs) for runs in psm_dict.values()) + assert total_runs == 1 + assert self.deeplc_predictor is not None + + for runs in psm_dict.values(): + for run, psms in runs.items(): + peptide_rt_diff_dict = defaultdict( + lambda: { + "observed_retention_time_best": np.Inf, + "predicted_retention_time_best": np.Inf, + "rt_diff_best": np.Inf, + } + ) + logger.info( + f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." + ) + + # Disable wild logging to stdout by Tensorflow, unless in debug mode + with contextlib.redirect_stdout( + open(os.devnull, "w", encoding="utf-8") + ) if not self._verbose else contextlib.nullcontext(): + # Make new PSM list for this run (chain PSMs per spectrum to flat list) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - logger.debug("Predicting retention times...") predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run)) observations = psm_list_run["retention_time"] rt_diffs_run = np.abs(predictions - observations) diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 5775b89..c3deb35 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -8,9 +8,37 @@ "ms2_tolerance": 0.02 }, "deeplc": { - "deeplc_retrain": false + "deeplc_retrain": true }, - "maxquant": {} + "maxquant": {}, + "peak_fgen": { + "ion_types": [ + "a1", + "a2", + "b1", + "b2", + "c1", + "c2", + "y1", + "y2", + "z1", + "z2", + "x1", + "x2", + "p1", + "p2", + "p3" + ], + "neutral_losses": [ + "", + "-H2O1" + ], + "max_workers": -1, + "ion_types_evidence": ["b1", "y1", "b2", "y2"], + "neutral_losses_evidence": ["", "-H2O1"], + "fragment_tol_mass": 20, + "fragment_tol_mode": "ppm" + } }, "rescoring_engine": { "mokapot": { @@ -41,6 +69,11 @@ "rename_to_usi": false, "fasta_file": null, "write_flashlfq": false, - "write_report": false + "write_report": false, + "rescoring_features": [ + "hyperscore", + "", + "" + ] } } diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 6a3c606..a6b941a 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: +def parse_psms(config: Dict, psm_list: Union[PSMList, None], recalculate_qvalues=True) -> PSMList: """ Parse PSMs and prepare for rescoring. @@ -43,8 +43,10 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: # Remove invalid AAs, find decoys, calculate q-values psm_list = _remove_invalid_aa(psm_list) - _find_decoys(psm_list, config["id_decoy_pattern"]) - _calculate_qvalues(psm_list, config["lower_score_is_better"]) + + if recalculate_qvalues: + _find_decoys(psm_list, config["id_decoy_pattern"]) + _calculate_qvalues(psm_list, config["lower_score_is_better"]) if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]: logger.debug("Parsing retention time and/or ion mobility from PSM identifier...") _parse_values_from_spectrum_id( @@ -53,14 +55,21 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: # Store scoring values for comparison later for psm in psm_list: - psm.provenance_data.update( - { - "before_rescoring_score": psm.score, - "before_rescoring_qvalue": psm.qvalue, - "before_rescoring_pep": psm.pep, - "before_rescoring_rank": psm.rank, - } - ) + if recalculate_qvalues: + psm.provenance_data.update( + { + "before_rescoring_score": psm.score, + "before_rescoring_qvalue": psm.qvalue, + "before_rescoring_pep": psm.pep, + "before_rescoring_rank": psm.rank, + } + ) + else: + psm.provenance_data.update( + { + "before_rescoring_score": psm.score, + } + ) logger.debug("Parsing modifications...") modifications_found = set( diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 3515893..a29416c 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -66,7 +66,7 @@ def infer_spectrum_path( ) # Match with file extension if not in resolved_path yet - if not _is_minitdf(resolved_path) and not re.match( + if not _is_minitdf(resolved_path) and not re.search( r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE ): for filename in glob(resolved_path + "*"):