From 0c9350998226ac973343ab5af1c670d0b948c313 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 31 Jul 2024 16:14:06 +0200 Subject: [PATCH 1/3] =?UTF-8?q?Refactor=20parsing=20of=20spectrum=20data:?= =?UTF-8?q?=20-=20Clearer=20logging=20when=20parsing=20precursor=20info=20?= =?UTF-8?q?from=20spectrum=20files=20-=20Always=20check=20if=20PSMs=20matc?= =?UTF-8?q?h=20with=20spectra=20based=20on=20observed=20precursor=20m/z=20?= =?UTF-8?q?(if=20available=20in=20PSM=20list)=20-=20Always=20raise=20error?= =?UTF-8?q?=20if=20not=20all=20PSMs=20can=20be=20found=20in=20spectrum=20f?= =?UTF-8?q?ile(s),=20before=20MS=C2=B2PIP=20-=20Provide=20example=20PSM=20?= =?UTF-8?q?IDs=20from=20both=20PSM=20and=20spectrum=20file=20when=20matchi?= =?UTF-8?q?ng=20fails.=20-=20Move=20all=20code=20to=20parse=5Fspectra?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ms2rescore/core.py | 73 +++-------- ms2rescore/feature_generators/base.py | 6 + ms2rescore/feature_generators/deeplc.py | 11 +- ms2rescore/feature_generators/im2deep.py | 3 + ms2rescore/feature_generators/ionmob.py | 3 + ms2rescore/feature_generators/maxquant.py | 37 ++++-- ms2rescore/feature_generators/ms2pip.py | 3 + ms2rescore/parse_spectra.py | 142 ++++++++++++++++++--- tests/test_parse_spectra.py | 148 ++++++++++++++++++++-- 9 files changed, 322 insertions(+), 104 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 170f1038..d45c993e 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -3,7 +3,6 @@ from multiprocessing import cpu_count from typing import Dict, Optional -import numpy as np import psm_utils.io from mokapot.dataset import LinearPsmDataset from psm_utils import PSMList @@ -11,7 +10,7 @@ from ms2rescore import exceptions from ms2rescore.feature_generators import FEATURE_GENERATORS from ms2rescore.parse_psms import parse_psms -from ms2rescore.parse_spectra import get_missing_values +from ms2rescore.parse_spectra import add_precursor_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence @@ -62,20 +61,28 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) # Add missing precursor info from spectrum file if needed - psm_list = _fill_missing_precursor_info(psm_list, config) + available_ms_data = add_precursor_values( + psm_list, config["spectrum_path"], config["spectrum_id_pattern"] + ) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): - # TODO: Handle this somewhere else, more generally? - if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all(): - logger.warning( - "MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping " - "this feature generator." - ) - continue + # Compile configuration conf = config.copy() conf.update(fgen_config) fgen = FEATURE_GENERATORS[fgen_name](**conf) + + # Check if required MS data is available + missing_ms_data = fgen.required_ms_data - available_ms_data + if missing_ms_data: + logger.warning( + f"Skipping feature generator {fgen_name} because required MS data is missing: " + f"{missing_ms_data}. Ensure that the required MS data is present in the input " + "files or disable the feature generator." + ) + continue + + # Add features fgen.add_features(psm_list) logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) @@ -102,6 +109,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: # Write feature names to file _write_feature_names(feature_names, output_file_root) + # Rename PSMs to USIs if requested if config["rename_to_usi"]: logging.debug(f"Creating USIs for {len(psm_list)} PSMs") psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list] @@ -173,51 +181,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.exception(e) -def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList: - """Fill missing precursor info from spectrum file if needed.""" - # Check if required - # TODO: avoid hard coding feature generators in some way - rt_required = ("deeplc" in config["feature_generators"]) and any( - v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"] - ) - im_required = ( - "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] - ) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"]) - logger.debug(f"RT required: {rt_required}, IM required: {im_required}") - - # Add missing values - if rt_required or im_required: - logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required) - - # Check if values are now present - for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]: - if required and ( - 0.0 in psm_list[value_name] - or None in psm_list[value_name] - or np.isnan(psm_list[value_name]).any() - ): - if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]): - raise exceptions.MissingValuesError( - f"Could not find any '{value_name}' values in PSM or spectrum files. Disable " - f"feature generators that require '{value_name}' or ensure that the values are " - "present in the input files." - ) - else: - missing_value_psms = psm_list[ - [v is None or np.isnan(v) for v in psm_list[value_name]] - ] - logger.warning( - f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. " - "These PSMs will be removed." - ) - psm_list = psm_list[ - [v is not None and not np.isnan(v) for v in psm_list[value_name]] - ] - - return psm_list - - def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList: """Filter PSMs by rank.""" psm_list.set_ranks(lower_score_better=lower_score_better) diff --git a/ms2rescore/feature_generators/base.py b/ms2rescore/feature_generators/base.py index d76c3be1..cf63ad2c 100644 --- a/ms2rescore/feature_generators/base.py +++ b/ms2rescore/feature_generators/base.py @@ -1,11 +1,17 @@ from abc import ABC, abstractmethod +from typing import Set from psm_utils import PSMList +from ms2rescore.parse_spectra import MSDataType + class FeatureGeneratorBase(ABC): """Base class from which all feature generators must inherit.""" + # List of required MS data types for feature generation + required_ms_data: Set[MSDataType] = set() + def __init__(self, *args, **kwargs) -> None: super().__init__() diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 30bea716..6171135e 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -27,6 +27,7 @@ from psm_utils import PSMList from ms2rescore.feature_generators.base import FeatureGeneratorBase +from ms2rescore.parse_spectra import MSDataType os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) @@ -35,6 +36,8 @@ class DeepLCFeatureGenerator(FeatureGeneratorBase): """DeepLC retention time-based feature generator.""" + required_ms_data = {MSDataType.retention_time} + def __init__( self, *args, @@ -138,9 +141,11 @@ def add_features(self, psm_list: PSMList) -> None: ) # Disable wild logging to stdout by Tensorflow, unless in debug mode - with contextlib.redirect_stdout( - open(os.devnull, "w") - ) if not self._verbose else contextlib.nullcontext(): + with ( + contextlib.redirect_stdout(open(os.devnull, "w")) + if not self._verbose + else contextlib.nullcontext() + ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index 552b7280..c693d0f9 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -22,6 +22,7 @@ from psm_utils import PSMList from ms2rescore.feature_generators.base import FeatureGeneratorBase +from ms2rescore.parse_spectra import MSDataType os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) @@ -30,6 +31,8 @@ class IM2DeepFeatureGenerator(FeatureGeneratorBase): """IM2Deep collision cross section feature generator.""" + required_ms_data = {MSDataType.ion_mobility} + def __init__( self, *args, diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index 7fa0c0a1..d6b4a882 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -24,6 +24,7 @@ from psm_utils import Peptidoform, PSMList from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.parse_spectra import MSDataType try: from ionmob import __file__ as ionmob_file @@ -55,6 +56,8 @@ class IonMobFeatureGenerator(FeatureGeneratorBase): """Ionmob collisional cross section (CCS)-based feature generator.""" + required_ms_data = {MSDataType.ion_mobility} + def __init__( self, *args, diff --git a/ms2rescore/feature_generators/maxquant.py b/ms2rescore/feature_generators/maxquant.py index 51105645..abc86705 100644 --- a/ms2rescore/feature_generators/maxquant.py +++ b/ms2rescore/feature_generators/maxquant.py @@ -23,6 +23,16 @@ class MaxQuantFeatureGenerator(FeatureGeneratorBase): """Generate MaxQuant-derived features.""" + available_features = [ + "mean_error_top7", + "sq_mean_error_top7", + "stdev_error_top7", + "ln_explained_ion_current", + "ln_nterm_ion_current_ratio", + "ln_cterm_ion_current_ratio", + "ln_ms2_ion_current", + ] + def __init__(self, *args, **kwargs) -> None: """ Generate MaxQuant-derived features. @@ -39,22 +49,15 @@ def __init__(self, *args, **kwargs) -> None: """ super().__init__(*args, **kwargs) + self._feature_names = self.available_features[:] # Copy list @property def feature_names(self) -> List[str]: - return [ - "mean_error_top7", - "sq_mean_error_top7", - "stdev_error_top7", - "ln_explained_ion_current", - "ln_nterm_ion_current_ratio", - "ln_cterm_ion_current_ratio", - "ln_ms2_ion_current", - ] + return self._feature_names def add_features(self, psm_list: PSMList): """ - Add MS²PIP-derived features to PSMs. + Add MaxQuant-derived features to PSMs. Parameters ---------- @@ -62,7 +65,14 @@ def add_features(self, psm_list: PSMList): PSMs to add features to. """ - logger.info("Adding MaxQuant-derived features to PSMs.") + # Check if all PSMs are from MaxQuant + if not self._all_psms_from_maxquant(psm_list): + self._feature_names = [] # Set feature names to empty list to indicate none added + logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.") + return + else: + self._feature_names = self.available_features # Reset feature names + logger.info("Adding MaxQuant-derived features to PSMs.") # Infer mass deviations column name for column_name in [ @@ -90,6 +100,11 @@ def add_features(self, psm_list: PSMList): for psm in psm_list: psm["rescoring_features"].update(self._compute_features(psm["metadata"])) + @staticmethod + def _all_psms_from_maxquant(psm_list): + """Check if the PSMs are from MaxQuant.""" + return (psm_list["source"] == "msms").all() + def _compute_features(self, psm_metadata): """Compute features from derived from intensities and mass errors.""" features = {} diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 3882b3fc..fefad5e3 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -38,6 +38,7 @@ from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.parse_spectra import MSDataType from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) @@ -46,6 +47,8 @@ class MS2PIPFeatureGenerator(FeatureGeneratorBase): """Generate MS²PIP-based features.""" + required_ms_data = {MSDataType.ms2_spectra} + def __init__( self, *args, diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 2b2c1b5f..171c4045 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -2,48 +2,148 @@ import logging import re +from enum import Enum from itertools import chain +from typing import Optional, Set, Tuple +import numpy as np from ms2rescore_rs import get_precursor_info from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreError from ms2rescore.utils import infer_spectrum_path -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) -def get_missing_values( - psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False -): - """Get missing RT/IM features from spectrum file.""" +class MSDataType(str, Enum): + """Enum for MS data types required for feature generation.""" + + retention_time = "retention time" + ion_mobility = "ion mobility" + precursor_mz = "precursor m/z" + ms2_spectra = "MS2 spectra" + + # Mimic behavior of StrEnum (Python >=3.11) + def __str__(self): + return self.value + + +def add_precursor_values( + psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: Optional[str] = None +) -> Set[MSDataType]: + """ + Add precursor m/z, retention time, and ion mobility values to a PSM list. + + Parameters + ---------- + psm_list + PSM list to add precursor values to. + spectrum_path + Path to the spectrum files. + spectrum_id_pattern + Regular expression pattern to extract spectrum IDs from file names. If provided, the + pattern must contain a single capturing group that matches the spectrum ID. Default is + None. + + Returns + ------- + available_ms_data + Set of available MS data types in the PSM list. + + """ + # Check if precursor values are missing in PSM list + rt_missing = any(v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]) + im_missing = any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"]) + mz_missing = any(v is None or v == 0 or np.isnan(v) for v in psm_list["precursor_mz"]) + + # Get precursor values from spectrum files + LOGGER.info("Parsing precursor info from spectrum files...") + mz, rt, im = _get_precursor_values(psm_list, spectrum_path, spectrum_id_pattern) + mz_found, rt_found, im_found = np.all(mz != 0.0), np.all(rt != 0.0), np.all(im != 0.0) + # ms2rescore_rs always returns 0.0 for missing values + + # Update PSM list with missing precursor values + if rt_missing and rt_found: + LOGGER.debug("Missing retention time values in PSM list. Updating from spectrum files.") + psm_list["retention_time"] = rt + if im_missing and im_found: + LOGGER.debug("Missing ion mobility values in PSM list. Updating from spectrum files.") + psm_list["ion_mobility"] = im + if mz_missing and mz_found: + LOGGER.debug("Missing precursor m/z values in PSM list. Updating from spectrum files.") + psm_list["precursor_mz"] = mz + else: + # Check if precursor m/z values are consistent between PSMs and spectrum files + mz_diff = np.abs(psm_list["precursor_mz"] - mz) + if np.mean(mz_diff) > 1e-2: + LOGGER.warning( + "Mismatch between precursor m/z values in PSM list and spectrum files (mean " + "difference exceeds 0.01 Da). Please ensure that the correct spectrum files are " + "provided and that the `spectrum_id_pattern` and `psm_id_pattern` options are " + "configured correctly. See " + "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " + "for more information." + ) + + # Return available MS data types + available_ms_data = { + MSDataType.ms2_spectra, # Assume MS2 spectra are always present + MSDataType.retention_time if not rt_missing or rt_found else None, + MSDataType.ion_mobility if not im_missing or im_found else None, + MSDataType.precursor_mz if not mz_missing or mz_found else None, + } + available_ms_data.discard(None) + + return available_ms_data + + +def _get_precursor_values( + psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: str +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Get precursor m/z, RT, and IM from spectrum files.""" + # Iterate over different runs in PSM list + precursor_dict = dict() psm_dict = psm_list.get_psm_dict() for runs in psm_dict.values(): - for run, psms in runs.items(): + for run_name, psms in runs.items(): psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_file = infer_spectrum_path(config["spectrum_path"], run) - logger.debug("Reading spectrum file: '%s'", spectrum_file) + spectrum_file = infer_spectrum_path(spectrum_path, run_name) + + LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) precursors = get_precursor_info(str(spectrum_file)) - if config["spectrum_id_pattern"]: - spectrum_id_pattern = re.compile(config["spectrum_id_pattern"]) + # Parse spectrum IDs with regex pattern if provided + if spectrum_id_pattern: + compiled_pattern = re.compile(spectrum_id_pattern) precursors = { - spectrum_id_pattern.search(spectrum_id).group(1): precursor + compiled_pattern.search(spectrum_id).group(1): precursor for spectrum_id, precursor in precursors.items() } + # Ensure all PSMs have a precursor values for psm in psm_list_run: - try: - if rt_required: - psm.retention_time = precursors[psm.spectrum_id].rt - if im_required: - psm.ion_mobility = precursors[psm.spectrum_id].im - if not psm.precursor_mz: - psm.precursor_mz = precursors[psm.spectrum_id].mz - except KeyError as e: + if psm.spectrum_id not in precursors: raise SpectrumParsingError( - f"Could not extract missing RT/IM values from spectrum file for run {run}." - ) from e + "Mismatch between PSM and spectrum file IDs. Could find precursor values " + f"for PSM with ID {psm.spectrum_id} in run {run_name}.\n" + "Please check that the `spectrum_id_pattern` and `psm_id_pattern` options " + "are configured correctly. See " + "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra" + " for more information.\n" + f"Example ID from PSM file: {psm.spectrum_id}\n" + f"Example ID from spectrum file: {list(precursors.keys())[0]}" + ) + + # Store precursor values in dictionary + precursor_dict[run_name] = precursors + + # Reshape precursor values into arrays matching PSM list + mzs = np.array([precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list]) + rts = np.array([precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list]) + ims = np.array([precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list]) + + return mzs, rts, ims class SpectrumParsingError(MS2RescoreError): diff --git a/tests/test_parse_spectra.py b/tests/test_parse_spectra.py index 4dffc9dc..e37f33ae 100644 --- a/tests/test_parse_spectra.py +++ b/tests/test_parse_spectra.py @@ -1,23 +1,143 @@ +from unittest.mock import MagicMock, patch + +import numpy as np import pytest from psm_utils import PSM, PSMList -from ms2rescore.parse_spectra import get_missing_values +from ms2rescore.feature_generators.base import MSDataType +from ms2rescore.parse_spectra import ( + SpectrumParsingError, + _get_precursor_values, + add_precursor_values, +) -def test_get_missing_values(): +@pytest.fixture +def mock_psm_list(): psm_list = PSMList( psm_list=[ - PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"), + PSM( + peptidoform="PEPTIDE/2", + run="run1", + spectrum_id="spectrum1", + retention_time=None, + ion_mobility=None, + precursor_mz=None, + ), + PSM( + peptidoform="PEPTIDE/2", + run="run1", + spectrum_id="spectrum2", + retention_time=None, + ion_mobility=None, + precursor_mz=None, + ), ] ) - get_missing_values( - psm_list, - config={ - "spectrum_path": "tests/test_data/test.mgf", - "spectrum_id_pattern": "peptide: (.*)", - }, - rt_required=True, - im_required=True, - ) - assert psm_list[0].retention_time == pytest.approx(0.853, 0.001) - assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01) + return psm_list + + +@pytest.fixture +def mock_precursor_info(): + return { + "spectrum1": MagicMock(mz=529.7935187324, rt=10.5, im=1.0), + "spectrum2": MagicMock(mz=651.83, rt=12.3, im=1.2), + } + + +@pytest.fixture +def mock_precursor_info_missing_im(): + return { + "spectrum1": MagicMock(mz=529.7935187324, rt=10.5, im=0.0), + "spectrum2": MagicMock(mz=651.83, rt=12.3, im=0.0), + } + + +@pytest.fixture +def mock_precursor_info_incomplete(): + return { + "spectrum1": MagicMock(mz=529.7935187324, rt=10.5, im=1.0), + # "spectrum2" is missing + } + + +@patch("ms2rescore.parse_spectra.get_precursor_info") +@patch("ms2rescore.parse_spectra.infer_spectrum_path") +def test_add_precursor_values( + mock_infer_spectrum_path, mock_get_precursor_info, mock_psm_list, mock_precursor_info +): + mock_infer_spectrum_path.return_value = "test_data/test_spectrum_file.mgf" + mock_get_precursor_info.return_value = mock_precursor_info + + available_ms_data = add_precursor_values(mock_psm_list, "test_data") + + assert MSDataType.retention_time in available_ms_data + assert MSDataType.ion_mobility in available_ms_data + assert MSDataType.precursor_mz in available_ms_data + + for psm in mock_psm_list: + assert psm.retention_time is not None + assert psm.ion_mobility is not None + assert psm.precursor_mz is not None + + +@patch("ms2rescore.parse_spectra.get_precursor_info") +@patch("ms2rescore.parse_spectra.infer_spectrum_path") +def test_add_precursor_values_missing_im( + mock_infer_spectrum_path, + mock_get_precursor_info, + mock_psm_list, + mock_precursor_info_missing_im, +): + mock_infer_spectrum_path.return_value = "test_data/test_spectrum_file.mgf" + mock_get_precursor_info.return_value = mock_precursor_info_missing_im + + available_ms_data = add_precursor_values(mock_psm_list, "test_data") + + assert MSDataType.retention_time in available_ms_data + assert MSDataType.ion_mobility not in available_ms_data + assert MSDataType.precursor_mz in available_ms_data + + for psm in mock_psm_list: + assert psm.retention_time is not None + assert psm.ion_mobility is None + assert psm.precursor_mz is not None + + +@patch("ms2rescore.parse_spectra.get_precursor_info") +@patch("ms2rescore.parse_spectra.infer_spectrum_path") +def test_get_precursor_values( + mock_infer_spectrum_path, mock_get_precursor_info, mock_psm_list, mock_precursor_info +): + mock_infer_spectrum_path.return_value = "test_data/test_spectrum_file.mgf" + mock_get_precursor_info.return_value = mock_precursor_info + + mzs, rts, ims = _get_precursor_values(mock_psm_list, "test_data", None) + + expected_mzs = np.array([529.7935187324, 651.83]) + expected_rts = np.array([10.5, 12.3]) + expected_ims = np.array([1.0, 1.2]) + + np.testing.assert_array_equal(mzs, expected_mzs) + np.testing.assert_array_equal(rts, expected_rts) + np.testing.assert_array_equal(ims, expected_ims) + + +@patch("ms2rescore.parse_spectra.get_precursor_info") +@patch("ms2rescore.parse_spectra.infer_spectrum_path") +def test_get_precursor_values_missing_spectrum_id( + mock_infer_spectrum_path, + mock_get_precursor_info, + mock_psm_list, + mock_precursor_info_incomplete, +): + mock_infer_spectrum_path.return_value = "test_data/test_spectrum_file.mgf" + mock_get_precursor_info.return_value = mock_precursor_info_incomplete + + with pytest.raises(SpectrumParsingError): + _get_precursor_values(mock_psm_list, "test_data", None) + + +def test_spectrum_parsing_error(): + with pytest.raises(SpectrumParsingError): + raise SpectrumParsingError("Test error message") From 41ea6c802607ce1663190efc8cce84526cd73ed8 Mon Sep 17 00:00:00 2001 From: Ralf Gabriels Date: Thu, 5 Dec 2024 09:05:34 +0000 Subject: [PATCH 2/3] Implement requested changes (.copy; use generators) --- ms2rescore/feature_generators/maxquant.py | 2 +- ms2rescore/parse_spectra.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ms2rescore/feature_generators/maxquant.py b/ms2rescore/feature_generators/maxquant.py index abc86705..45205c2a 100644 --- a/ms2rescore/feature_generators/maxquant.py +++ b/ms2rescore/feature_generators/maxquant.py @@ -49,7 +49,7 @@ def __init__(self, *args, **kwargs) -> None: """ super().__init__(*args, **kwargs) - self._feature_names = self.available_features[:] # Copy list + self._feature_names = self.available_features.copy() @property def feature_names(self) -> List[str]: diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 171c4045..8626a0eb 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -139,9 +139,9 @@ def _get_precursor_values( precursor_dict[run_name] = precursors # Reshape precursor values into arrays matching PSM list - mzs = np.array([precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list]) - rts = np.array([precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list]) - ims = np.array([precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list]) + mzs = np.array(precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list) + rts = np.array(precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list) + ims = np.array(precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list) return mzs, rts, ims From b417a6c6320b69b97e96658c7d6bf40987c2d180 Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 5 Dec 2024 12:02:52 +0100 Subject: [PATCH 3/3] Use np.fromiter for generators --- ms2rescore/parse_spectra.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 8626a0eb..7794a33e 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -139,9 +139,9 @@ def _get_precursor_values( precursor_dict[run_name] = precursors # Reshape precursor values into arrays matching PSM list - mzs = np.array(precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list) - rts = np.array(precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list) - ims = np.array(precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list) + mzs = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].mz for psm in psm_list), float) + rts = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].rt for psm in psm_list), float) + ims = np.fromiter((precursor_dict[psm.run][psm.spectrum_id].im for psm in psm_list), float) return mzs, rts, ims