From 6826a1ca58064c8e88930f4ab1bfaa26a6fb15f2 Mon Sep 17 00:00:00 2001 From: Daniela Klaproth-Andrade Date: Mon, 1 Jul 2024 20:47:12 +0200 Subject: [PATCH 01/51] migration to depthcharge v0.4.8 --- casanovo/casanovo.py | 8 +- casanovo/config.py | 12 + casanovo/config.yaml | 40 ++- casanovo/data/ms_io.py | 4 +- casanovo/denovo/dataloaders.py | 352 ++++++++++++------------ casanovo/denovo/model.py | 463 +++++++++++++++++++++----------- casanovo/denovo/model_runner.py | 175 +++++++----- casanovo/denovo/transformers.py | 173 ++++++++++++ pyproject.toml | 2 +- 9 files changed, 807 insertions(+), 422 deletions(-) create mode 100644 casanovo/denovo/transformers.py diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index fef73a9b..f3c9f19b 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -139,7 +139,7 @@ def main() -> None: "peak_path", required=True, nargs=-1, - type=click.Path(exists=True, dir_okay=False), + type=click.Path(exists=True, dir_okay=True), ) @click.option( "--evaluate", @@ -206,7 +206,7 @@ def sequence( "peak_path", required=True, nargs=-1, - type=click.Path(exists=True, dir_okay=False), + type=click.Path(exists=True, dir_okay=True), ) @click.argument( "fasta_path", @@ -266,7 +266,7 @@ def db_search( "train_peak_path", required=True, nargs=-1, - type=click.Path(exists=True, dir_okay=False), + type=click.Path(exists=True, dir_okay=True), ) @click.option( "-p", @@ -277,7 +277,7 @@ def db_search( """, required=False, multiple=True, - type=click.Path(exists=True, dir_okay=False), + type=click.Path(exists=True, dir_okay=True), ) def train( train_peak_path: Tuple[str], diff --git a/casanovo/config.py b/casanovo/config.py index e276e12d..f802a292 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -83,6 +83,18 @@ class Config: calculate_precision=bool, accelerator=str, devices=int, + lance_dir=str, + shuffle=bool, + buffer_size=int, + reverse_peptides=bool, + replace_isoleucine_with_leucine=bool, + accumulate_grad_batches=int, + gradient_clip_val=float, + gradient_clip_algorithm=str, + precision=str, + early_stopping_patience=int, + resume_training_from=str, + mskb_tokenizer=bool, ) def __init__(self, config_file: Optional[str] = None): diff --git a/casanovo/config.yaml b/casanovo/config.yaml index b7179347..5df107e7 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -84,6 +84,8 @@ tb_summarywriter: false log_metrics: false # How often to log optimizer parameters in steps log_every_n_steps: 50 +# Path to save lance instances +lance_dir: # Model validation and checkpointing frequency in training steps. val_check_interval: 50_000 @@ -125,6 +127,10 @@ learning_rate: 5e-4 weight_decay: 1e-5 # Amount of label smoothing when computing the training loss. train_label_smoothing: 0.01 +# Shuffle dataset during training. +# A buffer of size buffer_size is filled and examples from this buffer are randomly sampled. +shuffle: +buffer_size: 100_000 # TRAINING/INFERENCE OPTIONS # Number of spectra in one training batch. @@ -137,6 +143,23 @@ num_sanity_val_steps: 0 # This is expensive, so we recommend against it. calculate_precision: False +# Additional Pytorch lightning trainer flags +accumulate_grad_batches: 1 +gradient_clip_val: +gradient_clip_algorithm: +precision: "32-true" # '16-true', '16-mixed', 'bf16-true', 'bf16-mixed', '32-true', '64-true', '64', '32', '16', 'bf16' + +# Resume training and early stopping +resume_training_from : #'last', 'best', 'path' +early_stopping_patience: + +# Replace I by L in peptide sequences +replace_isoleucine_with_leucine: True +# Reverse peptide sequences +reverse_peptides: True +# mskb tokenizer, otherwise proforma syntax +mskb_tokenizer: True + # AMINO ACID AND MODIFICATION VOCABULARY residues: "G": 57.021464 @@ -145,8 +168,7 @@ residues: "P": 97.052764 "V": 99.068414 "T": 101.047670 - "C+57.021": 160.030649 # 103.009185 + 57.021464 - "L": 113.084064 + "C[Carbamidomethyl]": 160.030649 # 103.009185 + 57.021464 "L": 113.084064 "I": 113.084064 "N": 114.042927 "D": 115.026943 @@ -160,11 +182,11 @@ residues: "Y": 163.063329 "W": 186.079313 # Amino acid modifications. - "M+15.995": 147.035400 # Met oxidation: 131.040485 + 15.994915 - "N+0.984": 115.026943 # Asn deamidation: 114.042927 + 0.984016 - "Q+0.984": 129.042594 # Gln deamidation: 128.058578 + 0.984016 + "M[Oxidation]": 147.035400 # Met oxidation: 131.040485 + 15.994915 + "N[Deamidated]": 115.026943 # Asn deamidation: 114.042927 + 0.984016 + "Q[Deamidated]": 129.042594 # Gln deamidation: 128.058578 + 0.984016 # N-terminal modifications. - "+42.011": 42.010565 # Acetylation - "+43.006": 43.005814 # Carbamylation - "-17.027": -17.026549 # NH3 loss - "+43.006-17.027": 25.980265 # Carbamylation and NH3 loss + "[Acetyl]-": 42.010565 # Acetylation + "[Carbamyl]-": 43.005814 # Carbamylation "+43.006" + "[Ammonia-loss]-": -17.026549 # NH3 loss + "[+25.980265]-": 25.980265 # Carbamylation and NH3 loss diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index bb9a8a3e..62d7a905 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -7,7 +7,7 @@ import re from pathlib import Path from typing import List - +import pprint import natsort from .. import __version__ @@ -142,7 +142,7 @@ def set_ms_run(self, peak_filenames: List[str]) -> None: self.metadata.append( (f"ms_run[{i}]-location", Path(filename).as_uri()), ) - self._run_map[filename] = i + self._run_map[os.path.basename(filename)] = i def save(self) -> None: """ diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index cdbf71bf..9a271816 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -3,15 +3,24 @@ import functools import logging import os -from typing import List, Optional, Tuple - +from typing import Optional, Iterable +from pathlib import Path import lightning.pytorch as pl import numpy as np import torch -from depthcharge.data import AnnotatedSpectrumIndex +from torch.utils.data import DataLoader +import tempfile +import pyarrow as pa +from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe + -from ..data import db_utils -from ..data.datasets import AnnotatedSpectrumDataset, SpectrumDataset +from depthcharge.tokenizers import PeptideTokenizer +from depthcharge.data import ( + AnnotatedSpectrumDataset, + CustomField, + SpectrumDataset, + preprocessing +) logger = logging.getLogger("casanovo") @@ -23,12 +32,12 @@ class DeNovoDataModule(pl.LightningDataModule): Parameters ---------- - train_index : Optional[AnnotatedSpectrumIndex] - The spectrum index file corresponding to the training data. - valid_index : Optional[AnnotatedSpectrumIndex] - The spectrum index file corresponding to the validation data. - test_index : Optional[AnnotatedSpectrumIndex] - The spectrum index file corresponding to the testing data. + train_paths : str, optional + A spectrum lance path for model training. + valid_pathas : str, optional + A spectrum lance path for validation. + test_paths : str, optional + A spectrum lance path for evaluation or inference. train_batch_size : int The batch size to use for training. eval_batch_size : int @@ -48,18 +57,27 @@ class DeNovoDataModule(pl.LightningDataModule): Remove peaks within the given mass tolerance in Dalton around the precursor mass. n_workers : int, optional - The number of workers to use for data loading. By default, the - number of available CPU cores on the current machine is used. + The number of workers to use for data loading. By default, the number of + available CPU cores on the current machine is used. + max_charge: int + Remove PSMs which precursor charge higher than specified max_charge + tokenizer: Optional[PeptideTokenizer] + Peptide tokenizer for tokenizing sequences random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the - order they were parsed. + The NumPy random state. ``None`` leaves mass spectra in the order they + were parsed. + shuffle: Optional[bool] + Should the training dataset be shuffled? Suffling based on specified buffer_size + buffer_size: Optional[int] + See more here: + https://huggingface.co/docs/datasets/v1.11.0/dataset_streaming.html#shuffling-the-dataset-shuffle """ def __init__( self, - train_index: Optional[AnnotatedSpectrumIndex] = None, - valid_index: Optional[AnnotatedSpectrumIndex] = None, - test_index: Optional[AnnotatedSpectrumIndex] = None, + train_paths: Optional[Iterable[str]] = None, + valid_paths: Optional[Iterable[str]] = None, + test_paths: Optional[str] = None, train_batch_size: int = 128, eval_batch_size: int = 1028, n_peaks: Optional[int] = 150, @@ -69,25 +87,124 @@ def __init__( remove_precursor_tol: float = 2.0, n_workers: Optional[int] = None, random_state: Optional[int] = None, + max_charge: Optional[int] = 10, + tokenizer: Optional[PeptideTokenizer] = None, + lance_dir: Optional[str] = None, + shuffle: Optional[bool] = True, + buffer_size: Optional[int] = 100_000, ): super().__init__() - self.train_index: Optional[AnnotatedSpectrumIndex] = train_index - self.valid_index: Optional[AnnotatedSpectrumIndex] = valid_index - self.test_index: Optional[AnnotatedSpectrumIndex] = test_index + self.train_paths = train_paths + self.valid_paths = valid_paths + self.test_paths = test_paths self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size - self.n_peaks: Optional[int] = n_peaks - self.min_mz = min_mz - self.max_mz = max_mz - self.min_intensity = min_intensity - self.remove_precursor_tol = remove_precursor_tol - self.n_workers = n_workers if n_workers is not None else os.cpu_count() - self.rng = np.random.default_rng(random_state) + + self.tokenizer = tokenizer if tokenizer is not None else PeptideTokenizer() + self.lance_dir = lance_dir if lance_dir is not None else tempfile.TemporaryDirectory(suffix='.lance').name + + self.train_dataset = None self.valid_dataset = None self.test_dataset = None self.protein_database = None + self.n_workers = n_workers if n_workers is not None else os.cpu_count() + self.shuffle = shuffle if shuffle else None # set to None if not wanted. Otherwise torch throws and error + self.buffer_size = buffer_size + + self.valid_charge = np.arange(1, max_charge+1) + self.preprocessing_fn = [ + preprocessing.set_mz_range(min_mz=min_mz, max_mz=max_mz), + preprocessing.remove_precursor_peak(remove_precursor_tol, "Da"), + preprocessing.filter_intensity(min_intensity, n_peaks), + preprocessing.scale_intensity("root", 1), + scale_to_unit_norm + ] + self.custom_field_test_mgf = [ + CustomField("scans", + lambda x: x["params"]["scans"] if 'scans' in x["params"] else x["params"]["title"], + pa.string()), + CustomField("title", + lambda x: x["params"]["title"], + pa.string()) + ] + self.custom_field_test_mzml = [ + CustomField("scans", lambda x: x["id"], pa.string()), + CustomField("title", lambda x: x["id"], pa.string()), + ] + + self.custom_field_anno = [CustomField("seq", lambda x: x["params"]["seq"], pa.string())] + + def make_dataset(self, paths, annotated, mode, shuffle): + """ + Make spectrum datasets + Parameters + ---------- + paths : Iterable[str] + Paths to input datasets + annotated: bool + True if peptide sequence annotations are available for the test + data. + mode: str {"train", "valid", "test"} + The mode indicating name of lance instance + shuffle: bool + Indicates whether to shuffle training data based on buffer_size + """ + custom_fields = self.custom_field_anno if annotated else [] + + if mode=="test": + if all([Path(f).suffix in ('.mgf') for f in paths]): + custom_fields = custom_fields + self.custom_field_test_mgf + if all([Path(f).suffix in (".mzml", ".mzxml", '.mzML') for f in paths]): + custom_fields = custom_fields + self.custom_field_test_mzml + + lance_path = f'{self.lance_dir}/{mode}.lance' + + parse_kwargs = dict( + preprocessing_fn=self.preprocessing_fn, + custom_fields=custom_fields, + valid_charge=self.valid_charge, + + ) + + dataset_params = dict( + batch_size=self.train_batch_size if mode=="train" else self.eval_batch_size + ) + anno_dataset_params = dataset_params | dict( + tokenizer=self.tokenizer, + annotations='seq', + ) + + if any([Path(f).suffix in (".lance") for f in paths]): + if annotated: + dataset = AnnotatedSpectrumDataset.from_lance(paths[0], **anno_dataset_params) + else: + dataset = SpectrumDataset.from_lance(paths[0], **dataset_params) + else: + if annotated: + dataset = AnnotatedSpectrumDataset( + spectra=paths, + path=lance_path, + parse_kwargs=parse_kwargs, + **anno_dataset_params, + ) + else: + dataset = SpectrumDataset( + spectra=paths, + path=lance_path, + parse_kwargs=parse_kwargs, + **dataset_params, + ) + + if shuffle: + dataset = ShufflerIterDataPipe( + dataset, + buffer_size=self.buffer_size + ) + + return dataset + def setup(self, stage: str = None, annotated: bool = True) -> None: """ Set up the PyTorch Datasets. @@ -102,43 +219,32 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: test data. """ if stage in (None, "fit", "validate"): - make_dataset = functools.partial( - AnnotatedSpectrumDataset, - n_peaks=self.n_peaks, - min_mz=self.min_mz, - max_mz=self.max_mz, - min_intensity=self.min_intensity, - remove_precursor_tol=self.remove_precursor_tol, - ) - if self.train_index is not None: - self.train_dataset = make_dataset( - self.train_index, - random_state=self.rng, + if self.train_paths is not None: + self.train_dataset = self.make_dataset( + self.train_paths, annotated=True, + mode='train', shuffle=self.shuffle + ) + if self.valid_paths is not None: + self.valid_dataset = self.make_dataset( + self.valid_paths, annotated=True, + mode='valid', shuffle=False ) - if self.valid_index is not None: - self.valid_dataset = make_dataset(self.valid_index) if stage in (None, "test"): - make_dataset = functools.partial( - AnnotatedSpectrumDataset if annotated else SpectrumDataset, - n_peaks=self.n_peaks, - min_mz=self.min_mz, - max_mz=self.max_mz, - min_intensity=self.min_intensity, - remove_precursor_tol=self.remove_precursor_tol, - ) - if self.test_index is not None: - self.test_dataset = make_dataset(self.test_index) + if self.test_paths is not None: + self.test_dataset = self.make_dataset( + self.test_paths, + annotated=annotated, + mode='test', + shuffle=False + ) def _make_loader( self, dataset: torch.utils.data.Dataset, - batch_size: int, - shuffle: bool = False, - collate_fn: Optional[callable] = None, + shuffle: Optional[bool] = None, ) -> torch.utils.data.DataLoader: """ - Create a PyTorch DataLoader. - + Create a PyTorch DataLoader. Parameters ---------- dataset : torch.utils.data.Dataset @@ -155,32 +261,29 @@ def _make_loader( torch.utils.data.DataLoader A PyTorch DataLoader. """ - return torch.utils.data.DataLoader( + return DataLoader( dataset, - batch_size=batch_size, - collate_fn=prepare_batch if collate_fn is None else collate_fn, - pin_memory=True, - num_workers=self.n_workers, shuffle=shuffle, + num_workers=0, # self.n_workers, + #precision=torch.float32, + pin_memory=True, ) def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader( - self.train_dataset, self.train_batch_size, shuffle=True - ) + return self._make_loader(self.train_dataset, self.shuffle ) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" - return self._make_loader(self.valid_dataset, self.eval_batch_size) + return self._make_loader(self.valid_dataset) def test_dataloader(self) -> torch.utils.data.DataLoader: """Get the test DataLoader.""" - return self._make_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader(self.test_dataset) def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" - return self._make_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader(self.test_dataset) def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search.""" @@ -193,114 +296,13 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: ) -def prepare_batch( - batch: List[Tuple[torch.Tensor, float, int, str]] -) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray]: +def scale_to_unit_norm(spectrum): """ - Collate MS/MS spectra into a batch. - - The MS/MS spectra will be padded so that they fit nicely as a - tensor. However, the padded elements are ignored during the - subsequent steps. - - Parameters - ---------- - batch : List[Tuple[torch.Tensor, float, int, str]] - A batch of data from an AnnotatedSpectrumDataset, consisting of - for each spectrum (i) a tensor with the m/z and intensity peak - values, (ii), the precursor m/z, (iii) the precursor charge, - (iv) the spectrum identifier. - - Returns - ------- - spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak - values for each spectrum. - precursors : torch.Tensor of shape (batch_size, 3) - A tensor with the precursor neutral mass, precursor charge, and - precursor m/z. - spectrum_ids : np.ndarray - The spectrum identifiers (during de novo sequencing) or peptide - sequences (during training). + Scaling function used in Casanovo + slightly differing from the depthcharge implementation """ - spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) - spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) - precursor_mzs = torch.tensor(precursor_mzs) - precursor_charges = torch.tensor(precursor_charges) - precursor_masses = (precursor_mzs - 1.007276) * precursor_charges - precursors = torch.vstack( - [precursor_masses, precursor_charges, precursor_mzs] - ).T.float() - return spectra, precursors, np.asarray(spectrum_ids) - - -def prepare_psm_batch( - batch: List[Tuple[torch.Tensor, float, int, str]], - protein_database: db_utils.ProteinDatabase, -) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray]: - """ - Collate MS/MS spectra into a batch for DB search. - - The MS/MS spectra will be padded so that they fit nicely as a - tensor. However, the padded elements are ignored during the - subsequent steps. - - Parameters - ---------- - batch : List[Tuple[torch.Tensor, float, int, str]] - A batch of data from an AnnotatedSpectrumDataset, consisting of - for each spectrum (i) a tensor with the m/z and intensity peak - values, (ii), the precursor m/z, (iii) the precursor charge, - (iv) the spectrum identifier. - protein_database : db_utils.ProteinDatabase - The protein database to use for candidate peptide retrieval. - - Returns - ------- - batch_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak - values for each spectrum. - batch_precursors : torch.Tensor of shape (batch_size, 3) - A tensor with the precursor neutral mass, precursor charge, and - precursor m/z. - batch_spectrum_ids : np.ndarray - The spectrum identifiers. - batch_peptides : np.ndarray - The candidate peptides for each spectrum. - """ - spectra, precursors, spectrum_ids = prepare_batch(batch) - - batch_spectra = [] - batch_precursors = [] - batch_spectrum_ids = [] - batch_peptides = [] - # FIXME: This can be optimized by using a sliding window instead of - # retrieving candidates for each spectrum independently. - for i in range(len(batch)): - candidate_pep = protein_database.get_candidates( - precursors[i][2], precursors[i][1] - ) - if len(candidate_pep) == 0: - logger.debug( - "No candidate peptides found for spectrum %s with precursor " - "charge %d and precursor m/z %f", - spectrum_ids[i], - precursors[i][1], - precursors[i][2], - ) - else: - batch_spectra.append( - spectra[i].unsqueeze(0).repeat(len(candidate_pep), 1, 1) - ) - batch_precursors.append( - precursors[i].unsqueeze(0).repeat(len(candidate_pep), 1) + spectrum._inner._intensity = spectrum.intensity / np.linalg.norm( + spectrum.intensity ) - batch_spectrum_ids.extend([spectrum_ids[i]] * len(candidate_pep)) - batch_peptides.extend(candidate_pep) + return spectrum - return ( - torch.cat(batch_spectra, dim=0), - torch.cat(batch_precursors, dim=0), - np.asarray(batch_spectrum_ids), - np.asarray(batch_peptides), - ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index f350f3b3..04c3d0a5 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -2,28 +2,25 @@ import collections import heapq -import itertools import logging import warnings -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union -import depthcharge.masses import einops import torch import numpy as np import lightning.pytorch as pl -from torch.utils.tensorboard import SummaryWriter -from depthcharge.components import ModelMixin, PeptideDecoder, SpectrumEncoder + +from depthcharge.tokenizers import PeptideTokenizer from . import evaluate from .. import config from ..data import ms_io +from ..denovo.transformers import SpectrumEncoder, PeptideDecoder logger = logging.getLogger("casanovo") - -class Spec2Pep(pl.LightningModule, ModelMixin): +class Spec2Pep(pl.LightningModule): """ A Transformer model for de novo peptide sequencing. @@ -93,6 +90,8 @@ class Spec2Pep(pl.LightningModule, ModelMixin): calculate_precision : bool Calculate the validation set precision during training. This is expensive. + tokenizer: Optional[PeptideTokenizer] + Tokenizer object to tokenize and detokenize peptide sequences. **kwargs : Dict Additional keyword arguments passed to the Adam optimizer. """ @@ -114,40 +113,42 @@ def __init__( n_beams: int = 1, top_match: int = 1, n_log: int = 10, - tb_summarywriter: Optional[Path] = None, train_label_smoothing: float = 0.01, warmup_iters: int = 100_000, cosine_schedule_period_iters: int = 600_000, out_writer: Optional[ms_io.MztabWriter] = None, calculate_precision: bool = False, + tokenizer: Optional[PeptideTokenizer] = None, **kwargs: Dict, ): super().__init__() self.save_hyperparameters() + self.tokenizer = tokenizer if tokenizer is not None else PeptideTokenizer() + self.vocab_size = len(self.tokenizer) + 1 # Build the model. self.encoder = SpectrumEncoder( - dim_model=dim_model, + d_model=dim_model, n_head=n_head, dim_feedforward=dim_feedforward, n_layers=n_layers, dropout=dropout, - dim_intensity=dim_intensity, ) self.decoder = PeptideDecoder( - dim_model=dim_model, + d_model=dim_model, + n_tokens=self.tokenizer, n_head=n_head, dim_feedforward=dim_feedforward, n_layers=n_layers, dropout=dropout, - residues=residues, max_charge=max_charge, ) self.softmax = torch.nn.Softmax(2) + ignore_index = 0 self.celoss = torch.nn.CrossEntropyLoss( - ignore_index=0, label_smoothing=train_label_smoothing + ignore_index=ignore_index, label_smoothing=train_label_smoothing ) - self.val_celoss = torch.nn.CrossEntropyLoss(ignore_index=0) + self.val_celoss = torch.nn.CrossEntropyLoss(ignore_index=ignore_index) # Optimizer settings. self.warmup_iters = warmup_iters self.cosine_schedule_period_iters = cosine_schedule_period_iters @@ -170,41 +171,40 @@ def __init__( self.min_peptide_len = min_peptide_len self.n_beams = n_beams self.top_match = top_match - self.peptide_mass_calculator = depthcharge.masses.PeptideMass( - self.residues - ) - self.stop_token = self.decoder._aa2idx["$"] + + self.stop_token = self.tokenizer.stop_int # Logging. self.calculate_precision = calculate_precision self.n_log = n_log self._history = [] - if tb_summarywriter is not None: - self.tb_summarywriter = SummaryWriter(str(tb_summarywriter)) - else: - self.tb_summarywriter = None # Output writer during predicting. self.out_writer: ms_io.MztabWriter = out_writer + @property + def device(self) -> torch.device: + """The current device for first parameter of the model.""" + return next(self.parameters()).device + + @property + def n_parameters(self): + """The number of learnable parameters.""" + return sum(p.numel() for p in self.parameters() if p.requires_grad) + def forward( - self, spectra: torch.Tensor, precursors: torch.Tensor + self, batch: dict ) -> List[List[Tuple[float, np.ndarray, str]]]: """ Predict peptide sequences for a batch of MS/MS spectra. Parameters ---------- - spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) - The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the - peaks in the MS/MS spectrum, and axis 2 is essentially a - 2-tuple specifying the m/z-intensity pair for each peak. - These should be zero-padded, such that all the spectra in - the batch are the same length. - precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge - (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. + batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] + A batch of (i) m/z values of MS/MS spectra, + (ii) intensity values of MS/MS spectra, + (iii) precursor information, + (iv) peptide sequences as torch Tensors. Returns ------- @@ -214,26 +214,27 @@ def forward( score, the amino acid scores, and the predicted peptide sequence. """ - return self.beam_search_decode( - spectra.to(self.encoder.device), - precursors.to(self.decoder.device), - ) + mzs, ints, precursors, _ = self._process_batch(batch) + return self.beam_search_decode(mzs, ints, precursors) def beam_search_decode( - self, spectra: torch.Tensor, precursors: torch.Tensor + self, mzs: torch.Tensor, ints: torch.Tensor, precursors: torch.Tensor ) -> List[List[Tuple[float, np.ndarray, str]]]: """ Beam search decoding of the spectrum predictions. Parameters ---------- - spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) - The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the - peaks in the MS/MS spectrum, and axis 2 is essentially a - 2-tuple specifying the m/z-intensity pair for each peak. - These should be zero-padded, such that all the spectra in - the batch are the same length. + mzs : torch.Tensor of shape (n_spectra, n_peaks) + The m/z axis of spectra for which to predict peptide sequences. + Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in + the MS/MS spectrum. These should be zero-padded, + such that all the spectra in the batch are the same length. + ints: torch.Tensor of shape (n_spectra, n_peaks) + The m/z axis of spectra for which to predict peptide sequences. + Axis 0 represents an MS/MS spectrum, axis 1 specifies + the m/z-intensity pair for each peak. These should be zero-padded, + such that all the spectra in the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) The measured precursor mass (axis 0), precursor charge (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. @@ -246,28 +247,36 @@ def beam_search_decode( with the peptide score, the amino acid scores, and the predicted peptide sequence. """ - memories, mem_masks = self.encoder(spectra) + memories, mem_masks = self.encoder(mzs, ints) # Sizes. - batch = spectra.shape[0] # B - length = self.max_peptide_len + 1 # L - vocab = self.decoder.vocab_size + 1 # V + batch = mzs.shape[0] # B + length = self.max_length + 1 # L + vocab = self.vocab_size # V beam = self.n_beams # S # Initialize scores and tokens. scores = torch.full( size=(batch, length, vocab, beam), fill_value=torch.nan - ) - scores = scores.type_as(spectra) - tokens = torch.zeros(batch, length, beam, dtype=torch.int64) - tokens = tokens.to(self.encoder.device) - + ).type_as(mzs) + + tokens = torch.zeros(batch, length, beam, + dtype=torch.int64, + device=self.encoder.device) + # Create cache for decoded beams. pred_cache = collections.OrderedDict((i, []) for i in range(batch)) # Get the first prediction. - pred, _ = self.decoder(None, precursors, memories, mem_masks) - tokens[:, 0, :] = torch.topk(pred[:, 0, :], beam, dim=1)[1] + pred = self.decoder( + tokens=torch.zeros(batch, 0, + dtype=torch.int64, + device=self.encoder.device), + memory=memories, + memory_key_padding_mask=mem_masks, + precursors=precursors + ) + tokens[:, 0, :] = torch.topk(pred[:, 0, :], beam, dim=1)[1] scores[:, :1, :, :] = einops.repeat(pred, "B L V -> B L V S", S=beam) # Make all tensors the right shape for decoding. @@ -305,20 +314,21 @@ def beam_search_decode( if finished_beams.all(): break # Update the scores. - scores[~finished_beams, : step + 2, :], _ = self.decoder( - tokens[~finished_beams, : step + 1], - precursors[~finished_beams, :], - memories[~finished_beams, :, :], - mem_masks[~finished_beams, :], + scores[~finished_beams, : step + 2, :]= self.decoder( + tokens=tokens[~finished_beams, : step + 1], + precursors=precursors[~finished_beams, :], + memory=memories[~finished_beams, :, :], + memory_key_padding_mask=mem_masks[~finished_beams, :], ) # Find the top-k beams with the highest scores and continue # decoding those. tokens, scores = self._get_topk_beams( tokens, scores, finished_beams, batch, step + 1 ) - - # Return the peptide with the highest confidence score, within - # the precursor m/z tolerance if possible. + tokens = tokens + + # Return the peptide with the highest confidence score, within the + # precursor m/z tolerance if possible. return list(self._get_top_peptide(pred_cache)) def _finish_beams( @@ -357,19 +367,21 @@ def _finish_beams( violate the minimum peptide length). """ # Check for tokens with a negative mass (i.e. neutral loss). - aa_neg_mass = [None] - for aa, mass in self.peptide_mass_calculator.masses.items(): + aa_neg_mass_idx = [] + for aa, mass in self.tokenizer.residues.items(): if mass < 0: - aa_neg_mass.append(aa) + # aa_neg_mass.append(aa) + aa_neg_mass_idx.append(self.tokenizer.index[aa]) + # Find N-terminal residues. n_term = torch.Tensor( [ - self.decoder._aa2idx[aa] - for aa in self.peptide_mass_calculator.masses - if aa.startswith(("+", "-")) + self.tokenizer.index[aa] + for aa in self.tokenizer.index + if aa.startswith(("+", "-",'[+', '[-')) ] ).to(self.decoder.device) - + beam_fits_precursor = torch.zeros( tokens.shape[0], dtype=torch.bool ).to(self.encoder.device) @@ -382,9 +394,10 @@ def _finish_beams( finished_beams[ends_stop_token] = True # Beams with a dummy token predicted in the current step can be # discarded. - discarded_beams = torch.zeros(tokens.shape[0], dtype=torch.bool).to( - self.encoder.device - ) + discarded_beams = torch.zeros( + tokens.shape[0], dtype=torch.bool + ).to(self.encoder.device) + discarded_beams[tokens[:, step] == 0] = True # Discard beams with invalid modification combinations (i.e. # N-terminal modifications occur multiple times or in internal @@ -413,13 +426,13 @@ def _finish_beams( continue pred_tokens = tokens[i][: step + 1] peptide_len = len(pred_tokens) - peptide = self.decoder.detokenize(pred_tokens) + # Omit stop token. - if self.decoder.reverse and peptide[0] == "$": - peptide = peptide[1:] + if self.tokenizer.reverse and pred_tokens[0] == self.stop_token: + pred_tokens = pred_tokens[1:] peptide_len -= 1 - elif not self.decoder.reverse and peptide[-1] == "$": - peptide = peptide[:-1] + elif not self.tokenizer.reverse and pred_tokens[-1] == self.stop_token: + pred_tokens = pred_tokens[:-1] peptide_len -= 1 # Discard beams that were predicted to end but don't fit the # minimum peptide length. @@ -433,16 +446,27 @@ def _finish_beams( precursor_charge = precursors[i, 1] precursor_mz = precursors[i, 2] matches_precursor_mz = exceeds_precursor_mz = False - for aa in [None] if finished_beams[i] else aa_neg_mass: + + # Send tokenizer masses to correct device for calculate_precursor_ions() + self.tokenizer.masses = self.tokenizer.masses.type_as(precursor_mz) + + for aa in [None] if finished_beams[i] else aa_neg_mass_idx: if aa is None: - calc_peptide = peptide + calc_peptide = pred_tokens else: - calc_peptide = peptide.copy() - calc_peptide.append(aa) - try: - calc_mz = self.peptide_mass_calculator.mass( - seq=calc_peptide, charge=precursor_charge + calc_peptide = pred_tokens.detach().clone() + calc_peptide = torch.cat( + (calc_peptide, + torch.tensor([aa]).type_as(calc_peptide) + ) ) + try: + + calc_mz = self.tokenizer.calculate_precursor_ions( + calc_peptide.unsqueeze(0), + precursor_charge.unsqueeze(0) + )[0] + delta_mass_ppm = [ _calc_mass_error( calc_mz, @@ -615,7 +639,7 @@ def _get_topk_beams( all spectra. """ beam = self.n_beams # S - vocab = self.decoder.vocab_size + 1 # V + vocab = self.vocab_size # V # Reshape to group by spectrum (B for "batch"). tokens = einops.rearrange(tokens, "(B S) L -> B L S", S=beam) @@ -702,7 +726,7 @@ def _get_top_peptide( ( pep_score, aa_scores, - "".join(self.decoder.detokenize(pred_tokens)), + pred_tokens, ) for pep_score, _, aa_scores, pred_tokens in heapq.nlargest( self.top_match, peptides @@ -711,29 +735,61 @@ def _get_top_peptide( else: yield [] + def _process_batch(self, batch): + """ Prepare batch returned from AnnotatedSpectrumDataset of the + latest depthcharge version + + Each batch is a dict and contains these keys: + ['peak_file', 'scan_id', 'ms_level', 'precursor_mz', + 'precursor_charge', 'mz_array', 'intensity_array', + 'seq'] + Returns + ------- + spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak values + for each spectrum. + precursors : torch.Tensor of shape (batch_size, 3) + A tensor with the precursor neutral mass, precursor charge, and + precursor m/z. + seqs : np.ndarray + The spectrum identifiers (during de novo sequencing) or peptide + sequences (during training). + + """ + # Squeeze torch tensors in first dimension + for k in batch.keys(): + try: + batch[k]= batch[k].squeeze(0) + except: + continue + + precursor_mzs = batch["precursor_mz"] + precursor_charges = batch["precursor_charge"] + precursor_masses = (precursor_mzs - 1.007276) * precursor_charges + precursors = torch.vstack([precursor_masses, + precursor_charges, precursor_mzs] ).T #.float() + + mzs, ints = batch['mz_array'], batch['intensity_array'] + #spectra = torch.stack([mzs, ints], dim=2) + + seqs = batch['seq'] if "seq" in batch else None + + return mzs, ints, precursors, seqs + def _forward_step( self, - spectra: torch.Tensor, - precursors: torch.Tensor, - sequences: List[str], + batch, ) -> Tuple[torch.Tensor, torch.Tensor]: """ The forward learning step. Parameters ---------- - spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) - The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the - peaks in the MS/MS spectrum, and axis 2 is essentially a - 2-tuple specifying the m/z-intensity pair for each peak. - These should be zero-padded, such that all the spectra in - the batch are the same length. - precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge - (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. - sequences : List[str] of length n_spectra - The partial peptide sequences to predict. + batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] + A batch of (i) m/z values of MS/MS spectra, + (ii) intensity values of MS/MS spectra, + (iii) precursor information, + (iv) peptide sequences as torch Tensors. Returns ------- @@ -742,11 +798,19 @@ def _forward_step( tokens : torch.Tensor of shape (n_spectra, length) The predicted tokens for each spectrum. """ - return self.decoder(sequences, precursors, *self.encoder(spectra)) + mzs, ints, precursors, tokens = self._process_batch(batch) + memories, mem_masks = self.encoder(mzs, ints) + decoded = self.decoder( + tokens=tokens, + memory=memories, + memory_key_padding_mask=mem_masks, + precursors=precursors + ) + return decoded, tokens def training_step( self, - batch: Tuple[torch.Tensor, torch.Tensor, List[str]], + batch: dict, *args, mode: str = "train", ) -> torch.Tensor: @@ -755,9 +819,11 @@ def training_step( Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, - (iii) peptide sequences as torch Tensors. + batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] + A batch of (i) m/z values of MS/MS spectra, + (ii) intensity values of MS/MS spectra, + (iii) precursor information, + (iv) peptide sequences as torch Tensors. mode : str Logging key to describe the current stage. @@ -766,8 +832,9 @@ def training_step( torch.Tensor The loss of the training step. """ - pred, truth = self._forward_step(*batch) - pred = pred[:, :-1, :].reshape(-1, self.decoder.vocab_size + 1) + pred, truth = self._forward_step(batch) + pred = pred[:, :-1, :].reshape(-1, self.vocab_size) + if mode == "train": loss = self.celoss(pred, truth.flatten()) else: @@ -778,6 +845,7 @@ def training_step( on_step=False, on_epoch=True, sync_dist=True, + batch_size=pred.shape[0] ) return loss @@ -789,9 +857,11 @@ def validation_step( Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, - (iii) peptide sequences. + batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] + A batch of (i) m/z values of MS/MS spectra, + (ii) intensity values of MS/MS spectra, + (iii) precursor information, + (iv) peptide sequences as torch Tensors. Returns ------- @@ -803,23 +873,39 @@ def validation_step( if not self.calculate_precision: return loss - # Calculate and log amino acid and peptide match evaluation - # metrics from the predicted peptides. - peptides_pred, peptides_true = [], batch[2] - for spectrum_preds in self.forward(batch[0], batch[1]): + # Calculate and log amino acid and peptide match evaluation metrics from + # the predicted peptides. + peptides_true = [''.join(p) for p in self.tokenizer.detokenize(batch['seq'], join=False)] + peptides_pred = [] + for spectrum_preds in self.forward(batch): for _, _, pred in spectrum_preds: peptides_pred.append(pred) - + peptides_pred = [''.join(p) for p in self.tokenizer.detokenize(peptides_pred, join=False)] + batch_size = len(peptides_true) aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_true, peptides_pred, self.decoder._peptide_mass.masses + peptides_true, + peptides_pred, + self.tokenizer.residues, ) ) + log_args = dict(on_step=False, on_epoch=True, sync_dist=True) - self.log("Peptide precision at coverage=1", pep_precision, **log_args) - self.log("AA precision at coverage=1", aa_precision, **log_args) + self.log( + "pep_precision", + pep_precision, + **log_args, + batch_size=batch_size + ) + self.log( + "aa_precision", + aa_precision, + **log_args, + batch_size=batch_size + ) return loss + def predict_step( self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], *args ) -> List[ms_io.PepSpecMatch]: @@ -828,39 +914,57 @@ def predict_step( Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] - A batch of (i) MS/MS spectra, (ii) precursor information, - (iii) spectrum identifiers as torch Tensors. + batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] + A batch of (i) m/z values of MS/MS spectra, + (ii) intensity values of MS/MS spectra, + (iii) precursor information, + (iv) peptide sequences as torch Tensors. Returns ------- predictions: List[ms_io.PepSpecMatch] Predicted PSMs for the given batch of spectra. """ + + _, _, precursors, true_seqs = self._process_batch(batch) + true_seqs = ( + [''.join(p) for p in self.tokenizer.detokenize(true_seqs, join=False)] + if true_seqs is not None else ['']*precursors.shape[0] + ) + + prec_charges = precursors[:, 1].cpu().detach().numpy() + prec_mzs = precursors[:, 2].cpu().detach().numpy() + predictions = [] for ( precursor_charge, precursor_mz, - spectrum_i, + scan, + title, + file_name, + true_seq, spectrum_preds, ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], - self.forward(batch[0], batch[1]), + prec_charges, + prec_mzs, + batch["scans"], + batch["title"], + batch["peak_file"], + true_seqs, + self.forward(batch) ): for peptide_score, aa_scores, peptide in spectrum_preds: predictions.append( - ms_io.PepSpecMatch( - sequence=peptide, - spectrum_id=tuple(spectrum_i), - peptide_score=peptide_score, - charge=int(precursor_charge), - calc_mz=self.peptide_mass_calculator.mass( - peptide, precursor_charge - ), - exp_mz=precursor_mz, - aa_scores=aa_scores, + ( + scan, + precursor_charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, + file_name, + true_seq, + title ) ) @@ -870,10 +974,13 @@ def on_train_epoch_end(self) -> None: """ Log the training loss at the end of each epoch. """ - train_loss = self.trainer.callback_metrics["train_CELoss"].detach() + if "train_CELoss" in self.trainer.callback_metrics: + train_loss = self.trainer.callback_metrics["train_CELoss"].detach().item() + else: + train_loss = np.nan metrics = { "step": self.trainer.global_step, - "train": train_loss.item(), + "train": train_loss, } self._history.append(metrics) self._log_history() @@ -890,10 +997,10 @@ def on_validation_epoch_end(self) -> None: if self.calculate_precision: metrics["valid_aa_precision"] = ( - callback_metrics["AA precision at coverage=1"].detach().item() + callback_metrics["aa_precision"].detach().item() ) metrics["valid_pep_precision"] = ( - callback_metrics["Peptide precision at coverage=1"] + callback_metrics["pep_precision"] .detach() .item() ) @@ -909,9 +1016,49 @@ def on_predict_batch_end( """ if self.out_writer is None: return - for pred in outputs: - if len(pred.sequence) > 0: - self.out_writer.psms.append(pred) + # Triply nested lists: results -> batch -> step -> spectrum. + for ( + scan, + charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, + file_name, + true_seq, + title + ) in outputs: + if len(peptide) == 0: + continue + + # Compute mass and detokenize + calc_mass = self.tokenizer.calculate_precursor_ions( + peptide.unsqueeze(0), + torch.tensor([charge]).type_as(peptide) + )[0] + peptide = ''.join( + self.tokenizer.detokenize(peptide.unsqueeze(0), join=False)[0] + ) + + self.out_writer.psms.append( + ( + peptide, + scan, + peptide_score, + charge, + precursor_mz, + calc_mass, + ",".join(list(map("{:.5f}".format, aa_scores))), + file_name, + true_seq, + title + ), + ) + + def on_train_start(self): + """Log optimizer settings.""" + self.log("hp/optimizer_warmup_iters", self.warmup_iters) + self.log("hp/optimizer_cosine_schedule_period_iters", self.cosine_schedule_period_iters) def _log_history(self) -> None: """ @@ -943,18 +1090,6 @@ def _log_history(self) -> None: ] logger.info(msg, *vals) - if self.tb_summarywriter is not None: - for descr, key in [ - ("loss/train_crossentropy_loss", "train"), - ("loss/val_crossentropy_loss", "valid"), - ("eval/val_pep_precision", "valid_pep_precision"), - ("eval/val_aa_precision", "valid_aa_precision"), - ]: - metric_value = metrics.get(key, np.nan) - if not np.isnan(metric_value): - self.tb_summarywriter.add_scalar( - descr, metric_value, metrics["step"] - ) def configure_optimizers( self, @@ -1235,3 +1370,13 @@ def _aa_pep_score( if not fits_precursor_mz: peptide_score -= 1 return aa_scores, peptide_score + +def generate_tgt_mask(sz: int) -> torch.Tensor: + """Generate a square mask for the sequence. + + Parameters + ---------- + sz : int + The length of the target sequence. + """ + return ~torch.triu(torch.ones(sz, sz, dtype=torch.bool)).transpose(0, 1) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 30f86f24..3c06b477 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -5,19 +5,21 @@ import logging import os import tempfile -import uuid import warnings from pathlib import Path from typing import Iterable, List, Optional, Union +from datetime import datetime -import depthcharge.masses import lightning.pytorch as pl import lightning.pytorch.loggers -import numpy as np import torch -from depthcharge.data import AnnotatedSpectrumIndex, SpectrumIndex + from lightning.pytorch.strategies import DDPStrategy -from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor +from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping +from lightning.pytorch.loggers import TensorBoardLogger + +from depthcharge.tokenizers import PeptideTokenizer +from depthcharge.tokenizers.peptides import MskbPeptideTokenizer from .. import utils from ..config import Config @@ -187,17 +189,20 @@ def train( The path to the MS data files for validation. """ self.initialize_trainer(train=True) + self.initialize_tokenizer() self.initialize_model(train=True) - train_index = self._get_index(train_peak_path, True, "training") - valid_index = self._get_index(valid_peak_path, True, "validation") - self.initialize_data_module(train_index, valid_index) + train_paths = self._get_input_paths(train_peak_path, True, "train") + valid_paths = self._get_input_paths(valid_peak_path, True, "valid") + self.initialize_data_module(train_paths, valid_paths) self.loaders.setup() + #logger.info(f'TRAIN PSMs: {self.loaders.train_dataset.n_spectra}') + #logger.info(f'VAL PSMs: {self.loaders.valid_dataset.n_spectra}') self.trainer.fit( self.model, self.loaders.train_dataloader(), - self.loaders.val_dataloader(), + self.loaders.val_dataloader() ) def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: @@ -226,6 +231,13 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: pred_idx += 1 else: seq_pred.append(None) + self.initialize_trainer(train=False) + self.initialize_tokenizer() + self.initialize_model(train=False) + + test_paths = self._get_input_paths(peak_path, True, "test") + self.initialize_data_module(test_paths=test_paths) + self.loaders.setup(stage="test", annotated=True) aa_precision, aa_recall, pep_precision = aa_match_metrics( *aa_match_batch( @@ -278,12 +290,13 @@ def predict( ) self.initialize_trainer(train=False) + self.initialize_tokenizer() self.initialize_model(train=False) self.model.out_writer = self.writer - test_index = self._get_index(peak_path, evaluate, "") - self.writer.set_ms_run(test_index.ms_files) - self.initialize_data_module(test_index=test_index) + test_paths = self._get_input_paths(peak_path, False, "test") + self.writer.set_ms_run(test_paths) + self.initialize_data_module(test_paths=test_paths) self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.test_dataloader()) @@ -303,6 +316,8 @@ def initialize_trainer(self, train: bool) -> None: accelerator=self.config.accelerator, devices=1, enable_checkpointing=False, + precision=self.config.precision, + logger=False ) if train: @@ -311,6 +326,16 @@ def initialize_trainer(self, train: bool) -> None: else: devices = self.config.devices + if self.config.tb_summarywriter is not None: + logger = TensorBoardLogger( + self.config.tb_summarywriter, + version=None, + name=f'model_{datetime.now().strftime("%Y%m%d_%H%M")}', + default_hp_metric=False + ) + else: + logger = False + additional_cfg = dict( devices=devices, callbacks=self.callbacks, @@ -320,7 +345,10 @@ def initialize_trainer(self, train: bool) -> None: strategy=self._get_strategy(), val_check_interval=self.config.val_check_interval, check_val_every_n_epoch=None, - log_every_n_steps=self.config.log_every_n_steps, + logger=logger, + accumulate_grad_batches=self.config.accumulate_grad_batches, + gradient_clip_val=self.config.gradient_clip_val, + gradient_clip_algorithm=self.config.gradient_clip_algorithm, ) if self.config.log_metrics: @@ -372,6 +400,10 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: ) else: tb_summarywriter = self.output_dir / "tensorboard" + try: + tokenizer = self.tokenizer + except AttributeError: + raise RuntimeError("Please use `initialize_tokenizer()` first.") model_params = dict( dim_model=self.config.dim_model, @@ -380,8 +412,7 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: n_layers=self.config.n_layers, dropout=self.config.dropout, dim_intensity=self.config.dim_intensity, - max_peptide_len=self.config.max_peptide_len, - residues=self.config.residues, + max_length=self.config.max_length, max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, @@ -397,6 +428,7 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: weight_decay=self.config.weight_decay, out_writer=self.writer, calculate_precision=self.config.calculate_precision, + tokenizer=tokenizer ) # Reconfigurable non-architecture related parameters for a @@ -476,24 +508,38 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: "Casanovo." ) + def initialize_tokenizer( + self, + ) -> None : + """Initialize the peptide tokenizer""" + if self.config.mskb_tokenizer: + tokenizer_cs = MskbPeptideTokenizer + else: + tokenizer_cs = PeptideTokenizer + + self.tokenizer = tokenizer_cs( + residues=self.config.residues, + replace_isoleucine_with_leucine=self.config.replace_isoleucine_with_leucine, + reverse=self.config.reverse_peptides, + start_token=None, stop_token='$' + ) + def initialize_data_module( self, - train_index: Optional[AnnotatedSpectrumIndex] = None, - valid_index: Optional[AnnotatedSpectrumIndex] = None, - test_index: Optional[ - Union[AnnotatedSpectrumIndex, SpectrumIndex] - ] = None, + train_paths: Optional[str] = None, + valid_paths: Optional[str] = None, + test_paths: Optional[str] = None, ) -> None: """Initialize the data module. Parameters ---------- - train_index : AnnotatedSpectrumIndex, optional - A spectrum index for model training. - valid_index : AnnotatedSpectrumIndex, optional - A spectrum index for validation. - test_index : AnnotatedSpectrumIndex or SpectrumIndex, optional - A spectrum index for evaluation or inference. + train_paths : str, optional + A spectrum path for model training. + valid_paths : str, optional + A spectrum path for validation. + test_paths : str, optional + A spectrum path for evaluation or inference. """ try: n_devices = self.trainer.num_devices @@ -502,10 +548,16 @@ def initialize_data_module( except AttributeError: raise RuntimeError("Please use `initialize_trainer()` first.") + try: + tokenizer = self.tokenizer + except AttributeError: + raise RuntimeError("Please use `initialize_tokenizer()` first.") + + lance_dir = Path(self.tmp_dir.name) if self.config.lance_dir is None else self.config.lance_dir self.loaders = DeNovoDataModule( - train_index=train_index, - valid_index=valid_index, - test_index=test_index, + train_paths=train_paths, + valid_paths=valid_paths, + test_paths=test_paths, min_mz=self.config.min_mz, max_mz=self.config.max_mz, min_intensity=self.config.min_intensity, @@ -513,18 +565,21 @@ def initialize_data_module( n_workers=self.config.n_workers, train_batch_size=train_bs, eval_batch_size=eval_bs, + n_peaks=self.config.n_peaks, + max_charge=self.config.max_charge, + tokenizer=tokenizer, + lance_dir=lance_dir, + shuffle=self.config.shuffle, + buffer_size=self.config.buffer_size, ) - def _get_index( + def _get_input_paths( self, peak_path: Iterable[str], annotated: bool, - msg: str = "", - ) -> Union[SpectrumIndex, AnnotatedSpectrumIndex]: - """Get the spectrum index. - - If the file is a SpectrumIndex, only one is allowed. Otherwise - multiple may be specified. + mode: str, + ) -> str: + """Get the spectrum input paths. Parameters ---------- @@ -532,54 +587,30 @@ def _get_index( The peak files/directories to check. annotated : bool Are the spectra expected to be annotated? - msg : str, optional - A string to insert into the error message. - + mode : str + Either train, valid or test to specify lance file name Returns ------- - SpectrumIndex or AnnotatedSpectrumIndex - The spectrum index for training, evaluation, or inference. + The spectrum paths for training, evaluation, or inference. """ - ext = (".mgf", ".h5", ".hdf5") + ext = (".mgf", ".lance") if not annotated: - ext += (".mzml", ".mzxml") + ext += (".mzML", ".mzml", ".mzxml") # FIXME: Check if these work - msg = msg.strip() filenames = _get_peak_filenames(peak_path, ext) if not filenames: - not_found_err = f"Cound not find {msg} peak files" + not_found_err = f"Cound not find {mode} peak files" logger.error(not_found_err + " from %s", peak_path) raise FileNotFoundError(not_found_err) - is_index = any([Path(f).suffix in (".h5", ".hdf5") for f in filenames]) - if is_index: + is_lance = any([Path(f).suffix in (".lance") for f in filenames]) + if is_lance: if len(filenames) > 1: - h5_err = f"Multiple {msg} HDF5 spectrum indexes specified" - logger.error(h5_err) - raise ValueError(h5_err) - - index_fname, filenames = filenames[0], None - else: - index_fname = Path(self.tmp_dir.name) / f"{uuid.uuid4().hex}.hdf5" - - Index = AnnotatedSpectrumIndex if annotated else SpectrumIndex - valid_charge = np.arange(1, self.config.max_charge + 1) - - try: - return Index(index_fname, filenames, valid_charge=valid_charge) - except TypeError as e: - if Index == AnnotatedSpectrumIndex: - error_msg = ( - "Error creating annotated spectrum index. " - "This may be the result of having an unannotated MGF file " - "present in the validation peak file path list.\n" - f"Original error message: {e}" - ) - - logger.error(error_msg) - raise TypeError(error_msg) + lance_err = f"Multiple {mode} spectrum lance files specified" + logger.error(lance_err) + raise ValueError(lance_err) - raise e + return filenames def _get_strategy(self) -> Union[str, DDPStrategy]: """Get the strategy for the Trainer. diff --git a/casanovo/denovo/transformers.py b/casanovo/denovo/transformers.py new file mode 100644 index 00000000..2e93cc8b --- /dev/null +++ b/casanovo/denovo/transformers.py @@ -0,0 +1,173 @@ +"""Transformer encoder and decoder for the de novo sequencing task.""" +import torch +from collections.abc import Callable + +from depthcharge.tokenizers import Tokenizer +from depthcharge.encoders import PeakEncoder, FloatEncoder, PositionalEncoder +from depthcharge.transformers import SpectrumTransformerEncoder, AnalyteTransformerDecoder + + +class PeptideDecoder(AnalyteTransformerDecoder): + """A transformer decoder for peptide sequences + + Parameters + ---------- + n_tokens : int + The number of tokens used to tokenize peptide sequences. + d_model : int, optional + The latent dimensionality to represent peaks in the mass spectrum. + nhead : int, optional + The number of attention heads in each layer. ``d_model`` must be + divisible by ``nhead``. + dim_feedforward : int, optional + The dimensionality of the fully connected layers in the Transformer + layers of the model. + n_layers : int, optional + The number of Transformer layers. + dropout : float, optional + The dropout probability for all layers. + pos_encoder : PositionalEncoder or bool, optional + The positional encodings to use for the amino acid sequence. If + ``True``, the default positional encoder is used. ``False`` disables + positional encodings, typically only for ablation tests. + max_charge : int, optional + The maximum charge state for peptide sequences. + """ + + def __init__( + self, + n_tokens: int | Tokenizer, + d_model: int = 128, + n_head: int = 8, + dim_feedforward: int = 1024, + n_layers: int = 1, + dropout: float = 0, + positional_encoder: PositionalEncoder | bool = True, + padding_int: int | None = None, + max_charge: int = 10, + ) -> None: + """Initialize a PeptideDecoder.""" + + super().__init__( + n_tokens=n_tokens, + d_model=d_model, + nhead=n_head, + dim_feedforward=dim_feedforward, + n_layers=n_layers, + dropout=dropout, + positional_encoder=positional_encoder, + padding_int=padding_int, + ) + + self.charge_encoder = torch.nn.Embedding(max_charge, d_model) + self.mass_encoder = FloatEncoder(d_model) + + # override final layer: + # +1 in comparison to version in depthcharge to second dimension + # This includes padding (=0) as a possible class + # and avoids problems during beam search decoding + self.final = torch.nn.Linear( + d_model, + self.token_encoder.num_embeddings, + ) + + def global_token_hook( + self, + tokens: torch.Tensor, + precursors: torch.Tensor, + **kwargs: dict, + ) -> torch.Tensor: + """ + Override global_token_hook to include precursor information. + + Parameters + ---------- + tokens : list of str, torch.Tensor, or None + The partial molecular sequences for which to predict the next + token. Optionally, these may be the token indices instead + of a string. + precursors : torch.Tensor + Precursor information. + **kwargs : dict + Additional data passed with the batch. + + Returns + ------- + torch.Tensor of shape (batch_size, d_model) + The global token representations. + + """ + masses = self.mass_encoder(precursors[:, None, 0]).squeeze(1) + charges = self.charge_encoder(precursors[:, 1].int() - 1) + precursors = masses + charges + return precursors + + +class SpectrumEncoder(SpectrumTransformerEncoder): + """A Transformer encoder for input mass spectra. + + Parameters + ---------- + d_model : int, optional + The latent dimensionality to represent peaks in the mass spectrum. + n_head : int, optional + The number of attention heads in each layer. ``d_model`` must be + divisible by ``n_head``. + dim_feedforward : int, optional + The dimensionality of the fully connected layers in the Transformer + layers of the model. + n_layers : int, optional + The number of Transformer layers. + dropout : float, optional + The dropout probability for all layers. + peak_encoder : bool, optional + Use positional encodings m/z values of each peak. + dim_intensity: int or None, optional + The number of features to use for encoding peak intensity. + The remaining (``d_model - dim_intensity``) are reserved for + encoding the m/z value. + """ + + def __init__( + self, + d_model: int = 128, + n_head: int = 8, + dim_feedforward: int = 1024, + n_layers: int = 1, + dropout: float = 0, + peak_encoder: PeakEncoder | Callable | bool = True, + ): + """Initialize a SpectrumEncoder""" + super().__init__(d_model, n_head, dim_feedforward, + n_layers, dropout, peak_encoder) + + self.latent_spectrum = torch.nn.Parameter(torch.randn(1, 1, d_model)) + + def global_token_hook( + self, + mz_array: torch.Tensor, + intensity_array: torch.Tensor, + *args: torch.Tensor, + **kwargs: dict, + ) -> torch.Tensor: + """Override global_token_hook to include + lantent_spectrum parameter + + Parameters + ---------- + mz_array : torch.Tensor of shape (n_spectra, n_peaks) + The zero-padded m/z dimension for a batch of mass spectra. + intensity_array : torch.Tensor of shape (n_spectra, n_peaks) + The zero-padded intensity dimension for a batch of mass spctra. + *args : torch.Tensor + Additional data passed with the batch. + **kwargs : dict + Additional data passed with the batch. + + Returns + ------- + torch.Tensor of shape (batch_size, d_model) + The precursor representations. + + """ + return self.latent_spectrum.squeeze(0).expand(mz_array.shape[0], -1) diff --git a/pyproject.toml b/pyproject.toml index 3967bf05..5f6b8ae9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "appdirs", "lightning>=2.1", "click", - "depthcharge-ms>=0.2.3,<0.3.0", + "depthcharge-ms>=0.4.8 ", "natsort", "numpy<2.0", "pandas", From 8c8dc619cccaa7c311fd95f7e4d5c173b5df31f3 Mon Sep 17 00:00:00 2001 From: Daniela Klaproth-Andrade Date: Mon, 1 Jul 2024 20:58:21 +0200 Subject: [PATCH 02/51] shuffling training set by default --- casanovo/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 5df107e7..d3aaa064 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -129,7 +129,7 @@ weight_decay: 1e-5 train_label_smoothing: 0.01 # Shuffle dataset during training. # A buffer of size buffer_size is filled and examples from this buffer are randomly sampled. -shuffle: +shuffle: True buffer_size: 100_000 # TRAINING/INFERENCE OPTIONS From 70cdea6a2937fc0922dcbd686ac3d27673b3688c Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Fri, 26 Jul 2024 23:34:06 -0700 Subject: [PATCH 03/51] Reformat with Black --- casanovo/denovo/dataloaders.py | 125 ++++++++++++--------- casanovo/denovo/model.py | 188 +++++++++++++++++--------------- casanovo/denovo/model_runner.py | 73 ++++++++++--- casanovo/denovo/transformers.py | 13 ++- 4 files changed, 245 insertions(+), 154 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 9a271816..1cf088f9 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,7 +1,10 @@ """Data loaders for the de novo sequencing task.""" +<<<<<<< HEAD import functools import logging +======= +>>>>>>> c21c899 (Reformat with Black) import os from typing import Optional, Iterable from pathlib import Path @@ -16,10 +19,10 @@ from depthcharge.tokenizers import PeptideTokenizer from depthcharge.data import ( - AnnotatedSpectrumDataset, - CustomField, - SpectrumDataset, - preprocessing + AnnotatedSpectrumDataset, + CustomField, + SpectrumDataset, + preprocessing, ) @@ -61,7 +64,7 @@ class DeNovoDataModule(pl.LightningDataModule): available CPU cores on the current machine is used. max_charge: int Remove PSMs which precursor charge higher than specified max_charge - tokenizer: Optional[PeptideTokenizer] + tokenizer: Optional[PeptideTokenizer] Peptide tokenizer for tokenizing sequences random_state : Optional[int] The NumPy random state. ``None`` leaves mass spectra in the order they @@ -69,7 +72,7 @@ class DeNovoDataModule(pl.LightningDataModule): shuffle: Optional[bool] Should the training dataset be shuffled? Suffling based on specified buffer_size buffer_size: Optional[int] - See more here: + See more here: https://huggingface.co/docs/datasets/v1.11.0/dataset_streaming.html#shuffling-the-dataset-shuffle """ @@ -100,9 +103,14 @@ def __init__( self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size - self.tokenizer = tokenizer if tokenizer is not None else PeptideTokenizer() - self.lance_dir = lance_dir if lance_dir is not None else tempfile.TemporaryDirectory(suffix='.lance').name - + self.tokenizer = ( + tokenizer if tokenizer is not None else PeptideTokenizer() + ) + self.lance_dir = ( + lance_dir + if lance_dir is not None + else tempfile.TemporaryDirectory(suffix=".lance").name + ) self.train_dataset = None self.valid_dataset = None @@ -110,31 +118,39 @@ def __init__( self.protein_database = None self.n_workers = n_workers if n_workers is not None else os.cpu_count() - self.shuffle = shuffle if shuffle else None # set to None if not wanted. Otherwise torch throws and error + self.shuffle = ( + shuffle if shuffle else None + ) # set to None if not wanted. Otherwise torch throws and error self.buffer_size = buffer_size - self.valid_charge = np.arange(1, max_charge+1) + self.valid_charge = np.arange(1, max_charge + 1) self.preprocessing_fn = [ preprocessing.set_mz_range(min_mz=min_mz, max_mz=max_mz), preprocessing.remove_precursor_peak(remove_precursor_tol, "Da"), preprocessing.filter_intensity(min_intensity, n_peaks), preprocessing.scale_intensity("root", 1), - scale_to_unit_norm - ] + scale_to_unit_norm, + ] self.custom_field_test_mgf = [ - CustomField("scans", - lambda x: x["params"]["scans"] if 'scans' in x["params"] else x["params"]["title"], - pa.string()), - CustomField("title", - lambda x: x["params"]["title"], - pa.string()) + CustomField( + "scans", + lambda x: ( + x["params"]["scans"] + if "scans" in x["params"] + else x["params"]["title"] + ), + pa.string(), + ), + CustomField("title", lambda x: x["params"]["title"], pa.string()), ] self.custom_field_test_mzml = [ CustomField("scans", lambda x: x["id"], pa.string()), CustomField("title", lambda x: x["id"], pa.string()), ] - - self.custom_field_anno = [CustomField("seq", lambda x: x["params"]["seq"], pa.string())] + + self.custom_field_anno = [ + CustomField("seq", lambda x: x["params"]["seq"], pa.string()) + ] def make_dataset(self, paths, annotated, mode, shuffle): """ @@ -147,40 +163,49 @@ def make_dataset(self, paths, annotated, mode, shuffle): True if peptide sequence annotations are available for the test data. mode: str {"train", "valid", "test"} - The mode indicating name of lance instance + The mode indicating name of lance instance shuffle: bool Indicates whether to shuffle training data based on buffer_size """ custom_fields = self.custom_field_anno if annotated else [] - - if mode=="test": - if all([Path(f).suffix in ('.mgf') for f in paths]): + + if mode == "test": + if all([Path(f).suffix in (".mgf") for f in paths]): custom_fields = custom_fields + self.custom_field_test_mgf - if all([Path(f).suffix in (".mzml", ".mzxml", '.mzML') for f in paths]): + if all( + [Path(f).suffix in (".mzml", ".mzxml", ".mzML") for f in paths] + ): custom_fields = custom_fields + self.custom_field_test_mzml - - lance_path = f'{self.lance_dir}/{mode}.lance' - + + lance_path = f"{self.lance_dir}/{mode}.lance" + parse_kwargs = dict( preprocessing_fn=self.preprocessing_fn, custom_fields=custom_fields, valid_charge=self.valid_charge, - ) dataset_params = dict( - batch_size=self.train_batch_size if mode=="train" else self.eval_batch_size + batch_size=( + self.train_batch_size + if mode == "train" + else self.eval_batch_size + ) ) anno_dataset_params = dataset_params | dict( tokenizer=self.tokenizer, - annotations='seq', + annotations="seq", ) if any([Path(f).suffix in (".lance") for f in paths]): if annotated: - dataset = AnnotatedSpectrumDataset.from_lance(paths[0], **anno_dataset_params) + dataset = AnnotatedSpectrumDataset.from_lance( + paths[0], **anno_dataset_params + ) else: - dataset = SpectrumDataset.from_lance(paths[0], **dataset_params) + dataset = SpectrumDataset.from_lance( + paths[0], **dataset_params + ) else: if annotated: dataset = AnnotatedSpectrumDataset( @@ -196,11 +221,10 @@ def make_dataset(self, paths, annotated, mode, shuffle): parse_kwargs=parse_kwargs, **dataset_params, ) - + if shuffle: dataset = ShufflerIterDataPipe( - dataset, - buffer_size=self.buffer_size + dataset, buffer_size=self.buffer_size ) return dataset @@ -221,21 +245,25 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: if stage in (None, "fit", "validate"): if self.train_paths is not None: self.train_dataset = self.make_dataset( - self.train_paths, annotated=True, - mode='train', shuffle=self.shuffle + self.train_paths, + annotated=True, + mode="train", + shuffle=self.shuffle, ) if self.valid_paths is not None: self.valid_dataset = self.make_dataset( - self.valid_paths, annotated=True, - mode='valid', shuffle=False + self.valid_paths, + annotated=True, + mode="valid", + shuffle=False, ) if stage in (None, "test"): if self.test_paths is not None: self.test_dataset = self.make_dataset( self.test_paths, annotated=annotated, - mode='test', - shuffle=False + mode="test", + shuffle=False, ) def _make_loader( @@ -244,7 +272,7 @@ def _make_loader( shuffle: Optional[bool] = None, ) -> torch.utils.data.DataLoader: """ - Create a PyTorch DataLoader. + Create a PyTorch DataLoader. Parameters ---------- dataset : torch.utils.data.Dataset @@ -265,13 +293,13 @@ def _make_loader( dataset, shuffle=shuffle, num_workers=0, # self.n_workers, - #precision=torch.float32, + # precision=torch.float32, pin_memory=True, ) def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader(self.train_dataset, self.shuffle ) + return self._make_loader(self.train_dataset, self.shuffle) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" @@ -302,7 +330,6 @@ def scale_to_unit_norm(spectrum): slightly differing from the depthcharge implementation """ spectrum._inner._intensity = spectrum.intensity / np.linalg.norm( - spectrum.intensity - ) + spectrum.intensity + ) return spectrum - diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 04c3d0a5..9f0084bc 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -20,6 +20,7 @@ logger = logging.getLogger("casanovo") + class Spec2Pep(pl.LightningModule): """ A Transformer model for de novo peptide sequencing. @@ -124,8 +125,10 @@ def __init__( super().__init__() self.save_hyperparameters() - self.tokenizer = tokenizer if tokenizer is not None else PeptideTokenizer() - self.vocab_size = len(self.tokenizer) + 1 + self.tokenizer = ( + tokenizer if tokenizer is not None else PeptideTokenizer() + ) + self.vocab_size = len(self.tokenizer) + 1 # Build the model. self.encoder = SpectrumEncoder( d_model=dim_model, @@ -144,7 +147,7 @@ def __init__( max_charge=max_charge, ) self.softmax = torch.nn.Softmax(2) - ignore_index = 0 + ignore_index = 0 self.celoss = torch.nn.CrossEntropyLoss( ignore_index=ignore_index, label_smoothing=train_label_smoothing ) @@ -171,7 +174,7 @@ def __init__( self.min_peptide_len = min_peptide_len self.n_beams = n_beams self.top_match = top_match - + self.stop_token = self.tokenizer.stop_int # Logging. @@ -201,9 +204,9 @@ def forward( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] - A batch of (i) m/z values of MS/MS spectra, + A batch of (i) m/z values of MS/MS spectra, (ii) intensity values of MS/MS spectra, - (iii) precursor information, + (iii) precursor information, (iv) peptide sequences as torch Tensors. Returns @@ -215,7 +218,7 @@ def forward( sequence. """ mzs, ints, precursors, _ = self._process_batch(batch) - return self.beam_search_decode(mzs, ints, precursors) + return self.beam_search_decode(mzs, ints, precursors) def beam_search_decode( self, mzs: torch.Tensor, ints: torch.Tensor, precursors: torch.Tensor @@ -252,31 +255,31 @@ def beam_search_decode( # Sizes. batch = mzs.shape[0] # B length = self.max_length + 1 # L - vocab = self.vocab_size # V + vocab = self.vocab_size # V beam = self.n_beams # S # Initialize scores and tokens. scores = torch.full( size=(batch, length, vocab, beam), fill_value=torch.nan ).type_as(mzs) - - tokens = torch.zeros(batch, length, beam, - dtype=torch.int64, - device=self.encoder.device) - + + tokens = torch.zeros( + batch, length, beam, dtype=torch.int64, device=self.encoder.device + ) + # Create cache for decoded beams. pred_cache = collections.OrderedDict((i, []) for i in range(batch)) # Get the first prediction. pred = self.decoder( - tokens=torch.zeros(batch, 0, - dtype=torch.int64, - device=self.encoder.device), - memory=memories, - memory_key_padding_mask=mem_masks, - precursors=precursors + tokens=torch.zeros( + batch, 0, dtype=torch.int64, device=self.encoder.device + ), + memory=memories, + memory_key_padding_mask=mem_masks, + precursors=precursors, ) - tokens[:, 0, :] = torch.topk(pred[:, 0, :], beam, dim=1)[1] + tokens[:, 0, :] = torch.topk(pred[:, 0, :], beam, dim=1)[1] scores[:, :1, :, :] = einops.repeat(pred, "B L V -> B L V S", S=beam) # Make all tensors the right shape for decoding. @@ -314,7 +317,7 @@ def beam_search_decode( if finished_beams.all(): break # Update the scores. - scores[~finished_beams, : step + 2, :]= self.decoder( + scores[~finished_beams, : step + 2, :] = self.decoder( tokens=tokens[~finished_beams, : step + 1], precursors=precursors[~finished_beams, :], memory=memories[~finished_beams, :, :], @@ -326,7 +329,7 @@ def beam_search_decode( tokens, scores, finished_beams, batch, step + 1 ) tokens = tokens - + # Return the peptide with the highest confidence score, within the # precursor m/z tolerance if possible. return list(self._get_top_peptide(pred_cache)) @@ -372,16 +375,16 @@ def _finish_beams( if mass < 0: # aa_neg_mass.append(aa) aa_neg_mass_idx.append(self.tokenizer.index[aa]) - + # Find N-terminal residues. n_term = torch.Tensor( [ self.tokenizer.index[aa] for aa in self.tokenizer.index - if aa.startswith(("+", "-",'[+', '[-')) + if aa.startswith(("+", "-", "[+", "[-")) ] ).to(self.decoder.device) - + beam_fits_precursor = torch.zeros( tokens.shape[0], dtype=torch.bool ).to(self.encoder.device) @@ -394,10 +397,10 @@ def _finish_beams( finished_beams[ends_stop_token] = True # Beams with a dummy token predicted in the current step can be # discarded. - discarded_beams = torch.zeros( - tokens.shape[0], dtype=torch.bool - ).to(self.encoder.device) - + discarded_beams = torch.zeros(tokens.shape[0], dtype=torch.bool).to( + self.encoder.device + ) + discarded_beams[tokens[:, step] == 0] = True # Discard beams with invalid modification combinations (i.e. # N-terminal modifications occur multiple times or in internal @@ -426,12 +429,15 @@ def _finish_beams( continue pred_tokens = tokens[i][: step + 1] peptide_len = len(pred_tokens) - + # Omit stop token. if self.tokenizer.reverse and pred_tokens[0] == self.stop_token: pred_tokens = pred_tokens[1:] peptide_len -= 1 - elif not self.tokenizer.reverse and pred_tokens[-1] == self.stop_token: + elif ( + not self.tokenizer.reverse + and pred_tokens[-1] == self.stop_token + ): pred_tokens = pred_tokens[:-1] peptide_len -= 1 # Discard beams that were predicted to end but don't fit the @@ -446,27 +452,28 @@ def _finish_beams( precursor_charge = precursors[i, 1] precursor_mz = precursors[i, 2] matches_precursor_mz = exceeds_precursor_mz = False - + # Send tokenizer masses to correct device for calculate_precursor_ions() self.tokenizer.masses = self.tokenizer.masses.type_as(precursor_mz) - + for aa in [None] if finished_beams[i] else aa_neg_mass_idx: if aa is None: calc_peptide = pred_tokens else: calc_peptide = pred_tokens.detach().clone() calc_peptide = torch.cat( - (calc_peptide, - torch.tensor([aa]).type_as(calc_peptide) + ( + calc_peptide, + torch.tensor([aa]).type_as(calc_peptide), ) ) try: - + calc_mz = self.tokenizer.calculate_precursor_ions( calc_peptide.unsqueeze(0), - precursor_charge.unsqueeze(0) + precursor_charge.unsqueeze(0), )[0] - + delta_mass_ppm = [ _calc_mass_error( calc_mz, @@ -639,7 +646,7 @@ def _get_topk_beams( all spectra. """ beam = self.n_beams # S - vocab = self.vocab_size # V + vocab = self.vocab_size # V # Reshape to group by spectrum (B for "batch"). tokens = einops.rearrange(tokens, "(B S) L -> B L S", S=beam) @@ -736,10 +743,10 @@ def _get_top_peptide( yield [] def _process_batch(self, batch): - """ Prepare batch returned from AnnotatedSpectrumDataset of the + """Prepare batch returned from AnnotatedSpectrumDataset of the latest depthcharge version - Each batch is a dict and contains these keys: + Each batch is a dict and contains these keys: ['peak_file', 'scan_id', 'ms_level', 'precursor_mz', 'precursor_charge', 'mz_array', 'intensity_array', 'seq'] @@ -759,20 +766,21 @@ def _process_batch(self, batch): # Squeeze torch tensors in first dimension for k in batch.keys(): try: - batch[k]= batch[k].squeeze(0) + batch[k] = batch[k].squeeze(0) except: continue precursor_mzs = batch["precursor_mz"] precursor_charges = batch["precursor_charge"] precursor_masses = (precursor_mzs - 1.007276) * precursor_charges - precursors = torch.vstack([precursor_masses, - precursor_charges, precursor_mzs] ).T #.float() + precursors = torch.vstack( + [precursor_masses, precursor_charges, precursor_mzs] + ).T # .float() + + mzs, ints = batch["mz_array"], batch["intensity_array"] + # spectra = torch.stack([mzs, ints], dim=2) - mzs, ints = batch['mz_array'], batch['intensity_array'] - #spectra = torch.stack([mzs, ints], dim=2) - - seqs = batch['seq'] if "seq" in batch else None + seqs = batch["seq"] if "seq" in batch else None return mzs, ints, precursors, seqs @@ -786,9 +794,9 @@ def _forward_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] - A batch of (i) m/z values of MS/MS spectra, + A batch of (i) m/z values of MS/MS spectra, (ii) intensity values of MS/MS spectra, - (iii) precursor information, + (iii) precursor information, (iv) peptide sequences as torch Tensors. Returns @@ -802,9 +810,9 @@ def _forward_step( memories, mem_masks = self.encoder(mzs, ints) decoded = self.decoder( tokens=tokens, - memory=memories, - memory_key_padding_mask=mem_masks, - precursors=precursors + memory=memories, + memory_key_padding_mask=mem_masks, + precursors=precursors, ) return decoded, tokens @@ -820,9 +828,9 @@ def training_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] - A batch of (i) m/z values of MS/MS spectra, + A batch of (i) m/z values of MS/MS spectra, (ii) intensity values of MS/MS spectra, - (iii) precursor information, + (iii) precursor information, (iv) peptide sequences as torch Tensors. mode : str Logging key to describe the current stage. @@ -834,7 +842,7 @@ def training_step( """ pred, truth = self._forward_step(batch) pred = pred[:, :-1, :].reshape(-1, self.vocab_size) - + if mode == "train": loss = self.celoss(pred, truth.flatten()) else: @@ -845,7 +853,7 @@ def training_step( on_step=False, on_epoch=True, sync_dist=True, - batch_size=pred.shape[0] + batch_size=pred.shape[0], ) return loss @@ -858,9 +866,9 @@ def validation_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] - A batch of (i) m/z values of MS/MS spectra, + A batch of (i) m/z values of MS/MS spectra, (ii) intensity values of MS/MS spectra, - (iii) precursor information, + (iii) precursor information, (iv) peptide sequences as torch Tensors. Returns @@ -875,12 +883,18 @@ def validation_step( # Calculate and log amino acid and peptide match evaluation metrics from # the predicted peptides. - peptides_true = [''.join(p) for p in self.tokenizer.detokenize(batch['seq'], join=False)] + peptides_true = [ + "".join(p) + for p in self.tokenizer.detokenize(batch["seq"], join=False) + ] peptides_pred = [] for spectrum_preds in self.forward(batch): for _, _, pred in spectrum_preds: peptides_pred.append(pred) - peptides_pred = [''.join(p) for p in self.tokenizer.detokenize(peptides_pred, join=False)] + peptides_pred = [ + "".join(p) + for p in self.tokenizer.detokenize(peptides_pred, join=False) + ] batch_size = len(peptides_true) aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( @@ -889,23 +903,16 @@ def validation_step( self.tokenizer.residues, ) ) - + log_args = dict(on_step=False, on_epoch=True, sync_dist=True) self.log( - "pep_precision", - pep_precision, - **log_args, - batch_size=batch_size + "pep_precision", pep_precision, **log_args, batch_size=batch_size ) self.log( - "aa_precision", - aa_precision, - **log_args, - batch_size=batch_size + "aa_precision", aa_precision, **log_args, batch_size=batch_size ) return loss - def predict_step( self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], *args ) -> List[ms_io.PepSpecMatch]: @@ -915,9 +922,9 @@ def predict_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[str]] - A batch of (i) m/z values of MS/MS spectra, + A batch of (i) m/z values of MS/MS spectra, (ii) intensity values of MS/MS spectra, - (iii) precursor information, + (iii) precursor information, (iv) peptide sequences as torch Tensors. Returns @@ -928,8 +935,12 @@ def predict_step( _, _, precursors, true_seqs = self._process_batch(batch) true_seqs = ( - [''.join(p) for p in self.tokenizer.detokenize(true_seqs, join=False)] - if true_seqs is not None else ['']*precursors.shape[0] + [ + "".join(p) + for p in self.tokenizer.detokenize(true_seqs, join=False) + ] + if true_seqs is not None + else [""] * precursors.shape[0] ) prec_charges = precursors[:, 1].cpu().detach().numpy() @@ -951,7 +962,7 @@ def predict_step( batch["title"], batch["peak_file"], true_seqs, - self.forward(batch) + self.forward(batch), ): for peptide_score, aa_scores, peptide in spectrum_preds: predictions.append( @@ -964,7 +975,7 @@ def predict_step( aa_scores, file_name, true_seq, - title + title, ) ) @@ -975,7 +986,9 @@ def on_train_epoch_end(self) -> None: Log the training loss at the end of each epoch. """ if "train_CELoss" in self.trainer.callback_metrics: - train_loss = self.trainer.callback_metrics["train_CELoss"].detach().item() + train_loss = ( + self.trainer.callback_metrics["train_CELoss"].detach().item() + ) else: train_loss = np.nan metrics = { @@ -1000,9 +1013,7 @@ def on_validation_epoch_end(self) -> None: callback_metrics["aa_precision"].detach().item() ) metrics["valid_pep_precision"] = ( - callback_metrics["pep_precision"] - .detach() - .item() + callback_metrics["pep_precision"].detach().item() ) self._history.append(metrics) self._log_history() @@ -1026,17 +1037,16 @@ def on_predict_batch_end( aa_scores, file_name, true_seq, - title + title, ) in outputs: if len(peptide) == 0: continue # Compute mass and detokenize calc_mass = self.tokenizer.calculate_precursor_ions( - peptide.unsqueeze(0), - torch.tensor([charge]).type_as(peptide) + peptide.unsqueeze(0), torch.tensor([charge]).type_as(peptide) )[0] - peptide = ''.join( + peptide = "".join( self.tokenizer.detokenize(peptide.unsqueeze(0), join=False)[0] ) @@ -1051,14 +1061,17 @@ def on_predict_batch_end( ",".join(list(map("{:.5f}".format, aa_scores))), file_name, true_seq, - title + title, ), ) def on_train_start(self): """Log optimizer settings.""" self.log("hp/optimizer_warmup_iters", self.warmup_iters) - self.log("hp/optimizer_cosine_schedule_period_iters", self.cosine_schedule_period_iters) + self.log( + "hp/optimizer_cosine_schedule_period_iters", + self.cosine_schedule_period_iters, + ) def _log_history(self) -> None: """ @@ -1371,6 +1384,7 @@ def _aa_pep_score( peptide_score -= 1 return aa_scores, peptide_score + def generate_tgt_mask(sz: int) -> torch.Tensor: """Generate a square mask for the sequence. diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 3c06b477..d8abcb3b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -15,7 +15,11 @@ import torch from lightning.pytorch.strategies import DDPStrategy -from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping +from lightning.pytorch.callbacks import ( + ModelCheckpoint, + LearningRateMonitor, + EarlyStopping, +) from lightning.pytorch.loggers import TensorBoardLogger from depthcharge.tokenizers import PeptideTokenizer @@ -97,6 +101,7 @@ def __init__( ) # Configure checkpoints. +<<<<<<< HEAD self.callbacks = [ ModelCheckpoint( dirpath=output_dir, @@ -112,6 +117,37 @@ def __init__( ), LearningRateMonitor(log_momentum=True, log_weight_decay=True), ] +======= + if config.save_top_k is not None: + self.callbacks = [ + ModelCheckpoint( + dirpath=config.model_save_folder_path, + monitor="valid_CELoss", + mode="min", + save_top_k=config.save_top_k, + auto_insert_metric_name=True, + filename="{epoch}-{step}-{train_CELoss:.3f}-{valid_CELoss:.3f}", + save_last=True, + ) + ] + # Configure early stopping + if config.early_stopping_patience is not None: + self.callbacks.append( + EarlyStopping( + monitor="valid_CELoss", + min_delta=0.00, + patience=self.config.early_stopping_patience, + verbose=True, + check_finite=True, + mode="min", + ) + ) + # Configure learning rate monitor + if config.tb_summarywriter is not None: + self.callbacks.append( + LearningRateMonitor(logging_interval="step", log_momentum=True) + ) +>>>>>>> c21c899 (Reformat with Black) def __enter__(self): """Enter the context manager""" @@ -196,13 +232,13 @@ def train( valid_paths = self._get_input_paths(valid_peak_path, True, "valid") self.initialize_data_module(train_paths, valid_paths) self.loaders.setup() - #logger.info(f'TRAIN PSMs: {self.loaders.train_dataset.n_spectra}') - #logger.info(f'VAL PSMs: {self.loaders.valid_dataset.n_spectra}') + # logger.info(f'TRAIN PSMs: {self.loaders.train_dataset.n_spectra}') + # logger.info(f'VAL PSMs: {self.loaders.valid_dataset.n_spectra}') self.trainer.fit( self.model, self.loaders.train_dataloader(), - self.loaders.val_dataloader() + self.loaders.val_dataloader(), ) def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: @@ -282,7 +318,11 @@ def predict( running model evaluation. Files that are not an annotated peak file format will be ignored if evaluate is set to true. """ +<<<<<<< HEAD self.writer = ms_io.MztabWriter(results_path) +======= + self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) +>>>>>>> c21c899 (Reformat with Black) self.writer.set_metadata( self.config, model=str(self.model_filename), @@ -317,7 +357,7 @@ def initialize_trainer(self, train: bool) -> None: devices=1, enable_checkpointing=False, precision=self.config.precision, - logger=False + logger=False, ) if train: @@ -328,14 +368,14 @@ def initialize_trainer(self, train: bool) -> None: if self.config.tb_summarywriter is not None: logger = TensorBoardLogger( - self.config.tb_summarywriter, + self.config.tb_summarywriter, version=None, name=f'model_{datetime.now().strftime("%Y%m%d_%H%M")}', - default_hp_metric=False + default_hp_metric=False, ) else: logger = False - + additional_cfg = dict( devices=devices, callbacks=self.callbacks, @@ -428,7 +468,7 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: weight_decay=self.config.weight_decay, out_writer=self.writer, calculate_precision=self.config.calculate_precision, - tokenizer=tokenizer + tokenizer=tokenizer, ) # Reconfigurable non-architecture related parameters for a @@ -510,18 +550,19 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: def initialize_tokenizer( self, - ) -> None : + ) -> None: """Initialize the peptide tokenizer""" if self.config.mskb_tokenizer: tokenizer_cs = MskbPeptideTokenizer else: tokenizer_cs = PeptideTokenizer - + self.tokenizer = tokenizer_cs( residues=self.config.residues, replace_isoleucine_with_leucine=self.config.replace_isoleucine_with_leucine, reverse=self.config.reverse_peptides, - start_token=None, stop_token='$' + start_token=None, + stop_token="$", ) def initialize_data_module( @@ -553,7 +594,11 @@ def initialize_data_module( except AttributeError: raise RuntimeError("Please use `initialize_tokenizer()` first.") - lance_dir = Path(self.tmp_dir.name) if self.config.lance_dir is None else self.config.lance_dir + lance_dir = ( + Path(self.tmp_dir.name) + if self.config.lance_dir is None + else self.config.lance_dir + ) self.loaders = DeNovoDataModule( train_paths=train_paths, valid_paths=valid_paths, @@ -595,7 +640,7 @@ def _get_input_paths( """ ext = (".mgf", ".lance") if not annotated: - ext += (".mzML", ".mzml", ".mzxml") # FIXME: Check if these work + ext += (".mzML", ".mzml", ".mzxml") # FIXME: Check if these work filenames = _get_peak_filenames(peak_path, ext) if not filenames: diff --git a/casanovo/denovo/transformers.py b/casanovo/denovo/transformers.py index 2e93cc8b..d0216b63 100644 --- a/casanovo/denovo/transformers.py +++ b/casanovo/denovo/transformers.py @@ -1,10 +1,14 @@ """Transformer encoder and decoder for the de novo sequencing task.""" + import torch from collections.abc import Callable from depthcharge.tokenizers import Tokenizer from depthcharge.encoders import PeakEncoder, FloatEncoder, PositionalEncoder -from depthcharge.transformers import SpectrumTransformerEncoder, AnalyteTransformerDecoder +from depthcharge.transformers import ( + SpectrumTransformerEncoder, + AnalyteTransformerDecoder, +) class PeptideDecoder(AnalyteTransformerDecoder): @@ -62,7 +66,7 @@ def __init__( self.charge_encoder = torch.nn.Embedding(max_charge, d_model) self.mass_encoder = FloatEncoder(d_model) - # override final layer: + # override final layer: # +1 in comparison to version in depthcharge to second dimension # This includes padding (=0) as a possible class # and avoids problems during beam search decoding @@ -138,8 +142,9 @@ def __init__( peak_encoder: PeakEncoder | Callable | bool = True, ): """Initialize a SpectrumEncoder""" - super().__init__(d_model, n_head, dim_feedforward, - n_layers, dropout, peak_encoder) + super().__init__( + d_model, n_head, dim_feedforward, n_layers, dropout, peak_encoder + ) self.latent_spectrum = torch.nn.Parameter(torch.randn(1, 1, d_model)) From 8771d786d458f28eb78d4a528855419baba46516 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sat, 27 Jul 2024 00:05:13 -0700 Subject: [PATCH 04/51] Fix formatting again after merge --- casanovo/denovo/model_runner.py | 36 +-------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index d8abcb3b..f7491cdf 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -101,7 +101,6 @@ def __init__( ) # Configure checkpoints. -<<<<<<< HEAD self.callbacks = [ ModelCheckpoint( dirpath=output_dir, @@ -117,37 +116,11 @@ def __init__( ), LearningRateMonitor(log_momentum=True, log_weight_decay=True), ] -======= - if config.save_top_k is not None: - self.callbacks = [ - ModelCheckpoint( - dirpath=config.model_save_folder_path, - monitor="valid_CELoss", - mode="min", - save_top_k=config.save_top_k, - auto_insert_metric_name=True, - filename="{epoch}-{step}-{train_CELoss:.3f}-{valid_CELoss:.3f}", - save_last=True, - ) - ] - # Configure early stopping - if config.early_stopping_patience is not None: - self.callbacks.append( - EarlyStopping( - monitor="valid_CELoss", - min_delta=0.00, - patience=self.config.early_stopping_patience, - verbose=True, - check_finite=True, - mode="min", - ) - ) - # Configure learning rate monitor + if config.tb_summarywriter is not None: self.callbacks.append( LearningRateMonitor(logging_interval="step", log_momentum=True) ) ->>>>>>> c21c899 (Reformat with Black) def __enter__(self): """Enter the context manager""" @@ -318,11 +291,7 @@ def predict( running model evaluation. Files that are not an annotated peak file format will be ignored if evaluate is set to true. """ -<<<<<<< HEAD self.writer = ms_io.MztabWriter(results_path) -======= - self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) ->>>>>>> c21c899 (Reformat with Black) self.writer.set_metadata( self.config, model=str(self.model_filename), @@ -340,9 +309,6 @@ def predict( self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.test_dataloader()) - if evaluate: - self.log_metrics(test_index) - def initialize_trainer(self, train: bool) -> None: """Initialize the lightning Trainer. From 7984bdc2446f488102b7978f4e74d409c71d8436 Mon Sep 17 00:00:00 2001 From: Daniela Klaproth-Andrade Date: Mon, 29 Jul 2024 18:09:21 +0200 Subject: [PATCH 05/51] Resolve requested changes --- casanovo/denovo/dataloaders.py | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 1cf088f9..c7e0e6dd 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -37,7 +37,7 @@ class DeNovoDataModule(pl.LightningDataModule): ---------- train_paths : str, optional A spectrum lance path for model training. - valid_pathas : str, optional + valid_paths : str, optional A spectrum lance path for validation. test_paths : str, optional A spectrum lance path for evaluation or inference. @@ -153,8 +153,8 @@ def __init__( ] def make_dataset(self, paths, annotated, mode, shuffle): - """ - Make spectrum datasets + """Make spectrum datasets. + Parameters ---------- paths : Iterable[str] diff --git a/pyproject.toml b/pyproject.toml index 5f6b8ae9..c8c29e0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "appdirs", "lightning>=2.1", "click", - "depthcharge-ms>=0.4.8 ", + "depthcharge-ms>=0.4.8,<0.5.0", "natsort", "numpy<2.0", "pandas", From f4b6ec6df9920373656138f03aa1347d239e3b33 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Mon, 29 Jul 2024 09:26:05 -0700 Subject: [PATCH 06/51] Reformat with Black --- casanovo/denovo/dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index c7e0e6dd..619d1c44 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -154,7 +154,7 @@ def __init__( def make_dataset(self, paths, annotated, mode, shuffle): """Make spectrum datasets. - + Parameters ---------- paths : Iterable[str] From 4ec36b3b525b8b4a26f42777e52c7f564b2c2e0d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 10 Sep 2024 09:15:29 -0700 Subject: [PATCH 07/51] removed invalid imports --- tests/unit_tests/test_unit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 00617457..a2372bb8 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -14,7 +14,6 @@ import unittest import unittest.mock -import depthcharge.masses import einops import github import numpy as np @@ -28,7 +27,9 @@ from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score -from depthcharge.data import SpectrumIndex, AnnotatedSpectrumIndex +from casanovo.data import ms_io +from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics +from casanovo.denovo.model import Spec2Pep, _aa_pep_score def test_version(): From 355edc652d2539e28dd84fa15aaed5fffbecd279 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 18 Sep 2024 14:45:29 -0700 Subject: [PATCH 08/51] removed to be added functionality (for now) --- casanovo/config.py | 2 -- casanovo/config.yaml | 4 ---- casanovo/denovo/dataloaders.py | 3 --- casanovo/denovo/model_runner.py | 1 - 4 files changed, 10 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index f802a292..69de80d1 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -92,8 +92,6 @@ class Config: gradient_clip_val=float, gradient_clip_algorithm=str, precision=str, - early_stopping_patience=int, - resume_training_from=str, mskb_tokenizer=bool, ) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index d3aaa064..196d6071 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -149,10 +149,6 @@ gradient_clip_val: gradient_clip_algorithm: precision: "32-true" # '16-true', '16-mixed', 'bf16-true', 'bf16-mixed', '32-true', '64-true', '64', '32', '16', 'bf16' -# Resume training and early stopping -resume_training_from : #'last', 'best', 'path' -early_stopping_patience: - # Replace I by L in peptide sequences replace_isoleucine_with_leucine: True # Reverse peptide sequences diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 619d1c44..f4d00470 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,10 +1,7 @@ """Data loaders for the de novo sequencing task.""" -<<<<<<< HEAD import functools import logging -======= ->>>>>>> c21c899 (Reformat with Black) import os from typing import Optional, Iterable from pathlib import Path diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index f7491cdf..6259e802 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -114,7 +114,6 @@ def __init__( filename=best_filename, enable_version_counter=False, ), - LearningRateMonitor(log_momentum=True, log_weight_decay=True), ] if config.tb_summarywriter is not None: From d224011f2f3bf7bc0c29e750136f0c060c39b7bd Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 18 Sep 2024 14:58:45 -0700 Subject: [PATCH 09/51] tensorboard logger --- casanovo/denovo/model_runner.py | 60 +++++++++++++-------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 6259e802..2228fa62 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -13,13 +13,10 @@ import lightning.pytorch as pl import lightning.pytorch.loggers import torch +import torch.utils.data from lightning.pytorch.strategies import DDPStrategy -from lightning.pytorch.callbacks import ( - ModelCheckpoint, - LearningRateMonitor, - EarlyStopping, -) +from lightning.pytorch.callbacks import ModelCheckpoint from lightning.pytorch.loggers import TensorBoardLogger from depthcharge.tokenizers import PeptideTokenizer @@ -213,8 +210,10 @@ def train( self.loaders.val_dataloader(), ) - def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: - """Log peptide precision and amino acid precision. + def log_metrics( + self, test_dataloader: torch.utils.data.DataLoader + ) -> None: + """Log peptide precision and amino acid precision Calculate and log peptide precision and amino acid precision based off of model predictions and spectrum annotations. @@ -222,32 +221,14 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Parameters ---------- test_index : AnnotatedSpectrumIndex - Index containing the annotated spectra used to generate - model predictions. - """ - seq_pred = [] - seq_true = [] - pred_idx = 0 - - with test_index as t_ind: - for true_idx in range(t_ind.n_spectra): - seq_true.append(t_ind[true_idx][4]) - if pred_idx < len(self.writer.psms) and self.writer.psms[ - pred_idx - ].spectrum_id == t_ind.get_spectrum_id(true_idx): - seq_pred.append(self.writer.psms[pred_idx].sequence) - pred_idx += 1 - else: - seq_pred.append(None) - self.initialize_trainer(train=False) - self.initialize_tokenizer() - self.initialize_model(train=False) + Index containing the annotated spectra used to generate model + predictions - test_paths = self._get_input_paths(peak_path, True, "test") - self.initialize_data_module(test_paths=test_paths) - self.loaders.setup(stage="test", annotated=True) - - aa_precision, aa_recall, pep_precision = aa_match_metrics( + model_output = [psm.sequence for psm in self.writer.psms] + spectrum_annotations = [ + test_index[i][4] for i in range(test_index.n_spectra) + ] + aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( seq_true, seq_pred, @@ -264,7 +245,9 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) - logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall) + """ + # TODO: Fix log_metrics, wait for eval bug fix to be merged in + return def predict( self, @@ -308,6 +291,9 @@ def predict( self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.test_dataloader()) + if evaluate: + self.log_metrics(self.loaders.test_dataloader()) + def initialize_trainer(self, train: bool) -> None: """Initialize the lightning Trainer. @@ -331,11 +317,11 @@ def initialize_trainer(self, train: bool) -> None: else: devices = self.config.devices - if self.config.tb_summarywriter is not None: + # TODO: CSV logger + if self.config.tb_summarywriter: logger = TensorBoardLogger( - self.config.tb_summarywriter, - version=None, - name=f'model_{datetime.now().strftime("%Y%m%d_%H%M")}', + self.output_dir, + version="tensorboard", default_hp_metric=False, ) else: From e6ac94e16a810282faa40528cc085b436a453592 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 18 Sep 2024 15:16:04 -0700 Subject: [PATCH 10/51] circular import bug --- casanovo/data/pep_spec_match.py | 41 +++++++++++++++++++++++++++++++++ casanovo/denovo/model.py | 14 ++++++++++- casanovo/utils.py | 4 ++++ 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 casanovo/data/pep_spec_match.py diff --git a/casanovo/data/pep_spec_match.py b/casanovo/data/pep_spec_match.py new file mode 100644 index 00000000..0dc3c48b --- /dev/null +++ b/casanovo/data/pep_spec_match.py @@ -0,0 +1,41 @@ +"""Peptide spectrum match dataclass""" + +import dataclasses +from typing import Tuple, Iterable + + +@dataclasses.dataclass +class PepSpecMatch: + """ + Peptide Spectrum Match (PSM) dataclass + + Parameters + ---------- + sequence : str + The amino acid sequence of the peptide. + spectrum_id : Tuple[str, str] + A tuple containing the spectrum identifier in the form + (spectrum file name, spectrum file idx) + peptide_score : float + Score of the match between the full peptide sequence and the + spectrum. + charge : int + The precursor charge state of the peptide ion observed in the spectrum. + calc_mz : float + The calculated mass-to-charge ratio (m/z) of the peptide based on its + sequence and charge state. + exp_mz : float + The observed (experimental) precursor mass-to-charge ratio (m/z) of the + peptide as detected in the spectrum. + aa_scores : Iterable[float] + A list of scores for individual amino acids in the peptide + sequence, where len(aa_scores) == len(sequence) + """ + + sequence: str + spectrum_id: Tuple[str, str] + peptide_score: float + charge: int + calc_mz: float + exp_mz: float + aa_scores: Iterable[float] diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 9f0084bc..51b55efe 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -15,7 +15,7 @@ from . import evaluate from .. import config -from ..data import ms_io +from ..data import ms_io, pep_spec_match from ..denovo.transformers import SpectrumEncoder, PeptideDecoder logger = logging.getLogger("casanovo") @@ -1051,6 +1051,7 @@ def on_predict_batch_end( ) self.out_writer.psms.append( +<<<<<<< HEAD ( peptide, scan, @@ -1063,6 +1064,17 @@ def on_predict_batch_end( true_seq, title, ), +======= + pep_spec_match.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(charge), + calc_mz=precursor_mz, + exp_mz=self.peptide_mass_calculator.mass(peptide, charge), + aa_scores=aa_scores, + ) +>>>>>>> 5719cdc (circular import bug) ) def on_train_start(self): diff --git a/casanovo/utils.py b/casanovo/utils.py index 86e0748f..3be1b12e 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -15,7 +15,11 @@ import psutil import torch +<<<<<<< HEAD from .data.psm import PepSpecMatch +======= +from .data.pep_spec_match import PepSpecMatch +>>>>>>> 5719cdc (circular import bug) SCORE_BINS = (0.0, 0.5, 0.9, 0.95, 0.99) From 39de09825debef8c4727e2e51b19b8c45b95d266 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 18 Sep 2024 15:24:52 -0700 Subject: [PATCH 11/51] removed tensorboard unit tests --- tests/unit_tests/test_unit.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index a2372bb8..89c1234f 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -421,18 +421,6 @@ def test_is_valid_url(): assert not casanovo._is_valid_url("foobar") -def test_tensorboard(): - """ - Test that the tensorboard.SummaryWriter object is only created when a folder - path is passed. - """ - model = Spec2Pep(tb_summarywriter="test_path") - assert model.tb_summarywriter is not None - - model = Spec2Pep() - assert model.tb_summarywriter is None - - def test_aa_pep_score(): """ Test the calculation of amino acid and peptide scores from the raw amino From 97b8de74b027bf59f5d8a268f8a435f97c718fb0 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 18 Sep 2024 16:38:05 -0700 Subject: [PATCH 12/51] beam search decode unit tests (IP) --- casanovo/denovo/model.py | 1 - tests/unit_tests/test_unit.py | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 51b55efe..468e184d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -468,7 +468,6 @@ def _finish_beams( ) ) try: - calc_mz = self.tokenizer.calculate_precursor_ions( calc_peptide.unsqueeze(0), precursor_charge.unsqueeze(0), diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 89c1234f..8cc9eba4 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1218,12 +1218,11 @@ def test_beam_search_decode(): """ model = Spec2Pep(n_beams=4, residues="massivekb", min_peptide_len=4) model.decoder.reverse = False # For simplicity. - aa2idx = model.decoder._aa2idx # Sizes. batch = 1 # B - length = model.max_peptide_len + 1 # L - vocab = model.decoder.vocab_size + 1 # V + length = model.max_length + 1 # L + vocab = len(model.tokenizer) + 1 # V beam = model.n_beams # S step = 3 @@ -1244,7 +1243,9 @@ def test_beam_search_decode(): # Fill scores and tokens with relevant predictions. scores[:, : step + 1, :] = 0 for i, peptide in enumerate(["PEPK", "PEPR", "PEPG", "PEP$"]): - tokens[i, : step + 1] = torch.tensor([aa2idx[aa] for aa in peptide]) + tokens[i, : step + 1] = model.decoder.token_encoder( + [aa for aa in peptide] + ) for j in range(step + 1): scores[i, j, tokens[1, j]] = 1 From 2ee2845a426f2323d519953b10e0392127e6f999 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 19 Sep 2024 13:23:25 -0700 Subject: [PATCH 13/51] teast_beam_search decode test update --- tests/unit_tests/test_unit.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 8cc9eba4..1d983924 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1243,9 +1243,7 @@ def test_beam_search_decode(): # Fill scores and tokens with relevant predictions. scores[:, : step + 1, :] = 0 for i, peptide in enumerate(["PEPK", "PEPR", "PEPG", "PEP$"]): - tokens[i, : step + 1] = model.decoder.token_encoder( - [aa for aa in peptide] - ) + tokens[i, : step + 1] = model.tokenizer.tokenize(peptide)[0] for j in range(step + 1): scores[i, j, tokens[1, j]] = 1 From 9b9349da16232c246ba2bb7a5fcccb4e6051607c Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 19 Sep 2024 15:29:18 -0700 Subject: [PATCH 14/51] test_eval_metrics test update --- tests/unit_tests/test_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 1d983924..bc76f2cf 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1596,7 +1596,7 @@ def test_eval_metrics(): aa_matches, n_pred_aa, n_gt_aa = aa_match_batch( peptides1=preds, peptides2=gt, - aa_dict=model.decoder._peptide_mass.masses, + aa_dict=model.tokenizer.residues, mode="best", ) From 0295493cfb255b70c3f5d330aec9fe41cb68b57c Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 20 Sep 2024 13:21:33 -0700 Subject: [PATCH 15/51] unit tests updates --- tests/unit_tests/test_unit.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index bc76f2cf..3bfe8867 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -4,6 +4,7 @@ import hashlib import heapq import io +import itertools import os import pathlib import platform @@ -14,6 +15,7 @@ import unittest import unittest.mock +import depthcharge import einops import github import numpy as np @@ -1631,24 +1633,21 @@ def test_spectrum_id_mgf(mgf_small, tmp_path): mgf_small2 = tmp_path / "mgf_small2.mgf" shutil.copy(mgf_small, mgf_small2) - for index_func, dataset_func in [ - (SpectrumIndex, SpectrumDataset), - (AnnotatedSpectrumIndex, AnnotatedSpectrumDataset), + for dataset_func in [ + depthcharge.data.SpectrumDataset, + depthcharge.data.AnnotatedSpectrumDataset, ]: - index = index_func( - tmp_path / "index.hdf5", [mgf_small, mgf_small2], overwrite=True - ) - dataset = dataset_func(index) - for i, (filename, mgf_i) in enumerate( + dataset = dataset_func([mgf_small, mgf_small2], 1) + for i, (filename, scan_id) in enumerate( [ - (mgf_small, 0), - (mgf_small, 1), - (mgf_small2, 0), - (mgf_small2, 1), + (mgf_small, "0"), + (mgf_small, "1"), + (mgf_small2, "0"), + (mgf_small2, "1"), ] ): - spectrum_id = str(filename), f"index={mgf_i}" - assert dataset.get_spectrum_id(i) == spectrum_id + assert dataset[i]["peak_file"][0] == filename.name + assert dataset[i]["scan_id"][0] == scan_id def test_spectrum_id_mzml(mzml_small, tmp_path): From 3d1c20f7c811ed710cef70959a373f9aefe3e4fc Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 24 Sep 2024 09:19:26 -0700 Subject: [PATCH 16/51] spectrum id unit tests --- casanovo/denovo/model.py | 2 ++ tests/unit_tests/test_unit.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 468e184d..6f31ea49 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -10,6 +10,7 @@ import torch import numpy as np import lightning.pytorch as pl +from torch.utils.tensorboard import SummaryWriter from depthcharge.tokenizers import PeptideTokenizer @@ -120,6 +121,7 @@ def __init__( out_writer: Optional[ms_io.MztabWriter] = None, calculate_precision: bool = False, tokenizer: Optional[PeptideTokenizer] = None, + tb_summarywriter: Optional[SummaryWriter] = None, # TODO **kwargs: Dict, ): super().__init__() diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 3bfe8867..7f9c0b12 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -30,6 +30,7 @@ from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score from casanovo.data import ms_io +from casanovo.denovo.dataloaders import DeNovoDataModule from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score @@ -1632,12 +1633,18 @@ def test_spectrum_id_mgf(mgf_small, tmp_path): """Test that spectra from MGF files are specified by their index.""" mgf_small2 = tmp_path / "mgf_small2.mgf" shutil.copy(mgf_small, mgf_small2) + data_module = DeNovoDataModule( + train_paths=[mgf_small, mgf_small2], + valid_paths=[mgf_small, mgf_small2], + test_paths=[mgf_small, mgf_small2], + ) + data_module.setup() - for dataset_func in [ - depthcharge.data.SpectrumDataset, - depthcharge.data.AnnotatedSpectrumDataset, + for dataset in [ + data_module.train_dataset, + data_module.valid_dataset, + data_module.test_dataset, ]: - dataset = dataset_func([mgf_small, mgf_small2], 1) for i, (filename, scan_id) in enumerate( [ (mgf_small, "0"), From 3ec8d7c371018086ae5a116c185ef3874621a487 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 24 Sep 2024 14:46:30 -0700 Subject: [PATCH 17/51] integration test fix --- tests/conftest.py | 10 ++++++++++ tests/test_integration.py | 2 -- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a35c5834..c671c83e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -291,6 +291,16 @@ def tiny_config(tmp_path): "train_batch_size": 32, "num_sanity_val_steps": 0, "calculate_precision": False, + "lance_dir": None, + "shuffle": False, + "buffer_size": 64, + "accumulate_grad_batches": 1, + "gradient_clip_val": None, + "gradient_clip_algorithm": None, + "precision": "32-true", + "replace_isoleucine_with_leucine": False, + "reverse_peptides": False, + "mskb_tokenizer": True, "residues": { "G": 57.021464, "A": 71.037114, diff --git a/tests/test_integration.py b/tests/test_integration.py index 7dab1b5b..a0ab75eb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -92,7 +92,6 @@ def test_train_and_run( # Train a tiny model: train_args = [ "train", - "--validation_peak_path", str(mgf_small), "--config", tiny_config, @@ -100,7 +99,6 @@ def test_train_and_run( str(tmp_path), "--output_root", "train", - str(mgf_small), # The training files. ] result = run(train_args) From 9b8efeaf385b7f94602fe7d2e32e7f348895e5cc Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 25 Sep 2024 15:54:25 -0700 Subject: [PATCH 18/51] model prediction io flow fixes --- casanovo/denovo/model.py | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 6f31ea49..128c0186 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -934,16 +934,7 @@ def predict_step( Predicted PSMs for the given batch of spectra. """ - _, _, precursors, true_seqs = self._process_batch(batch) - true_seqs = ( - [ - "".join(p) - for p in self.tokenizer.detokenize(true_seqs, join=False) - ] - if true_seqs is not None - else [""] * precursors.shape[0] - ) - + _, _, precursors, _ = self._process_batch(batch) prec_charges = precursors[:, 1].cpu().detach().numpy() prec_mzs = precursors[:, 2].cpu().detach().numpy() @@ -952,31 +943,25 @@ def predict_step( precursor_charge, precursor_mz, scan, - title, file_name, - true_seq, spectrum_preds, ) in zip( prec_charges, prec_mzs, - batch["scans"], - batch["title"], + batch["scan_id"], batch["peak_file"], - true_seqs, self.forward(batch), ): for peptide_score, aa_scores, peptide in spectrum_preds: predictions.append( ( - scan, + scan[0], precursor_charge, precursor_mz, peptide, peptide_score, aa_scores, - file_name, - true_seq, - title, + file_name[0], ) ) @@ -1037,8 +1022,6 @@ def on_predict_batch_end( peptide_score, aa_scores, file_name, - true_seq, - title, ) in outputs: if len(peptide) == 0: continue @@ -1068,11 +1051,11 @@ def on_predict_batch_end( ======= pep_spec_match.PepSpecMatch( sequence=peptide, - spectrum_id=tuple(spectrum_i), + spectrum_id=(file_name, scan), peptide_score=peptide_score, charge=int(charge), calc_mz=precursor_mz, - exp_mz=self.peptide_mass_calculator.mass(peptide, charge), + exp_mz=calc_mass, aa_scores=aa_scores, ) >>>>>>> 5719cdc (circular import bug) From 47df27ede5bb127fb8e4ce11d2c267764abcdd38 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 25 Sep 2024 16:47:13 -0700 Subject: [PATCH 19/51] PyLightning logging refactor --- casanovo/denovo/model_runner.py | 83 +++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 2228fa62..7f4d634b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -317,15 +317,55 @@ def initialize_trainer(self, train: bool) -> None: else: devices = self.config.devices - # TODO: CSV logger - if self.config.tb_summarywriter: - logger = TensorBoardLogger( - self.output_dir, - version="tensorboard", - default_hp_metric=False, - ) - else: - logger = False + # Configure loggers + logger = False + if self.config.log_metrics or self.config.tb_summarywriter: + if not self.output_dir: + logger.warning( + "Output directory not set in model runner. " + "No loss file or tensorboard will be created." + ) + else: + logger = [] + csv_log_dir = "csv_logs" + tb_log_dir = "tensorboard" + + if self.config.log_metrics: + if self.overwrite_ckpt_check: + utils.check_dir_file_exists( + self.output_dir, + csv_log_dir, + ) + + logger.append( + lightning.pytorch.loggers.CSVLogger( + self.output_dir, + version=csv_log_dir, + name=None, + ) + ) + + if self.config.tb_summarywriter: + if self.overwrite_ckpt_check: + utils.check_dir_file_exists( + self.output_dir, + tb_log_dir, + ) + + logger.append( + lightning.pytorch.loggers.TensorBoardLogger( + self.output_dir, + version=tb_log_dir, + name=None, + ) + ) + + if len(logger) > 0: + self.callbacks.append( + LearningRateMonitor( + log_momentum=True, log_weight_decay=True + ), + ) additional_cfg = dict( devices=devices, @@ -342,31 +382,6 @@ def initialize_trainer(self, train: bool) -> None: gradient_clip_algorithm=self.config.gradient_clip_algorithm, ) - if self.config.log_metrics: - if not self.output_dir: - logger.warning( - "Output directory not set in model runner. " - "No loss file will be created." - ) - else: - csv_log_dir = "csv_logs" - if self.overwrite_ckpt_check: - utils.check_dir_file_exists( - self.output_dir, - csv_log_dir, - ) - - additional_cfg.update( - { - "logger": lightning.pytorch.loggers.CSVLogger( - self.output_dir, - version=csv_log_dir, - name=None, - ), - "log_every_n_steps": self.config.log_every_n_steps, - } - ) - trainer_cfg.update(additional_cfg) self.trainer = pl.Trainer(**trainer_cfg) From 45b3e2660833531c7d63afd80e9ada9c3baee418 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 27 Sep 2024 11:02:09 -0700 Subject: [PATCH 20/51] mgf file reader title field formatting --- casanovo/denovo/dataloaders.py | 4 +++- casanovo/utils.py | 4 ---- tests/conftest.py | 7 +++---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index f4d00470..4f701838 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -138,7 +138,9 @@ def __init__( ), pa.string(), ), - CustomField("title", lambda x: x["params"]["title"], pa.string()), + CustomField( + "title", lambda x: f"index={x['params']['title']}", pa.string() + ), ] self.custom_field_test_mzml = [ CustomField("scans", lambda x: x["id"], pa.string()), diff --git a/casanovo/utils.py b/casanovo/utils.py index 3be1b12e..86e0748f 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -15,11 +15,7 @@ import psutil import torch -<<<<<<< HEAD from .data.psm import PepSpecMatch -======= -from .data.pep_spec_match import PepSpecMatch ->>>>>>> 5719cdc (circular import bug) SCORE_BINS = (0.0, 0.5, 0.9, 0.95, 0.99) diff --git a/tests/conftest.py b/tests/conftest.py index c671c83e..d3314396 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -122,15 +122,14 @@ def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None, annotate=True): mgf = [ "BEGIN IONS", + f"TITLE={title}", + f"SEQ={peptide}", f"PEPMASS={precursor_mz}", f"CHARGE={charge}+", + f"SCANS=F1:{2470 + title}", f"{frags}", "END IONS", ] - - if annotate: - mgf.insert(1, f"SEQ={peptide}") - return "\n".join(mgf) From a1b42af3ce072e9b38b55969c6d14e14102fbbac Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 30 Sep 2024 15:20:37 -0700 Subject: [PATCH 21/51] integration tests fix --- casanovo/data/ms_io.py | 4 ++-- casanovo/denovo/dataloaders.py | 2 +- tests/conftest.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 62d7a905..f419bdd4 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -142,7 +142,7 @@ def set_ms_run(self, peak_filenames: List[str]) -> None: self.metadata.append( (f"ms_run[{i}]-location", Path(filename).as_uri()), ) - self._run_map[os.path.basename(filename)] = i + self._run_map[Path(filename).name] = i def save(self) -> None: """ @@ -184,7 +184,7 @@ def save(self) -> None: ), 1, ): - filename = os.path.abspath(psm.spectrum_id[0]) + filename = psm.spectrum_id[0] idx = psm.spectrum_id[1] writer.writerow( [ diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4f701838..ed7ca5ba 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -134,7 +134,7 @@ def __init__( lambda x: ( x["params"]["scans"] if "scans" in x["params"] - else x["params"]["title"] + else ["params"]["title"] ), pa.string(), ), diff --git a/tests/conftest.py b/tests/conftest.py index d3314396..2a776c1c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -297,7 +297,7 @@ def tiny_config(tmp_path): "gradient_clip_val": None, "gradient_clip_algorithm": None, "precision": "32-true", - "replace_isoleucine_with_leucine": False, + "replace_isoleucine_with_leucine": True, "reverse_peptides": False, "mskb_tokenizer": True, "residues": { From 261f63ccf64b8403ac592192f4d8aa277c330ba4 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 30 Sep 2024 16:53:10 -0700 Subject: [PATCH 22/51] integration tests --- casanovo/data/ms_io.py | 3 +++ casanovo/denovo/dataloaders.py | 4 +--- casanovo/denovo/model.py | 2 +- casanovo/denovo/model_runner.py | 38 ++++++++++++++++++++++----------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index f419bdd4..7b954d71 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -186,6 +186,9 @@ def save(self) -> None: ): filename = psm.spectrum_id[0] idx = psm.spectrum_id[1] + if Path(filename).suffix == ".mgf" and idx.isnumeric(): + idx = f"index={idx}" + writer.writerow( [ "PSM", diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index ed7ca5ba..59e0cbf6 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -138,9 +138,7 @@ def __init__( ), pa.string(), ), - CustomField( - "title", lambda x: f"index={x['params']['title']}", pa.string() - ), + CustomField("title", lambda x: x["params"]["title"], pa.string()), ] self.custom_field_test_mzml = [ CustomField("scans", lambda x: x["id"], pa.string()), diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 128c0186..50536736 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1055,7 +1055,7 @@ def on_predict_batch_end( peptide_score=peptide_score, charge=int(charge), calc_mz=precursor_mz, - exp_mz=calc_mass, + exp_mz=calc_mass.item(), aa_scores=aa_scores, ) >>>>>>> 5719cdc (circular import bug) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 7f4d634b..6d203998 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -18,6 +18,7 @@ from lightning.pytorch.strategies import DDPStrategy from lightning.pytorch.callbacks import ModelCheckpoint from lightning.pytorch.loggers import TensorBoardLogger +from torch.utils.data import DataLoader from depthcharge.tokenizers import PeptideTokenizer from depthcharge.tokenizers.peptides import MskbPeptideTokenizer @@ -210,9 +211,7 @@ def train( self.loaders.val_dataloader(), ) - def log_metrics( - self, test_dataloader: torch.utils.data.DataLoader - ) -> None: + def log_metrics(self, test_dataloader: DataLoader) -> None: """Log peptide precision and amino acid precision Calculate and log peptide precision and amino acid precision @@ -224,15 +223,29 @@ def log_metrics( Index containing the annotated spectra used to generate model predictions - model_output = [psm.sequence for psm in self.writer.psms] - spectrum_annotations = [ - test_index[i][4] for i in range(test_index.n_spectra) - ] - aa_precision, _, pep_precision = aa_match_metrics( + for batch in test_dataloader: + for peak_file, scan_id, curr_seq_true in zip( + batch["peak_file"], + batch["scan_id"], + self.model.tokenizer.detokenize(batch["seq"][0]), + ): + spectrum_id_true = (peak_file, scan_id) + seq_true.append(curr_seq_true) + if ( + pred_idx < len(self.writer.psms) + and self.writer.psms[pred_idx].spectrum_id + == spectrum_id_true + ): + seq_pred.append(self.writer.psms[pred_idx].sequence) + pred_idx += 1 + else: + seq_pred.append(None) + + aa_precision, aa_recall, pep_precision = aa_match_metrics( *aa_match_batch( seq_true, seq_pred, - depthcharge.masses.PeptideMass().masses, + self.model.tokenizer.residues, ) ) @@ -288,11 +301,12 @@ def predict( test_paths = self._get_input_paths(peak_path, False, "test") self.writer.set_ms_run(test_paths) self.initialize_data_module(test_paths=test_paths) - self.loaders.setup(stage="test", annotated=False) - self.trainer.predict(self.model, self.loaders.test_dataloader()) + self.loaders.setup(stage="test", annotated=evaluate) + predict_dataloader = self.loaders.predict_dataloader() + self.trainer.predict(self.model, predict_dataloader) if evaluate: - self.log_metrics(self.loaders.test_dataloader()) + self.log_metrics(predict_dataloader) def initialize_trainer(self, train: bool) -> None: """Initialize the lightning Trainer. From e3e84567f21959c97f75d4877d1f11d8249d39e7 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 13:24:06 -0700 Subject: [PATCH 23/51] test_initialize_model fix --- tests/unit_tests/test_runner.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index cf04cf83..e406beaf 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -16,16 +16,25 @@ def test_initialize_model(tmp_path, mgf_small): """Test initializing a new or existing model.""" config = Config() config.model_save_folder_path = tmp_path + # Initializing model without initializing tokenizer raises an error + with pytest.raises(RuntimeError): + ModelRunner(config=config).initialize_model(train=True) + # No model filename given, so train from scratch. - ModelRunner(config=config).initialize_model(train=True) + runner = ModelRunner(config=config) + runner.initialize_tokenizer() + runner.initialize_model(train=True) # No model filename given during inference = error. with pytest.raises(ValueError): - ModelRunner(config=config).initialize_model(train=False) + runner = ModelRunner(config=config) + runner.initialize_tokenizer() + runner.initialize_model(train=False) # Non-existing model filename given during inference = error. with pytest.raises(FileNotFoundError): runner = ModelRunner(config=config, model_filename="blah") + runner.initialize_tokenizer() runner.initialize_model(train=False) # Train a quick model. @@ -38,10 +47,12 @@ def test_initialize_model(tmp_path, mgf_small): # Resume training from previous model. runner = ModelRunner(config=config, model_filename=str(ckpt)) + runner.initialize_tokenizer() runner.initialize_model(train=True) # Inference with previous model. runner = ModelRunner(config=config, model_filename=str(ckpt)) + runner.initialize_tokenizer() runner.initialize_model(train=False) # If the model initialization throws and EOFError, then the Spec2Pep model @@ -50,6 +61,7 @@ def test_initialize_model(tmp_path, mgf_small): weights.touch() with pytest.raises(EOFError): runner = ModelRunner(config=config, model_filename=str(weights)) + runner.initialize_tokenizer() runner.initialize_model(train=False) From 0fb66929c2cfae6b8eddc4ddb06fb37236a0446f Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 13:26:26 -0700 Subject: [PATCH 24/51] test_save_and_load_weights fix --- tests/unit_tests/test_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index e406beaf..2b5c879d 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -86,6 +86,7 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): # Now load the weights into a new model # The device should be meta for all the weights. runner = ModelRunner(config=other_config, model_filename=str(ckpt)) + runner.initialize_tokenizer() runner.initialize_model(train=False) obs_layers = runner.model.encoder.transformer_encoder.num_layers From 5594bf83d28c3bce619fe7229077024936ce551b Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 13:28:23 -0700 Subject: [PATCH 25/51] test_save_and_load_weights_deprecated fix --- tests/unit_tests/test_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 2b5c879d..7918af88 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -140,6 +140,7 @@ def test_save_and_load_weights_deprecated(tmp_path, mgf_small, tiny_config): with ModelRunner( config=config, model_filename=str(ckpt), overwrite_ckpt_check=False ) as runner: + runner.initialize_tokenizer() runner.initialize_model(train=False) assert runner.model.cosine_schedule_period_iters == 5 # Fine-tuning. From 7bd2b5e6f3f42a7356d608ee0d399f5800d0b3ab Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 15:23:13 -0700 Subject: [PATCH 26/51] test_evaluate fix, evaluate unnanotated peak file error handling --- casanovo/denovo/dataloaders.py | 2 +- casanovo/denovo/model_runner.py | 17 ++++++++++- tests/conftest.py | 6 ++-- tests/unit_tests/test_runner.py | 50 +++++++++++++++++++++++---------- 4 files changed, 56 insertions(+), 19 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 59e0cbf6..f4d00470 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -134,7 +134,7 @@ def __init__( lambda x: ( x["params"]["scans"] if "scans" in x["params"] - else ["params"]["title"] + else x["params"]["title"] ), pa.string(), ), diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 6d203998..9366d33f 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -301,7 +301,22 @@ def predict( test_paths = self._get_input_paths(peak_path, False, "test") self.writer.set_ms_run(test_paths) self.initialize_data_module(test_paths=test_paths) - self.loaders.setup(stage="test", annotated=evaluate) + + try: + self.loaders.setup(stage="test", annotated=evaluate) + except (KeyError, OSError) as e: + if evaluate: + error_message = ( + "Error creating annotated spectrum dataloaders. " + "This may be the result of having an unannotated peak file " + "present in the validation peak file path list.\n" + ) + + logger.error(error_message) + raise TypeError(error_message) from e + + raise + predict_dataloader = self.loaders.predict_dataloader() self.trainer.predict(self.model, predict_dataloader) diff --git a/tests/conftest.py b/tests/conftest.py index 2a776c1c..dfe6ef0a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -123,13 +123,15 @@ def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None, annotate=True): mgf = [ "BEGIN IONS", f"TITLE={title}", - f"SEQ={peptide}", f"PEPMASS={precursor_mz}", f"CHARGE={charge}+", - f"SCANS=F1:{2470 + title}", f"{frags}", "END IONS", ] + + if annotate: + mgf.insert(1, f"SEQ={peptide}") + return "\n".join(mgf) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 7918af88..b57e7296 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -155,7 +155,7 @@ def test_save_and_load_weights_deprecated(tmp_path, mgf_small, tiny_config): assert "max_iters" not in runner.model.opt_kwargs -def test_calculate_precision(tmp_path, mgf_small, tiny_config): +def test_calculate_precision(tmp_path, mgf_small, tiny_config, monkeypatch): """Test that this parameter is working correctly.""" config = Config(tiny_config) config.n_layers = 1 @@ -163,22 +163,42 @@ def test_calculate_precision(tmp_path, mgf_small, tiny_config): config.calculate_precision = False config.tb_summarywriter = str(tmp_path) - runner = ModelRunner(config=config, output_dir=tmp_path) - with runner: - runner.train([mgf_small], [mgf_small]) + with monkeypatch.context() as ctx: + mock_logger = unittest.mock.MagicMock() + ctx.setattr("casanovo.denovo.model.logger", mock_logger) + runner = ModelRunner(config=config, output_dir=tmp_path) + with runner: + runner.train([mgf_small], [mgf_small]) - assert "valid_aa_precision" not in runner.model.history.columns - assert "valid_pep_precision" not in runner.model.history.columns + logged_items = [ + item + for call in mock_logger.info.call_args_list + for arg in call.args + for item in (arg.split("\t") if isinstance(arg, str) else [arg]) + ] + + assert "AA precision" not in logged_items + assert "Peptide precision" not in logged_items config.calculate_precision = True - runner = ModelRunner( - config=config, output_dir=tmp_path, overwrite_ckpt_check=False - ) - with runner: - runner.train([mgf_small], [mgf_small]) + with monkeypatch.context() as ctx: + mock_logger = unittest.mock.MagicMock() + ctx.setattr("casanovo.denovo.model.logger", mock_logger) + runner = ModelRunner( + config=config, output_dir=tmp_path, overwrite_ckpt_check=False + ) + with runner: + runner.train([mgf_small], [mgf_small]) + + logged_items = [ + item + for call in mock_logger.info.call_args_list + for arg in call.args + for item in (arg.split("\t") if isinstance(arg, str) else [arg]) + ] - assert "valid_aa_precision" in runner.model.history.columns - assert "valid_pep_precision" in runner.model.history.columns + assert "AA precision" in logged_items + assert "Peptide precision" in logged_items def test_save_final_model(tmp_path, mgf_small, tiny_config): @@ -237,8 +257,8 @@ def test_evaluate( result_file.unlink() exception_string = ( - "Error creating annotated spectrum index. " - "This may be the result of having an unannotated MGF file " + "Error creating annotated spectrum dataloaders. " + "This may be the result of having an unannotated peak file " "present in the validation peak file path list.\n" ) From d17886090d3f64696bd88ca589dfe15a67a551b0 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 15:47:55 -0700 Subject: [PATCH 27/51] test_evaluate fix, evaluate unnanotated peak file error handling --- tests/unit_tests/test_runner.py | 40 +++++++++++++++++---------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index b57e7296..253b1d53 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -4,6 +4,7 @@ import unittest.mock from pathlib import Path +import depthcharge.tokenizers.peptides import pytest import torch @@ -360,19 +361,16 @@ def test_metrics_logging(tmp_path, mgf_small, tiny_config): def test_log_metrics(monkeypatch, tiny_config): - def get_mock_index(psm_list): - mock_test_index = unittest.mock.MagicMock() - mock_test_index.__enter__.return_value = mock_test_index - mock_test_index.__exit__.return_value = False - mock_test_index.n_spectra = len(psm_list) - mock_test_index.get_spectrum_id = lambda idx: psm_list[idx].spectrum_id - - mock_spectra = [ - (None, None, None, None, curr_psm.sequence) - for curr_psm in psm_list + def get_mock_loader(psm_list, tokenizer): + return [ + { + "peak_file": [psm.spectrum_id[0] for psm in psm_list], + "scan_id": [psm.spectrum_id[1] for psm in psm_list], + "seq": tokenizer.tokenize( + [psm.sequence for psm in psm_list] + ).unsqueeze(0), + } ] - mock_test_index.__getitem__.side_effect = lambda idx: mock_spectra[idx] - return mock_test_index def get_mock_psm(sequence, spectrum_id): return PepSpecMatch( @@ -391,6 +389,10 @@ def get_mock_psm(sequence, spectrum_id): with ModelRunner(Config(tiny_config)) as runner: runner.writer = unittest.mock.MagicMock() + runner.model = unittest.mock.MagicMock() + runner.model.tokenizer = ( + depthcharge.tokenizers.peptides.MskbPeptideTokenizer() + ) # Test 100% peptide precision infer_psms = [ @@ -404,7 +406,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] @@ -426,7 +428,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] @@ -453,7 +455,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] @@ -471,7 +473,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] @@ -487,7 +489,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] @@ -503,7 +505,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] @@ -530,7 +532,7 @@ def get_mock_psm(sequence, spectrum_id): ] runner.writer.psms = infer_psms - mock_index = get_mock_index(act_psms) + mock_index = get_mock_loader(act_psms, runner.model.tokenizer) runner.log_metrics(mock_index) pep_precision = mock_logger.info.call_args_list[-3][0][1] From 340695a905356f963225a1a7de49ee223484e0d5 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 15:52:04 -0700 Subject: [PATCH 28/51] test_eval_metrics fix --- tests/unit_tests/test_unit.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 7f9c0b12..f4429d50 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -16,6 +16,7 @@ import unittest.mock import depthcharge +import depthcharge.tokenizers.peptides import einops import github import numpy as np @@ -1582,7 +1583,7 @@ def test_eval_metrics(): the ground truth. A peptide prediction is correct if all its AA are correct matches. """ - model = Spec2Pep() + tokenizer = depthcharge.tokenizers.peptides.MskbPeptideTokenizer() preds = [ "SPEIK", @@ -1599,7 +1600,7 @@ def test_eval_metrics(): aa_matches, n_pred_aa, n_gt_aa = aa_match_batch( peptides1=preds, peptides2=gt, - aa_dict=model.tokenizer.residues, + aa_dict=tokenizer.residues, mode="best", ) @@ -1614,16 +1615,12 @@ def test_eval_metrics(): assert 26 / 40 == pytest.approx(aa_recall) assert 26 / 41 == pytest.approx(aa_precision) - aa_matches, pep_match = aa_match( - None, None, depthcharge.masses.PeptideMass().masses - ) + aa_matches, pep_match = aa_match(None, None, tokenizer.residues) assert aa_matches.shape == (0,) assert not pep_match - aa_matches, pep_match = aa_match( - "PEPTIDE", None, depthcharge.masses.PeptideMass().masses - ) + aa_matches, pep_match = aa_match("PEPTIDE", None, tokenizer.residues) assert np.array_equal(aa_matches, np.zeros(len("PEPTIDE"), dtype=bool)) assert not pep_match From e4d93f90a01ad0049868045e3877c1dbe6ed033c Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 16:23:15 -0700 Subject: [PATCH 29/51] test_spectrum_id tests fix --- tests/unit_tests/test_unit.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index f4429d50..28d739e6 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -16,6 +16,7 @@ import unittest.mock import depthcharge +import depthcharge.data import depthcharge.tokenizers.peptides import einops import github @@ -1634,6 +1635,7 @@ def test_spectrum_id_mgf(mgf_small, tmp_path): train_paths=[mgf_small, mgf_small2], valid_paths=[mgf_small, mgf_small2], test_paths=[mgf_small, mgf_small2], + shuffle=False, ) data_module.setup() @@ -1658,11 +1660,13 @@ def test_spectrum_id_mzml(mzml_small, tmp_path): """Test that spectra from mzML files are specified by their scan number.""" mzml_small2 = tmp_path / "mzml_small2.mzml" shutil.copy(mzml_small, mzml_small2) - - index = SpectrumIndex( - tmp_path / "index.hdf5", [mzml_small, mzml_small2], overwrite=True + data_module = DeNovoDataModule( + test_paths=[mzml_small, mzml_small2], + shuffle=False, ) - dataset = SpectrumDataset(index) + data_module.setup(stage="test", annotated=False) + + dataset = data_module.test_dataset for i, (filename, scan_nr) in enumerate( [ (mzml_small, 17), @@ -1671,8 +1675,8 @@ def test_spectrum_id_mzml(mzml_small, tmp_path): (mzml_small2, 111), ] ): - spectrum_id = str(filename), f"scan={scan_nr}" - assert dataset.get_spectrum_id(i) == spectrum_id + assert dataset[i]["peak_file"][0] == filename.name + assert dataset[i]["scan_id"][0] == f"scan={scan_nr}" def test_train_val_step_functions(): From eb4af71a9bf7d0561878541695a8a0ae453327e1 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 1 Oct 2024 17:07:28 -0700 Subject: [PATCH 30/51] unit tests fixes --- tests/unit_tests/test_unit.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 28d739e6..5983c0ed 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1,10 +1,10 @@ import collections +import copy import datetime import functools import hashlib import heapq import io -import itertools import os import pathlib import platform @@ -1644,6 +1644,9 @@ def test_spectrum_id_mgf(mgf_small, tmp_path): data_module.valid_dataset, data_module.test_dataset, ]: + for batch in dataset: + print(batch) + for i, (filename, scan_id) in enumerate( [ (mgf_small, "0"), @@ -1681,19 +1684,27 @@ def test_spectrum_id_mzml(mzml_small, tmp_path): def test_train_val_step_functions(): """Test train and validation step functions operating on batches.""" + tokenizer = depthcharge.tokenizers.peptides.MskbPeptideTokenizer() model = Spec2Pep( n_beams=1, residues="massivekb", min_peptide_len=4, train_label_smoothing=0.1, + tokenizer=tokenizer, ) - spectra = torch.zeros(1, 5, 2) - precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) - peptides = ["PEPK"] - batch = (spectra, precursors, peptides) - train_step_loss = model.training_step(batch) - val_step_loss = model.validation_step(batch) + batch = { + "mz_array": torch.zeros(1, 5), + "intensity_array": torch.zeros(1, 5), + "precursor_mz": torch.tensor(235.63410).unsqueeze(0), + "precursor_charge": torch.tensor(2.0).unsqueeze(0), + "seq": tokenizer.tokenize(["PEPK"]), + } + train_batch = {key: val.unsqueeze(0) for key, val in batch.items()} + val_batch = copy.deepcopy(train_batch) + + train_step_loss = model.training_step(train_batch) + val_step_loss = model.validation_step(val_batch) # Check if valid loss value returned assert train_step_loss > 0 @@ -1709,12 +1720,8 @@ def test_run_map(mgf_small): out_writer = ms_io.MztabWriter("dummy.mztab") # Set peak file by base file name only. out_writer.set_ms_run([os.path.basename(mgf_small.name)]) - assert os.path.basename(mgf_small.name) not in out_writer._run_map - assert os.path.abspath(mgf_small.name) in out_writer._run_map - # Set peak file by full path. - out_writer.set_ms_run([os.path.abspath(mgf_small.name)]) - assert os.path.basename(mgf_small.name) not in out_writer._run_map - assert os.path.abspath(mgf_small.name) in out_writer._run_map + assert mgf_small.name in out_writer._run_map + assert os.path.abspath(mgf_small.name) not in out_writer._run_map def test_check_dir(tmp_path): From 2a946c2a6ce6ee343ac6ed15e59f65d78715dcfc Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 2 Oct 2024 12:34:35 -0700 Subject: [PATCH 31/51] teast_beam_search_decode fix --- tests/unit_tests/test_unit.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 5983c0ed..1b740ea0 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1221,7 +1221,11 @@ def test_beam_search_decode(): """ Test beam search decoding and its sub-functions. """ - model = Spec2Pep(n_beams=4, residues="massivekb", min_peptide_len=4) + model = Spec2Pep( + n_beams=4, + residues="massivekb", + min_peptide_len=4, + ) model.decoder.reverse = False # For simplicity. # Sizes. @@ -1247,8 +1251,12 @@ def test_beam_search_decode(): ) # Fill scores and tokens with relevant predictions. scores[:, : step + 1, :] = 0 - for i, peptide in enumerate(["PEPK", "PEPR", "PEPG", "PEP$"]): - tokens[i, : step + 1] = model.tokenizer.tokenize(peptide)[0] + for i, (peptide, add_stop) in enumerate( + [("PEPK", False), ("PEPR", False), ("PEPG", False), ("PEP", True)] + ): + tokens[i, : step + 1] = model.tokenizer.tokenize( + peptide, add_stop=add_stop + )[0] for j in range(step + 1): scores[i, j, tokens[1, j]] = 1 From 17bc3a20e86a426ca1ab96f5f3159241373e6f23 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 2 Oct 2024 14:30:26 -0700 Subject: [PATCH 32/51] negative residue work around --- tests/unit_tests/test_unit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 1b740ea0..b7206cb2 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1217,14 +1217,18 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): assert expected_isotope0123 == list(candidates) -def test_beam_search_decode(): +def test_beam_search_decode(tiny_config): """ Test beam search decoding and its sub-functions. """ + config = casanovo.Config(tiny_config) model = Spec2Pep( n_beams=4, residues="massivekb", min_peptide_len=4, + tokenizer=depthcharge.tokenizers.peptides.PeptideTokenizer( + residues=config.residues + ), ) model.decoder.reverse = False # For simplicity. From 7d789a7827bffc25cfded3629dac254e79d35264 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 7 Oct 2024 16:23:26 -0700 Subject: [PATCH 33/51] depthcharge upgrade - all unit tests pass --- casanovo/config.yaml | 3 +- casanovo/denovo/model.py | 4 +- tests/conftest.py | 18 ++-- tests/unit_tests/test_unit.py | 155 +++++++++++++++++++++------------- 4 files changed, 109 insertions(+), 71 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 196d6071..ffb9bf45 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -164,7 +164,8 @@ residues: "P": 97.052764 "V": 99.068414 "T": 101.047670 - "C[Carbamidomethyl]": 160.030649 # 103.009185 + 57.021464 "L": 113.084064 + "C[Carbamidomethyl]": 160.030649 # 103.009185 + 57.021464 + "L": 113.084064 "I": 113.084064 "N": 114.042927 "D": 115.026943 diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 50536736..a63a5263 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -372,7 +372,7 @@ def _finish_beams( violate the minimum peptide length). """ # Check for tokens with a negative mass (i.e. neutral loss). - aa_neg_mass_idx = [] + aa_neg_mass_idx = [None] for aa, mass in self.tokenizer.residues.items(): if mass < 0: # aa_neg_mass.append(aa) @@ -383,7 +383,7 @@ def _finish_beams( [ self.tokenizer.index[aa] for aa in self.tokenizer.index - if aa.startswith(("+", "-", "[+", "[-")) + if aa.startswith("[") and aa.endswith("]-") ] ).to(self.decoder.device) diff --git a/tests/conftest.py b/tests/conftest.py index dfe6ef0a..84051d85 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -309,7 +309,7 @@ def tiny_config(tmp_path): "P": 97.052764, "V": 99.068414, "T": 101.047670, - "C+57.021": 160.030649, + "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 "L": 113.084064, "I": 113.084064, "N": 114.042927, @@ -323,13 +323,15 @@ def tiny_config(tmp_path): "R": 156.101111, "Y": 163.063329, "W": 186.079313, - "M+15.995": 147.035400, - "N+0.984": 115.026943, - "Q+0.984": 129.042594, - "+42.011": 42.010565, - "+43.006": 43.005814, - "-17.027": -17.026549, - "+43.006-17.027": 25.980265, + # Amino acid modifications. + "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 + "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 + "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 + # N-terminal modifications. + "[Acetyl]-": 42.010565, # Acetylation + "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" + "[Ammonia-loss]-": -17.026549, # NH3 loss + "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss }, "allowed_fixed_mods": "C:C+57.021", "allowed_var_mods": ( diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index b7206cb2..3e276f01 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1290,14 +1290,15 @@ def test_beam_search_decode(tiny_config): beam_fits_precursor, pred_cache, ) + # Verify that the correct peptides have been cached. correct_cached = 0 for _, _, _, pep in pred_cache[0]: - if torch.equal(pep, torch.tensor([4, 14, 4, 13])): + if torch.equal(pep, model.tokenizer.tokenize("PEPK")[0]): correct_cached += 1 - elif torch.equal(pep, torch.tensor([4, 14, 4, 18])): + elif torch.equal(pep, model.tokenizer.tokenize("PEPR")[0]): correct_cached += 1 - elif torch.equal(pep, torch.tensor([4, 14, 4])): + elif torch.equal(pep, model.tokenizer.tokenize("PEP")[0]): correct_cached += 1 else: pytest.fail( @@ -1309,16 +1310,22 @@ def test_beam_search_decode(tiny_config): # Return the candidate peptide with the highest score test_cache = collections.OrderedDict((i, []) for i in range(batch)) heapq.heappush( - test_cache[0], (0.93, 0.1, 4 * [0.93], torch.tensor([4, 14, 4, 19])) + test_cache[0], + (0.93, 0.1, 4 * [0.93], model.tokenizer.tokenize("PEPY")[0]), ) heapq.heappush( - test_cache[0], (0.95, 0.2, 4 * [0.95], torch.tensor([4, 14, 4, 13])) + test_cache[0], + (0.95, 0.2, 4 * [0.95], model.tokenizer.tokenize("PEPK")[0]), ) heapq.heappush( - test_cache[0], (0.94, 0.3, 4 * [0.94], torch.tensor([4, 14, 4, 4])) + test_cache[0], + (0.94, 0.3, 4 * [0.94], model.tokenizer.tokenize("PEPP")[0]), ) - assert list(model._get_top_peptide(test_cache))[0][0][-1] == "PEPK" + assert torch.equal( + next(model._get_top_peptide(test_cache))[0][-1], + model.tokenizer.tokenize(["PEPK"])[0], + ) # Test that an empty predictions is returned when no beams have been # finished. empty_cache = collections.OrderedDict((i, []) for i in range(batch)) @@ -1326,30 +1333,30 @@ def test_beam_search_decode(tiny_config): # Test multiple PSM per spectrum and if it's highest scoring peptides model.top_match = 2 assert set( - [pep[-1] for pep in list(model._get_top_peptide(test_cache))[0]] + [ + model.tokenizer.detokenize(pep[-1].unsqueeze(0))[0] + for pep in list(model._get_top_peptide(test_cache))[0] + ] ) == {"PEPK", "PEPP"} # Test _get_topk_beams(). # Set scores to proceed generating the unfinished beam. step = 4 scores[2, step, :] = 0 - scores[2, step, range(1, 5)] = torch.tensor([1.0, 2.0, 3.0, 4.0]) + next_tokens = model.tokenizer.tokenize(["P", "S", "A", "G"]).flatten() + scores[2, step, next_tokens] = torch.tensor([4.0, 3.0, 2.0, 1.0]) # Modify finished beams array to allow decoding from only one beam test_finished_beams = torch.tensor([True, True, False, True]) new_tokens, new_scores = model._get_topk_beams( tokens, scores, test_finished_beams, batch, step ) - expected_tokens = torch.tensor( - [ - [4, 14, 4, 1, 4], - [4, 14, 4, 1, 3], - [4, 14, 4, 1, 2], - [4, 14, 4, 1, 1], - ] + expected_tokens = model.tokenizer.tokenize( + ["PEPGP", "PEPGS", "PEPGA", "PEPGG"] ) + # Only the expected scores of the final step. expected_scores = torch.zeros(beam, vocab) - expected_scores[:, range(1, 5)] = torch.tensor([1.0, 2.0, 3.0, 4.0]) + expected_scores[:, next_tokens] = torch.tensor([4.0, 3.0, 2.0, 1.0]) assert torch.equal(new_tokens[:, : step + 1], expected_tokens) assert torch.equal(new_scores[:, step, :], expected_scores) @@ -1357,10 +1364,10 @@ def test_beam_search_decode(tiny_config): # Test output if decoding loop isn't stopped with termination of all beams. model.max_peptide_len = 0 # 1 spectrum with 5 peaks (2 values: m/z and intensity). - spectra = torch.zeros(1, 5, 2) + mzs = ints = torch.zeros(1, 5) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) - assert len(list(model.beam_search_decode(spectra, precursors))[0]) == 0 - model.max_peptide_len = 100 + assert len(list(model.beam_search_decode(mzs, ints, precursors))[0]) == 0 + model.max_length = 100 # Re-initialize scores and tokens to further test caching functionality. scores = torch.full( @@ -1370,8 +1377,9 @@ def test_beam_search_decode(tiny_config): tokens = torch.zeros(batch * beam, length, dtype=torch.int64) scores[:, : step + 1, :] = 0 - for i, peptide in enumerate(["PKKP$", "EPPK$", "PEPK$", "PMKP$"]): - tokens[i, : step + 1] = torch.tensor([aa2idx[aa] for aa in peptide]) + tokens[:, : step + 1] = model.tokenizer.tokenize( + ["PKKP", "EPPK", "PEPK", "PMKP"], add_stop=True + ) i, j, s = np.arange(step), np.arange(4), torch.Tensor([4, 0.5, 3, 0.4]) scores[:, i, :] = 1 scores[j, i, tokens[j, i]] = s @@ -1392,10 +1400,16 @@ def test_beam_search_decode(tiny_config): assert negative_score == 2 # Test using a single beam only. - model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=2) + model = Spec2Pep( + n_beams=1, + min_peptide_len=2, + tokenizer=depthcharge.tokenizers.peptides.MskbPeptideTokenizer( + residues=config.residues + ), + ) + vocab = len(model.tokenizer) + 1 beam = model.n_beams # S model.decoder.reverse = False # For simplicity. - aa2idx = model.decoder._aa2idx step = 4 # Initialize scores and tokens. @@ -1408,12 +1422,14 @@ def test_beam_search_decode(tiny_config): pred_cache = collections.OrderedDict((i, []) for i in range(batch)) # Ground truth peptide is "PEPK". - true_peptide = "PEPK$" + true_peptide = "PEPK" precursors = torch.tensor([469.25364, 2.0, 235.63410]).repeat( beam * batch, 1 ) scores[:, range(step), :] = 1 - tokens[0, : step + 1] = torch.tensor([aa2idx[aa] for aa in true_peptide]) + tokens[0, : step + 1] = model.tokenizer.tokenize( + true_peptide, add_stop=True + )[0] # Test _finish_beams(). finished_beams, beam_fits_precursor, discarded_beams = model._finish_beams( @@ -1429,7 +1445,9 @@ def test_beam_search_decode(tiny_config): tokens, scores, step, finished_beams, beam_fits_precursor, pred_cache ) - assert torch.equal(pred_cache[0][0][-1], torch.tensor([4, 14, 4, 13])) + assert torch.equal( + pred_cache[0][0][-1], model.tokenizer.tokenize(true_peptide)[0] + ) # Test _get_topk_beams(). step = 1 @@ -1460,9 +1478,13 @@ def test_beam_search_decode(tiny_config): assert torch.equal(new_tokens[:, : step + 1], expected_tokens) # Test _finish_beams() for tokens with a negative mass. - model = Spec2Pep(n_beams=2, residues="massivekb") + model = Spec2Pep( + n_beams=2, + tokenizer=depthcharge.tokenizers.peptides.MskbPeptideTokenizer( + residues=config.residues + ), + ) beam = model.n_beams # S - aa2idx = model.decoder._aa2idx step = 1 # Ground truth peptide is "-17.027GK". @@ -1470,8 +1492,7 @@ def test_beam_search_decode(tiny_config): beam * batch, 1 ) tokens = torch.zeros(batch * beam, length, dtype=torch.int64) - for i, peptide in enumerate(["GK", "AK"]): - tokens[i, : step + 1] = torch.tensor([aa2idx[aa] for aa in peptide]) + tokens[:, : step + 1] = model.tokenizer.tokenize(["GK", "AK"]) # Test _finish_beams(). finished_beams, beam_fits_precursor, discarded_beams = model._finish_beams( @@ -1482,26 +1503,34 @@ def test_beam_search_decode(tiny_config): assert torch.equal(discarded_beams, torch.tensor([False, False])) # Test _finish_beams() for multiple/internal N-mods and dummy predictions. - model = Spec2Pep(n_beams=3, residues="massivekb", min_peptide_len=3) + model = Spec2Pep( + n_beams=3, + min_peptide_len=3, + tokenizer=depthcharge.tokenizers.peptides.PeptideTokenizer( + residues=config.residues + ), + ) beam = model.n_beams # S - model.decoder.reverse = True - aa2idx = model.decoder._aa2idx step = 4 # Ground truth peptide is irrelevant for this test. precursors = torch.tensor([1861.0044, 2.0, 940.5750]).repeat( beam * batch, 1 ) + + # sequences with invalid mass modifications will raise an exception if + # tokenized using tokenizer.tokenize tokens = torch.zeros(batch * beam, length, dtype=torch.int64) - # Reverse decoding - for i, peptide in enumerate( - [ - ["K", "A", "A", "A", "+43.006-17.027"], - ["K", "A", "A", "+42.011", "A"], - ["K", "A", "A", "+43.006", "+42.011"], - ] - ): - tokens[i, : step + 1] = torch.tensor([aa2idx[aa] for aa in peptide]) + sequences = [ + ["K", "A", "A", "A", "[+25.980265]-"], + ["K", "A", "A", "[Acetyl]-", "A"], + ["K", "A", "A", "[Carbamyl]-", "[Ammonia-loss]-"], + ] + + for i, seq in enumerate(sequences): + tokens[i, : step + 1] = torch.tensor( + [model.tokenizer.index[aa] for aa in seq] + ) # Test _finish_beams(). All should be discarded finished_beams, beam_fits_precursor, discarded_beams = model._finish_beams( @@ -1514,14 +1543,19 @@ def test_beam_search_decode(tiny_config): assert torch.equal(discarded_beams, torch.tensor([False, True, True])) # Test _get_topk_beams() with finished beams in the batch. - model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=3) + model = Spec2Pep( + n_beams=1, + min_peptide_len=3, + tokenizer=depthcharge.tokenizers.peptides.PeptideTokenizer( + residues=config.residues + ), + ) # Sizes and other variables. batch = 2 # B beam = model.n_beams # S - model.decoder.reverse = True - length = model.max_peptide_len + 1 # L - vocab = model.decoder.vocab_size + 1 # V + length = model.max_length + 1 # L + vocab = len(model.tokenizer) + 1 # V step = 4 # Initialize dummy scores and tokens. @@ -1536,8 +1570,8 @@ def test_beam_search_decode(tiny_config): scores[:, step, range(1, 4)] = torch.tensor([1.0, 2.0, 3.0]) # Simulate one finished and one unfinished beam in the same batch. - tokens[0, :step] = torch.tensor([4, 14, 4, 28]) - tokens[1, :step] = torch.tensor([4, 14, 4, 1]) + tokens[0, :step] = model.tokenizer.tokenize("PEP", add_stop=True)[0] + tokens[1, :step] = model.tokenizer.tokenize("PEPG")[0] # Set finished beams array to allow decoding from only one beam. test_finished_beams = torch.tensor([True, False]) @@ -1547,22 +1581,23 @@ def test_beam_search_decode(tiny_config): ) # Only the second peptide should have a new token predicted. - expected_tokens = torch.tensor( - [ - [4, 14, 4, 28, 0], - [4, 14, 4, 1, 3], - ] - ) + expected_tokens = tokens.clone() + expected_tokens[1, len("PEPG")] = 3 - assert torch.equal(new_tokens[:, : step + 1], expected_tokens) + assert torch.equal(new_tokens, expected_tokens) # Test that duplicate peptide scores don't lead to a conflict in the cache. - model = Spec2Pep(n_beams=5, residues="massivekb", min_peptide_len=3) + model = Spec2Pep( + n_beams=1, + min_peptide_len=3, + tokenizer=depthcharge.tokenizers.peptides.PeptideTokenizer( + residues=config.residues + ), + ) batch = 2 # B beam = model.n_beams # S - model.decoder.reverse = True - length = model.max_peptide_len + 1 # L - vocab = model.decoder.vocab_size + 1 # V + length = model.max_length + 1 # L + vocab = len(model.tokenizer) + 1 # V step = 4 # Simulate beams with identical amino acid scores but different tokens. From c1ca43615241618817035b1f70194b919db8ddaf Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 7 Oct 2024 19:27:37 -0700 Subject: [PATCH 34/51] pylance depthcharge compatability fix --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c8c29e0e..6d80ff83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pandas", "psutil", "PyGithub", + "pylance==0.15.0", "PyYAML", "requests", "rich-click>=1.6.1", From 2d539fdccc4f1ff0ae41a8dced40c17943dab78c Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 14 Oct 2024 13:47:00 -0700 Subject: [PATCH 35/51] removed scans field from dataloaders --- casanovo/denovo/dataloaders.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index f4d00470..95084206 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -129,19 +129,9 @@ def __init__( scale_to_unit_norm, ] self.custom_field_test_mgf = [ - CustomField( - "scans", - lambda x: ( - x["params"]["scans"] - if "scans" in x["params"] - else x["params"]["title"] - ), - pa.string(), - ), CustomField("title", lambda x: x["params"]["title"], pa.string()), ] self.custom_field_test_mzml = [ - CustomField("scans", lambda x: x["id"], pa.string()), CustomField("title", lambda x: x["id"], pa.string()), ] From 6ab33978c073baca38e3c53d2667d6f8f4c3e6e3 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 20 Nov 2024 17:07:46 -0800 Subject: [PATCH 36/51] non db functionality working --- casanovo/data/datasets.py | 269 -------------------------------- casanovo/data/db_utils.py | 13 +- casanovo/data/ms_io.py | 1 - casanovo/data/pep_spec_match.py | 41 ----- casanovo/denovo/dataloaders.py | 28 ++-- casanovo/denovo/model.py | 226 +++++++++++++-------------- casanovo/denovo/model_runner.py | 44 ++---- tests/conftest.py | 86 +++++----- tests/test_integration.py | 136 ++++++++-------- tests/unit_tests/test_unit.py | 11 +- 10 files changed, 255 insertions(+), 600 deletions(-) delete mode 100644 casanovo/data/datasets.py delete mode 100644 casanovo/data/pep_spec_match.py diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py deleted file mode 100644 index 3917a2c8..00000000 --- a/casanovo/data/datasets.py +++ /dev/null @@ -1,269 +0,0 @@ -"""A PyTorch Dataset class for annotated spectra.""" - -from typing import Optional, Tuple - -import depthcharge -import numpy as np -import spectrum_utils.spectrum as sus -import torch -from torch.utils.data import Dataset - - -class SpectrumDataset(Dataset): - """ - Parse and retrieve collections of MS/MS spectra. - - Parameters - ---------- - spectrum_index : depthcharge.data.SpectrumIndex - The MS/MS spectra to use as a dataset. - n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. - min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. - max_mz : float - The maximum m/z to include. - min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. - remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. - random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. - """ - - def __init__( - self, - spectrum_index: depthcharge.data.SpectrumIndex, - n_peaks: int = 150, - min_mz: float = 140.0, - max_mz: float = 2500.0, - min_intensity: float = 0.01, - remove_precursor_tol: float = 2.0, - random_state: Optional[int] = None, - ): - """Initialize a SpectrumDataset""" - super().__init__() - self.n_peaks = n_peaks - self.min_mz = min_mz - self.max_mz = max_mz - self.min_intensity = min_intensity - self.remove_precursor_tol = remove_precursor_tol - self.rng = np.random.default_rng(random_state) - self._index = spectrum_index - - def __len__(self) -> int: - """The number of spectra.""" - return self.n_spectra - - def __getitem__( - self, idx - ) -> Tuple[torch.Tensor, float, int, Tuple[str, str]]: - """ - Return the MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the spectrum to return. - - Returns - ------- - spectrum : torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - spectrum_id: Tuple[str, str] - The unique spectrum identifier, formed by its original peak file and - identifier (index or scan number) therein. - """ - mz_array, int_array, precursor_mz, precursor_charge = self.index[idx][ - :4 - ] - spectrum = self._process_peaks( - mz_array, int_array, precursor_mz, precursor_charge - ) - return ( - spectrum, - precursor_mz, - precursor_charge, - self.get_spectrum_id(idx), - ) - - def get_spectrum_id(self, idx: int) -> Tuple[str, str]: - """ - Return the identifier of the MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the MS/MS spectrum within the SpectrumIndex. - - Returns - ------- - ms_data_file : str - The peak file from which the MS/MS spectrum was originally parsed. - identifier : str - The MS/MS spectrum identifier, per PSI recommendations. - """ - with self.index: - return self.index.get_spectrum_id(idx) - - def _process_peaks( - self, - mz_array: np.ndarray, - int_array: np.ndarray, - precursor_mz: float, - precursor_charge: int, - ) -> torch.Tensor: - """ - Preprocess the spectrum by removing noise peaks and scaling the peak - intensities. - - Parameters - ---------- - mz_array : numpy.ndarray of shape (n_peaks,) - The spectrum peak m/z values. - int_array : numpy.ndarray of shape (n_peaks,) - The spectrum peak intensity values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - - Returns - ------- - torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - """ - spectrum = sus.MsmsSpectrum( - "", - precursor_mz, - precursor_charge, - mz_array.astype(np.float64), - int_array.astype(np.float32), - ) - try: - spectrum.set_mz_range(self.min_mz, self.max_mz) - if len(spectrum.mz) == 0: - raise ValueError - spectrum.remove_precursor_peak(self.remove_precursor_tol, "Da") - if len(spectrum.mz) == 0: - raise ValueError - spectrum.filter_intensity(self.min_intensity, self.n_peaks) - if len(spectrum.mz) == 0: - raise ValueError - spectrum.scale_intensity("root", 1) - intensities = spectrum.intensity / np.linalg.norm( - spectrum.intensity - ) - return torch.tensor(np.array([spectrum.mz, intensities])).T.float() - except ValueError: - # Replace invalid spectra by a dummy spectrum. - return torch.tensor([[0, 1]]).float() - - @property - def n_spectra(self) -> int: - """The total number of spectra.""" - return self.index.n_spectra - - @property - def index(self) -> depthcharge.data.SpectrumIndex: - """The underlying SpectrumIndex.""" - return self._index - - @property - def rng(self): - """The NumPy random number generator.""" - return self._rng - - @rng.setter - def rng(self, seed): - """Set the NumPy random number generator.""" - self._rng = np.random.default_rng(seed) - - -class AnnotatedSpectrumDataset(SpectrumDataset): - """ - Parse and retrieve collections of annotated MS/MS spectra. - - Parameters - ---------- - annotated_spectrum_index : depthcharge.data.SpectrumIndex - The MS/MS spectra to use as a dataset. - n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. - min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. - max_mz : float - The maximum m/z to include. - min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. - remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. - random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. - """ - - def __init__( - self, - annotated_spectrum_index: depthcharge.data.SpectrumIndex, - n_peaks: int = 150, - min_mz: float = 140.0, - max_mz: float = 2500.0, - min_intensity: float = 0.01, - remove_precursor_tol: float = 2.0, - random_state: Optional[int] = None, - ): - super().__init__( - annotated_spectrum_index, - n_peaks=n_peaks, - min_mz=min_mz, - max_mz=max_mz, - min_intensity=min_intensity, - remove_precursor_tol=remove_precursor_tol, - random_state=random_state, - ) - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: - """ - Return the annotated MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the spectrum to return. - - Returns - ------- - spectrum : torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - annotation : str - The peptide annotation of the spectrum. - """ - ( - mz_array, - int_array, - precursor_mz, - precursor_charge, - peptide, - ) = self.index[idx] - spectrum = self._process_peaks( - mz_array, int_array, precursor_mz, precursor_charge - ) - return spectrum, precursor_mz, precursor_charge, peptide diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index d3670930..7d7b1ae9 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -7,7 +7,6 @@ import string from typing import Dict, Iterator, Pattern, Set, Tuple -import depthcharge.masses import numpy as np import pandas as pd import pyteomics.fasta @@ -53,8 +52,8 @@ class ProteinDatabase: A comma-separated string of fixed modifications to consider. allowed_var_mods : str A comma-separated string of variable modifications to consider. - residues : Dict[str, float] - A dictionary of amino acid masses. + tokenizer: depthcharge.tokenizers.PeptideTokenizer + Used to access residues. """ def __init__( @@ -95,13 +94,14 @@ def __init__( digestion, missed_cleavages, ) - self.db_peptides = self._digest_fasta(peptide_generator) + self.db_peptides = self._digest_fasta(peptide_generator, residues) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], + residues: Dict[str, float], ) -> pd.DataFrame: """ Digests a FASTA file and returns the peptides, their masses, @@ -148,10 +148,7 @@ def _digest_fasta( .reset_index() ) # Calculate the mass of each peptide. - mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - peptides["calc_mass"] = ( - peptides["peptide"].apply(mass_calculator.mass).round(5) - ) + peptides["calc_mass"] = peptides["peptide"].map(residues).round(5) # Sort by peptide mass and index by peptide sequence. peptides.sort_values( by=["calc_mass", "peptide"], ascending=True, inplace=True diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 7b954d71..959c5bf7 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -7,7 +7,6 @@ import re from pathlib import Path from typing import List -import pprint import natsort from .. import __version__ diff --git a/casanovo/data/pep_spec_match.py b/casanovo/data/pep_spec_match.py deleted file mode 100644 index 0dc3c48b..00000000 --- a/casanovo/data/pep_spec_match.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Peptide spectrum match dataclass""" - -import dataclasses -from typing import Tuple, Iterable - - -@dataclasses.dataclass -class PepSpecMatch: - """ - Peptide Spectrum Match (PSM) dataclass - - Parameters - ---------- - sequence : str - The amino acid sequence of the peptide. - spectrum_id : Tuple[str, str] - A tuple containing the spectrum identifier in the form - (spectrum file name, spectrum file idx) - peptide_score : float - Score of the match between the full peptide sequence and the - spectrum. - charge : int - The precursor charge state of the peptide ion observed in the spectrum. - calc_mz : float - The calculated mass-to-charge ratio (m/z) of the peptide based on its - sequence and charge state. - exp_mz : float - The observed (experimental) precursor mass-to-charge ratio (m/z) of the - peptide as detected in the spectrum. - aa_scores : Iterable[float] - A list of scores for individual amino acids in the peptide - sequence, where len(aa_scores) == len(sequence) - """ - - sequence: str - spectrum_id: Tuple[str, str] - peptide_score: float - charge: int - calc_mz: float - exp_mz: float - aa_scores: Iterable[float] diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 95084206..74d3b7e3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -256,10 +256,12 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: def _make_loader( self, dataset: torch.utils.data.Dataset, - shuffle: Optional[bool] = None, + batch_size: int, + shuffle: bool = False, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. + Parameters ---------- dataset : torch.utils.data.Dataset @@ -278,37 +280,33 @@ def _make_loader( """ return DataLoader( dataset, - shuffle=shuffle, - num_workers=0, # self.n_workers, - # precision=torch.float32, + batch_size=batch_size, pin_memory=True, + num_workers=self.n_workers, + shuffle=shuffle, ) def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader(self.train_dataset, self.shuffle) + return self._make_loader( + self.train_dataset, self.train_batch_size, shuffle=self.shuffle + ) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" - return self._make_loader(self.valid_dataset) + return self._make_loader(self.valid_dataset, self.eval_batch_size) def test_dataloader(self) -> torch.utils.data.DataLoader: """Get the test DataLoader.""" - return self._make_loader(self.test_dataset) + return self._make_loader(self.test_dataset, self.eval_batch_size) def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" - return self._make_loader(self.test_dataset) + return self._make_loader(self.test_dataset, self.eval_batch_size) def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search.""" - return self._make_loader( - self.test_dataset, - self.eval_batch_size, - collate_fn=functools.partial( - prepare_psm_batch, protein_database=self.protein_database - ), - ) + return self._make_loader(self.test_dataset, self.eval_batch_size) def scale_to_unit_norm(spectrum): diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index a63a5263..19ea7244 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -2,21 +2,21 @@ import collections import heapq +import itertools import logging import warnings -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import einops import torch import numpy as np import lightning.pytorch as pl -from torch.utils.tensorboard import SummaryWriter from depthcharge.tokenizers import PeptideTokenizer from . import evaluate from .. import config -from ..data import ms_io, pep_spec_match +from ..data import ms_io, psm from ..denovo.transformers import SpectrumEncoder, PeptideDecoder logger = logging.getLogger("casanovo") @@ -76,9 +76,6 @@ class Spec2Pep(pl.LightningModule): Number of PSMs to return for each spectrum. n_log : int The number of epochs to wait between logging messages. - tb_summarywriter : Optional[Path] - Folder path to record performance metrics during training. If - ``None``, don't use a ``SummaryWriter``. train_label_smoothing : float Smoothing factor when calculating the training loss. warmup_iters : int @@ -105,7 +102,6 @@ def __init__( dim_feedforward: int = 1024, n_layers: int = 9, dropout: float = 0.0, - dim_intensity: Optional[int] = None, max_peptide_len: int = 100, residues: Union[Dict[str, float], str] = "canonical", max_charge: int = 5, @@ -121,7 +117,6 @@ def __init__( out_writer: Optional[ms_io.MztabWriter] = None, calculate_precision: bool = False, tokenizer: Optional[PeptideTokenizer] = None, - tb_summarywriter: Optional[SummaryWriter] = None, # TODO **kwargs: Dict, ): super().__init__() @@ -241,22 +236,21 @@ def beam_search_decode( the m/z-intensity pair for each peak. These should be zero-padded, such that all the spectra in the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge - (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge (axis 1), and + precursor m/z (axis 2) of each MS/MS spectrum. Returns ------- pred_peptides : List[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide - prediction(s). A peptide predictions consists of a tuple - with the peptide score, the amino acid scores, and the - predicted peptide sequence. + For each spectrum, a list with the top peptide prediction(s). A + peptide predictions consists of a tuple with the peptide score, + the amino acid scores, and the predicted peptide sequence. """ memories, mem_masks = self.encoder(mzs, ints) # Sizes. batch = mzs.shape[0] # B - length = self.max_length + 1 # L + length = self.max_peptide_len + 1 # L vocab = self.vocab_size # V beam = self.n_beams # S @@ -293,16 +287,15 @@ def beam_search_decode( # The main decoding loop. for step in range(0, self.max_peptide_len): - # Terminate beams exceeding the precursor m/z tolerance and - # track all finished beams (either terminated or stop token - # predicted). + # Terminate beams exceeding the precursor m/z tolerance and track + # all finished beams (either terminated or stop token predicted). ( finished_beams, beam_fits_precursor, discarded_beams, ) = self._finish_beams(tokens, precursors, step) - # Cache peptide predictions from the finished beams (but not - # the discarded beams). + # Cache peptide predictions from the finished beams (but not the + # discarded beams). self._cache_finished_beams( tokens, scores, @@ -313,8 +306,7 @@ def beam_search_decode( ) # Stop decoding when all current beams have been finished. - # Continue with beams that have not been finished and not - # discarded. + # Continue with beams that have not been finished and not discarded. finished_beams |= discarded_beams if finished_beams.all(): break @@ -325,8 +317,8 @@ def beam_search_decode( memory=memories[~finished_beams, :, :], memory_key_padding_mask=mem_masks[~finished_beams, :], ) - # Find the top-k beams with the highest scores and continue - # decoding those. + # Find the top-k beams with the highest scores and continue decoding + # those. tokens, scores = self._get_topk_beams( tokens, scores, finished_beams, batch, step + 1 ) @@ -343,33 +335,33 @@ def _finish_beams( step: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Track all beams that have been finished, either by predicting - the stop token or because they were terminated due to exceeding - the precursor m/z tolerance. + Track all beams that have been finished, either by predicting the stop + token or because they were terminated due to exceeding the precursor + m/z tolerance. Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. step : int Index of the current decoding step. Returns ------- finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams have - been finished. + Boolean tensor indicating whether the current beams have been + finished. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating if current beams are within - precursor m/z tolerance. + Boolean tensor indicating if current beams are within precursor m/z + tolerance. discarded_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams should - be discarded (e.g. because they were predicted to end but - violate the minimum peptide length). + Boolean tensor indicating whether the current beams should be + discarded (e.g. because they were predicted to end but violate the + minimum peptide length). """ # Check for tokens with a negative mass (i.e. neutral loss). aa_neg_mass_idx = [None] @@ -390,8 +382,7 @@ def _finish_beams( beam_fits_precursor = torch.zeros( tokens.shape[0], dtype=torch.bool ).to(self.encoder.device) - # Beams with a stop token predicted in the current step can be - # finished. + # Beams with a stop token predicted in the current step can be finished. finished_beams = torch.zeros(tokens.shape[0], dtype=torch.bool).to( self.encoder.device ) @@ -404,9 +395,8 @@ def _finish_beams( ) discarded_beams[tokens[:, step] == 0] = True - # Discard beams with invalid modification combinations (i.e. - # N-terminal modifications occur multiple times or in internal - # positions). + # Discard beams with invalid modification combinations (i.e. N-terminal + # modifications occur multiple times or in internal positions). if step > 1: # Only relevant for longer predictions. dim0 = torch.arange(tokens.shape[0]) final_pos = torch.full((ends_stop_token.shape[0],), step) @@ -423,8 +413,8 @@ def _finish_beams( ).any(dim=1) discarded_beams[multiple_mods | internal_mods] = True - # Check which beams should be terminated or discarded based on - # the predicted peptide. + # Check which beams should be terminated or discarded based on the + # predicted peptide. for i in range(len(finished_beams)): # Skip already discarded beams. if discarded_beams[i]: @@ -442,15 +432,15 @@ def _finish_beams( ): pred_tokens = pred_tokens[:-1] peptide_len -= 1 - # Discard beams that were predicted to end but don't fit the - # minimum peptide length. + # Discard beams that were predicted to end but don't fit the minimum + # peptide length. if finished_beams[i] and peptide_len < self.min_peptide_len: discarded_beams[i] = True continue - # Terminate the beam if it has not been finished by the - # model but the peptide mass exceeds the precursor m/z to an - # extent that it cannot be corrected anymore by a - # subsequently predicted AA with negative mass. + # Terminate the beam if it has not been finished by the model but + # the peptide mass exceeds the precursor m/z to an extent that it + # cannot be corrected anymore by a subsequently predicted AA with + # negative mass. precursor_charge = precursors[i, 1] precursor_mz = precursors[i, 2] matches_precursor_mz = exceeds_precursor_mz = False @@ -487,18 +477,16 @@ def _finish_beams( self.isotope_error_range[1] + 1, ) ] - # Terminate the beam if the calculated m/z for the - # predicted peptide (without potential additional - # AAs with negative mass) is within the precursor - # m/z tolerance. + # Terminate the beam if the calculated m/z for the predicted + # peptide (without potential additional AAs with negative + # mass) is within the precursor m/z tolerance. matches_precursor_mz = aa is None and any( abs(d) < self.precursor_mass_tol for d in delta_mass_ppm ) - # Terminate the beam if the calculated m/z exceeds - # the precursor m/z + tolerance and hasn't been - # corrected by a subsequently predicted AA with - # negative mass. + # Terminate the beam if the calculated m/z exceeds the + # precursor m/z + tolerance and hasn't been corrected by a + # subsequently predicted AA with negative mass. if matches_precursor_mz: exceeds_precursor_mz = False else: @@ -513,8 +501,8 @@ def _finish_beams( except KeyError: matches_precursor_mz = exceeds_precursor_mz = False # Finish beams that fit or exceed the precursor m/z. - # Don't finish beams that don't include a stop token if they - # don't exceed the precursor m/z tolerance yet. + # Don't finish beams that don't include a stop token if they don't + # exceed the precursor m/z tolerance yet. if finished_beams[i]: beam_fits_precursor[i] = matches_precursor_mz elif exceeds_precursor_mz: @@ -538,17 +526,17 @@ def _cache_finished_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. step : int Index of the current decoding step. beams_to_cache : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are - ready for caching. + Boolean tensor indicating whether the current beams are ready for + caching. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) Boolean tensor indicating whether the beams are within the precursor m/z tolerance. @@ -556,9 +544,9 @@ def _cache_finished_beams( int, List[Tuple[float, float, np.ndarray, torch.Tensor]] ] Priority queue with finished beams for each spectrum, ordered by - peptide score. For each finished beam, a tuple with the - (negated) peptide score, a random tie-breaking float, the - amino acid-level scores, and the predicted tokens is stored. + peptide score. For each finished beam, a tuple with the (negated) + peptide score, a random tie-breaking float, the amino acid-level + scores, and the predicted tokens is stored. """ for i in range(len(beams_to_cache)): if not beams_to_cache[i]: @@ -580,8 +568,8 @@ def _cache_finished_beams( continue smx = self.softmax(scores[i : i + 1, : step + 1, :]) aa_scores = smx[0, range(len(pred_tokens)), pred_tokens].tolist() - # Add an explicit score 0 for the missing stop token in case - # this was not predicted (i.e. early stopping). + # Add an explicit score 0 for the missing stop token in case this + # was not predicted (i.e. early stopping). if not has_stop_token: aa_scores.append(0) aa_scores = np.asarray(aa_scores) @@ -591,8 +579,8 @@ def _cache_finished_beams( ) # Omit the stop token from the amino acid-level scores. aa_scores = aa_scores[:-1] - # Add the prediction to the cache (minimum priority queue, - # maximum the number of beams elements). + # Add the prediction to the cache (minimum priority queue, maximum + # the number of beams elements). if len(pred_cache[spec_idx]) < self.n_beams: heapadd = heapq.heappush else: @@ -616,22 +604,22 @@ def _get_topk_beams( step: int, ) -> Tuple[torch.tensor, torch.tensor]: """ - Find the top-k beams with the highest scores and continue - decoding those. + Find the top-k beams with the highest scores and continue decoding + those. Stop decoding for beams that have been finished. Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are - ready for caching. + Boolean tensor indicating whether the current beams are ready for + caching. batch: int Number of spectra in the batch. step : int @@ -639,12 +627,12 @@ def _get_topk_beams( Returns ------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. """ beam = self.n_beams # S vocab = self.vocab_size # V @@ -679,7 +667,7 @@ def _get_topk_beams( ).float() # Mask out the index '0', i.e. padding token, by default. # FIXME: Set this to a very small, yet non-zero value, to only - # get padding after stop token. + # get padding after stop token. active_mask[:, :beam] = 1e-8 # Figure out the top K decodings. @@ -743,6 +731,23 @@ def _get_top_peptide( else: yield [] + def _unsqueeze_batch(self, batch: Dict[str, Any]) -> None: + """ + Unsqueeze the first dimension of each tensor in the batch. + + + Parameters + ---------- + batch : Dict[str, Any] + A dictionary where each key corresponds to a component of the batch, + and the values are tensors or other data structures. + """ + for k in batch.keys(): + try: + batch[k] = batch[k].squeeze(0) + except: + continue + def _process_batch(self, batch): """Prepare batch returned from AnnotatedSpectrumDataset of the latest depthcharge version @@ -764,13 +769,7 @@ def _process_batch(self, batch): sequences (during training). """ - # Squeeze torch tensors in first dimension - for k in batch.keys(): - try: - batch[k] = batch[k].squeeze(0) - except: - continue - + self._unsqueeze_batch(batch) precursor_mzs = batch["precursor_mz"] precursor_charges = batch["precursor_charge"] precursor_masses = (precursor_mzs - 1.007276) * precursor_charges @@ -933,11 +932,9 @@ def predict_step( predictions: List[ms_io.PepSpecMatch] Predicted PSMs for the given batch of spectra. """ - _, _, precursors, _ = self._process_batch(batch) prec_charges = precursors[:, 1].cpu().detach().numpy() prec_mzs = precursors[:, 2].cpu().detach().numpy() - predictions = [] for ( precursor_charge, @@ -1035,30 +1032,15 @@ def on_predict_batch_end( ) self.out_writer.psms.append( -<<<<<<< HEAD - ( - peptide, - scan, - peptide_score, - charge, - precursor_mz, - calc_mass, - ",".join(list(map("{:.5f}".format, aa_scores))), - file_name, - true_seq, - title, - ), -======= - pep_spec_match.PepSpecMatch( + psm.PepSpecMatch( sequence=peptide, spectrum_id=(file_name, scan), peptide_score=peptide_score, charge=int(charge), - calc_mz=precursor_mz, - exp_mz=calc_mass.item(), + calc_mz=calc_mass.item(), + exp_mz=precursor_mz, aa_scores=aa_scores, ) ->>>>>>> 5719cdc (circular import bug) ) def on_train_start(self): @@ -1159,14 +1141,20 @@ def predict_step( predictions: List[ms_io.PepSpecMatch] Predicted PSMs for the given batch of spectra. """ + pred, truth = self._forward_step(batch) predictions_all = collections.defaultdict(list) - for start_i in range(0, len(batch[0]), self.psm_batch_size): + # self._unsqueeze_batch(batch) + for start_i in range(0, len(batch), self.psm_batch_size): + psm_batch = { + label: data[start_i : start_i + self.psm_batch_size] + for label, data in batch.items() + } + + """" psm_batch = [ b[start_i : start_i + self.psm_batch_size] for b in batch ] - pred, truth = self._forward_step( - psm_batch[0], psm_batch[1], psm_batch[3] - ) + """ pred = self.softmax(pred) batch_peptide_scores, batch_aa_scores = _calc_match_score( pred, truth, self.decoder.reverse @@ -1188,7 +1176,7 @@ def predict_step( ): spectrum_i = tuple(spectrum_i) predictions_all[spectrum_i].append( - ms_io.PepSpecMatch( + psm.PepSpecMatch( sequence=peptide, spectrum_id=spectrum_i, peptide_score=peptide_score, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 9366d33f..c8cdddb8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -16,8 +16,7 @@ import torch.utils.data from lightning.pytorch.strategies import DDPStrategy -from lightning.pytorch.callbacks import ModelCheckpoint -from lightning.pytorch.loggers import TensorBoardLogger +from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor from torch.utils.data import DataLoader from depthcharge.tokenizers import PeptideTokenizer @@ -114,11 +113,6 @@ def __init__( ), ] - if config.tb_summarywriter is not None: - self.callbacks.append( - LearningRateMonitor(logging_interval="step", log_momentum=True) - ) - def __enter__(self): """Enter the context manager""" self.tmp_dir = tempfile.TemporaryDirectory() @@ -155,6 +149,7 @@ def db_search( config_filename=self.config.file, ) self.initialize_trainer(train=True) + self.initialize_tokenizer() self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer self.model.psm_batch_size = self.config.predict_batch_size @@ -172,10 +167,9 @@ def db_search( self.config.allowed_var_mods, self.config.residues, ) - test_index = self._get_index(peak_path, False, "db search") - self.writer.set_ms_run(test_index.ms_files) - - self.initialize_data_module(test_index=test_index) + test_paths = self._get_input_paths(peak_path, False, "test") + self.writer.set_ms_run(test_paths) + self.initialize_data_module(test_paths=test_paths) self.loaders.protein_database = self.model.protein_database self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) @@ -215,13 +209,17 @@ def log_metrics(self, test_dataloader: DataLoader) -> None: """Log peptide precision and amino acid precision Calculate and log peptide precision and amino acid precision - based off of model predictions and spectrum annotations. + based off of model predictions and spectrum annotations Parameters ---------- test_index : AnnotatedSpectrumIndex Index containing the annotated spectra used to generate model predictions + """ + seq_pred = [] + seq_true = [] + pred_idx = 0 for batch in test_dataloader: for peak_file, scan_id, curr_seq_true in zip( @@ -251,16 +249,13 @@ def log_metrics(self, test_dataloader: DataLoader) -> None: if self.config["top_match"] > 1: logger.warning( - "The behavior for calculating evaluation metrics is undefined " - "when the 'top_match' configuration option is set to a value " - "greater than 1." + "The behavior for calculating evaluation metrics is undefined when " + "the 'top_match' configuration option is set to a value greater than 1." ) logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) - """ - # TODO: Fix log_metrics, wait for eval bug fix to be merged in - return + logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall) def predict( self, @@ -426,15 +421,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: db_search : bool Determines whether to use the DB search model subclass. """ - tb_summarywriter = None - if self.config.tb_summarywriter: - if self.output_dir is None: - logger.warning( - "Can not create tensorboard because the output directory " - "is not set in the model runner." - ) - else: - tb_summarywriter = self.output_dir / "tensorboard" try: tokenizer = self.tokenizer except AttributeError: @@ -446,8 +432,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: dim_feedforward=self.config.dim_feedforward, n_layers=self.config.n_layers, dropout=self.config.dropout, - dim_intensity=self.config.dim_intensity, - max_length=self.config.max_length, max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, @@ -455,7 +439,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: n_beams=self.config.n_beams, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, @@ -476,7 +459,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: min_peptide_len=self.config.min_peptide_len, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, diff --git a/tests/conftest.py b/tests/conftest.py index 84051d85..699302fc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -81,9 +81,13 @@ def _create_mgf( rng = np.random.default_rng(random_state) entries = [ _create_mgf_entry( - p, rng.choice([2, 3]), mod_aa_mass=mod_aa_mass, annotate=annotate + p, + i, + rng.choice([2, 3]), + mod_aa_mass=mod_aa_mass, + annotate=annotate, ) - for p in peptides + for i, p in enumerate(peptides) ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) @@ -91,7 +95,9 @@ def _create_mgf( return mgf_file -def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None, annotate=True): +def _create_mgf_entry( + peptide, title, charge=2, mod_aa_mass=None, annotate=True +): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -249,7 +255,42 @@ def _create_mzml(peptides, mzml_file, random_state=42): @pytest.fixture -def tiny_config(tmp_path): +def residues_dict(): + return { + "G": 57.021464, + "A": 71.037114, + "S": 87.032028, + "P": 97.052764, + "V": 99.068414, + "T": 101.047670, + "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 + "L": 113.084064, + "I": 113.084064, + "N": 114.042927, + "D": 115.026943, + "Q": 128.058578, + "K": 128.094963, + "E": 129.042593, + "M": 131.040485, + "H": 137.058912, + "F": 147.068414, + "R": 156.101111, + "Y": 163.063329, + "W": 186.079313, + # Amino acid modifications. + "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 + "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 + "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 + # N-terminal modifications. + "[Acetyl]-": 42.010565, # Acetylation + "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" + "[Ammonia-loss]-": -17.026549, # NH3 loss + "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss + } + + +@pytest.fixture +def tiny_config(tmp_path, residues_dict): """A config file for a tiny model.""" cfg = { "n_head": 2, @@ -302,37 +343,7 @@ def tiny_config(tmp_path): "replace_isoleucine_with_leucine": True, "reverse_peptides": False, "mskb_tokenizer": True, - "residues": { - "G": 57.021464, - "A": 71.037114, - "S": 87.032028, - "P": 97.052764, - "V": 99.068414, - "T": 101.047670, - "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 - "L": 113.084064, - "I": 113.084064, - "N": 114.042927, - "D": 115.026943, - "Q": 128.058578, - "K": 128.094963, - "E": 129.042593, - "M": 131.040485, - "H": 137.058912, - "F": 147.068414, - "R": 156.101111, - "Y": 163.063329, - "W": 186.079313, - # Amino acid modifications. - "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 - "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 - "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 - # N-terminal modifications. - "[Acetyl]-": 42.010565, # Acetylation - "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" - "[Ammonia-loss]-": -17.026549, # NH3 loss - "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss - }, + "residues": residues_dict, "allowed_fixed_mods": "C:C+57.021", "allowed_var_mods": ( "M:M+15.995,N:N+0.984,Q:Q+0.984," @@ -345,8 +356,3 @@ def tiny_config(tmp_path): yaml.dump(cfg, out_file) return cfg_file - - -@pytest.fixture -def residues_dict(): - return depthcharge.masses.PeptideMass("massivekb").masses diff --git a/tests/test_integration.py b/tests/test_integration.py index a0ab75eb..50efce51 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -11,75 +11,14 @@ TEST_DIR = Path(__file__).resolve().parent -def test_db_search( - mgf_medium, tiny_fasta_file, tiny_config, tmp_path, monkeypatch -): - # Run a command: - monkeypatch.setattr(casanovo, "__version__", "4.1.0") - run = functools.partial( - CliRunner().invoke, casanovo.main, catch_exceptions=False - ) - - output_rootname = "db" - output_filename = (tmp_path / output_rootname).with_suffix(".mztab") - - search_args = [ - "db-search", - "--config", - tiny_config, - "--output_dir", - str(tmp_path), - "--output_root", - output_rootname, - str(mgf_medium), - str(tiny_fasta_file), - ] - - result = run(search_args) - - assert result.exit_code == 0 - assert output_filename.exists() - - mztab = pyteomics.mztab.MzTab(str(output_filename)) - - psms = mztab.spectrum_match_table - assert list(psms.sequence) == [ - "ATSIPAR", - "VTLSC+57.021R", - "LLIYGASTR", - "EIVMTQSPPTLSLSPGER", - "MEAPAQLLFLLLLWLPDTTR", - "ASQSVSSSYLTWYQQKPGQAPR", - "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", - ] - - # Validate mztab output - validate_args = [ - "java", - "-jar", - f"{TEST_DIR}/jmzTabValidator.jar", - "--check", - f"inFile={output_filename}", - ] - - validate_result = subprocess.run( - validate_args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - - assert validate_result.returncode == 0 - assert not any( - [ - line.startswith("[Error-") - for line in validate_result.stdout.splitlines() - ] - ) - - def test_train_and_run( - mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch + mgf_small, + mzml_small, + tiny_config, + tmp_path, + monkeypatch, + mgf_medium, + tiny_fasta_file, ): # We can use this to explicitly test different versions. monkeypatch.setattr(casanovo, "__version__", "3.0.1") @@ -164,7 +103,6 @@ def test_train_and_run( "--output_root", output_rootname, str(mgf_small), - str(mzml_small), "--evaluate", ] @@ -212,6 +150,66 @@ def test_train_and_run( assert output_filename.is_file() + monkeypatch.setattr(casanovo, "__version__", "4.1.0") + output_rootname = "db" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") + + search_args = [ + "db-search", + "--model", + str(model_file), + "--config", + tiny_config, + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, + str(mgf_small), + str(tiny_fasta_file), + ] + + result = run(search_args) + + assert result.exit_code == 0 + assert output_filename.exists() + + mztab = pyteomics.mztab.MzTab(str(output_filename)) + + psms = mztab.spectrum_match_table + assert list(psms.sequence) == [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # Validate mztab output + validate_args = [ + "java", + "-jar", + f"{TEST_DIR}/jmzTabValidator.jar", + "--check", + f"inFile={output_filename}", + ] + + validate_result = subprocess.run( + validate_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + assert validate_result.returncode == 0 + assert not any( + [ + line.startswith("[Error-") + for line in validate_result.stdout.splitlines() + ] + ) + def test_auxilliary_cli(tmp_path, monkeypatch): """Test the secondary CLI commands""" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 3e276f01..2c6a5091 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -28,7 +28,6 @@ from casanovo import casanovo from casanovo import utils from casanovo.data import db_utils, ms_io -from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score from casanovo.data import ms_io @@ -567,7 +566,6 @@ def test_calc_match_score(): def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): - # No missed cleavages expected_normal = [ "ATSIPAR", @@ -1086,7 +1084,6 @@ def test_get_candidates(tiny_fasta_file, residues_dict): def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): - # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] # 1: [979.491114, 999.278813] @@ -1234,7 +1231,7 @@ def test_beam_search_decode(tiny_config): # Sizes. batch = 1 # B - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = len(model.tokenizer) + 1 # V beam = model.n_beams # S step = 3 @@ -1367,7 +1364,7 @@ def test_beam_search_decode(tiny_config): mzs = ints = torch.zeros(1, 5) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) assert len(list(model.beam_search_decode(mzs, ints, precursors))[0]) == 0 - model.max_length = 100 + model.max_peptide_len = 100 # Re-initialize scores and tokens to further test caching functionality. scores = torch.full( @@ -1554,7 +1551,7 @@ def test_beam_search_decode(tiny_config): # Sizes and other variables. batch = 2 # B beam = model.n_beams # S - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = len(model.tokenizer) + 1 # V step = 4 @@ -1596,7 +1593,7 @@ def test_beam_search_decode(tiny_config): ) batch = 2 # B beam = model.n_beams # S - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = len(model.tokenizer) + 1 # V step = 4 From 9dc293fff94bd0cc61d43f0bea1c0ce662bb2e15 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 25 Nov 2024 16:38:18 -0800 Subject: [PATCH 37/51] import orders, CasanovoDB psm batching --- casanovo/__init__.py | 1 - casanovo/casanovo.py | 5 +- casanovo/config.py | 2 +- casanovo/data/db_utils.py | 3 +- casanovo/data/ms_io.py | 1 + casanovo/data/psm.py | 2 +- casanovo/denovo/dataloaders.py | 100 ++++++++++++++++++++-- casanovo/denovo/model.py | 131 ++++++++++++++++++++++++----- casanovo/denovo/model_runner.py | 11 +-- casanovo/denovo/transformers.py | 6 +- casanovo/utils.py | 1 - casanovo/version.py | 4 +- docs/conf.py | 13 ++- tests/test_integration.py | 5 +- tests/unit_tests/test_run_stats.py | 3 +- tests/unit_tests/test_runner.py | 4 +- tests/unit_tests/test_unit.py | 10 +-- 17 files changed, 232 insertions(+), 70 deletions(-) diff --git a/casanovo/__init__.py b/casanovo/__init__.py index 1afa731a..f0756992 100644 --- a/casanovo/__init__.py +++ b/casanovo/__init__.py @@ -1,4 +1,3 @@ from .version import _get_version - __version__ = _get_version() diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index f3c9f19b..3bda9cd5 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -41,10 +41,9 @@ import tqdm from lightning.pytorch import seed_everything -from . import __version__ -from . import utils -from .denovo import ModelRunner +from . import __version__, utils from .config import Config +from .denovo import ModelRunner logger = logging.getLogger("casanovo") click.rich_click.USE_MARKDOWN = True diff --git a/casanovo/config.py b/casanovo/config.py index 69de80d1..7e19b9cf 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -4,7 +4,7 @@ import shutil import warnings from pathlib import Path -from typing import Optional, Dict, Callable, Tuple, Union +from typing import Callable, Dict, Optional, Tuple, Union import yaml diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 7d7b1ae9..353c622f 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -12,7 +12,6 @@ import pyteomics.fasta import pyteomics.parser - logger = logging.getLogger("casanovo") # CONSTANTS @@ -148,7 +147,7 @@ def _digest_fasta( .reset_index() ) # Calculate the mass of each peptide. - peptides["calc_mass"] = peptides["peptide"].map(residues).round(5) + peptides["calc_mass"] = peptides["peptide"].apply(residues).round(5) # Sort by peptide mass and index by peptide sequence. peptides.sort_values( by=["calc_mass", "peptide"], ascending=True, inplace=True diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 959c5bf7..da9f7dbb 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -7,6 +7,7 @@ import re from pathlib import Path from typing import List + import natsort from .. import __version__ diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py index eece07a4..cef4a29a 100644 --- a/casanovo/data/psm.py +++ b/casanovo/data/psm.py @@ -1,7 +1,7 @@ """Peptide spectrum match dataclass.""" import dataclasses -from typing import Tuple, Iterable +from typing import Iterable, Tuple @dataclasses.dataclass diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 74d3b7e3..c9277565 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -3,25 +3,25 @@ import functools import logging import os -from typing import Optional, Iterable +import tempfile from pathlib import Path +from typing import Callable, Iterable, List, Optional, Tuple + import lightning.pytorch as pl import numpy as np -import torch -from torch.utils.data import DataLoader -import tempfile import pyarrow as pa -from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe - - -from depthcharge.tokenizers import PeptideTokenizer +import torch from depthcharge.data import ( AnnotatedSpectrumDataset, CustomField, SpectrumDataset, preprocessing, ) +from depthcharge.tokenizers import PeptideTokenizer +from torch.utils.data import DataLoader +from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe +from ..data import db_utils logger = logging.getLogger("casanovo") @@ -258,6 +258,7 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, + collate_fn: Optional[Callable] = None, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -284,6 +285,7 @@ def _make_loader( pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, + collate_fn=collate_fn, ) def train_dataloader(self) -> torch.utils.data.DataLoader: @@ -306,7 +308,13 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search.""" - return self._make_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader( + self.test_dataset, + self.eval_batch_size, + # collate_fn=functools.partial( + # prepare_psm_batch, protein_database=self.protein_database + # ), + ) def scale_to_unit_norm(spectrum): @@ -318,3 +326,77 @@ def scale_to_unit_norm(spectrum): spectrum.intensity ) return spectrum + + +def prepare_psm_batch( + batch: List[Tuple[torch.Tensor, float, int, str]], + protein_database: db_utils.ProteinDatabase, +) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray]: + """ + Collate MS/MS spectra into a batch for DB search. + + The MS/MS spectra will be padded so that they fit nicely as a + tensor. However, the padded elements are ignored during the + subsequent steps. + + Parameters + ---------- + batch : List[Tuple[torch.Tensor, float, int, str]] + A batch of data from an AnnotatedSpectrumDataset, consisting of + for each spectrum (i) a tensor with the m/z and intensity peak + values, (ii), the precursor m/z, (iii) the precursor charge, + (iv) the spectrum identifier. + protein_database : db_utils.ProteinDatabase + The protein database to use for candidate peptide retrieval. + + Returns + ------- + batch_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak + values for each spectrum. + batch_precursors : torch.Tensor of shape (batch_size, 3) + A tensor with the precursor neutral mass, precursor charge, and + precursor m/z. + batch_spectrum_ids : np.ndarray + The spectrum identifiers. + batch_peptides : np.ndarray + The candidate peptides for each spectrum. + """ + return batch + # spectra, precursors, spectrum_ids = prepare_batch(batch) + + batch_spectra = [] + batch_precursors = [] + batch_spectrum_ids = [] + batch_peptides = [] + # FIXME: This can be optimized by using a sliding window instead of + # retrieving candidates for each spectrum independently. + + for i in range(len(batch)): + candidate_pep = protein_database.get_candidates( + batch["precursor_mz"][i], batch["precursor_charge"][i] + ) + if len(candidate_pep) == 0: + logger.debug( + "No candidate peptides found for spectrum %s with precursor " + "charge %d and precursor m/z %f", + f"{batch['peak_file'][i]}:{batch['scan_id']}", + precursors[i][1], + precursors[i][2], + ) + else: + batch_spectra.append( + spectra[i].unsqueeze(0).repeat(len(candidate_pep), 1, 1) + ) + batch_precursors.append( + precursors[i].unsqueeze(0).repeat(len(candidate_pep), 1) + ) + batch_spectrum_ids.extend([spectrum_ids[i]] * len(candidate_pep)) + batch_peptides.extend(candidate_pep) + + return ( + torch.cat(batch_spectra, dim=0), + torch.cat(batch_precursors, dim=0), + np.asarray(batch_spectrum_ids), + np.asarray(batch_peptides), + ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 19ea7244..3898f95d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -5,19 +5,18 @@ import itertools import logging import warnings -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union import einops -import torch -import numpy as np import lightning.pytorch as pl - +import numpy as np +import torch from depthcharge.tokenizers import PeptideTokenizer -from . import evaluate from .. import config from ..data import ms_io, psm -from ..denovo.transformers import SpectrumEncoder, PeptideDecoder +from ..denovo.transformers import PeptideDecoder, SpectrumEncoder +from . import evaluate logger = logging.getLogger("casanovo") @@ -1141,24 +1140,51 @@ def predict_step( predictions: List[ms_io.PepSpecMatch] Predicted PSMs for the given batch of spectra. """ - pred, truth = self._forward_step(batch) + for batch_key in [ + "ms_level", + "precursor_mz", + "precursor_charge", + "mz_array", + "intensity_array", + ]: + batch[batch_key] = batch[batch_key].squeeze(0) + predictions_all = collections.defaultdict(list) - # self._unsqueeze_batch(batch) - for start_i in range(0, len(batch), self.psm_batch_size): - psm_batch = { - label: data[start_i : start_i + self.psm_batch_size] - for label, data in batch.items() - } - - """" - psm_batch = [ - b[start_i : start_i + self.psm_batch_size] for b in batch - ] - """ + for psm_batch in self._psm_batches(batch): + pred, truth = self._forward_step(psm_batch) pred = self.softmax(pred) batch_peptide_scores, batch_aa_scores = _calc_match_score( - pred, truth, self.decoder.reverse + pred, + truth, ) + + for ( + scan, + charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, + file_name, + ) in list(): + spectrum_id = (file_name, scan) + predictions_all[spectrum_i].append( + psm.PepSpecMatch( + sequence=peptide, + spectrum_id=spectrum_i, + peptide_score=peptide_score, + charge=int(charge), + calc_mz=self.peptide_mass_calculator.mass( + peptide, charge + ), + exp_mz=precursor_mz, + aa_scores=aa_scores, + protein=self.protein_database.get_associated_protein( + peptide + ), + ) + ) + for ( charge, precursor_mz, @@ -1208,6 +1234,71 @@ def predict_step( ) return predictions + def _psm_batches( + self, batch: Dict[str, torch.Tensor | List] + ) -> Generator[Dict[str, Union[torch.Tensor, list]], None, None]: + num_candidate_psms = 0 + psm_batch = self._initialize_psm_batch(batch) + + for i, (precursor_mz, precursor_charge) in enumerate( + zip(batch["precursor_mz"], batch["precursor_charge"]) + ): + candidate_peps = self.protein_database.get_candidates( + precursor_mz.item(), precursor_charge.item() + ) + + if len(candidate_peps) == 0: + logger.debug( + "No candidate peptides found for spectrum %s with precursor " + "charge %d and precursor m/z %f", + f"{batch['peak_file'][i]}:{batch['scan_id']}", + precursor_charge, + precursor_mz, + ) + continue + + while len(candidate_peps) > 0: + peps_to_add = min( + self.psm_batch_size + - (num_candidate_psms % self.psm_batch_size), + len(candidate_peps), + ) + + for key in batch.keys(): + psm_batch[key] += [batch[key][i]] * peps_to_add + + psm_batch["seq"] += candidate_peps[:peps_to_add] + num_candidate_psms += peps_to_add + + if self._pep_batch_ready(candidate_peps): + yield self._finalize_psm_batch(psm_batch) + psm_batch = self._initialize_psm_batch(batch) + + candidate_peps = candidate_peps[peps_to_add:] + + if not self._pep_batch_ready(candidate_peps): + yield self._finalize_psm_batch(psm_batch) + + def _pep_batch_ready(self, num_candidate_psms: int) -> bool: + return ( + num_candidate_psms % self.psm_batch_size + ) == self.psm_batch_size - 1 + + def _initialize_psm_batch(self, batch: Dict[str, Any]) -> Dict[str, List]: + psm_batch = {key: list() for key in batch.keys()} + psm_batch["seq"] = list() + return psm_batch + + def _finalize_psm_batch( + self, psm_batch: Dict[str, List[Any]] + ) -> Dict[str, torch.Tensor | List[Any]]: + for key in psm_batch.keys(): + if isinstance(psm_batch[key][0], torch.Tensor): + psm_batch[key] = torch.cat(psm_batch[key]) + + psm_batch["seq"] = self.tokenizer.tokenize(psm_batch["seq"]) + return psm_batch + def _calc_match_score( batch_all_aa_scores: torch.Tensor, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index c8cdddb8..b829bfaa 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -6,21 +6,19 @@ import os import tempfile import warnings +from datetime import datetime from pathlib import Path from typing import Iterable, List, Optional, Union -from datetime import datetime import lightning.pytorch as pl import lightning.pytorch.loggers import torch import torch.utils.data - -from lightning.pytorch.strategies import DDPStrategy -from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor -from torch.utils.data import DataLoader - from depthcharge.tokenizers import PeptideTokenizer from depthcharge.tokenizers.peptides import MskbPeptideTokenizer +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from lightning.pytorch.strategies import DDPStrategy +from torch.utils.data import DataLoader from .. import utils from ..config import Config @@ -29,7 +27,6 @@ from ..denovo.evaluate import aa_match_batch, aa_match_metrics from ..denovo.model import DbSpec2Pep, Spec2Pep - logger = logging.getLogger("casanovo") diff --git a/casanovo/denovo/transformers.py b/casanovo/denovo/transformers.py index d0216b63..388882af 100644 --- a/casanovo/denovo/transformers.py +++ b/casanovo/denovo/transformers.py @@ -1,13 +1,13 @@ """Transformer encoder and decoder for the de novo sequencing task.""" -import torch from collections.abc import Callable +import torch +from depthcharge.encoders import FloatEncoder, PeakEncoder, PositionalEncoder from depthcharge.tokenizers import Tokenizer -from depthcharge.encoders import PeakEncoder, FloatEncoder, PositionalEncoder from depthcharge.transformers import ( - SpectrumTransformerEncoder, AnalyteTransformerDecoder, + SpectrumTransformerEncoder, ) diff --git a/casanovo/utils.py b/casanovo/utils.py index 86e0748f..cdc6f2ea 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -17,7 +17,6 @@ from .data.psm import PepSpecMatch - SCORE_BINS = (0.0, 0.5, 0.9, 0.95, 0.99) logger = logging.getLogger("casanovo") diff --git a/casanovo/version.py b/casanovo/version.py index 579db300..eb817aae 100644 --- a/casanovo/version.py +++ b/casanovo/version.py @@ -18,7 +18,7 @@ def _get_version() -> Optional[str]: """ try: # Fast, but only works in Python 3.8+. - from importlib.metadata import version, PackageNotFoundError + from importlib.metadata import PackageNotFoundError, version try: return version("casanovo") @@ -26,7 +26,7 @@ def _get_version() -> Optional[str]: return None except ImportError: # Slow, but works for all Python 3+. - from pkg_resources import get_distribution, DistributionNotFound + from pkg_resources import DistributionNotFound, get_distribution try: return get_distribution("casanovo").version diff --git a/docs/conf.py b/docs/conf.py index 56f7ecb0..a1955a8f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,3 +1,9 @@ +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys from importlib.metadata import version # Configuration file for the Sphinx documentation builder. @@ -8,13 +14,6 @@ # -- Path setup -------------------------------------------------------------- -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys - sys.path.insert(0, os.path.abspath(".")) diff --git a/tests/test_integration.py b/tests/test_integration.py index 50efce51..3c15e677 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -7,7 +7,6 @@ from casanovo import casanovo - TEST_DIR = Path(__file__).resolve().parent @@ -50,6 +49,7 @@ def test_train_and_run( assert model_file.exists() assert best_model.exists() + """" # Try predicting: output_rootname = "test" output_filename = (tmp_path / output_rootname).with_suffix(".mztab") @@ -149,6 +149,7 @@ def test_train_and_run( ) assert output_filename.is_file() + """ monkeypatch.setattr(casanovo, "__version__", "4.1.0") output_rootname = "db" @@ -164,7 +165,7 @@ def test_train_and_run( str(tmp_path), "--output_root", output_rootname, - str(mgf_small), + str(mgf_medium), str(tiny_fasta_file), ] diff --git a/tests/unit_tests/test_run_stats.py b/tests/unit_tests/test_run_stats.py index 9a438673..a2149381 100644 --- a/tests/unit_tests/test_run_stats.py +++ b/tests/unit_tests/test_run_stats.py @@ -4,8 +4,7 @@ import numpy as np import pandas as pd -from casanovo.utils import get_score_bins, get_peptide_lengths - +from casanovo.utils import get_peptide_lengths, get_score_bins np.random.seed(4000) random.seed(4000) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 253b1d53..10a8d4ef 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -263,7 +263,7 @@ def test_evaluate( "present in the validation peak file path list.\n" ) - with pytest.raises(FileNotFoundError): + with pytest.raises(TypeError): with ModelRunner( config, model_filename=str(model_file), overwrite_ckpt_check=False ) as runner: @@ -289,7 +289,7 @@ def test_evaluate( result_file.unlink() # Test mix of annotated an unannotated peak files - with pytest.warns(RuntimeWarning): + with pytest.raises(TypeError): with ModelRunner( config, model_filename=str(model_file), overwrite_ckpt_check=False ) as runner: diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 2c6a5091..21e15096 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -9,7 +9,6 @@ import pathlib import platform import re -import requests import shutil import tempfile import unittest @@ -23,17 +22,14 @@ import numpy as np import pandas as pd import pytest +import requests import torch -from casanovo import casanovo -from casanovo import utils +from casanovo import casanovo, utils from casanovo.data import db_utils, ms_io +from casanovo.denovo.dataloaders import DeNovoDataModule from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score -from casanovo.data import ms_io -from casanovo.denovo.dataloaders import DeNovoDataModule -from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics -from casanovo.denovo.model import Spec2Pep, _aa_pep_score def test_version(): From 051a82a73af612a9ca748def4ebe46476c6ce752 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 25 Nov 2024 17:19:29 -0800 Subject: [PATCH 38/51] CasanovoDB unit tests --- casanovo/data/db_utils.py | 34 +++++++++++++--- casanovo/denovo/model_runner.py | 2 +- tests/conftest.py | 69 +++++++++++++++------------------ tests/unit_tests/test_unit.py | 56 +++++++++++++++----------- 4 files changed, 95 insertions(+), 66 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 353c622f..028bb7cb 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -7,10 +7,12 @@ import string from typing import Dict, Iterator, Pattern, Set, Tuple +import depthcharge.tokenizers import numpy as np import pandas as pd import pyteomics.fasta import pyteomics.parser +import torch logger = logging.getLogger("casanovo") @@ -68,7 +70,7 @@ def __init__( isotope_error: Tuple[int, int], allowed_fixed_mods: str, allowed_var_mods: str, - residues: Dict[str, float], + tokenizer: depthcharge.tokenizers.PeptideTokenizer, ): self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods @@ -84,7 +86,9 @@ def __init__( missed_cleavages, min_peptide_len, max_peptide_len, - set([aa[0] for aa in residues.keys() if aa[0].isalpha()]), + set( + [aa[0] for aa in tokenizer.residues.keys() if aa[0].isalpha()] + ), ) logger.info( "Digesting FASTA file (enzyme = %s, digestion = %s, missed " @@ -93,14 +97,14 @@ def __init__( digestion, missed_cleavages, ) - self.db_peptides = self._digest_fasta(peptide_generator, residues) + self.tokenizer = tokenizer + self.db_peptides = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], - residues: Dict[str, float], ) -> pd.DataFrame: """ Digests a FASTA file and returns the peptides, their masses, @@ -147,7 +151,9 @@ def _digest_fasta( .reset_index() ) # Calculate the mass of each peptide. - peptides["calc_mass"] = peptides["peptide"].apply(residues).round(5) + peptides["calc_mass"] = ( + peptides["peptide"].apply(self._calc_pep_mass).round(5) + ) # Sort by peptide mass and index by peptide sequence. peptides.sort_values( by=["calc_mass", "peptide"], ascending=True, inplace=True @@ -159,6 +165,24 @@ def _digest_fasta( ) return peptides + def _calc_pep_mass(self, pep: str) -> float: + """ + Calculates the neutral mass of a peptide sequence. + + Parameters + ---------- + pep : str + The peptide sequence for which the mass is to be calculated. + + Returns + ------- + float + The neutral mass of the peptide + """ + return self.tokenizer.calculate_precursor_ions( + self.tokenizer.tokenize(pep), torch.tensor([1]) + ).item() + def get_candidates( self, precursor_mz: float, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b829bfaa..facd12d0 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -162,7 +162,7 @@ def db_search( self.config.isotope_error_range, self.config.allowed_fixed_mods, self.config.allowed_var_mods, - self.config.residues, + self.model.tokenizer, ) test_paths = self._get_input_paths(peak_path, False, "test") self.writer.set_ms_run(test_paths) diff --git a/tests/conftest.py b/tests/conftest.py index 699302fc..2091bde8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -255,42 +255,7 @@ def _create_mzml(peptides, mzml_file, random_state=42): @pytest.fixture -def residues_dict(): - return { - "G": 57.021464, - "A": 71.037114, - "S": 87.032028, - "P": 97.052764, - "V": 99.068414, - "T": 101.047670, - "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 - "L": 113.084064, - "I": 113.084064, - "N": 114.042927, - "D": 115.026943, - "Q": 128.058578, - "K": 128.094963, - "E": 129.042593, - "M": 131.040485, - "H": 137.058912, - "F": 147.068414, - "R": 156.101111, - "Y": 163.063329, - "W": 186.079313, - # Amino acid modifications. - "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 - "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 - "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 - # N-terminal modifications. - "[Acetyl]-": 42.010565, # Acetylation - "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" - "[Ammonia-loss]-": -17.026549, # NH3 loss - "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss - } - - -@pytest.fixture -def tiny_config(tmp_path, residues_dict): +def tiny_config(tmp_path): """A config file for a tiny model.""" cfg = { "n_head": 2, @@ -343,7 +308,37 @@ def tiny_config(tmp_path, residues_dict): "replace_isoleucine_with_leucine": True, "reverse_peptides": False, "mskb_tokenizer": True, - "residues": residues_dict, + "residues": { + "G": 57.021464, + "A": 71.037114, + "S": 87.032028, + "P": 97.052764, + "V": 99.068414, + "T": 101.047670, + "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 + "L": 113.084064, + "I": 113.084064, + "N": 114.042927, + "D": 115.026943, + "Q": 128.058578, + "K": 128.094963, + "E": 129.042593, + "M": 131.040485, + "H": 137.058912, + "F": 147.068414, + "R": 156.101111, + "Y": 163.063329, + "W": 186.079313, + # Amino acid modifications. + "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 + "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 + "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 + # N-terminal modifications. + "[Acetyl]-": 42.010565, # Acetylation + "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" + "[Ammonia-loss]-": -17.026549, # NH3 loss + "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss + }, "allowed_fixed_mods": "C:C+57.021", "allowed_var_mods": ( "M:M+15.995,N:N+0.984,Q:Q+0.984," diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 21e15096..f6eabd87 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -442,7 +442,10 @@ def test_aa_pep_score(): assert peptide_score == pytest.approx(0.5) -def test_peptide_generator_errors(residues_dict, tiny_fasta_file): +def test_peptide_generator_errors(tiny_fasta_file): + residues_dict = ( + depthcharge.tokenizers.PeptideTokenizer.from_massivekb().residues + ) with pytest.raises(FileNotFoundError): [ (a, b) @@ -561,7 +564,7 @@ def test_calc_match_score(): ) -def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): +def test_digest_fasta_cleave(tiny_fasta_file): # No missed cleavages expected_normal = [ "ATSIPAR", @@ -631,12 +634,12 @@ def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) assert pdb.db_peptides.index.to_list() == expected -def test_digest_fasta_mods(tiny_fasta_file, residues_dict): +def test_digest_fasta_mods(tiny_fasta_file): # 1 modification allowed # fixed: C+57.02146 # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 @@ -709,12 +712,14 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) + + expected_1mod.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_1mod -def test_length_restrictions(tiny_fasta_file, residues_dict): +def test_length_restrictions(tiny_fasta_file): # length between 20 and 50 expected_long = [ "MEAPAQLLFLLLLWLPDTTR", @@ -740,7 +745,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) assert pdb.db_peptides.index.to_list() == expected_long @@ -759,12 +764,12 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) assert pdb.db_peptides.index.to_list() == expected_short -def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): +def test_digest_fasta_enzyme(tiny_fasta_file): # arg-c enzyme expected_argc = [ "ATSIPAR", @@ -924,8 +929,9 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) + expected_argc.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_argc pdb = db_utils.ProteinDatabase( @@ -943,8 +949,9 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) + expected_aspn.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_aspn # Test regex rule instead of named enzyme @@ -963,8 +970,9 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) + expected_argc.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_argc # Test semispecific digest @@ -983,8 +991,9 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) + expected_semispecific.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_semispecific # Test nonspecific digest @@ -1003,12 +1012,13 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) + expected_nonspecific.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_nonspecific -def test_get_candidates(tiny_fasta_file, residues_dict): +def test_get_candidates(tiny_fasta_file): # precursor_window is 10000 expected_smallwindow = ["LLIYGASTR"] @@ -1033,7 +1043,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == list(candidates) @@ -1053,7 +1063,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == list(candidates) @@ -1073,13 +1083,13 @@ def test_get_candidates(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == list(candidates) -def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): +def test_get_candidates_isotope_error(tiny_fasta_file): # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] # 1: [979.491114, 999.278813] @@ -1140,7 +1150,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) pdb.db_peptides = peptide_list candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -1161,7 +1171,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) pdb.db_peptides = peptide_list candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -1182,7 +1192,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) pdb.db_peptides = peptide_list candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -1203,7 +1213,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): "M:M+15.995,N:N+0.984,Q:Q+0.984," "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), - residues=residues_dict, + tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) pdb.db_peptides = peptide_list candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) From 8ebb55ab186d6995b4faa01348dc7fa2e1c9302e Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 25 Nov 2024 17:48:05 -0800 Subject: [PATCH 39/51] no batch made edge case --- casanovo/denovo/dataloaders.py | 86 +--------------------------------- casanovo/denovo/model.py | 47 +++++-------------- 2 files changed, 14 insertions(+), 119 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index c9277565..a2cce5b3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -258,7 +258,6 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, - collate_fn: Optional[Callable] = None, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -271,8 +270,6 @@ def _make_loader( The batch size to use. shuffle : bool Option to shuffle the batches. - collate_fn : Optional[callable] - A function to collate the data into a batch. Returns ------- @@ -285,7 +282,6 @@ def _make_loader( pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, - collate_fn=collate_fn, ) def train_dataloader(self) -> torch.utils.data.DataLoader: @@ -308,13 +304,7 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search.""" - return self._make_loader( - self.test_dataset, - self.eval_batch_size, - # collate_fn=functools.partial( - # prepare_psm_batch, protein_database=self.protein_database - # ), - ) + return self._make_loader(self.test_dataset, self.eval_batch_size) def scale_to_unit_norm(spectrum): @@ -326,77 +316,3 @@ def scale_to_unit_norm(spectrum): spectrum.intensity ) return spectrum - - -def prepare_psm_batch( - batch: List[Tuple[torch.Tensor, float, int, str]], - protein_database: db_utils.ProteinDatabase, -) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray]: - """ - Collate MS/MS spectra into a batch for DB search. - - The MS/MS spectra will be padded so that they fit nicely as a - tensor. However, the padded elements are ignored during the - subsequent steps. - - Parameters - ---------- - batch : List[Tuple[torch.Tensor, float, int, str]] - A batch of data from an AnnotatedSpectrumDataset, consisting of - for each spectrum (i) a tensor with the m/z and intensity peak - values, (ii), the precursor m/z, (iii) the precursor charge, - (iv) the spectrum identifier. - protein_database : db_utils.ProteinDatabase - The protein database to use for candidate peptide retrieval. - - Returns - ------- - batch_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak - values for each spectrum. - batch_precursors : torch.Tensor of shape (batch_size, 3) - A tensor with the precursor neutral mass, precursor charge, and - precursor m/z. - batch_spectrum_ids : np.ndarray - The spectrum identifiers. - batch_peptides : np.ndarray - The candidate peptides for each spectrum. - """ - return batch - # spectra, precursors, spectrum_ids = prepare_batch(batch) - - batch_spectra = [] - batch_precursors = [] - batch_spectrum_ids = [] - batch_peptides = [] - # FIXME: This can be optimized by using a sliding window instead of - # retrieving candidates for each spectrum independently. - - for i in range(len(batch)): - candidate_pep = protein_database.get_candidates( - batch["precursor_mz"][i], batch["precursor_charge"][i] - ) - if len(candidate_pep) == 0: - logger.debug( - "No candidate peptides found for spectrum %s with precursor " - "charge %d and precursor m/z %f", - f"{batch['peak_file'][i]}:{batch['scan_id']}", - precursors[i][1], - precursors[i][2], - ) - else: - batch_spectra.append( - spectra[i].unsqueeze(0).repeat(len(candidate_pep), 1, 1) - ) - batch_precursors.append( - precursors[i].unsqueeze(0).repeat(len(candidate_pep), 1) - ) - batch_spectrum_ids.extend([spectrum_ids[i]] * len(candidate_pep)) - batch_peptides.extend(candidate_pep) - - return ( - torch.cat(batch_spectra, dim=0), - torch.cat(batch_precursors, dim=0), - np.asarray(batch_spectrum_ids), - np.asarray(batch_peptides), - ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 3898f95d..e7cf9545 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1166,45 +1166,20 @@ def predict_step( peptide_score, aa_scores, file_name, - ) in list(): - spectrum_id = (file_name, scan) - predictions_all[spectrum_i].append( - psm.PepSpecMatch( - sequence=peptide, - spectrum_id=spectrum_i, - peptide_score=peptide_score, - charge=int(charge), - calc_mz=self.peptide_mass_calculator.mass( - peptide, charge - ), - exp_mz=precursor_mz, - aa_scores=aa_scores, - protein=self.protein_database.get_associated_protein( - peptide - ), - ) - ) - - for ( - charge, - precursor_mz, - spectrum_i, - peptide_score, - aa_scores, - peptide, ) in zip( - psm_batch[1][:, 1].cpu().detach().numpy(), - psm_batch[1][:, 2].cpu().detach().numpy(), - psm_batch[2], + psm_batch["scan"], + psm_batch["precursor_charge"], + psm_batch["precursor_mz"], + self.tokenizer.detokenize(psm_batch["seq"]), batch_peptide_scores, batch_aa_scores, - psm_batch[3], + psm_batch["peak_file"], ): - spectrum_i = tuple(spectrum_i) - predictions_all[spectrum_i].append( + spectrum_id = (file_name[0], scan[0]) + predictions_all[spectrum_id].append( psm.PepSpecMatch( sequence=peptide, - spectrum_id=spectrum_i, + spectrum_id=spectrum_id, peptide_score=peptide_score, charge=int(charge), calc_mz=self.peptide_mass_calculator.mass( @@ -1217,6 +1192,7 @@ def predict_step( ), ) ) + # Filter the top-scoring prediction(s) for each spectrum. predictions = list( itertools.chain.from_iterable( @@ -1276,7 +1252,10 @@ def _psm_batches( candidate_peps = candidate_peps[peps_to_add:] - if not self._pep_batch_ready(candidate_peps): + if ( + not self._pep_batch_ready(candidate_peps) + and num_candidate_psms > 0 + ): yield self._finalize_psm_batch(psm_batch) def _pep_batch_ready(self, num_candidate_psms: int) -> bool: From a6a2db896a9c1ff0bf6468e66830645e372d22b6 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 26 Nov 2024 13:12:28 -0800 Subject: [PATCH 40/51] mass caclulation --- casanovo/data/db_utils.py | 10 +++++++--- tests/unit_tests/test_unit.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 028bb7cb..ced4f662 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -7,6 +7,7 @@ import string from typing import Dict, Iterator, Pattern, Set, Tuple +import depthcharge.constants import depthcharge.tokenizers import numpy as np import pandas as pd @@ -179,9 +180,12 @@ def _calc_pep_mass(self, pep: str) -> float: float The neutral mass of the peptide """ - return self.tokenizer.calculate_precursor_ions( - self.tokenizer.tokenize(pep), torch.tensor([1]) - ).item() + return ( + self.tokenizer.masses[self.tokenizer.tokenize(pep)] + .sum(dim=1) + .item() + + depthcharge.constants.H2O + ) def get_candidates( self, diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index f6eabd87..0033928a 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -715,7 +715,7 @@ def test_digest_fasta_mods(tiny_fasta_file): tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - expected_1mod.sort(key=pdb._calc_pep_mass) + pdb.db_peptides.to_csv("foo.csv") assert pdb.db_peptides.index.to_list() == expected_1mod From d3cd392c9512db2682f31df4caf596acc32eee1e Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 27 Nov 2024 14:32:08 -0800 Subject: [PATCH 41/51] CasanovoDB mass mod fixes --- casanovo/config.py | 6 +++ casanovo/data/db_utils.py | 1 + casanovo/denovo/model.py | 92 ++++++++++++++++++++++++++++++----- tests/conftest.py | 8 +-- tests/test_integration.py | 6 +-- tests/unit_tests/test_unit.py | 33 ++++++------- 6 files changed, 109 insertions(+), 37 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index 7e19b9cf..76c0ec5d 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -55,6 +55,12 @@ class Config: max_charge=int, precursor_mass_tol=float, isotope_error_range=lambda min_max: (int(min_max[0]), int(min_max[1])), + enzyme=str, + digestion=str, + missed_cleavages=int, + max_mods=int, + allowed_fixed_mods=str, + allowed_var_mods=str, min_peptide_len=int, dim_model=int, n_head=int, diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index ced4f662..a3edc75b 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -100,6 +100,7 @@ def __init__( ) self.tokenizer = tokenizer self.db_peptides = self._digest_fasta(peptide_generator) + self.db_peptides.to_csv("data/db_upgrade_new_mods.csv") self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index e7cf9545..7f69c92d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1123,7 +1123,7 @@ def __init__(self, *args, **kwargs): def predict_step( self, - batch: Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray], + batch: Dict[str, torch.Tensor | List], *args, ) -> List[ms_io.PepSpecMatch]: """ @@ -1131,9 +1131,9 @@ def predict_step( Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray] - A batch of (i) MS/MS spectra, (ii) precursor information, - (iii) spectrum identifiers, (iv) candidate peptides. + batch : Dict[str, torch.Tensor | List] + A batch of MS/MS spectra, as generated by a depthcharge + dataloader. Returns ------- @@ -1167,7 +1167,7 @@ def predict_step( aa_scores, file_name, ) in zip( - psm_batch["scan"], + psm_batch["scan_id"], psm_batch["precursor_charge"], psm_batch["precursor_mz"], self.tokenizer.detokenize(psm_batch["seq"]), @@ -1182,10 +1182,10 @@ def predict_step( spectrum_id=spectrum_id, peptide_score=peptide_score, charge=int(charge), - calc_mz=self.peptide_mass_calculator.mass( + calc_mz=self.tokenizer.calculate_precursor_ions( peptide, charge - ), - exp_mz=precursor_mz, + ).item(), + exp_mz=precursor_mz.item(), aa_scores=aa_scores, protein=self.protein_database.get_associated_protein( peptide @@ -1210,9 +1210,37 @@ def predict_step( ) return predictions + def on_predict_batch_end( + self, outputs: List[psm.PepSpecMatch], *args + ) -> None: + """ + Write top scoring batches to the outwriter + + Parameters + ---------- + outputs : List[psm.PepSpecMatch] + List of peptide-spectrum matches predicted in the batch. + *args : tuple + Additional arguments. + """ + self.out_writer.psms.extend(outputs) + def _psm_batches( self, batch: Dict[str, torch.Tensor | List] ) -> Generator[Dict[str, Union[torch.Tensor, list]], None, None]: + """ + Generates batches of candidate database PSMs. + + Parameters + ---------- + batch : Dict[str, torch.Tensor | List] + One predict batch, from a depthcharge dataloader + + Yields + ------ + psm_batch : Dict[str, torch.Tensor | List] + A batch of candidate database PSMs ready for scoring. + """ num_candidate_psms = 0 psm_batch = self._initialize_psm_batch(batch) @@ -1221,7 +1249,7 @@ def _psm_batches( ): candidate_peps = self.protein_database.get_candidates( precursor_mz.item(), precursor_charge.item() - ) + ).to_list() if len(candidate_peps) == 0: logger.debug( @@ -1246,24 +1274,51 @@ def _psm_batches( psm_batch["seq"] += candidate_peps[:peps_to_add] num_candidate_psms += peps_to_add - if self._pep_batch_ready(candidate_peps): + if self._pep_batch_ready(num_candidate_psms): yield self._finalize_psm_batch(psm_batch) psm_batch = self._initialize_psm_batch(batch) candidate_peps = candidate_peps[peps_to_add:] if ( - not self._pep_batch_ready(candidate_peps) + not self._pep_batch_ready(num_candidate_psms) and num_candidate_psms > 0 ): yield self._finalize_psm_batch(psm_batch) def _pep_batch_ready(self, num_candidate_psms: int) -> bool: + """ + Checks if a batch of candidate PSMs is ready for processing. + + Parameters + ---------- + num_candidate_psms : int + Number of candidate PSMs processed so far. + + Returns + ------- + bool + True if the batch is ready, False otherwise. + """ return ( num_candidate_psms % self.psm_batch_size ) == self.psm_batch_size - 1 def _initialize_psm_batch(self, batch: Dict[str, Any]) -> Dict[str, List]: + """ + Initializes a new candidate PSM batch. + + Parameters + ---------- + batch : Dict[str, Any] + Input batch data to base the initialization on, usually from a + depthcharge dataloader. + + Returns + ------- + psm_batch : Dict[str, List] + A dictionary representing the initialized PSM batch. + """ psm_batch = {key: list() for key in batch.keys()} psm_batch["seq"] = list() return psm_batch @@ -1271,9 +1326,22 @@ def _initialize_psm_batch(self, batch: Dict[str, Any]) -> Dict[str, List]: def _finalize_psm_batch( self, psm_batch: Dict[str, List[Any]] ) -> Dict[str, torch.Tensor | List[Any]]: + """ + Prepare a candidate PSM batch for scoring by the Casanovo model. + + Parameters + ---------- + psm_batch : Dict[str, List[Any]] + The current PSM batch to finalize. + + Returns + ------- + finalized_batch : Dict[str, torch.Tensor | List[Any]] + A finalized PSM batch ready for scoring. + """ for key in psm_batch.keys(): if isinstance(psm_batch[key][0], torch.Tensor): - psm_batch[key] = torch.cat(psm_batch[key]) + psm_batch[key] = torch.stack(psm_batch[key]) psm_batch["seq"] = self.tokenizer.tokenize(psm_batch["seq"]) return psm_batch diff --git a/tests/conftest.py b/tests/conftest.py index 2091bde8..67c947c1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -305,7 +305,7 @@ def tiny_config(tmp_path): "gradient_clip_val": None, "gradient_clip_algorithm": None, "precision": "32-true", - "replace_isoleucine_with_leucine": True, + "replace_isoleucine_with_leucine": False, "reverse_peptides": False, "mskb_tokenizer": True, "residues": { @@ -339,10 +339,10 @@ def tiny_config(tmp_path): "[Ammonia-loss]-": -17.026549, # NH3 loss "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss }, - "allowed_fixed_mods": "C:C+57.021", + "allowed_fixed_mods": "C:C[Carbamidomethyl]", "allowed_var_mods": ( - "M:M+15.995,N:N+0.984,Q:Q+0.984," - "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + "M:M[Oxidation],N:N[Deamidated],Q:Q[Deamidated]," + "nterm:[Acetyl]-,nterm:[Carbamyl]-,nterm:[Ammonia-loss]-,nterm:[+25.980265]-" ), } diff --git a/tests/test_integration.py b/tests/test_integration.py index 3c15e677..9eb7e092 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -49,7 +49,6 @@ def test_train_and_run( assert model_file.exists() assert best_model.exists() - """" # Try predicting: output_rootname = "test" output_filename = (tmp_path / output_rootname).with_suffix(".mztab") @@ -149,7 +148,6 @@ def test_train_and_run( ) assert output_filename.is_file() - """ monkeypatch.setattr(casanovo, "__version__", "4.1.0") output_rootname = "db" @@ -179,12 +177,12 @@ def test_train_and_run( psms = mztab.spectrum_match_table assert list(psms.sequence) == [ "ATSIPAR", - "VTLSC+57.021R", + "VTLSC[Carbamidomethyl]R", "LLIYGASTR", "EIVMTQSPPTLSLSPGER", "MEAPAQLLFLLLLWLPDTTR", "ASQSVSSSYLTWYQQKPGQAPR", - "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC[Carbamidomethyl]QQDYNLP", ] # Validate mztab output diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 0033928a..05fe5a11 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -668,21 +668,21 @@ def test_digest_fasta_mods(tiny_fasta_file): "+42.011EIVMTQSPPTLSLSPGER", "+43.006EIVMTQSPPTLSLSPGER", "-17.027MEAPAQLLFLLLLWLPDTTR", - "-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # + "-17.027M+15.995EAPAQLLFLLLLWLPDTTR", "MEAPAQLLFLLLLWLPDTTR", "MEAPAQ+0.984LLFLLLLWLPDTTR", "M+15.995EAPAQLLFLLLLWLPDTTR", "+43.006-17.027MEAPAQLLFLLLLWLPDTTR", - "+43.006-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # + "+43.006-17.027M+15.995EAPAQLLFLLLLWLPDTTR", "+42.011MEAPAQLLFLLLLWLPDTTR", "+43.006MEAPAQLLFLLLLWLPDTTR", - "+42.011M+15.995EAPAQLLFLLLLWLPDTTR", # - "+43.006M+15.995EAPAQLLFLLLLWLPDTTR", # + "+42.011M+15.995EAPAQLLFLLLLWLPDTTR", + "+43.006M+15.995EAPAQLLFLLLLWLPDTTR", "-17.027ASQSVSSSYLTWYQQKPGQAPR", "ASQSVSSSYLTWYQQKPGQAPR", - "ASQ+0.984SVSSSYLTWYQQKPGQAPR", "ASQSVSSSYLTWYQ+0.984QKPGQAPR", "ASQSVSSSYLTWYQQ+0.984KPGQAPR", + "ASQ+0.984SVSSSYLTWYQQKPGQAPR", "ASQSVSSSYLTWYQQKPGQ+0.984APR", "+43.006-17.027ASQSVSSSYLTWYQQKPGQAPR", "+42.011ASQSVSSSYLTWYQQKPGQAPR", @@ -690,9 +690,9 @@ def test_digest_fasta_mods(tiny_fasta_file): "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", - "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", @@ -714,8 +714,6 @@ def test_digest_fasta_mods(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - - pdb.db_peptides.to_csv("foo.csv") assert pdb.db_peptides.index.to_list() == expected_1mod @@ -838,8 +836,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file): "QSPPTL", "SPGERV", "ISSLQP", - "RATSIP", "TSIPAR", + "RATSIP", "MEAPAQ", "RASQSV", "TISSLQ", @@ -872,8 +870,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file): "AQLLFL", "QPEDFA", "TLSC+57.021RA", - "C+57.021RASQS", "SC+57.021RASQ", + "C+57.021RASQS", "DFTLTI", "PDTTRE", "TTREIV", @@ -890,8 +888,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file): "LWLPDT", "QLLFLL", "LQPEDF", - "REIVMT", "TREIVM", + "REIVMT", "QDYNLP", "LLLWLP", "SSYLTW", @@ -910,8 +908,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file): "TWYQQK", "VYYC+57.021QQ", "YLTWYQ", - "YC+57.021QQDY", "YYC+57.021QQD", + "YC+57.021QQDY", ] pdb = db_utils.ProteinDatabase( @@ -931,7 +929,6 @@ def test_digest_fasta_enzyme(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - expected_argc.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_argc pdb = db_utils.ProteinDatabase( @@ -951,7 +948,6 @@ def test_digest_fasta_enzyme(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - expected_aspn.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_aspn # Test regex rule instead of named enzyme @@ -972,7 +968,6 @@ def test_digest_fasta_enzyme(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - expected_argc.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_argc # Test semispecific digest @@ -993,7 +988,6 @@ def test_digest_fasta_enzyme(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - expected_semispecific.sort(key=pdb._calc_pep_mass) assert pdb.db_peptides.index.to_list() == expected_semispecific # Test nonspecific digest @@ -1014,7 +1008,12 @@ def test_digest_fasta_enzyme(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - expected_nonspecific.sort(key=pdb._calc_pep_mass) + peptide_list = pdb.db_peptides.index.to_list() + + first = peptide_list[:50] + second = peptide_list[50:100] + third = peptide_list[100:] + assert pdb.db_peptides.index.to_list() == expected_nonspecific From 113c8797ca5996eb2b22987b99de3af8ff47a704 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 27 Nov 2024 14:55:58 -0800 Subject: [PATCH 42/51] remove unsqueeze batch method --- casanovo/data/db_utils.py | 1 - casanovo/denovo/dataloaders.py | 5 +---- casanovo/denovo/model.py | 24 ++++++------------------ casanovo/denovo/model_runner.py | 1 - tests/conftest.py | 1 - tests/test_integration.py | 12 ++++++------ 6 files changed, 13 insertions(+), 31 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index a3edc75b..e6c039cb 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -13,7 +13,6 @@ import pandas as pd import pyteomics.fasta import pyteomics.parser -import torch logger = logging.getLogger("casanovo") diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index a2cce5b3..c22e7887 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,11 +1,10 @@ """Data loaders for the de novo sequencing task.""" -import functools import logging import os import tempfile from pathlib import Path -from typing import Callable, Iterable, List, Optional, Tuple +from typing import Iterable, Optional import lightning.pytorch as pl import numpy as np @@ -21,8 +20,6 @@ from torch.utils.data import DataLoader from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe -from ..data import db_utils - logger = logging.getLogger("casanovo") diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 7f69c92d..72574418 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -730,23 +730,6 @@ def _get_top_peptide( else: yield [] - def _unsqueeze_batch(self, batch: Dict[str, Any]) -> None: - """ - Unsqueeze the first dimension of each tensor in the batch. - - - Parameters - ---------- - batch : Dict[str, Any] - A dictionary where each key corresponds to a component of the batch, - and the values are tensors or other data structures. - """ - for k in batch.keys(): - try: - batch[k] = batch[k].squeeze(0) - except: - continue - def _process_batch(self, batch): """Prepare batch returned from AnnotatedSpectrumDataset of the latest depthcharge version @@ -768,7 +751,12 @@ def _process_batch(self, batch): sequences (during training). """ - self._unsqueeze_batch(batch) + for k in batch.keys(): + try: + batch[k] = batch[k].squeeze(0) + except: + continue + precursor_mzs = batch["precursor_mz"] precursor_charges = batch["precursor_charge"] precursor_masses = (precursor_mzs - 1.007276) * precursor_charges diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index facd12d0..10e15cdf 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -6,7 +6,6 @@ import os import tempfile import warnings -from datetime import datetime from pathlib import Path from typing import Iterable, List, Optional, Union diff --git a/tests/conftest.py b/tests/conftest.py index 67c947c1..0ced6ecf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ """Fixtures used for testing.""" -import depthcharge import numpy as np import psims import pytest diff --git a/tests/test_integration.py b/tests/test_integration.py index 9eb7e092..14f59bb3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -79,13 +79,13 @@ def test_train_and_run( # Verify that the spectrum predictions are correct # and indexed according to the peak input file type. psms = mztab.spectrum_match_table - assert psms.loc[1, "sequence"] == "LESLLEK" + assert psms.loc[1, "sequence"] == "LESLIEK" assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" - assert psms.loc[2, "sequence"] == "PEPTLDEK" + assert psms.loc[2, "sequence"] == "PEPTIDEK" assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" - assert psms.loc[3, "sequence"] == "LESLLEK" + assert psms.loc[3, "sequence"] == "LESLIEK" assert psms.loc[3, "spectra_ref"] == "ms_run[2]:scan=17" - assert psms.loc[4, "sequence"] == "PEPTLDEK" + assert psms.loc[4, "sequence"] == "PEPTIDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" # Finally, try evaluating: @@ -118,9 +118,9 @@ def test_train_and_run( # Verify that the spectrum predictions are correct # and indexed according to the peak input file type. psms = mztab.spectrum_match_table - assert psms.loc[1, "sequence"] == "LESLLEK" + assert psms.loc[1, "sequence"] == "LESLIEK" assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" - assert psms.loc[2, "sequence"] == "PEPTLDEK" + assert psms.loc[2, "sequence"] == "PEPTIDEK" assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" # Validate mztab output From 54366a50a8abc9bbb02138dbe3478dc81390c32d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 27 Nov 2024 16:04:26 -0800 Subject: [PATCH 43/51] reduced test epochs from 20 to 15 --- casanovo/denovo/model.py | 1 - tests/conftest.py | 2 +- tests/test_integration.py | 2 +- tests/unit_tests/test_runner.py | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 72574418..69730ed2 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -321,7 +321,6 @@ def beam_search_decode( tokens, scores = self._get_topk_beams( tokens, scores, finished_beams, batch, step + 1 ) - tokens = tokens # Return the peptide with the highest confidence score, within the # precursor m/z tolerance if possible. diff --git a/tests/conftest.py b/tests/conftest.py index 0ced6ecf..0cbfcc06 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -263,7 +263,7 @@ def tiny_config(tmp_path): "train_label_smoothing": 0.01, "warmup_iters": 1, "cosine_schedule_period_iters": 1, - "max_epochs": 20, + "max_epochs": 15, "val_check_interval": 1, "accelerator": "cpu", "precursor_mass_tol": 5, diff --git a/tests/test_integration.py b/tests/test_integration.py index 14f59bb3..6e46f2a3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -40,7 +40,7 @@ def test_train_and_run( ] result = run(train_args) - model_file = tmp_path / "train.epoch=19-step=20.ckpt" + model_file = tmp_path / "train.epoch=14-step=15.ckpt" best_model = tmp_path / "train.best.ckpt" assert result.exit_code == 0 assert model_file.exists() diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 10a8d4ef..958f1984 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -207,7 +207,7 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): # Test checkpoint saving when val_check_interval is greater than training steps config = Config(tiny_config) config.val_check_interval = 50 - model_file = tmp_path / "epoch=19-step=20.ckpt" + model_file = tmp_path / "epoch=14-step=15.ckpt" with ModelRunner(config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) @@ -224,7 +224,7 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): # Test checkpoint saving when val_check_interval is not a factor of training steps config.val_check_interval = 15 validation_file = tmp_path / "foobar.best.ckpt" - model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" + model_file = tmp_path / "foobar.epoch=14-step=15.ckpt" with ModelRunner( config, output_dir=tmp_path, output_rootname="foobar" ) as runner: From 3028cd20b29fabc69987a3132dfa90e0e0f4a280 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 2 Dec 2024 10:43:17 -0800 Subject: [PATCH 44/51] integration test fix --- tests/conftest.py | 4 ++-- tests/test_integration.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0cbfcc06..e23e9d39 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -263,7 +263,7 @@ def tiny_config(tmp_path): "train_label_smoothing": 0.01, "warmup_iters": 1, "cosine_schedule_period_iters": 1, - "max_epochs": 15, + "max_epochs": 20, "val_check_interval": 1, "accelerator": "cpu", "precursor_mass_tol": 5, @@ -304,7 +304,7 @@ def tiny_config(tmp_path): "gradient_clip_val": None, "gradient_clip_algorithm": None, "precision": "32-true", - "replace_isoleucine_with_leucine": False, + "replace_isoleucine_with_leucine": True, "reverse_peptides": False, "mskb_tokenizer": True, "residues": { diff --git a/tests/test_integration.py b/tests/test_integration.py index 6e46f2a3..9eb7e092 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -40,7 +40,7 @@ def test_train_and_run( ] result = run(train_args) - model_file = tmp_path / "train.epoch=14-step=15.ckpt" + model_file = tmp_path / "train.epoch=19-step=20.ckpt" best_model = tmp_path / "train.best.ckpt" assert result.exit_code == 0 assert model_file.exists() @@ -79,13 +79,13 @@ def test_train_and_run( # Verify that the spectrum predictions are correct # and indexed according to the peak input file type. psms = mztab.spectrum_match_table - assert psms.loc[1, "sequence"] == "LESLIEK" + assert psms.loc[1, "sequence"] == "LESLLEK" assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" - assert psms.loc[2, "sequence"] == "PEPTIDEK" + assert psms.loc[2, "sequence"] == "PEPTLDEK" assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" - assert psms.loc[3, "sequence"] == "LESLIEK" + assert psms.loc[3, "sequence"] == "LESLLEK" assert psms.loc[3, "spectra_ref"] == "ms_run[2]:scan=17" - assert psms.loc[4, "sequence"] == "PEPTIDEK" + assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" # Finally, try evaluating: @@ -118,9 +118,9 @@ def test_train_and_run( # Verify that the spectrum predictions are correct # and indexed according to the peak input file type. psms = mztab.spectrum_match_table - assert psms.loc[1, "sequence"] == "LESLIEK" + assert psms.loc[1, "sequence"] == "LESLLEK" assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" - assert psms.loc[2, "sequence"] == "PEPTIDEK" + assert psms.loc[2, "sequence"] == "PEPTLDEK" assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" # Validate mztab output From ec20013dc51496b972f3c0d0edbac0209cc89d30 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 2 Dec 2024 13:15:15 -0800 Subject: [PATCH 45/51] integration test fix --- casanovo/denovo/model_runner.py | 5 ++++- tests/conftest.py | 26 ++++++++++++++++++++++---- tests/test_integration.py | 4 +++- tests/unit_tests/test_runner.py | 4 ++-- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 10e15cdf..c8fc7125 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -494,7 +494,9 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: self.model = Model.load_from_checkpoint( self.model_filename, map_location=device, **loaded_model_params ) - + # Use tokenizer initialized from config file instead of loaded + # from checkpoint file + self.model.tokenizer = tokenizer architecture_params = set(model_params.keys()) - set( loaded_model_params.keys() ) @@ -515,6 +517,7 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: map_location=device, **model_params, ) + self.model.tokenizer = tokenizer except RuntimeError: raise RuntimeError( "Weights file incompatible with the current version of " diff --git a/tests/conftest.py b/tests/conftest.py index e23e9d39..4cc02aed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -253,9 +253,8 @@ def _create_mzml(peptides, mzml_file, random_state=42): return mzml_file -@pytest.fixture -def tiny_config(tmp_path): - """A config file for a tiny model.""" +def get_config_file(file_path, file_name, additional_cfg=None): + """Get Casanovo config yaml file""" cfg = { "n_head": 2, "dim_feedforward": 10, @@ -345,8 +344,27 @@ def tiny_config(tmp_path): ), } - cfg_file = tmp_path / "config.yml" + if additional_cfg is not None: + cfg.update(additional_cfg) + + cfg_file = file_path / file_name with cfg_file.open("w+") as out_file: yaml.dump(cfg, out_file) return cfg_file + + +@pytest.fixture +def tiny_config(tmp_path): + """A config file for a tiny model.""" + return get_config_file(tmp_path, "config.yml") + + +@pytest.fixture +def tiny_config_db(tmp_path): + """A config file for a db search.""" + return get_config_file( + tmp_path, + "config_db.yml", + additional_cfg={"replace_isoleucine_with_leucine": False}, + ) diff --git a/tests/test_integration.py b/tests/test_integration.py index 9eb7e092..b5adfa96 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -6,6 +6,7 @@ from click.testing import CliRunner from casanovo import casanovo +from casanovo.config import Config TEST_DIR = Path(__file__).resolve().parent @@ -14,6 +15,7 @@ def test_train_and_run( mgf_small, mzml_small, tiny_config, + tiny_config_db, tmp_path, monkeypatch, mgf_medium, @@ -158,7 +160,7 @@ def test_train_and_run( "--model", str(model_file), "--config", - tiny_config, + tiny_config_db, "--output_dir", str(tmp_path), "--output_root", diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 958f1984..10a8d4ef 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -207,7 +207,7 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): # Test checkpoint saving when val_check_interval is greater than training steps config = Config(tiny_config) config.val_check_interval = 50 - model_file = tmp_path / "epoch=14-step=15.ckpt" + model_file = tmp_path / "epoch=19-step=20.ckpt" with ModelRunner(config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) @@ -224,7 +224,7 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): # Test checkpoint saving when val_check_interval is not a factor of training steps config.val_check_interval = 15 validation_file = tmp_path / "foobar.best.ckpt" - model_file = tmp_path / "foobar.epoch=14-step=15.ckpt" + model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" with ModelRunner( config, output_dir=tmp_path, output_rootname="foobar" ) as runner: From 22338392d7b75e278be7442faf597826bbe4b57e Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 2 Dec 2024 14:51:14 -0800 Subject: [PATCH 46/51] psm batch generator unit test --- casanovo/config.yaml | 4 +- casanovo/denovo/model.py | 4 +- tests/test_integration.py | 1 - tests/unit_tests/test_unit.py | 110 ++++++++++++++++++++++++++++++++-- 4 files changed, 108 insertions(+), 11 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index ffb9bf45..74d6b782 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -63,8 +63,8 @@ max_mods: 1 # where aa is a standard amino acid (or "nterm" for an N-terminal mod) # and mod_residue is a key from the "residues" dictionary. # Example: "M:M+15.995,nterm:+43.006" -allowed_fixed_mods: "C:C+57.021" -allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" +allowed_fixed_mods: "C:C[Carbamidomethyl]" +allowed_var_mods: "M:M[Oxidation],N:N[Deamidated],Q:Q[Deamidated],nterm:[Acetyl]-,nterm:[Carbamyl]-,nterm:[Ammonia-loss]-,nterm:[+25.980265]-" ### diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 69730ed2..53c6a9a0 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1288,8 +1288,8 @@ def _pep_batch_ready(self, num_candidate_psms: int) -> bool: True if the batch is ready, False otherwise. """ return ( - num_candidate_psms % self.psm_batch_size - ) == self.psm_batch_size - 1 + num_candidate_psms % self.psm_batch_size == 0 + ) and num_candidate_psms != 0 def _initialize_psm_batch(self, batch: Dict[str, Any]) -> Dict[str, List]: """ diff --git a/tests/test_integration.py b/tests/test_integration.py index b5adfa96..948cff63 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -6,7 +6,6 @@ from click.testing import CliRunner from casanovo import casanovo -from casanovo.config import Config TEST_DIR = Path(__file__).resolve().parent diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 05fe5a11..d5458d84 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -5,6 +5,7 @@ import hashlib import heapq import io +import math import os import pathlib import platform @@ -26,10 +27,16 @@ import torch from casanovo import casanovo, utils +from casanovo.config import Config from casanovo.data import db_utils, ms_io from casanovo.denovo.dataloaders import DeNovoDataModule from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics -from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score +from casanovo.denovo.model import ( + DbSpec2Pep, + Spec2Pep, + _aa_pep_score, + _calc_match_score, +) def test_version(): @@ -1008,13 +1015,104 @@ def test_digest_fasta_enzyme(tiny_fasta_file): ), tokenizer=depthcharge.tokenizers.PeptideTokenizer.from_massivekb(), ) - peptide_list = pdb.db_peptides.index.to_list() + assert pdb.db_peptides.index.to_list() == expected_nonspecific - first = peptide_list[:50] - second = peptide_list[50:100] - third = peptide_list[100:] - assert pdb.db_peptides.index.to_list() == expected_nonspecific +def test_psm_batches(tiny_config): + peptides_one = [ + "SGSGSG", + "GSGSGT", + "SGSGTD", + "FSGSGS", + "ATSIPA", + "GASTRA", + "LSLSPG", + "ASQSVS", + "GSGTDF", + "SLSPGE", + "AQLLFL", + "QPEDFA", + ] + + peptides_two = [ + "SQSVSS", + "KPGQAP", + "SPPTLS", + "ASTRAT", + "RFSGSG", + "IYGAST", + "APAQLL", + "PTLSLS", + "TLSLSP", + "TLTISS", + "WYQQKP", + "TWYQQK", + ] + + def mock_get_candidates(precursor_mz, precorsor_charge): + if precorsor_charge == 1: + return pd.Series(peptides_one) + elif precorsor_charge == 2: + return pd.Series(peptides_two) + else: + return pd.Series() + + tokenizer = depthcharge.tokenizers.peptides.PeptideTokenizer( + residues=Config(tiny_config).residues + ) + db_model = DbSpec2Pep(tokenizer=tokenizer) + db_model.protein_database = unittest.mock.MagicMock() + db_model.protein_database.get_candidates = mock_get_candidates + + mock_batch = { + "precursor_mz": torch.Tensor([42.0, 84.0, 126.0]), + "precursor_charge": torch.Tensor([1, 2, 3]), + "peak_file": ["one.mgf", "two.mgf", "three.mgf"], + "scan_id": [1, 2, 3], + } + + expected_batch_all = { + "precursor_mz": torch.Tensor([42.0] * 12 + [84.0] * 12), + "precursor_charge": torch.Tensor([1] * 12 + [2] * 12), + "seq": tokenizer.tokenize(peptides_one + peptides_two), + "peak_file": ["one.mgf"] * 12 + ["two.mgf"] * 12, + "scan_id": [1] * 12 + [2] * 12, + } + + for psm_batch_size in [24, 12, 8, 10]: + db_model.psm_batch_size = psm_batch_size + psm_batches = list(db_model._psm_batches(mock_batch)) + assert len(psm_batches) == math.ceil(24 / psm_batch_size) + num_spectra = 0 + + for psm_batch in psm_batches: + end_idx = min( + num_spectra + psm_batch_size, + len(expected_batch_all["peak_file"]), + ) + assert torch.allclose( + psm_batch["precursor_mz"], + expected_batch_all["precursor_mz"][num_spectra:end_idx], + ) + assert torch.equal( + psm_batch["precursor_charge"], + expected_batch_all["precursor_charge"][num_spectra:end_idx], + ) + assert torch.equal( + psm_batch["seq"], + expected_batch_all["seq"][num_spectra:end_idx], + ) + assert ( + psm_batch["peak_file"] + == expected_batch_all["peak_file"][num_spectra:end_idx] + ) + assert ( + psm_batch["scan_id"] + == expected_batch_all["scan_id"][num_spectra:end_idx] + ) + num_spectra += len(psm_batch["peak_file"]) + + assert num_spectra == 24 def test_get_candidates(tiny_fasta_file): From c612785ab74b10edc9447c8e8cb67c6e6651cc85 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 2 Dec 2024 15:30:06 -0800 Subject: [PATCH 47/51] cleanup debug code --- casanovo/data/db_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index e6c039cb..6c5bc69a 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -99,7 +99,6 @@ def __init__( ) self.tokenizer = tokenizer self.db_peptides = self._digest_fasta(peptide_generator) - self.db_peptides.to_csv("data/db_upgrade_new_mods.csv") self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error From c43c5150df63de3654749b90ce7bb0065ed3a8c6 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 6 Dec 2024 11:58:02 -0800 Subject: [PATCH 48/51] disable multi threading on linux --- casanovo/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/casanovo/utils.py b/casanovo/utils.py index cdc6f2ea..aa0b1c64 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -39,7 +39,8 @@ def n_workers() -> int: The number of workers. """ # Windows or MacOS: no multiprocessing. - if platform.system() in ["Windows", "Darwin"]: + # FIXME: remove multi-threading issue workaround. + if platform.system() in ["Windows", "Darwin"] or True: logger.warning( "Dataloader multiprocessing is currently not supported on Windows " "or MacOS; using only a single thread." From 2123894ac0ed6e793944a219dc4e89ca6da3c860 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 6 Dec 2024 12:07:14 -0800 Subject: [PATCH 49/51] skip n_threads unit test --- casanovo/utils.py | 7 +++++-- tests/unit_tests/test_unit.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/casanovo/utils.py b/casanovo/utils.py index aa0b1c64..406e6874 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -38,9 +38,12 @@ def n_workers() -> int: int The number of workers. """ + # FIXME: remove multiprocessing Linux deadlock issue workaround when + # deadlock issue is resolved. + return 0 + # Windows or MacOS: no multiprocessing. - # FIXME: remove multi-threading issue workaround. - if platform.system() in ["Windows", "Darwin"] or True: + if platform.system() in ["Windows", "Darwin"]: logger.warning( "Dataloader multiprocessing is currently not supported on Windows " "or MacOS; using only a single thread." diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index d5458d84..2a701703 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -44,6 +44,7 @@ def test_version(): assert casanovo.__version__ is not None +@pytest.mark.skip(reason="Skipping due to Linux deadlock issue") def test_n_workers(monkeypatch): """Check that n_workers is correct without a GPU.""" monkeypatch.setattr("torch.cuda.is_available", lambda: False) From a49fc5cf648821daf24150a61324e9469689e5c0 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 18 Dec 2024 18:02:41 -0800 Subject: [PATCH 50/51] fixed double batching bug --- casanovo/denovo/dataloaders.py | 18 +++++++----------- casanovo/denovo/model.py | 6 +++--- casanovo/denovo/model_runner.py | 2 +- tests/unit_tests/test_runner.py | 4 +--- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index c22e7887..13a3b7a5 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -10,6 +10,7 @@ import numpy as np import pyarrow as pa import torch +import torch.utils.data._utils.collate from depthcharge.data import ( AnnotatedSpectrumDataset, CustomField, @@ -253,7 +254,6 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: def _make_loader( self, dataset: torch.utils.data.Dataset, - batch_size: int, shuffle: bool = False, ) -> torch.utils.data.DataLoader: """ @@ -263,8 +263,6 @@ def _make_loader( ---------- dataset : torch.utils.data.Dataset A PyTorch Dataset. - batch_size : int - The batch size to use. shuffle : bool Option to shuffle the batches. @@ -275,7 +273,7 @@ def _make_loader( """ return DataLoader( dataset, - batch_size=batch_size, + batch_size=None, pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, @@ -283,25 +281,23 @@ def _make_loader( def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader( - self.train_dataset, self.train_batch_size, shuffle=self.shuffle - ) + return self._make_loader(self.train_dataset, shuffle=self.shuffle) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" - return self._make_loader(self.valid_dataset, self.eval_batch_size) + return self._make_loader(self.valid_dataset) def test_dataloader(self) -> torch.utils.data.DataLoader: """Get the test DataLoader.""" - return self._make_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader(self.test_dataset) def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" - return self._make_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader(self.test_dataset) def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search.""" - return self._make_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader(self.test_dataset) def scale_to_unit_norm(spectrum): diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 53c6a9a0..5ac5b7ce 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -938,13 +938,13 @@ def predict_step( for peptide_score, aa_scores, peptide in spectrum_preds: predictions.append( ( - scan[0], + scan, precursor_charge, precursor_mz, peptide, peptide_score, aa_scores, - file_name[0], + file_name, ) ) @@ -1162,7 +1162,7 @@ def predict_step( batch_aa_scores, psm_batch["peak_file"], ): - spectrum_id = (file_name[0], scan[0]) + spectrum_id = (file_name, scan) predictions_all[spectrum_id].append( psm.PepSpecMatch( sequence=peptide, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index c8fc7125..07bccac7 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -221,7 +221,7 @@ def log_metrics(self, test_dataloader: DataLoader) -> None: for peak_file, scan_id, curr_seq_true in zip( batch["peak_file"], batch["scan_id"], - self.model.tokenizer.detokenize(batch["seq"][0]), + self.model.tokenizer.detokenize(batch["seq"]), ): spectrum_id_true = (peak_file, scan_id) seq_true.append(curr_seq_true) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 10a8d4ef..e9c9abd4 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -366,9 +366,7 @@ def get_mock_loader(psm_list, tokenizer): { "peak_file": [psm.spectrum_id[0] for psm in psm_list], "scan_id": [psm.spectrum_id[1] for psm in psm_list], - "seq": tokenizer.tokenize( - [psm.sequence for psm in psm_list] - ).unsqueeze(0), + "seq": tokenizer.tokenize([psm.sequence for psm in psm_list]), } ] From 759c02e6579892ae93b613e96fdabf4685b3eb7b Mon Sep 17 00:00:00 2001 From: Gwen Straub Date: Mon, 23 Dec 2024 16:35:42 -0800 Subject: [PATCH 51/51] use tokens to compare peptides --- casanovo/denovo/evaluate.py | 18 +++++++++--------- casanovo/denovo/model_runner.py | 16 ++++++++++++---- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index 6bc1ff2e..29c406db 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -8,9 +8,9 @@ def aa_match_prefix( - peptide1: List[str], - peptide2: List[str], - aa_dict: Dict[str, float], + peptide1: List[List[int]], + peptide2: List[List[int]], + aa_dict: Dict[int, float], cum_mass_threshold: float = 0.5, ind_mass_threshold: float = 0.1, ) -> Tuple[np.ndarray, bool]: @@ -64,9 +64,9 @@ def aa_match_prefix( def aa_match_prefix_suffix( - peptide1: List[str], - peptide2: List[str], - aa_dict: Dict[str, float], + peptide1: List[List[int]], + peptide2: List[List[int]], + aa_dict: Dict[int, float], cum_mass_threshold: float = 0.5, ind_mass_threshold: float = 0.1, ) -> Tuple[np.ndarray, bool]: @@ -127,9 +127,9 @@ def aa_match_prefix_suffix( def aa_match( - peptide1: List[str] | None, - peptide2: List[str] | None, - aa_dict: Dict[str, float], + peptide1: List[List[int]] | None, + peptide2: List[List[int]] | None, + aa_dict: Dict[int, float], cum_mass_threshold: float = 0.5, ind_mass_threshold: float = 0.1, mode: str = "best", diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 07bccac7..6ab50c89 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -221,25 +221,33 @@ def log_metrics(self, test_dataloader: DataLoader) -> None: for peak_file, scan_id, curr_seq_true in zip( batch["peak_file"], batch["scan_id"], - self.model.tokenizer.detokenize(batch["seq"]), + batch["seq"], ): spectrum_id_true = (peak_file, scan_id) - seq_true.append(curr_seq_true) + seq_true.append(curr_seq_true.tolist()) if ( pred_idx < len(self.writer.psms) and self.writer.psms[pred_idx].spectrum_id == spectrum_id_true ): - seq_pred.append(self.writer.psms[pred_idx].sequence) + next_pred_tokens = self.model.tokenizer.tokenize( + self.writer.psms[pred_idx].sequence + ).squeeze(0) + seq_pred.append(next_pred_tokens.tolist()) pred_idx += 1 else: seq_pred.append(None) + residue_dict = { + pep_idx: self.model.tokenizer.residues[pep_str] + for pep_str, pep_idx in self.model.tokenizer.index.items() + if pep_str in self.model.tokenizer.residues + } aa_precision, aa_recall, pep_precision = aa_match_metrics( *aa_match_batch( seq_true, seq_pred, - self.model.tokenizer.residues, + residue_dict, ) )