Merge pull request #388 from MannLabs/development

release 1.9.0
MannLabs · Nov 26, 2024 · 9938206 · 9938206
2 parents 06439d2 + d05009a
commit 9938206
Show file tree

Hide file tree

Showing 64 changed files with 1,766 additions and 513 deletions.
diff --git a/alphadia/__init__.py b/alphadia/__init__.py
@@ -1,3 +1,3 @@
 #!python
 
-__version__ = "1.8.2"
+__version__ = "1.9.0"
diff --git a/alphadia/calibration/property.py b/alphadia/calibration/property.py
@@ -75,6 +75,7 @@ def __init__(
             float(transform_deviation) if transform_deviation is not None else None
         )
         self.is_fitted = False
+        self.metrics = None
 
     def __repr__(self) -> str:
         return f"<Calibration {self.name}, is_fitted: {self.is_fitted}>"
@@ -172,10 +173,12 @@ def fit(self, dataframe: pd.DataFrame, plot: bool = False, **kwargs):
             self.function.fit(input_values, target_value)
             self.is_fitted = True
         except Exception as e:
-            logging.error(f"Could not fit estimator {self.name}: {e}")
+            logging.exception(f"Could not fit estimator {self.name}: {e}")
             return
 
-        if plot is True:
+        self._save_metrics(dataframe)
+
+        if plot:
             self.plot(dataframe, **kwargs)
 
     def predict(self, dataframe, inplace=True):
@@ -200,13 +203,13 @@ def predict(self, dataframe, inplace=True):
             logging.warning(
                 f"{self.name} prediction was skipped as it has not been fitted yet"
             )
-            return
+            return None
 
         if not set(self.input_columns).issubset(dataframe.columns):
             logging.warning(
                 f"{self.name} calibration was skipped as input column {self.input_columns} not found in dataframe"
             )
-            return
+            return None
 
         input_values = dataframe[self.input_columns].values
 
@@ -297,6 +300,13 @@ def deviation(self, dataframe: pd.DataFrame):
             axis=1,
         )
 
+    def _save_metrics(self, dataframe):
+        deviation = self.deviation(dataframe)
+        self.metrics = {
+            "median_accuracy": np.median(np.abs(deviation[:, 1])),
+            "median_precision": np.median(np.abs(deviation[:, 2])),
+        }
+
     def ci(self, dataframe, ci: float = 0.95):
         """Calculate the residual deviation at the given confidence interval.
 

diff --git a/alphadia/cli.py b/alphadia/cli.py
@@ -74,7 +74,6 @@
     action="append",
     default=[],
 )
-
 parser.add_argument(
     "--config",
     "-c",
@@ -83,7 +82,6 @@
     nargs="?",
     default=None,
 )
-
 parser.add_argument(
     "--wsl",
     "-w",
@@ -97,6 +95,13 @@
     nargs="?",
     default="{}",
 )
+parser.add_argument(
+    "--quant-dir",
+    type=str,
+    help="Directory to save the quantification results (psm & frag parquet files) to be reused in a distributed search",
+    nargs="?",
+    default=None,
+)
 
 
 def parse_config(args: argparse.Namespace) -> dict:
@@ -167,6 +172,41 @@ def parse_output_directory(args: argparse.Namespace, config: dict) -> str:
     return output_directory
 
 
+def parse_quant_dir(args: argparse.Namespace, config: dict) -> str:
+    """Parse custom quant path.
+    1. Use custom quant path from config file if specified.
+    2. Use custom quant path from command line if specified.
+
+    Parameters
+    ----------
+
+    args : argparse.Namespace
+        Command line arguments.
+
+    config : dict
+        Config dictionary.
+
+    Returns
+    -------
+
+    quant_dir : str
+        path to quant directory.
+    """
+
+    quant_dir = None
+    if "quant_dir" in config:
+        quant_dir = (
+            utils.windows_to_wsl(config["quant_dir"])
+            if args.wsl
+            else config["quant_dir"]
+        )
+
+    if args.quant_dir is not None:
+        quant_dir = utils.windows_to_wsl(args.quant_dir) if args.wsl else args.quant_dir
+
+    return quant_dir
+
+
 def parse_raw_path_list(args: argparse.Namespace, config: dict) -> list:
     """Parse raw file list.
     1. Use raw file list from config file if specified.
@@ -305,6 +345,8 @@ def run(*args, **kwargs):
         print("No output directory specified.")
         return
 
+    quant_dir = parse_quant_dir(args, config)
+
     reporting.init_logging(output_directory)
     raw_path_list = parse_raw_path_list(args, config)
 
@@ -321,7 +363,10 @@ def run(*args, **kwargs):
     for f in fasta_path_list:
         logger.progress(f"  {f}")
 
+    # TODO rename all output_directory, output_folder => output_path, quant_dir->quant_path (except cli parameter)
     logger.progress(f"Saving output to: {output_directory}")
+    if quant_dir is not None:
+        logger.progress(f"Saving quantification output to {quant_dir=}")
 
     try:
         import matplotlib
@@ -337,6 +382,7 @@ def run(*args, **kwargs):
             library_path=library_path,
             fasta_path_list=fasta_path_list,
             config=config,
+            quant_path=quant_dir,
         )
 
         plan.run()

diff --git a/alphadia/data/alpharaw.py → alphadia/data/alpharaw_wrapper.py b/alphadia/data/alpharaw.py → alphadia/data/alpharaw_wrapper.py
@@ -1,26 +1,16 @@
 # native imports
 import logging
-import math
 import os
 
 import numba as nb
-
-# third party imports
 import numpy as np
 import pandas as pd
-from alpharaw import mzml as alpharawmzml
-
-# TODO fix: "import resolves to its containing file"
-from alpharaw import sciex as alpharawsciex
-
-# alpha family imports
-from alpharaw import (
-    thermo as alpharawthermo,
-)
+from alpharaw import ms_data_base as alpharaw_ms_data_base
+from alpharaw import mzml as alpharaw_mzml
+from alpharaw import sciex as alpharaw_sciex
+from alpharaw import thermo as alpharaw_thermo
 
-# alphadia imports
 from alphadia import utils
-from alphadia.data.stats import log_stats
 
 logger = logging.getLogger()
 
@@ -252,7 +242,7 @@ def calculate_valid_scans(quad_slices: np.ndarray, cycle: np.ndarray):
     return np.array(precursor_idx_list)
 
 
-class AlphaRaw(alpharawthermo.MSData_Base):
+class AlphaRaw(alpharaw_thermo.MSData_Base):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.has_mobility = False
@@ -318,8 +308,6 @@ def filter_spectra(self, **kwargs):
         This function is implemented in the sub-class.
         """
 
-        pass
-
     def jitclass(self):
         return AlphaRawJIT(
             self.cycle,
@@ -340,28 +328,32 @@ def jitclass(self):
         )
 
 
-class MzML(AlphaRaw, alpharawmzml.MzMLReader):
+class AlphaRawBase(AlphaRaw, alpharaw_ms_data_base.MSData_Base):
+    def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
+        super().__init__(process_count=process_count)
+        self.load_hdf(raw_file_path)
+        self.process_alpharaw(**kwargs)
+
+
+class MzML(AlphaRaw, alpharaw_mzml.MzMLReader):
     def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
         super().__init__(process_count=process_count)
         self.load_raw(raw_file_path)
         self.process_alpharaw(**kwargs)
-        log_stats(self.rt_values, self.cycle)
 
 
-class Sciex(AlphaRaw, alpharawsciex.SciexWiffData):
+class Sciex(AlphaRaw, alpharaw_sciex.SciexWiffData):
     def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
         super().__init__(process_count=process_count)
         self.load_raw(raw_file_path)
         self.process_alpharaw(**kwargs)
-        log_stats(self.rt_values, self.cycle)
 
 
-class Thermo(AlphaRaw, alpharawthermo.ThermoRawData):
+class Thermo(AlphaRaw, alpharaw_thermo.ThermoRawData):
     def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
         super().__init__(process_count=process_count)
         self.load_raw(raw_file_path)
         self.process_alpharaw(**kwargs)
-        log_stats(self.rt_values, self.cycle)
 
     def filter_spectra(self, cv: float = None, astral_ms1: bool = False, **kwargs):
         """
@@ -460,7 +452,9 @@ def __init__(
         self.scan_max_index = scan_max_index
         self.frame_max_index = frame_max_index
 
-    def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16):
+    def get_frame_indices(
+        self, rt_values: np.array, optimize_size: int = 16, min_size: int = 32
+    ):
         """
 
         Convert an interval of two rt values to a frame index interval.
@@ -474,54 +468,28 @@ def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16):
         optimize_size : int, default = 16
             To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16
 
+        min_size : int, default = 32
+            The minimum number of dia cycles to include
+
         Returns
         -------
         np.ndarray, shape = (2,), dtype = int64
             array of frame indices
 
         """
 
-        if rt_values.shape != (2,):
-            raise ValueError("rt_values must be a numpy array of shape (2,)")
-
-        frame_index = np.zeros(len(rt_values), dtype=np.int64)
-        for i, rt in enumerate(rt_values):
-            frame_index[i] = search_sorted_left(self.rt_values, rt)
-
-        precursor_cycle_limits = (frame_index + self.zeroth_frame) // self.cycle.shape[
-            1
-        ]
-        precursor_cycle_len = precursor_cycle_limits[1] - precursor_cycle_limits[0]
-
-        # round up to the next multiple of 16
-        optimal_len = int(
-            optimize_size * math.ceil(precursor_cycle_len / optimize_size)
+        return utils.get_frame_indices(
+            rt_values=rt_values,
+            rt_values_array=self.rt_values,
+            zeroth_frame=self.zeroth_frame,
+            cycle_len=self.cycle.shape[1],
+            precursor_cycle_max_index=self.precursor_cycle_max_index,
+            optimize_size=optimize_size,
+            min_size=min_size,
         )
 
-        # by default, we extend the precursor cycle to the right
-        optimal_cycle_limits = np.array(
-            [precursor_cycle_limits[0], precursor_cycle_limits[0] + optimal_len],
-            dtype=np.int64,
-        )
-
-        # if the cycle is too long, we extend it to the left
-        if optimal_cycle_limits[1] > self.precursor_cycle_max_index:
-            optimal_cycle_limits[1] = self.precursor_cycle_max_index
-            optimal_cycle_limits[0] = self.precursor_cycle_max_index - optimal_len
-
-            if optimal_cycle_limits[0] < 0:
-                optimal_cycle_limits[0] = (
-                    0 if self.precursor_cycle_max_index % 2 == 0 else 1
-                )
-
-        # second element is the index of the first whole cycle which should not be used
-        # precursor_cycle_limits[1] += 1
-        # convert back to frame indices
-        frame_limits = optimal_cycle_limits * self.cycle.shape[1] + self.zeroth_frame
-        return utils.make_slice_1d(frame_limits)
-
     def get_frame_indices_tolerance(
-        self, rt: float, tolerance: float, optimize_size: int = 16
+        self, rt: float, tolerance: float, optimize_size: int = 16, min_size: int = 32
     ):
         """
         Determine the frame indices for a given retention time and tolerance.
@@ -538,6 +506,9 @@ def get_frame_indices_tolerance(
         optimize_size : int, default = 16
             To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16
 
+        min_size : int, default = 32
+            The minimum number of dia cycles to include
+
         Returns
         -------
         np.ndarray, shape = (1, 3,), dtype = int64
@@ -547,7 +518,9 @@ def get_frame_indices_tolerance(
 
         rt_limits = np.array([rt - tolerance, rt + tolerance], dtype=np.float32)
 
-        return self.get_frame_indices(rt_limits, optimize_size=optimize_size)
+        return self.get_frame_indices(
+            rt_limits, optimize_size=optimize_size, min_size=min_size
+        )
 
     def get_scan_indices(self, mobility_values: np.array, optimize_size: int = 16):
         return np.array([[0, 2, 1]], dtype=np.int64)