Skip to content

Commit

Permalink
Merge pull request #388 from MannLabs/development
Browse files Browse the repository at this point in the history
release 1.9.0
  • Loading branch information
GeorgWa authored Nov 26, 2024
2 parents 06439d2 + d05009a commit 9938206
Show file tree
Hide file tree
Showing 64 changed files with 1,766 additions and 513 deletions.
2 changes: 1 addition & 1 deletion alphadia/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!python

__version__ = "1.8.2"
__version__ = "1.9.0"
18 changes: 14 additions & 4 deletions alphadia/calibration/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(
float(transform_deviation) if transform_deviation is not None else None
)
self.is_fitted = False
self.metrics = None

def __repr__(self) -> str:
return f"<Calibration {self.name}, is_fitted: {self.is_fitted}>"
Expand Down Expand Up @@ -172,10 +173,12 @@ def fit(self, dataframe: pd.DataFrame, plot: bool = False, **kwargs):
self.function.fit(input_values, target_value)
self.is_fitted = True
except Exception as e:
logging.error(f"Could not fit estimator {self.name}: {e}")
logging.exception(f"Could not fit estimator {self.name}: {e}")
return

if plot is True:
self._save_metrics(dataframe)

if plot:
self.plot(dataframe, **kwargs)

def predict(self, dataframe, inplace=True):
Expand All @@ -200,13 +203,13 @@ def predict(self, dataframe, inplace=True):
logging.warning(
f"{self.name} prediction was skipped as it has not been fitted yet"
)
return
return None

if not set(self.input_columns).issubset(dataframe.columns):
logging.warning(
f"{self.name} calibration was skipped as input column {self.input_columns} not found in dataframe"
)
return
return None

input_values = dataframe[self.input_columns].values

Expand Down Expand Up @@ -297,6 +300,13 @@ def deviation(self, dataframe: pd.DataFrame):
axis=1,
)

def _save_metrics(self, dataframe):
deviation = self.deviation(dataframe)
self.metrics = {
"median_accuracy": np.median(np.abs(deviation[:, 1])),
"median_precision": np.median(np.abs(deviation[:, 2])),
}

def ci(self, dataframe, ci: float = 0.95):
"""Calculate the residual deviation at the given confidence interval.
Expand Down
50 changes: 48 additions & 2 deletions alphadia/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@
action="append",
default=[],
)

parser.add_argument(
"--config",
"-c",
Expand All @@ -83,7 +82,6 @@
nargs="?",
default=None,
)

parser.add_argument(
"--wsl",
"-w",
Expand All @@ -97,6 +95,13 @@
nargs="?",
default="{}",
)
parser.add_argument(
"--quant-dir",
type=str,
help="Directory to save the quantification results (psm & frag parquet files) to be reused in a distributed search",
nargs="?",
default=None,
)


def parse_config(args: argparse.Namespace) -> dict:
Expand Down Expand Up @@ -167,6 +172,41 @@ def parse_output_directory(args: argparse.Namespace, config: dict) -> str:
return output_directory


def parse_quant_dir(args: argparse.Namespace, config: dict) -> str:
"""Parse custom quant path.
1. Use custom quant path from config file if specified.
2. Use custom quant path from command line if specified.
Parameters
----------
args : argparse.Namespace
Command line arguments.
config : dict
Config dictionary.
Returns
-------
quant_dir : str
path to quant directory.
"""

quant_dir = None
if "quant_dir" in config:
quant_dir = (
utils.windows_to_wsl(config["quant_dir"])
if args.wsl
else config["quant_dir"]
)

if args.quant_dir is not None:
quant_dir = utils.windows_to_wsl(args.quant_dir) if args.wsl else args.quant_dir

return quant_dir


def parse_raw_path_list(args: argparse.Namespace, config: dict) -> list:
"""Parse raw file list.
1. Use raw file list from config file if specified.
Expand Down Expand Up @@ -305,6 +345,8 @@ def run(*args, **kwargs):
print("No output directory specified.")
return

quant_dir = parse_quant_dir(args, config)

reporting.init_logging(output_directory)
raw_path_list = parse_raw_path_list(args, config)

Expand All @@ -321,7 +363,10 @@ def run(*args, **kwargs):
for f in fasta_path_list:
logger.progress(f" {f}")

# TODO rename all output_directory, output_folder => output_path, quant_dir->quant_path (except cli parameter)
logger.progress(f"Saving output to: {output_directory}")
if quant_dir is not None:
logger.progress(f"Saving quantification output to {quant_dir=}")

try:
import matplotlib
Expand All @@ -337,6 +382,7 @@ def run(*args, **kwargs):
library_path=library_path,
fasta_path_list=fasta_path_list,
config=config,
quant_path=quant_dir,
)

plan.run()
Expand Down
99 changes: 36 additions & 63 deletions alphadia/data/alpharaw.py → alphadia/data/alpharaw_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,16 @@
# native imports
import logging
import math
import os

import numba as nb

# third party imports
import numpy as np
import pandas as pd
from alpharaw import mzml as alpharawmzml

# TODO fix: "import resolves to its containing file"
from alpharaw import sciex as alpharawsciex

# alpha family imports
from alpharaw import (
thermo as alpharawthermo,
)
from alpharaw import ms_data_base as alpharaw_ms_data_base
from alpharaw import mzml as alpharaw_mzml
from alpharaw import sciex as alpharaw_sciex
from alpharaw import thermo as alpharaw_thermo

# alphadia imports
from alphadia import utils
from alphadia.data.stats import log_stats

logger = logging.getLogger()

Expand Down Expand Up @@ -252,7 +242,7 @@ def calculate_valid_scans(quad_slices: np.ndarray, cycle: np.ndarray):
return np.array(precursor_idx_list)


class AlphaRaw(alpharawthermo.MSData_Base):
class AlphaRaw(alpharaw_thermo.MSData_Base):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.has_mobility = False
Expand Down Expand Up @@ -318,8 +308,6 @@ def filter_spectra(self, **kwargs):
This function is implemented in the sub-class.
"""

pass

def jitclass(self):
return AlphaRawJIT(
self.cycle,
Expand All @@ -340,28 +328,32 @@ def jitclass(self):
)


class MzML(AlphaRaw, alpharawmzml.MzMLReader):
class AlphaRawBase(AlphaRaw, alpharaw_ms_data_base.MSData_Base):
def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_hdf(raw_file_path)
self.process_alpharaw(**kwargs)


class MzML(AlphaRaw, alpharaw_mzml.MzMLReader):
def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_raw(raw_file_path)
self.process_alpharaw(**kwargs)
log_stats(self.rt_values, self.cycle)


class Sciex(AlphaRaw, alpharawsciex.SciexWiffData):
class Sciex(AlphaRaw, alpharaw_sciex.SciexWiffData):
def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_raw(raw_file_path)
self.process_alpharaw(**kwargs)
log_stats(self.rt_values, self.cycle)


class Thermo(AlphaRaw, alpharawthermo.ThermoRawData):
class Thermo(AlphaRaw, alpharaw_thermo.ThermoRawData):
def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_raw(raw_file_path)
self.process_alpharaw(**kwargs)
log_stats(self.rt_values, self.cycle)

def filter_spectra(self, cv: float = None, astral_ms1: bool = False, **kwargs):
"""
Expand Down Expand Up @@ -460,7 +452,9 @@ def __init__(
self.scan_max_index = scan_max_index
self.frame_max_index = frame_max_index

def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16):
def get_frame_indices(
self, rt_values: np.array, optimize_size: int = 16, min_size: int = 32
):
"""
Convert an interval of two rt values to a frame index interval.
Expand All @@ -474,54 +468,28 @@ def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16):
optimize_size : int, default = 16
To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16
min_size : int, default = 32
The minimum number of dia cycles to include
Returns
-------
np.ndarray, shape = (2,), dtype = int64
array of frame indices
"""

if rt_values.shape != (2,):
raise ValueError("rt_values must be a numpy array of shape (2,)")

frame_index = np.zeros(len(rt_values), dtype=np.int64)
for i, rt in enumerate(rt_values):
frame_index[i] = search_sorted_left(self.rt_values, rt)

precursor_cycle_limits = (frame_index + self.zeroth_frame) // self.cycle.shape[
1
]
precursor_cycle_len = precursor_cycle_limits[1] - precursor_cycle_limits[0]

# round up to the next multiple of 16
optimal_len = int(
optimize_size * math.ceil(precursor_cycle_len / optimize_size)
return utils.get_frame_indices(
rt_values=rt_values,
rt_values_array=self.rt_values,
zeroth_frame=self.zeroth_frame,
cycle_len=self.cycle.shape[1],
precursor_cycle_max_index=self.precursor_cycle_max_index,
optimize_size=optimize_size,
min_size=min_size,
)

# by default, we extend the precursor cycle to the right
optimal_cycle_limits = np.array(
[precursor_cycle_limits[0], precursor_cycle_limits[0] + optimal_len],
dtype=np.int64,
)

# if the cycle is too long, we extend it to the left
if optimal_cycle_limits[1] > self.precursor_cycle_max_index:
optimal_cycle_limits[1] = self.precursor_cycle_max_index
optimal_cycle_limits[0] = self.precursor_cycle_max_index - optimal_len

if optimal_cycle_limits[0] < 0:
optimal_cycle_limits[0] = (
0 if self.precursor_cycle_max_index % 2 == 0 else 1
)

# second element is the index of the first whole cycle which should not be used
# precursor_cycle_limits[1] += 1
# convert back to frame indices
frame_limits = optimal_cycle_limits * self.cycle.shape[1] + self.zeroth_frame
return utils.make_slice_1d(frame_limits)

def get_frame_indices_tolerance(
self, rt: float, tolerance: float, optimize_size: int = 16
self, rt: float, tolerance: float, optimize_size: int = 16, min_size: int = 32
):
"""
Determine the frame indices for a given retention time and tolerance.
Expand All @@ -538,6 +506,9 @@ def get_frame_indices_tolerance(
optimize_size : int, default = 16
To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16
min_size : int, default = 32
The minimum number of dia cycles to include
Returns
-------
np.ndarray, shape = (1, 3,), dtype = int64
Expand All @@ -547,7 +518,9 @@ def get_frame_indices_tolerance(

rt_limits = np.array([rt - tolerance, rt + tolerance], dtype=np.float32)

return self.get_frame_indices(rt_limits, optimize_size=optimize_size)
return self.get_frame_indices(
rt_limits, optimize_size=optimize_size, min_size=min_size
)

def get_scan_indices(self, mobility_values: np.array, optimize_size: int = 16):
return np.array([[0, 2, 1]], dtype=np.int64)
Expand Down
Loading

0 comments on commit 9938206

Please sign in to comment.