Skip to content

Commit

Permalink
Merge pull request #373 from MannLabs/add_rt_length_to_output_II
Browse files Browse the repository at this point in the history
Add rt length to output ii
  • Loading branch information
mschwoer authored Nov 19, 2024
2 parents f777949 + 4a6de93 commit db55495
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 101 deletions.
120 changes: 83 additions & 37 deletions alphadia/outputtransform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# native imports
import logging
import os
from collections import defaultdict
from collections.abc import Iterator

import directlfq.config as lfqconfig
Expand All @@ -25,6 +26,7 @@
)
from alphadia.transferlearning.train import FinetuneManager
from alphadia.workflow import manager, peptidecentric
from alphadia.workflow.managers.raw_file_manager import RawFileManager

logger = logging.getLogger()

Expand Down Expand Up @@ -938,61 +940,59 @@ def _build_run_stat_df(
Dataframe containing the statistics
"""
optimization_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH
)

calibration_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PATH
)

if channels is None:
channels = [0]
out_df = []
all_stats = []

for channel in channels:
channel_df = run_df[run_df["channel"] == channel]

base_dict = {
stats = {
"run": raw_name,
"channel": channel,
"precursors": len(channel_df),
"proteins": channel_df["pg"].nunique(),
}

stats["fwhm_rt"] = np.nan
if "cycle_fwhm" in channel_df.columns:
base_dict["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"])
stats["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"])

stats["fwhm_mobility"] = np.nan
if "mobility_fwhm" in channel_df.columns:
base_dict["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"])
stats["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"])

# collect optimization stats
base_dict["optimization.ms2_error"] = np.nan
base_dict["optimization.ms1_error"] = np.nan
base_dict["optimization.rt_error"] = np.nan
base_dict["optimization.mobility_error"] = np.nan

if os.path.exists(optimization_manager_path):
optimization_stats = defaultdict(lambda: np.nan)
if os.path.exists(
optimization_manager_path := os.path.join(
folder,
peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME,
)
):
optimization_manager = manager.OptimizationManager(
path=optimization_manager_path
)
base_dict["optimization.ms2_error"] = optimization_manager.ms2_error
base_dict["optimization.ms1_error"] = optimization_manager.ms1_error
base_dict["optimization.rt_error"] = optimization_manager.rt_error
base_dict["optimization.mobility_error"] = (
optimization_manager.mobility_error
)

optimization_stats["ms2_error"] = optimization_manager.ms2_error
optimization_stats["ms1_error"] = optimization_manager.ms1_error
optimization_stats["rt_error"] = optimization_manager.rt_error
optimization_stats["mobility_error"] = optimization_manager.mobility_error
else:
logger.warning(f"Error reading optimization manager for {raw_name}")

# collect calibration stats
base_dict["calibration.ms2_median_accuracy"] = np.nan
base_dict["calibration.ms2_median_precision"] = np.nan
base_dict["calibration.ms1_median_accuracy"] = np.nan
base_dict["calibration.ms1_median_precision"] = np.nan
prefix = "optimization."
for key in ["ms2_error", "ms1_error", "rt_error", "mobility_error"]:
stats[f"{prefix}{key}"] = optimization_stats[key]

if os.path.exists(calibration_manager_path):
# collect calibration stats
calibration_stats = defaultdict(lambda: np.nan)
if os.path.exists(
calibration_manager_path := os.path.join(
folder,
peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PKL_NAME,
)
):
calibration_manager = manager.CalibrationManager(
path=calibration_manager_path
)
Expand All @@ -1002,10 +1002,10 @@ def _build_run_stat_df(
"fragment", "mz"
)
) and (fragment_mz_metrics := fragment_mz_estimator.metrics):
base_dict["calibration.ms2_median_accuracy"] = fragment_mz_metrics[
calibration_stats["ms2_median_accuracy"] = fragment_mz_metrics[
"median_accuracy"
]
base_dict["calibration.ms2_median_precision"] = fragment_mz_metrics[
calibration_stats["ms2_median_precision"] = fragment_mz_metrics[
"median_precision"
]

Expand All @@ -1014,19 +1014,65 @@ def _build_run_stat_df(
"precursor", "mz"
)
) and (precursor_mz_metrics := precursor_mz_estimator.metrics):
base_dict["calibration.ms1_median_accuracy"] = precursor_mz_metrics[
calibration_stats["ms1_median_accuracy"] = precursor_mz_metrics[
"median_accuracy"
]
base_dict["calibration.ms1_median_precision"] = precursor_mz_metrics[
calibration_stats["ms1_median_precision"] = precursor_mz_metrics[
"median_precision"
]

else:
logger.warning(f"Error reading calibration manager for {raw_name}")

out_df.append(base_dict)
prefix = "calibration."
for key in [
"ms2_median_accuracy",
"ms2_median_precision",
"ms1_median_accuracy",
"ms1_median_precision",
]:
stats[f"{prefix}{key}"] = calibration_stats[key]

# collect raw stats
raw_stats = defaultdict(lambda: np.nan)
if os.path.exists(
raw_file_manager_path := os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.RAW_FILE_MANAGER_PKL_NAME
)
):
raw_stats = RawFileManager(
path=raw_file_manager_path, load_from_file=True
).stats
else:
logger.warning(f"Error reading raw file manager for {raw_name}")

# deliberately mapping explicitly to avoid coupling raw_stats to the output too tightly
prefix = "raw."

stats[f"{prefix}gradient_min_m"] = raw_stats["rt_limit_min"] / 60
stats[f"{prefix}gradient_max_m"] = raw_stats["rt_limit_max"] / 60
stats[f"{prefix}gradient_length_m"] = (
raw_stats["rt_limit_max"] - raw_stats["rt_limit_min"]
) / 60
for key in [
"cycle_length",
"cycle_duration",
"cycle_number",
"msms_range_min",
"msms_range_max",
]:
stats[f"{prefix}{key}"] = raw_stats[key]

all_stats.append(stats)

return pd.DataFrame(all_stats)


return pd.DataFrame(out_df)
def _get_value_or_nan(d: dict, key: str):
try:
return d[key]
except KeyError:
return np.nan


def _build_run_internal_df(
Expand All @@ -1048,7 +1094,7 @@ def _build_run_internal_df(
"""
timing_manager_path = os.path.join(
folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME
)
raw_name = os.path.basename(folder_path)

Expand Down
22 changes: 11 additions & 11 deletions alphadia/workflow/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ class WorkflowBase:
It also initializes the calibration_manager and fdr_manager for the workflow.
"""

RAW_FILE_MANAGER_PATH = "raw_file_manager.pkl"
CALIBRATION_MANAGER_PATH = "calibration_manager.pkl"
OPTIMIZATION_MANAGER_PATH = "optimization_manager.pkl"
TIMING_MANAGER_PATH = "timing_manager.pkl"
FDR_MANAGER_PATH = "fdr_manager.pkl"
FIGURE_PATH = "figures"
RAW_FILE_MANAGER_PKL_NAME = "raw_file_manager.pkl"
CALIBRATION_MANAGER_PKL_NAME = "calibration_manager.pkl"
OPTIMIZATION_MANAGER_PKL_NAME = "optimization_manager.pkl"
TIMING_MANAGER_PKL_NAME = "timing_manager.pkl"
FDR_MANAGER_PKL_NAME = "fdr_manager.pkl"
FIGURES_FOLDER_NAME = "figures"

def __init__(
self,
Expand Down Expand Up @@ -87,7 +87,7 @@ def load(
self.reporter.log_event("loading_data", {"progress": 0})
raw_file_manager = RawFileManager(
self.config,
path=os.path.join(self.path, self.RAW_FILE_MANAGER_PATH),
path=os.path.join(self.path, self.RAW_FILE_MANAGER_PKL_NAME),
reporter=self.reporter,
)

Expand All @@ -102,7 +102,7 @@ def load(
# initialize the calibration manager
self._calibration_manager = manager.CalibrationManager(
self.config["calibration_manager"],
path=os.path.join(self.path, self.CALIBRATION_MANAGER_PATH),
path=os.path.join(self.path, self.CALIBRATION_MANAGER_PKL_NAME),
load_from_file=self.config["general"]["reuse_calibration"],
reporter=self.reporter,
)
Expand All @@ -115,14 +115,14 @@ def load(
self._optimization_manager = manager.OptimizationManager(
self.config,
gradient_length=self.dia_data.rt_values.max(),
path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH),
path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PKL_NAME),
load_from_file=self.config["general"]["reuse_calibration"],
figure_path=os.path.join(self.path, self.FIGURE_PATH),
figure_path=os.path.join(self.path, self.FIGURES_FOLDER_NAME),
reporter=self.reporter,
)

self._timing_manager = manager.TimingManager(
path=os.path.join(self.path, self.TIMING_MANAGER_PATH),
path=os.path.join(self.path, self.TIMING_MANAGER_PKL_NAME),
load_from_file=self.config["general"]["reuse_calibration"],
)

Expand Down
72 changes: 36 additions & 36 deletions alphadia/workflow/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,51 +83,51 @@ def is_fitted(self, value):

def save(self):
"""Save the state to pickle file."""
if self.path is not None:
try:
with open(self.path, "wb") as f:
pickle.dump(self, f)
except Exception as e:
self.reporter.log_string(
f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}",
verbosity="error",
)
# Log the full traceback
if self.path is None:
return

self.reporter.log_string(
f"Traceback: {traceback.format_exc()}", verbosity="error"
)
try:
with open(self.path, "wb") as f:
pickle.dump(self, f)
except Exception as e:
self.reporter.log_string(
f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}",
verbosity="error",
)

self.reporter.log_string(
f"Traceback: {traceback.format_exc()}", verbosity="error"
)

def load(self):
"""Load the state from pickle file."""
if self.path is not None:
if os.path.exists(self.path):
try:
with open(self.path, "rb") as f:
loaded_state = pickle.load(f)

if loaded_state._version == self._version:
self.__dict__.update(loaded_state.__dict__)
self.is_loaded_from_file = True
else:
self.reporter.log_string(
f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.",
verbosity="warning",
)
except Exception:
if self.path is None or not os.path.exists(self.path):
self.reporter.log_string(
f"{self.__class__.__name__} not found at: {self.path}",
verbosity="warning",
)
return

try:
with open(self.path, "rb") as f:
loaded_state = pickle.load(f)

if loaded_state._version == self._version:
self.__dict__.update(loaded_state.__dict__)
self.is_loaded_from_file = True
self.reporter.log_string(
f"Failed to load {self.__class__.__name__} from {self.path}",
verbosity="error",
f"Loaded {self.__class__.__name__} from {self.path}"
)
else:
self.reporter.log_string(
f"Loaded {self.__class__.__name__} from {self.path}"
f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.",
verbosity="warning",
)
else:
self.reporter.log_string(
f"{self.__class__.__name__} not found at: {self.path}",
verbosity="warning",
)
except Exception:
self.reporter.log_string(
f"Failed to load {self.__class__.__name__} from {self.path}",
verbosity="error",
)

def fit(self):
"""Fit the workflow property of the manager."""
Expand Down
15 changes: 8 additions & 7 deletions alphadia/workflow/managers/raw_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@ def __init__(
self.stats = {} # needs to be before super().__init__ to avoid overwriting loaded values

super().__init__(path=path, load_from_file=load_from_file, **kwargs)
self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

self._config: Config = config

# deliberately not saving the actual raw data here to avoid the saved manager file being too large

self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

def get_dia_data_object(
self, dia_data_path: str
) -> bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRaw:
Expand Down Expand Up @@ -121,8 +124,6 @@ def _calc_stats(
stats["rt_limit_min"] = rt_values.min()
stats["rt_limit_max"] = rt_values.max()

stats["rt_duration_sec"] = rt_values.max() - rt_values.min()

cycle_length = cycle.shape[1]
stats["cycle_length"] = cycle_length
stats["cycle_duration"] = np.diff(rt_values[::cycle_length]).mean()
Expand All @@ -138,13 +139,13 @@ def _calc_stats(

def _log_stats(self):
"""Log the statistics calculated from the DIA data."""
rt_duration_min = self.stats["rt_duration_sec"] / 60
rt_duration = self.stats["rt_limit_max"] - self.stats["rt_limit_min"]

logger.info(
f"{'RT (min)':<20}: {self.stats['rt_limit_min']/60:.1f} - {self.stats['rt_limit_max']/60:.1f}"
)
logger.info(f"{'RT duration (sec)':<20}: {self.stats['rt_duration_sec']:.1f}")
logger.info(f"{'RT duration (min)':<20}: {rt_duration_min:.1f}")
logger.info(f"{'RT duration (sec)':<20}: {rt_duration:.1f}")
logger.info(f"{'RT duration (min)':<20}: {rt_duration/60:.1f}")

logger.info(f"{'Cycle len (scans)':<20}: {self.stats['cycle_length']:.0f}")
logger.info(f"{'Cycle len (sec)':<20}: {self.stats['cycle_duration']:.2f}")
Expand Down
1 change: 1 addition & 0 deletions docs/methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ methods/command-line
methods/configuration
methods/calibration
methods/transfer-learning
methods/output-format
```
Loading

0 comments on commit db55495

Please sign in to comment.