Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calibrate optimization lock only during optimization #301

Merged
merged 10 commits into from
Aug 15, 2024
48 changes: 38 additions & 10 deletions alphadia/outputtransform.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# native imports
import logging
import os
import pickle
from collections.abc import Iterator

import directlfq.config as lfqconfig
Expand All @@ -25,6 +24,7 @@
TransferLearningAccumulator,
)
from alphadia.transferlearning.train import FinetuneManager
from alphadia.workflow import manager, peptidecentric

logger = logging.getLogger()

Expand Down Expand Up @@ -657,12 +657,11 @@ def build_stat_df(
stat_df_list = []
for folder in folder_list:
raw_name = os.path.basename(folder)
optimization_manager_path = os.path.join(folder, "optimization_manager.pkl")
stat_df_list.append(
_build_run_stat_df(
folder,
raw_name,
psm_df[psm_df["run"] == raw_name],
optimization_manager_path,
all_channels,
)
)
Expand Down Expand Up @@ -820,16 +819,19 @@ def build_library(


def _build_run_stat_df(
folder: str,
raw_name: str,
run_df: pd.DataFrame,
optimization_manager_path: str,
channels: list[int] | None = None,
):
"""Build stat dataframe for a single run.

Parameters
----------

folder: str
Directory containing the raw file and the managers

raw_name: str
Name of the raw file

Expand All @@ -845,6 +847,12 @@ def _build_run_stat_df(
Dataframe containing the statistics

"""
optimization_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH
)
timing_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
)

if channels is None:
channels = [0]
Expand All @@ -869,13 +877,33 @@ def _build_run_stat_df(
if "mobility_fwhm" in channel_df.columns:
base_dict["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"])

with open(optimization_manager_path, "rb") as f:
optimization_manager = pickle.load(f)
if os.path.exists(optimization_manager_path):
optimization_manager = manager.OptimizationManager(
path=optimization_manager_path
)

base_dict["ms2_error"] = optimization_manager.ms2_error
base_dict["ms1_error"] = optimization_manager.ms1_error
base_dict["rt_error"] = optimization_manager.rt_error
base_dict["mobility_error"] = optimization_manager.mobility_error

base_dict["ms2_error"] = optimization_manager.ms2_error
base_dict["ms1_error"] = optimization_manager.ms1_error
base_dict["rt_error"] = optimization_manager.rt_error
base_dict["mobility_error"] = optimization_manager.mobility_error
else:
logger.warning(f"Error reading optimization manager for {raw_name}")
base_dict["ms2_error"] = np.nan
base_dict["ms1_error"] = np.nan
base_dict["rt_error"] = np.nan
base_dict["mobility_error"] = np.nan

if os.path.exists(timing_manager_path):
timing_manager = manager.TimingManager(path=timing_manager_path)

base_dict["optimization_duration"] = timing_manager.optimization["duration"]
base_dict["extraction_duration"] = timing_manager.extraction["duration"]

else:
logger.warning(f"Error reading timing manager for {raw_name}")
base_dict["optimization_duration"] = np.nan
base_dict["extraction_duration"] = np.nan

out_df.append(base_dict)

Expand Down
12 changes: 12 additions & 0 deletions alphadia/workflow/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class WorkflowBase:

CALIBRATION_MANAGER_PATH = "calibration_manager.pkl"
OPTIMIZATION_MANAGER_PATH = "optimization_manager.pkl"
TIMING_MANAGER_PATH = "timing_manager.pkl"
FDR_MANAGER_PATH = "fdr_manager.pkl"
FIGURE_PATH = "figures"

Expand Down Expand Up @@ -53,6 +54,7 @@ def __init__(
self._spectral_library: SpecLibBase | None = None
self._calibration_manager: manager.CalibrationManager | None = None
self._optimization_manager: manager.OptimizationManager | None = None
self._timing_manager: manager.TimingManager | None = None

if not os.path.exists(self.parent_path):
logger.info(f"Creating parent folder for workflows at {self.parent_path}")
Expand Down Expand Up @@ -109,6 +111,11 @@ def load(
reporter=self.reporter,
)

self._timing_manager = manager.TimingManager(
path=os.path.join(self.path, self.TIMING_MANAGER_PATH),
load_from_file=self.config["general"]["reuse_calibration"],
)

self.reporter.log_event("section_stop", {})

@property
Expand Down Expand Up @@ -141,6 +148,11 @@ def optimization_manager(self) -> str:
"""Optimization manager for the workflow. Owns the optimization data"""
return self._optimization_manager

@property
def timing_manager(self) -> str:
"""Optimization manager for the workflow. Owns the timing data"""
return self._timing_manager

@property
def spectral_library(self) -> SpecLibBase:
"""Spectral library for the workflow. Owns the spectral library data"""
Expand Down
68 changes: 55 additions & 13 deletions alphadia/workflow/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ def fit_predict(
class OptimizationManager(BaseManager):
def __init__(
self,
config: dict,
config: None | dict = None,
path: None | str = None,
load_from_file: bool = True,
**kwargs,
Expand All @@ -463,19 +463,21 @@ def __init__(
self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

initial_parameters = {
"ms1_error": config["search_initial"]["initial_ms1_tolerance"],
"ms2_error": config["search_initial"]["initial_ms2_tolerance"],
"rt_error": config["search_initial"]["initial_rt_tolerance"],
"mobility_error": config["search_initial"]["initial_mobility_tolerance"],
"column_type": "library",
"num_candidates": config["search_initial"]["initial_num_candidates"],
"classifier_version": -1,
"fwhm_rt": config["optimization_manager"]["fwhm_rt"],
"fwhm_mobility": config["optimization_manager"]["fwhm_mobility"],
"score_cutoff": config["optimization_manager"]["score_cutoff"],
}
if not self.is_loaded_from_file:
initial_parameters = {
"ms1_error": config["search_initial"]["initial_ms1_tolerance"],
"ms2_error": config["search_initial"]["initial_ms2_tolerance"],
"rt_error": config["search_initial"]["initial_rt_tolerance"],
"mobility_error": config["search_initial"][
"initial_mobility_tolerance"
],
"column_type": "library",
"num_candidates": config["search_initial"]["initial_num_candidates"],
"classifier_version": -1,
"fwhm_rt": config["optimization_manager"]["fwhm_rt"],
"fwhm_mobility": config["optimization_manager"]["fwhm_mobility"],
"score_cutoff": config["optimization_manager"]["score_cutoff"],
}
self.__dict__.update(initial_parameters)

for key, value in initial_parameters.items():
Expand Down Expand Up @@ -750,3 +752,43 @@ def fit(self, update_dict):
def column_hash(columns):
columns.sort()
return xxhash.xxh64_hexdigest("".join(columns))


class TimingManager(BaseManager):
def __init__(
self,
path: None | str = None,
load_from_file: bool = True,
**kwargs,
):
super().__init__(path=path, load_from_file=load_from_file, **kwargs)
self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

if not self.is_loaded_from_file:
self.__dict__.update(
{
"optimization": {
"start": None,
"end": None,
"duration": None,
},
"extraction": {
"start": None,
"end": None,
"duration": None,
},
}
)

def start(self, workflow_stage: str):
self.__dict__.update({workflow_stage: {"start": pd.Timestamp.now()}})
self.save()

def end(self, workflow_stage: str):
self.__dict__[workflow_stage]["end"] = pd.Timestamp.now()
self.__dict__[workflow_stage]["duration"] = (
self.__dict__[workflow_stage]["end"]
- self.__dict__[workflow_stage]["start"]
).total_seconds() / 60
self.save()
Loading
Loading