Merge pull request #309 from MannLabs/new_dynamic_optimization_lock

New dynamic optimization lock
MannLabs · Aug 21, 2024 · 5dc9ac9 · 5dc9ac9
2 parents dad6ed5 + 696e5bf
commit 5dc9ac9
Show file tree

Hide file tree

Showing 11 changed files with 547 additions and 363 deletions.
diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml
@@ -98,9 +98,6 @@ search_advanced:
 
 calibration:
 
-  # minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted)
-  optimization_lock_min_steps: 0
-
   # Number of precursors searched and scored per batch
   batch_size: 8000
 
@@ -110,7 +107,7 @@ calibration:
   # the maximum number of steps that a given optimizer is permitted to take
   max_steps: 20
 
-  # the maximum number of steps that a given optimizer is permitted to take
+  # the minimum number of steps that a given optimizer must take before it can be said to have converged
   min_steps: 2
 
   # TODO: remove this parameter

diff --git a/alphadia/exceptions.py b/alphadia/exceptions.py
@@ -37,19 +37,19 @@ class NoPsmFoundError(BusinessError):
     _msg = "No psm files accumulated, can't continue"
 
 
-class NoRecalibrationTargetError(BusinessError):
-    """Raise when no recalibration target is found."""
+class NoOptimizationLockTargetError(BusinessError):
+    """Raise when the optimization lock target is not found."""
 
-    _error_code = "NO_RECALIBRATION_TARGET"
+    _error_code = "NO_OPTIMIZATION_LOCK_TARGET"
 
-    _msg = "Searched all data without finding recalibration target"
+    _msg = "Searched all data without finding optimization lock target"
 
-    _detail_msg = """Search for raw file failed as not enough precursors were found for calibration.
+    _detail_msg = """Search for raw file failed as not enough precursors were found for calibration and optimization.
                  This can have the following reasons:
                    1. The sample was empty and therefore no precursors were found.
                    2. The sample contains only very few precursors.
                       For small libraries, try to set recalibration_target to a lower value.
-                      For large libraries, try to reduce the library size and reduce the calibration MS1 and MS2 tolerance.
+                      For large libraries, try to reduce the library size and reduce the initial MS1 and MS2 tolerance.
                    3. There was a fundamental issue with search parameters."""
 
 

diff --git a/alphadia/outputtransform.py b/alphadia/outputtransform.py
@@ -297,6 +297,7 @@ class SearchPlanOutput:
     PSM_INPUT = "psm"
     PRECURSOR_OUTPUT = "precursors"
     STAT_OUTPUT = "stat"
+    INTERNAL_OUTPUT = "internal"
     PG_OUTPUT = "protein_groups"
     LIBRARY_OUTPUT = "speclib.mbr"
     TRANSFER_OUTPUT = "speclib.transfer"
@@ -365,6 +366,7 @@ def build(
             folder_list, save=False, base_spec_lib=base_spec_lib
         )
         _ = self.build_stat_df(folder_list, psm_df=psm_df, save=True)
+        _ = self.build_internal_df(folder_list, save=True)
         _ = self.build_lfq_tables(folder_list, psm_df=psm_df, save=True)
         _ = self.build_library(
             base_spec_lib,
@@ -678,6 +680,50 @@ def build_stat_df(
 
         return stat_df
 
+    def build_internal_df(
+        self,
+        folder_list: list[str],
+        save: bool = True,
+    ):
+        """Build internal data table from a list of seach outputs
+
+        Parameters
+        ----------
+
+        folder_list: List[str]
+            List of folders containing the search outputs
+
+        save: bool
+            Save the precursor table to disk
+
+        Returns
+        -------
+
+        stat_df: pd.DataFrame
+            Precursor table
+        """
+        logger.progress("Building internal statistics")
+
+        internal_df_list = []
+        for folder in folder_list:
+            internal_df_list.append(
+                _build_run_internal_df(
+                    folder,
+                )
+            )
+
+        internal_df = pd.concat(internal_df_list)
+
+        if save:
+            logger.info("Writing internal output to disk")
+            write_df(
+                internal_df,
+                os.path.join(self.output_folder, self.INTERNAL_OUTPUT),
+                file_format="tsv",
+            )
+
+        return internal_df
+
     def build_lfq_tables(
         self,
         folder_list: list[str],
@@ -850,9 +896,6 @@ def _build_run_stat_df(
     optimization_manager_path = os.path.join(
         folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH
     )
-    timing_manager_path = os.path.join(
-        folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
-    )
 
     if channels is None:
         channels = [0]
@@ -894,20 +937,47 @@ def _build_run_stat_df(
             base_dict["rt_error"] = np.nan
             base_dict["mobility_error"] = np.nan
 
-        if os.path.exists(timing_manager_path):
-            timing_manager = manager.TimingManager(path=timing_manager_path)
+        out_df.append(base_dict)
 
-            base_dict["optimization_duration"] = timing_manager.optimization["duration"]
-            base_dict["extraction_duration"] = timing_manager.extraction["duration"]
+    return pd.DataFrame(out_df)
 
-        else:
-            logger.warning(f"Error reading timing manager for {raw_name}")
-            base_dict["optimization_duration"] = np.nan
-            base_dict["extraction_duration"] = np.nan
 
-        out_df.append(base_dict)
+def _build_run_internal_df(
+    folder_path: str,
+):
+    """Build stat dataframe for a single run.
 
-    return pd.DataFrame(out_df)
+    Parameters
+    ----------
+
+    folder_path: str
+        Path (from the base directory of the output_folder attribute of the Plan class) to the directory containing the raw file and the managers
+
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing the statistics
+
+    """
+    timing_manager_path = os.path.join(
+        folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
+    )
+    raw_name = os.path.basename(folder_path)
+
+    internal_dict = {
+        "run": raw_name,
+    }
+
+    if os.path.exists(timing_manager_path):
+        timing_manager = manager.TimingManager(path=timing_manager_path)
+        for key in timing_manager.timings:
+            internal_dict[f"duration_{key}"] = [timing_manager.timings[key]["duration"]]
+
+    else:
+        logger.warning(f"Error reading timing manager for {raw_name}")
+
+    return pd.DataFrame(internal_dict)
 
 
 def perform_protein_fdr(psm_df):

diff --git a/alphadia/planning.py b/alphadia/planning.py
@@ -330,9 +330,16 @@ def run(
                     logger.info(f"No existing quantification found for {raw_name}")
 
                 workflow.load(dia_path, speclib)
-                workflow.calibration()
 
+                workflow.timing_manager.set_start_time("optimization")
+                workflow.search_parameter_optimization()
+                workflow.timing_manager.set_end_time("optimization")
+
+                workflow.timing_manager.set_start_time("extraction")
                 psm_df, frag_df = workflow.extraction()
+                workflow.timing_manager.set_end_time("extraction")
+                workflow.timing_manager.save()
+
                 psm_df = psm_df[psm_df["qval"] <= self.config["fdr"]["fdr"]]
 
                 if self.config["multiplexing"]["enabled"]:

diff --git a/alphadia/workflow/base.py b/alphadia/workflow/base.py
@@ -102,9 +102,9 @@ def load(
             self._calibration_manager.disable_mobility_calibration()
 
         # initialize the optimization manager
-
         self._optimization_manager = manager.OptimizationManager(
             self.config,
+            gradient_length=self.dia_data.rt_values.max(),
             path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH),
             load_from_file=self.config["general"]["reuse_calibration"],
             figure_path=os.path.join(self.path, self.FIGURE_PATH),

diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py
@@ -455,6 +455,7 @@ class OptimizationManager(BaseManager):
     def __init__(
         self,
         config: None | dict = None,
+        gradient_length: None | float = None,
         path: None | str = None,
         load_from_file: bool = True,
         **kwargs,
@@ -464,10 +465,15 @@ def __init__(
         self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})
 
         if not self.is_loaded_from_file:
+            rt_error = (
+                config["search_initial"]["initial_rt_tolerance"]
+                if config["search_initial"]["initial_rt_tolerance"] > 1
+                else config["search_initial"]["initial_rt_tolerance"] * gradient_length
+            )
             initial_parameters = {
                 "ms1_error": config["search_initial"]["initial_ms1_tolerance"],
                 "ms2_error": config["search_initial"]["initial_ms2_tolerance"],
-                "rt_error": config["search_initial"]["initial_rt_tolerance"],
+                "rt_error": rt_error,
                 "mobility_error": config["search_initial"][
                     "initial_mobility_tolerance"
                 ],
@@ -487,7 +493,6 @@ def fit(self, update_dict):
         """Update the parameters dict with the values in update_dict."""
         self.__dict__.update(update_dict)
         self.is_fitted = True
-        self.save()
 
     def predict(self):
         """Return the parameters dict."""
@@ -761,34 +766,32 @@ def __init__(
         load_from_file: bool = True,
         **kwargs,
     ):
+        """Contains and updates timing information for the portions of the workflow."""
         super().__init__(path=path, load_from_file=load_from_file, **kwargs)
         self.reporter.log_string(f"Initializing {self.__class__.__name__}")
         self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})
-
         if not self.is_loaded_from_file:
-            self.__dict__.update(
-                {
-                    "optimization": {
-                        "start": None,
-                        "end": None,
-                        "duration": None,
-                    },
-                    "extraction": {
-                        "start": None,
-                        "end": None,
-                        "duration": None,
-                    },
-                }
-            )
+            self.timings = {}
 
-    def start(self, workflow_stage: str):
-        self.__dict__.update({workflow_stage: {"start": pd.Timestamp.now()}})
-        self.save()
+    def set_start_time(self, workflow_stage: str):
+        """Stores the start time of the given stage of the workflow in the timings attribute. Also saves the timing manager to disk.
+
+        Parameters
+        ----------
+        workflow_stage : str
+            The name under which the timing will be stored in the timings dict
+        """
+        self.timings.update({workflow_stage: {"start": pd.Timestamp.now()}})
+
+    def set_end_time(self, workflow_stage: str):
+        """Stores the end time of the given stage of the workflow in the timings attribute and calculates the duration. Also saves the timing manager to disk.
+        Parameters
+        ----------
+        workflow_stage : str
+            The name under which the timing will be stored in the timings dict
 
-    def end(self, workflow_stage: str):
-        self.__dict__[workflow_stage]["end"] = pd.Timestamp.now()
-        self.__dict__[workflow_stage]["duration"] = (
-            self.__dict__[workflow_stage]["end"]
-            - self.__dict__[workflow_stage]["start"]
+        """
+        self.timings[workflow_stage]["end"] = pd.Timestamp.now()
+        self.timings[workflow_stage]["duration"] = (
+            self.timings[workflow_stage]["end"] - self.timings[workflow_stage]["start"]
         ).total_seconds() / 60
-        self.save()