From 281e5e5827a5d78c0e20468bd789034fac252f85 Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 26 Jul 2024 13:35:47 +0200 Subject: [PATCH 01/36] added RT, MS1 and mobility optimizers for automatic calibration --- alphadia/workflow/searchoptimization.py | 111 ++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 6 deletions(-) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index ca1d054f..756a24e1 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -125,9 +125,16 @@ def _check_convergence(self): and self.feature[-1] < 1.1 * self.feature[-2] and self.feature[-1] < 1.1 * self.feature[-3] ): - self.optimal_parameter = self.parameters[np.argmax(self.feature)] + backtrack_by = ( + len(self.feature) - np.argmax(self.feature) + ) # Corresponds to the 1 + the number of searches since the last increase in identifications, with 1 indicating an increase in the most recent search. + # This is done instead of directly indexing the parameter of interest because the self.fdr_manager.classifier_store and self.parameters will be of different lengths, but the relevant entries will be the same index from the end. + self.optimal_parameter = self.parameters[-backtrack_by] self.optimization_manager.fit({"rt_error": self.optimal_parameter}) + self.optimization_manager.fit( + {"classifier_version": self.fdr_manager.num_classifiers - backtrack_by} + ) def _update_parameter(self, df: pd.DataFrame): """See base class. The update rule is @@ -145,7 +152,7 @@ def _update_parameter(self, df: pd.DataFrame): return proposed_parameter def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The number of precursor identifications is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" + """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): self.feature.append(len(precursors_df)) self._check_convergence() @@ -302,9 +309,16 @@ def _check_convergence(self): and self.feature[-1] < 1.1 * self.feature[-2] and self.feature[-1] < 1.1 * self.feature[-3] ): - self.optimal_parameter = self.parameters[np.argmax(self.feature)] + backtrack_by = ( + len(self.feature) - np.argmax(self.feature) + ) # Corresponds to the 1 + the number of searches since the last increase in identifications, with 1 indicating an increase in the most recent search. + # This is done instead of directly indexing the parameter of interest because the self.fdr_manager.classifier_store and self.parameters will be of different lengths, but the relevant entries will be the same index from the end. + self.optimal_parameter = self.parameters[-backtrack_by] self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) + self.optimization_manager.fit( + {"classifier_version": self.fdr_manager.num_classifiers - backtrack_by} + ) def _update_parameter(self, df: pd.DataFrame): """See base class. The update rule is @@ -322,7 +336,7 @@ def _update_parameter(self, df: pd.DataFrame): return proposed_parameter def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The number of precursor identifications is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" + """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): self.feature.append(len(precursors_df)) self._check_convergence() @@ -346,6 +360,91 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): class MobilityOptimizer(BaseOptimizer): - """TODO: Implement this class. It will be used to optimize the mobility parameter for the search.""" + def __init__( + self, + initial_parameter: float, + calibration_manager: manager.CalibrationManager, + optimization_manager: manager.OptimizationManager, + fdr_manager: manager.FDRManager, + **kwargs, + ): + """See base class. + + Parameters + ---------- + + initial_parameter: float + The parameter used for search in the first round of optimization. + + """ + super().__init__( + calibration_manager, optimization_manager, fdr_manager, **kwargs + ) + self.parameters = [initial_parameter] + self.feature = [] + + def _check_convergence(self): + """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. + This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. + If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. + + Notes + ----- + Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + + + """ - pass + if ( + len(self.feature) > 2 + and self.feature[-1] < 1.1 * self.feature[-2] + and self.feature[-1] < 1.1 * self.feature[-3] + ): + backtrack_by = ( + len(self.feature) - np.argmax(self.feature) + ) # Corresponds to the 1 + the number of searches since the last increase in identifications, with 1 indicating an increase in the most recent search. + # This is done instead of directly indexing the parameter of interest because the self.fdr_manager.classifier_store and self.parameters will be of different lengths, but the relevant entries will be the same index from the end. + + self.optimal_parameter = self.parameters[-backtrack_by] + self.optimization_manager.fit({"mobility_error": self.optimal_parameter}) + self.optimization_manager.fit( + {"classifier_version": self.fdr_manager.num_classifiers - backtrack_by} + ) + + def _update_parameter(self, df: pd.DataFrame): + """See base class. The update rule is + 1) calculate the deviation of the predicted mz values from the observed mz values, + 2) take the mean of the endpoints of the central 99% of these deviations, and + 3) multiply this value by 1.1. + This is implemented by the ci method for the estimator. + + + """ + proposed_parameter = 1.1 * self.calibration_manager.get_estimator( + "precursor", "mz" + ).ci(df, 0.99) + + return proposed_parameter + + def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" + if not self.has_converged(): + self.feature.append(len(precursors_df)) + self._check_convergence() + + if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence + self.reporter.log_string( + f"✅ Mobility: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + verbosity="progress", + ) + + else: + proposed_parameter = self._update_parameter(precursors_df) + + self.reporter.log_string( + f"❌ Mobility: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + verbosity="progress", + ) + + self.parameters.append(proposed_parameter) + self.optimization_manager.fit({"mobility_error": proposed_parameter}) From 918bfda43b4042a46322750528a7b11bfcb768fb Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 26 Jul 2024 15:11:50 +0200 Subject: [PATCH 02/36] added optimizers for targeted optimization of MS1, MS2, RT and mobility --- alphadia/workflow/searchoptimization.py | 342 +++++++++++++++++++++++- 1 file changed, 329 insertions(+), 13 deletions(-) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 756a24e1..4a5cd0b3 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -82,7 +82,7 @@ def has_converged(self): return self.optimal_parameter is not None -class RTOptimizer(BaseOptimizer): +class AutomaticRTOptimizer(BaseOptimizer): """TODO Finish this optimizer""" def __init__( @@ -159,7 +159,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ RT: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'rt_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", verbosity="progress", ) @@ -167,7 +167,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): proposed_parameter = self._update_parameter(precursors_df) self.reporter.log_string( - f"❌ RT: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'rt_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", verbosity="progress", ) @@ -175,7 +175,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.optimization_manager.fit({"rt_error": proposed_parameter}) -class MS2Optimizer(BaseOptimizer): +class AutomaticMS2Optimizer(BaseOptimizer): def __init__( self, initial_parameter: float, @@ -184,7 +184,7 @@ def __init__( fdr_manager: manager.FDRManager, **kwargs, ): - """See base class. + """This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number. Parameters ---------- @@ -250,7 +250,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ MS2: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'ms2_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", verbosity="progress", ) @@ -258,7 +258,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): proposed_parameter = self._update_parameter(fragments_df) self.reporter.log_string( - f"❌ MS2: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'ms2_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", verbosity="progress", ) @@ -266,7 +266,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.optimization_manager.fit({"ms2_error": proposed_parameter}) -class MS1Optimizer(BaseOptimizer): +class AutomaticMS1Optimizer(BaseOptimizer): """TODO Finish this optimizer""" def __init__( @@ -343,7 +343,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ MS1: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'ms1_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", verbosity="progress", ) @@ -351,7 +351,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): proposed_parameter = self._update_parameter(precursors_df) self.reporter.log_string( - f"❌ MS1: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'ms1_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", verbosity="progress", ) @@ -359,7 +359,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.optimization_manager.fit({"ms1_error": proposed_parameter}) -class MobilityOptimizer(BaseOptimizer): +class AutomaticMobilityOptimizer(BaseOptimizer): def __init__( self, initial_parameter: float, @@ -434,15 +434,331 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ Mobility: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'mobility_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + verbosity="progress", + ) + + else: + proposed_parameter = self._update_parameter(precursors_df) + + self.reporter.log_string( + f"❌ {'mobility_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + verbosity="progress", + ) + + self.parameters.append(proposed_parameter) + self.optimization_manager.fit({"mobility_error": proposed_parameter}) + + +class TargetedRTOptimizer(BaseOptimizer): + """This class optimizes the RT search parameter until it reaches a user-specified target value.""" + + def __init__( + self, + initial_parameter: float, + target_parameter: float, + calibration_manager: manager.CalibrationManager, + optimization_manager: manager.OptimizationManager, + fdr_manager: manager.FDRManager, + **kwargs, + ): + """See base class. + + Parameters + ---------- + + initial_parameter: float + The parameter used for search in the first round of optimization. + + target_parameter: float + Optimization will stop when this parameter is reached. + + """ + super().__init__( + calibration_manager, optimization_manager, fdr_manager, **kwargs + ) + self.target_parameter = target_parameter + self.parameters = [initial_parameter] + + def _check_convergence(self, proposed_parameter): + """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. + + Parameters + ---------- + proposed_parameter: float + The proposed parameter for the next round of optimization. + """ + + if proposed_parameter <= self.target_parameter: + self.optimal_parameter = self.target_parameter + self.optimization_manager.fit({"rt_error": self.optimal_parameter}) + + def _update_parameter(self, df: pd.DataFrame): + """See base class. The update rule is + 1) calculate the deviation of the predicted mz values from the observed mz values, and + 2) take the mean of the endpoints of the central 95% of these deviations + This is implemented by the ci method for the estimator. + + + """ + proposed_parameter = self.calibration_manager.get_estimator( + "precursor", "rt" + ).ci(df, 0.95) + + return proposed_parameter + + def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + """See base class.""" + if not self.has_converged(): + proposed_parameter = self._update_parameter(precursors_df) + self._check_convergence(proposed_parameter) + + if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence + self.reporter.log_string( + f"✅ {'rt_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", + verbosity="progress", + ) + + else: + self.reporter.log_string( + f"❌ {'rt_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", + verbosity="progress", + ) + + self.parameters.append(proposed_parameter) + self.optimization_manager.fit({"rt_error": proposed_parameter}) + + +class TargetedMS2Optimizer(BaseOptimizer): + """This class optimizes the MS2 search parameter until it reaches a user-specified target value.""" + + def __init__( + self, + initial_parameter: float, + target_parameter: float, + calibration_manager: manager.CalibrationManager, + optimization_manager: manager.OptimizationManager, + fdr_manager: manager.FDRManager, + **kwargs, + ): + """See base class. + + Parameters + ---------- + + initial_parameter: float + The parameter used for search in the first round of optimization. + + target_parameter: float + Optimization will stop when this parameter is reached. + + """ + super().__init__( + calibration_manager, optimization_manager, fdr_manager, **kwargs + ) + self.target_parameter = target_parameter + self.parameters = [initial_parameter] + + def _check_convergence(self, proposed_parameter): + """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. + + Parameters + ---------- + proposed_parameter: float + The proposed parameter for the next round of optimization. + """ + + if proposed_parameter <= self.target_parameter: + self.optimal_parameter = self.target_parameter + self.optimization_manager.fit({"ms2_error": self.optimal_parameter}) + + def _update_parameter(self, df: pd.DataFrame): + """See base class. The update rule is + 1) calculate the deviation of the predicted mz values from the observed mz values, and + 2) take the mean of the endpoints of the central 95% of these deviations + This is implemented by the ci method for the estimator. + + + """ + proposed_parameter = self.calibration_manager.get_estimator( + "fragment", "mz" + ).ci(df, 0.95) + + return proposed_parameter + + def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + """See base class.""" + if not self.has_converged(): + proposed_parameter = self._update_parameter(fragments_df) + self._check_convergence(proposed_parameter) + + if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence + self.reporter.log_string( + f"✅ {'ms2_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", + verbosity="progress", + ) + + else: + self.reporter.log_string( + f"❌ {'ms2_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", + verbosity="progress", + ) + + self.parameters.append(proposed_parameter) + self.optimization_manager.fit({"ms2_error": proposed_parameter}) + + +class TargetedMS1Optimizer(BaseOptimizer): + """This class optimizes the MS1 search parameter until it reaches a user-specified target value.""" + + def __init__( + self, + initial_parameter: float, + target_parameter: float, + calibration_manager: manager.CalibrationManager, + optimization_manager: manager.OptimizationManager, + fdr_manager: manager.FDRManager, + **kwargs, + ): + """See base class. + + Parameters + ---------- + + initial_parameter: float + The parameter used for search in the first round of optimization. + + target_parameter: float + Optimization will stop when this parameter is reached. + + """ + super().__init__( + calibration_manager, optimization_manager, fdr_manager, **kwargs + ) + self.target_parameter = target_parameter + self.parameters = [initial_parameter] + + def _check_convergence(self, proposed_parameter): + """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. + + Parameters + ---------- + proposed_parameter: float + The proposed parameter for the next round of optimization. + """ + + if proposed_parameter <= self.target_parameter: + self.optimal_parameter = self.target_parameter + self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) + + def _update_parameter(self, df: pd.DataFrame): + """See base class. The update rule is + 1) calculate the deviation of the predicted mz values from the observed mz values, and + 2) take the mean of the endpoints of the central 95% of these deviations + This is implemented by the ci method for the estimator. + + + """ + proposed_parameter = self.calibration_manager.get_estimator( + "precursor", "mz" + ).ci(df, 0.95) + + return proposed_parameter + + def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + """See base class.""" + if not self.has_converged(): + proposed_parameter = self._update_parameter(precursors_df) + self._check_convergence(proposed_parameter) + + if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence + self.reporter.log_string( + f"✅ {'ms1_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", verbosity="progress", ) else: + self.reporter.log_string( + f"❌ {'ms1_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", + verbosity="progress", + ) + + self.parameters.append(proposed_parameter) + self.optimization_manager.fit({"ms1_error": proposed_parameter}) + + +class TargetedMobilityOptimizer(BaseOptimizer): + """This class optimizes the mobility search parameter until it reaches a user-specified target value.""" + + def __init__( + self, + initial_parameter: float, + target_parameter: float, + calibration_manager: manager.CalibrationManager, + optimization_manager: manager.OptimizationManager, + fdr_manager: manager.FDRManager, + **kwargs, + ): + """See base class. + + Parameters + ---------- + + initial_parameter: float + The parameter used for search in the first round of optimization. + + target_parameter: float + Optimization will stop when this parameter is reached. + + """ + super().__init__( + calibration_manager, optimization_manager, fdr_manager, **kwargs + ) + self.target_parameter = target_parameter + self.parameters = [initial_parameter] + + def _check_convergence(self, proposed_parameter): + """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. + + Parameters + ---------- + proposed_parameter: float + The proposed parameter for the next round of optimization. + """ + + if proposed_parameter <= self.target_parameter: + self.optimal_parameter = self.target_parameter + self.optimization_manager.fit({"mobility_error": self.optimal_parameter}) + + def _update_parameter(self, df: pd.DataFrame): + """See base class. The update rule is + 1) calculate the deviation of the predicted mz values from the observed mz values, and + 2) take the mean of the endpoints of the central 95% of these deviations + This is implemented by the ci method for the estimator. + + + """ + proposed_parameter = self.calibration_manager.get_estimator( + "precursor", "mobility" + ).ci(df, 0.95) + + return proposed_parameter + + def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + """See base class.""" + if not self.has_converged(): proposed_parameter = self._update_parameter(precursors_df) + self._check_convergence(proposed_parameter) + if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence + self.reporter.log_string( + f"✅ {'mobility_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", + verbosity="progress", + ) + + else: self.reporter.log_string( - f"❌ Mobility: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'mobility_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", verbosity="progress", ) From 25ef9d23c15f0ac5ae104a603b8ac540ade26c19 Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 26 Jul 2024 16:11:09 +0200 Subject: [PATCH 03/36] added plotting for automatic optimizers --- alphadia/workflow/searchoptimization.py | 81 +++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 4a5cd0b3..37e6aad0 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -1,6 +1,7 @@ # native imports from abc import ABC, abstractmethod +import matplotlib.pyplot as plt import numpy as np # alpha family imports @@ -174,6 +175,26 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.parameters.append(proposed_parameter) self.optimization_manager.fit({"rt_error": proposed_parameter}) + def plot(self): + """Plot the optimization of the RT error parameter.""" + fig, ax = plt.subplots() + ax.vlines( + self.optimal_parameter, + 0, + max(self.feature), + color="red", + zorder=0, + label="Optimal RT error", + ) + ax.plot(self.parameters, self.feature) + ax.scatter(self.parameters, self.feature) + ax.set_ylabel("Number of precursor identifications") + ax.set_xlabel("RT error") + ax.xaxis.set_inverted(True) + ax.set_ylim(bottom=0, top=max(self.feature) * 1.1) + ax.legend(loc="upper left") + plt.show() + class AutomaticMS2Optimizer(BaseOptimizer): def __init__( @@ -265,6 +286,26 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.parameters.append(proposed_parameter) self.optimization_manager.fit({"ms2_error": proposed_parameter}) + def plot(self): + """Plot the optimization of the MS2 error parameter.""" + fig, ax = plt.subplots() + ax.vlines( + self.optimal_parameter, + 0, + max(self.precursor_ids), + color="red", + zorder=0, + label="Optimal MS2 error", + ) + ax.plot(self.parameters, self.precursor_ids) + ax.scatter(self.parameters, self.precursor_ids) + ax.set_ylabel("Number of precursor identifications") + ax.set_xlabel("MS2 error") + ax.xaxis.set_inverted(True) + ax.set_ylim(bottom=0, top=max(self.precursor_ids) * 1.1) + ax.legend(loc="upper left") + plt.show() + class AutomaticMS1Optimizer(BaseOptimizer): """TODO Finish this optimizer""" @@ -358,6 +399,26 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.parameters.append(proposed_parameter) self.optimization_manager.fit({"ms1_error": proposed_parameter}) + def plot(self): + """Plot the optimization of the MS1 error parameter.""" + fig, ax = plt.subplots() + ax.vlines( + self.optimal_parameter, + 0, + max(self.feature), + color="red", + zorder=0, + label="Optimal MS1 error", + ) + ax.plot(self.parameters, self.feature) + ax.scatter(self.parameters, self.feature) + ax.set_ylabel("Number of precursor identifications") + ax.set_xlabel("MS1 error") + ax.xaxis.set_inverted(True) + ax.set_ylim(bottom=0, top=max(self.feature) * 1.1) + ax.legend(loc="upper left") + plt.show() + class AutomaticMobilityOptimizer(BaseOptimizer): def __init__( @@ -449,6 +510,26 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.parameters.append(proposed_parameter) self.optimization_manager.fit({"mobility_error": proposed_parameter}) + def plot(self): + """Plot the optimization of the mobility error parameter.""" + fig, ax = plt.subplots() + ax.vlines( + self.optimal_parameter, + 0, + max(self.feature), + color="red", + zorder=0, + label="Optimal mobility error", + ) + ax.plot(self.parameters, self.feature) + ax.scatter(self.parameters, self.feature) + ax.set_ylabel("Number of precursor identifications") + ax.set_xlabel("Mobility error") + ax.xaxis.set_inverted(True) + ax.set_ylim(bottom=0, top=max(self.feature) * 1.1) + ax.legend(loc="upper left") + plt.show() + class TargetedRTOptimizer(BaseOptimizer): """This class optimizes the RT search parameter until it reaches a user-specified target value.""" From a9eae04a057ccbc5c7acb40b374c383e8fbe1f0c Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 26 Jul 2024 16:43:34 +0200 Subject: [PATCH 04/36] changed calibration and removed now-unused methods --- alphadia/workflow/peptidecentric.py | 367 +++++++++++----------------- 1 file changed, 143 insertions(+), 224 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index c07c6cdc..e22deddf 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -1,5 +1,6 @@ # native imports import logging +from numbers import Number # third party imports import numpy as np @@ -16,7 +17,7 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils from alphadia.peakgroup import search -from alphadia.workflow import base, manager +from alphadia.workflow import base, manager, searchoptimization logger = logging.getLogger() @@ -324,119 +325,67 @@ def get_batch_plan(self): return plan - def start_of_calibration(self): - self.batch_plan = self.get_batch_plan() + def extract_optimization_data(self, target): + """Search parameter optimization (i.e. refinement of tolerances for RT, MS2, etc.) is performed on a subset of the elution groups in the spectral library. + The number of elution groups which must be searched to get a sufficiently large number for robust calibration varies depending the library used and the data. + This function searches an increasing number of elution groups until a sufficient number (determined by target) of precursors are identified at 1% FDR. + It then returns the elution group indexes which will be used to find the data in the spectral library for search parameter optimization. - def start_of_epoch(self, current_epoch): - self.com.current_epoch = current_epoch + Parameters + ---------- - # if self.neptune is not None: - # self.neptune["eval/epoch"].log(current_epoch) + target : int + The number of precursors which must be identified at 1% FDR to stop the extraction. + + """ self.elution_group_order = self.spectral_library.precursor_df[ "elution_group_idx" ].unique() np.random.shuffle(self.elution_group_order) - self.calibration_manager.predict( - self.spectral_library._precursor_df, "precursor" - ) - self.calibration_manager.predict(self.spectral_library._fragment_df, "fragment") + batch_plan = self.get_batch_plan() - # make updates to the progress dict depending on the epoch - if self.com.current_epoch > 0: - self.com.recalibration_target = self.config["calibration"][ - "recalibration_target" - ] * (1 + current_epoch) - - def start_of_step(self, current_step, start_index, stop_index): - self.com.current_step = current_step - # if self.neptune is not None: - # self.neptune["eval/step"].log(current_step) - - # for key, value in self.com.__dict__.items(): - # self.neptune[f"eval/{key}"].log(value) - - self.reporter.log_string( - f"=== Epoch {self.com.current_epoch}, step {current_step}, extracting elution groups {start_index} to {stop_index} ===", - verbosity="progress", - ) + features = [] + fragments = [] + for current_step, (start_index, stop_index) in enumerate(batch_plan): + self.reporter.log_string( + f"=== Step {current_step}, extracting elution groups {start_index} to {stop_index} ===", + verbosity="progress", + ) - def check_epoch_conditions(self): - continue_calibration = False + eg_idxes = self.elution_group_order[start_index:stop_index] + batch_df = self.spectral_library._precursor_df[ + self.spectral_library._precursor_df["elution_group_idx"].isin(eg_idxes) + ] - self.reporter.log_string( - "=== checking if epoch conditions were reached ===", verbosity="info" - ) - if self.dia_data.has_ms1: - if self.com.ms1_error > self.config["search"]["target_ms1_tolerance"]: - self.reporter.log_string( - f"❌ {'ms1_error':<15}: {self.com.ms1_error:.4f} > {self.config['search']['target_ms1_tolerance']}", - verbosity="info", - ) - continue_calibration = True - else: - self.reporter.log_string( - f"✅ {'ms1_error':<15}: {self.com.ms1_error:.4f} <= {self.config['search']['target_ms1_tolerance']}", - verbosity="info", - ) + feature_df, fragment_df = self.extract_batch(batch_df) + features += [feature_df] + fragments += [fragment_df] + features_df = pd.concat(features) + fragments_df = pd.concat(fragments) - if self.com.ms2_error > self.config["search"]["target_ms2_tolerance"]: - self.reporter.log_string( - f"❌ {'ms2_error':<15}: {self.com.ms2_error:.4f} > {self.config['search']['target_ms2_tolerance']}", - verbosity="info", - ) - continue_calibration = True - else: self.reporter.log_string( - f"✅ {'ms2_error':<15}: {self.com.ms2_error:.4f} <= {self.config['search']['target_ms2_tolerance']}", - verbosity="info", + f"=== Step {current_step}, extracted {len(feature_df)} precursors and {len(fragment_df)} fragments ===", + verbosity="progress", ) + precursor_df = self.fdr_correction(features_df, fragments_df) + + precursors_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) - if self.com.rt_error > self.config["search"]["target_rt_tolerance"]: - self.reporter.log_string( - f"❌ {'rt_error':<15}: {self.com.rt_error:.4f} > {self.config['search']['target_rt_tolerance']}", - verbosity="info", - ) - continue_calibration = True - else: self.reporter.log_string( - f"✅ {'rt_error':<15}: {self.com.rt_error:.4f} <= {self.config['search']['target_rt_tolerance']}", - verbosity="info", + f"=== checking if recalibration conditions were reached, target {self.com.recalibration_target} precursors ===", + verbosity="progress", ) - if self.dia_data.has_mobility: - if ( - self.com.mobility_error - > self.config["search"]["target_mobility_tolerance"] - ): - self.reporter.log_string( - f"❌ {'mobility_error':<15}: {self.com.mobility_error:.4f} > {self.config['search']['target_mobility_tolerance']}", - verbosity="info", - ) - continue_calibration = True - else: - self.reporter.log_string( - f"✅ {'mobility_error':<15}: {self.com.mobility_error:.4f} <= {self.config['search']['target_mobility_tolerance']}", - verbosity="info", - ) + self.log_precursor_df(precursor_df) - if self.com.current_epoch < self.config["calibration"]["min_epochs"] - 1: - self.reporter.log_string( - f"❌ {'current_epoch':<15}: {self.com.current_epoch} < {self.config['calibration']['min_epochs']}", - verbosity="info", - ) - continue_calibration = True - else: - self.reporter.log_string( - f"✅ {'current_epoch':<15}: {self.com.current_epoch} >= {self.config['calibration']['min_epochs']}", - verbosity="info", - ) + if precursors_01FDR > target: + final_stop_index = stop_index # final_stop_index is the number of elution groups that will be included in the calibration data + break - self.reporter.log_string( - "==============================================", verbosity="info" - ) - return continue_calibration + self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] + self.com.fit({"classifier_version": self.fdr_manager.current_version}) def calibration(self): if ( @@ -449,95 +398,104 @@ def calibration(self): ) return - self.start_of_calibration() - for current_epoch in range(self.config["calibration"]["max_epochs"]): - if self.check_epoch_conditions(): - pass - else: - break + if ( + isinstance(self.config["search"]["target_ms2_tolerance"], Number) + and isinstance(self.config["search"]["target_rt_tolerance"], Number) + and isinstance(self.config["search"]["target_ms1_tolerance"], Number) + and isinstance(self.config["search"]["target_mobility_tolerance"], Number) + ): + self.reporter.log_string( + "A complete list of target tolerances has been specified. Will perform targeted search parameter optimization.", + verbosity="info", + ) + self.ms2_optimizer = searchoptimization.TargetedMS2Optimizer( + self.config["search"]["initial_ms2_tolerance"], + self.config["search"]["target_ms2_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + self.rt_optimizer = searchoptimization.TargetedRTOptimizer( + self.config["search"]["initial_rt_tolerance"], + self.config["search"]["target_rt_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + self.ms1_optimizer = searchoptimization.TargetedMS1Optimizer( + self.config["search"]["target_ms1_tolerance"], + self.config["search"]["target_ms1_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + self.mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( + self.config["search"]["target_mobility_tolerance"], + self.config["search"]["target_mobility_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) - self.start_of_epoch(current_epoch) + order_of_optimization = [ + [ + self.ms2_optimizer, + self.rt_optimizer, + self.ms1_optimizer, + self.mobility_optimizer, + ] + ] - features = [] - fragments = [] - for current_step, (start_index, stop_index) in enumerate(self.batch_plan): - self.start_of_step(current_step, start_index, stop_index) + else: + raise NotImplementedError( + "Automatic search parameter optimization is not yet implemented" + ) - eg_idxes = self.elution_group_order[start_index:stop_index] + for optimizers in order_of_optimization: + for current_step in range(self.config["calibration"]["max_epochs"]): + if np.all([optimizer.has_converged() for optimizer in optimizers]): + break batch_df = self.spectral_library._precursor_df[ self.spectral_library._precursor_df["elution_group_idx"].isin( - eg_idxes + self.eg_idxes_for_calibration ) ] - feature_df, fragment_df = self.extract_batch(batch_df) - features += [feature_df] - fragments += [fragment_df] - features_df = pd.concat(features) - fragments_df = pd.concat(fragments) + features_df, fragments_df = self.extract_batch(batch_df) self.reporter.log_string( - f"=== Epoch {self.com.current_epoch}, step {current_step}, extracted {len(feature_df)} precursors and {len(fragment_df)} fragments ===", + f"=== Step {current_step}, extracted {len(features_df)} precursors and {len(fragments_df)} fragments ===", verbosity="progress", ) - precursor_df = self.fdr_correction(features_df, fragments_df) - if self.check_recalibration(precursor_df): - self.recalibration(precursor_df, fragments_df) - break - else: - # check if last step has been reached - if current_step == len(self.batch_plan) - 1: - self.reporter.log_string( - "Searched all data without finding recalibration target", - verbosity="error", - ) - raise CalibrationError( - "Searched all data without finding recalibration target" - ) - - self.end_of_epoch() - - if self.config["calibration"].get("final_full_calibration", False): - self.reporter.log_string( - "Performing final calibration with all precursors", - verbosity="progress", - ) - features_df, fragments_df = self.extract_batch( - self.spectral_library._precursor_df - ) - precursor_df = self.fdr_correction(features_df, fragments_df) - self.recalibration(precursor_df, fragments_df) + precursors_df = self.fdr_correction( + features_df, fragments_df, self.com.classifier_version + ) + + precursors_df_filtered, fragments_df_filtered = self.filter_dfs( + precursors_df, fragments_df + ) - self.end_of_calibration() + self.recalibration(precursors_df_filtered, fragments_df_filtered) - def end_of_epoch(self): - pass + self.reporter.log_string( + "=== checking if optimization conditions were reached ===", + verbosity="info", + ) - def end_of_calibration(self): - # self.calibration_manager.predict(self.spectral_library._precursor_df, 'precursor') - # self.calibration_manager.predict(self.spectral_library._fragment_df, 'fragment') - self.calibration_manager.save() - pass + for optimizer in optimizers: + optimizer.step(precursors_df_filtered, fragments_df_filtered) - def recalibration(self, precursor_df, fragments_df): + self.reporter.log_string( + "==============================================", verbosity="info" + ) + + def filter_dfs(self, precursor_df, fragments_df): precursor_df_filtered = precursor_df[precursor_df["qval"] < 0.01] precursor_df_filtered = precursor_df_filtered[ precursor_df_filtered["decoy"] == 0 ] - self.calibration_manager.fit( - precursor_df_filtered, - "precursor", - plot=True, - skip=["mz"] if not self.dia_data.has_ms1 else [], - # neptune_run = self.neptune - ) - - rt_99 = self.calibration_manager.get_estimator("precursor", "rt").ci( - precursor_df_filtered, 0.95 - ) - fragments_df_filtered = fragments_df[ fragments_df["precursor_idx"].isin(precursor_df_filtered["precursor_idx"]) ] @@ -557,9 +515,24 @@ def recalibration(self, precursor_df, fragments_df): ), max_fragments, ) - fragments_df_filtered = fragments_df_filtered.iloc[:stop_rank] + fragments_df_filtered = fragments_df_filtered.iloc[ + :stop_rank + ] # QUESTION: Should this raise an exception if the length of fragments_df_full is less than min_fragments? + + self.reporter.log_string( + f"fragments_df_filtered: {len(fragments_df_filtered)}", verbosity="info" + ) - print(f"fragments_df_filtered: {len(fragments_df_filtered)}") + return precursor_df_filtered, fragments_df_filtered + + def recalibration(self, precursor_df_filtered, fragments_df_filtered): + self.calibration_manager.fit( + precursor_df_filtered, + "precursor", + plot=True, + skip=["mz"] if not self.dia_data.has_ms1 else [], + # neptune_run = self.neptune + ) self.calibration_manager.fit( fragments_df_filtered, @@ -568,49 +541,21 @@ def recalibration(self, precursor_df, fragments_df): # neptune_run = self.neptune ) - m2_99 = self.calibration_manager.get_estimator("fragment", "mz").ci( - fragments_df_filtered, 0.95 + self.calibration_manager.predict( + self.spectral_library._precursor_df, + "precursor", ) + self.calibration_manager.predict(self.spectral_library._fragment_df, "fragment") + self.com.fit( { - "ms2_error": max(m2_99, self.config["search"]["target_ms2_tolerance"]), - "rt_error": max(rt_99, self.config["search"]["target_rt_tolerance"]), "column_type": "calibrated", "num_candidates": self.config["search"]["target_num_candidates"], } ) - if self.dia_data.has_ms1: - m1_99 = self.calibration_manager.get_estimator("precursor", "mz").ci( - precursor_df_filtered, 0.95 - ) - self.com.fit( - { - "ms1_error": max( - m1_99, self.config["search"]["target_ms1_tolerance"] - ), - } - ) - - if self.dia_data.has_mobility: - mobility_99 = self.calibration_manager.get_estimator( - "precursor", "mobility" - ).ci(precursor_df_filtered, 0.95) - self.com.fit( - { - "mobility_error": max( - mobility_99, self.config["search"]["target_mobility_tolerance"] - ), - } - ) - - # if self.neptune is not None: - # self.neptune['eval/99_mobility_error'].log(mobility_99) - percentile_001 = np.percentile(precursor_df_filtered["score"], 0.1) - print("score cutoff", percentile_001) - self.optimization_manager.fit( { "fwhm_rt": precursor_df_filtered["cycle_fwhm"].median(), @@ -619,34 +564,7 @@ def recalibration(self, precursor_df, fragments_df): } ) - # if self.neptune is not None: - # precursor_df_fdr = precursor_df_filtered[precursor_df_filtered['qval'] < 0.01] - # self.neptune["eval/precursors"].log(len(precursor_df_fdr)) - # self.neptune['eval/99_ms1_error'].log(m1_99) - # self.neptune['eval/99_ms2_error'].log(m2_99) - # self.neptune['eval/99_rt_error'].log(rt_99) - - def check_recalibration(self, precursor_df): - self.com.accumulated_precursors = len(precursor_df) - self.com.accumulated_precursors_01FDR = len( - precursor_df[precursor_df["qval"] < 0.01] - ) - - self.reporter.log_string( - f"=== checking if recalibration conditions were reached, target {self.com.recalibration_target} precursors ===", - verbosity="progress", - ) - - self.log_precursor_df(precursor_df) - - perform_recalibration = False - - if self.com.accumulated_precursors_01FDR > self.com.recalibration_target: - perform_recalibration = True - - return perform_recalibration - - def fdr_correction(self, features_df, df_fragments): + def fdr_correction(self, features_df, df_fragments, version=-1): return self.fdr_manager.fit_predict( features_df, decoy_strategy="precursor_channel_wise" @@ -657,6 +575,7 @@ def fdr_correction(self, features_df, df_fragments): if self.config["search"]["compete_for_fragments"] else None, dia_cycle=self.dia_data.cycle, + version=version, # neptune_run=self.neptune ) From 20c91c6fa6cd9dcca466cd7d89dbab45d0fc6c74 Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 26 Jul 2024 17:10:20 +0200 Subject: [PATCH 05/36] fixed errors --- alphadia/workflow/peptidecentric.py | 66 +++++++++++++++++------------ 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index e22deddf..caf142c1 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -149,8 +149,6 @@ def com(self): def init_calibration_optimization_manager(self): self._calibration_optimization_manager = manager.OptimizationManager( { - "current_epoch": 0, - "current_step": 0, "ms1_error": self.config["search_initial"]["initial_ms1_tolerance"], "ms2_error": self.config["search_initial"]["initial_ms2_tolerance"], "rt_error": self.config["search_initial"]["initial_rt_tolerance"], @@ -164,9 +162,7 @@ def init_calibration_optimization_manager(self): "recalibration_target": self.config["calibration"][ "recalibration_target" ], - "accumulated_precursors": 0, - "accumulated_precursors_01FDR": 0, - "accumulated_precursors_001FDR": 0, + "classifier_version": -1, } ) @@ -405,44 +401,58 @@ def calibration(self): and isinstance(self.config["search"]["target_mobility_tolerance"], Number) ): self.reporter.log_string( - "A complete list of target tolerances has been specified. Will perform targeted search parameter optimization.", + "A complete list of target tolerances has been specified. Targeted search parameter optimization will be performed.", verbosity="info", ) + self.ms2_optimizer = searchoptimization.TargetedMS2Optimizer( - self.config["search"]["initial_ms2_tolerance"], + self.config["search_initial"]["initial_ms2_tolerance"], self.config["search"]["target_ms2_tolerance"], self.calibration_manager, self.com, self.fdr_manager, ) + self.rt_optimizer = searchoptimization.TargetedRTOptimizer( - self.config["search"]["initial_rt_tolerance"], + self.config["search_initial"]["initial_rt_tolerance"], self.config["search"]["target_rt_tolerance"], self.calibration_manager, self.com, self.fdr_manager, ) - self.ms1_optimizer = searchoptimization.TargetedMS1Optimizer( - self.config["search"]["target_ms1_tolerance"], - self.config["search"]["target_ms1_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, - ) - self.mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( - self.config["search"]["target_mobility_tolerance"], - self.config["search"]["target_mobility_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, - ) + + if self.dia_data.has_ms1: + self.ms1_optimizer = searchoptimization.TargetedMS1Optimizer( + self.config["search_initial"]["initial_ms1_tolerance"], + self.config["search"]["target_ms1_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + else: + self.ms1_optimizer = None + + if self.dia_data.has_mobility: + self.mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( + self.config["search_initial"]["initial_mobility_tolerance"], + self.config["search"]["target_mobility_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + else: + self.mobility_optimizer = None order_of_optimization = [ [ - self.ms2_optimizer, - self.rt_optimizer, - self.ms1_optimizer, - self.mobility_optimizer, + optimizer + for optimizer in [ + self.ms2_optimizer, + self.rt_optimizer, + self.ms1_optimizer, + self.mobility_optimizer, + ] + if optimizer is not None ] ] @@ -451,6 +461,10 @@ def calibration(self): "Automatic search parameter optimization is not yet implemented" ) + self.extract_optimization_data( + self.config["calibration"]["recalibration_target"] + ) + for optimizers in order_of_optimization: for current_step in range(self.config["calibration"]["max_epochs"]): if np.all([optimizer.has_converged() for optimizer in optimizers]): From 6faa3eb8f8ec2928c66b8b59d244ddff6efd1191 Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 29 Jul 2024 14:28:39 +0200 Subject: [PATCH 06/36] update to use df --- alphadia/workflow/searchoptimization.py | 136 ++++++++++++++---------- 1 file changed, 80 insertions(+), 56 deletions(-) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 5f7e5bef..2842ab8a 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -2,7 +2,6 @@ from abc import ABC, abstractmethod import matplotlib.pyplot as plt -import numpy as np # alpha family imports # third party imports @@ -106,8 +105,10 @@ def __init__( super().__init__( calibration_manager, optimization_manager, fdr_manager, **kwargs ) - self.parameters = [initial_parameter] - self.feature = [] + self.history_df = pd.DataFrame( + columns=["parameter", "feature", "classifier_version"] + ) + self.proposed_parameter = initial_parameter def _check_convergence(self): """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. @@ -122,19 +123,22 @@ def _check_convergence(self): """ if ( - len(self.feature) > 2 - and self.feature[-1] < 1.1 * self.feature[-2] - and self.feature[-1] < 1.1 * self.feature[-3] + len(self.history_df) > 2 + and self.history_df["feature"].iloc[-1] + < 1.1 * self.history_df["feature"].iloc[-2] + and self.history_df["feature"].iloc[-1] + < 1.1 * self.history_df["feature"].iloc[-3] ): - backtrack_by = ( - len(self.feature) - np.argmax(self.feature) - ) # Corresponds to the 1 + the number of searches since the last increase in identifications, with 1 indicating an increase in the most recent search. - # This is done instead of directly indexing the parameter of interest because the self.fdr_manager.classifier_store and self.parameters will be of different lengths, but the relevant entries will be the same index from the end. - - self.optimal_parameter = self.parameters[-backtrack_by] + self.optimal_parameter = self.history_df.loc[ + self.history_df["feature"].idxmax(), "parameter" + ] self.optimization_manager.fit({"rt_error": self.optimal_parameter}) self.optimization_manager.fit( - {"classifier_version": self.fdr_manager.num_classifiers - backtrack_by} + { + "classifier_version": self.history_df.loc[ + self.history_df["feature"].idxmax(), "classifier_version" + ] + } ) def _update_parameter(self, df: pd.DataFrame): @@ -155,25 +159,28 @@ def _update_parameter(self, df: pd.DataFrame): def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): - self.feature.append(len(precursors_df)) + self.history_df.loc[len(self.history_df)] = [ + self.proposed_parameter, + len(precursors_df), + self.fdr_manager.current_version, + ] self._check_convergence() if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ {'rt_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'rt_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", verbosity="progress", ) else: - proposed_parameter = self._update_parameter(precursors_df) + self.proposed_parameter = self._update_parameter(precursors_df) self.reporter.log_string( - f"❌ {'rt_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'rt_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", verbosity="progress", ) - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"rt_error": proposed_parameter}) + self.optimization_manager.fit({"rt_error": self.proposed_parameter}) def plot(self): """Plot the optimization of the RT error parameter.""" @@ -338,8 +345,10 @@ def __init__( super().__init__( calibration_manager, optimization_manager, fdr_manager, **kwargs ) - self.parameters = [initial_parameter] - self.feature = [] + self.history_df = pd.DataFrame( + columns=["parameter", "precursor_ids", "classifier_version"] + ) + self.proposed_parameter = initial_parameter def _check_convergence(self): """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. @@ -354,19 +363,22 @@ def _check_convergence(self): """ if ( - len(self.feature) > 2 - and self.feature[-1] < 1.1 * self.feature[-2] - and self.feature[-1] < 1.1 * self.feature[-3] + len(self.history_df) > 2 + and self.history_df["feature"].iloc[-1] + < 1.1 * self.history_df["feature"].iloc[-2] + and self.history_df["feature"].iloc[-1] + < 1.1 * self.history_df["feature"].iloc[-3] ): - backtrack_by = ( - len(self.feature) - np.argmax(self.feature) - ) # Corresponds to the 1 + the number of searches since the last increase in identifications, with 1 indicating an increase in the most recent search. - # This is done instead of directly indexing the parameter of interest because the self.fdr_manager.classifier_store and self.parameters will be of different lengths, but the relevant entries will be the same index from the end. - - self.optimal_parameter = self.parameters[-backtrack_by] + self.optimal_parameter = self.history_df.loc[ + self.history_df["feature"].idxmax(), "parameter" + ] self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) self.optimization_manager.fit( - {"classifier_version": self.fdr_manager.num_classifiers - backtrack_by} + { + "classifier_version": self.history_df.loc[ + self.history_df["feature"].idxmax(), "classifier_version" + ] + } ) def _update_parameter(self, df: pd.DataFrame): @@ -387,25 +399,28 @@ def _update_parameter(self, df: pd.DataFrame): def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): - self.feature.append(len(precursors_df)) + self.history_df.loc[len(self.history_df)] = [ + self.proposed_parameter, + len(precursors_df), + self.fdr_manager.current_version, + ] self._check_convergence() if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ {'ms1_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'ms1_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", verbosity="progress", ) else: - proposed_parameter = self._update_parameter(precursors_df) + self.proposed_parameter = self._update_parameter(precursors_df) self.reporter.log_string( - f"❌ {'ms1_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'ms1_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", verbosity="progress", ) - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"ms1_error": proposed_parameter}) + self.optimization_manager.fit({"ms1_error": self.proposed_parameter}) def plot(self): """Plot the optimization of the MS1 error parameter.""" @@ -449,8 +464,10 @@ def __init__( super().__init__( calibration_manager, optimization_manager, fdr_manager, **kwargs ) - self.parameters = [initial_parameter] - self.feature = [] + self.history_df = pd.DataFrame( + columns=["parameter", "precursor_ids", "classifier_version"] + ) + self.proposed_parameter = initial_parameter def _check_convergence(self): """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. @@ -465,19 +482,22 @@ def _check_convergence(self): """ if ( - len(self.feature) > 2 - and self.feature[-1] < 1.1 * self.feature[-2] - and self.feature[-1] < 1.1 * self.feature[-3] + len(self.history_df) > 2 + and self.history_df["feature"].iloc[-1] + < 1.1 * self.history_df["feature"].iloc[-2] + and self.history_df["feature"].iloc[-1] + < 1.1 * self.history_df["feature"].iloc[-3] ): - backtrack_by = ( - len(self.feature) - np.argmax(self.feature) - ) # Corresponds to the 1 + the number of searches since the last increase in identifications, with 1 indicating an increase in the most recent search. - # This is done instead of directly indexing the parameter of interest because the self.fdr_manager.classifier_store and self.parameters will be of different lengths, but the relevant entries will be the same index from the end. - - self.optimal_parameter = self.parameters[-backtrack_by] - self.optimization_manager.fit({"mobility_error": self.optimal_parameter}) + self.optimal_parameter = self.history_df.loc[ + self.history_df["feature"].idxmax(), "parameter" + ] + self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) self.optimization_manager.fit( - {"classifier_version": self.fdr_manager.num_classifiers - backtrack_by} + { + "classifier_version": self.history_df.loc[ + self.history_df["feature"].idxmax(), "classifier_version" + ] + } ) def _update_parameter(self, df: pd.DataFrame): @@ -498,25 +518,29 @@ def _update_parameter(self, df: pd.DataFrame): def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): - self.feature.append(len(precursors_df)) + self.history_df.loc[len(self.history_df)] = [ + self.proposed_parameter, + len(precursors_df), + self.fdr_manager.current_version, + ] self._check_convergence() if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ {'mobility_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.parameters)} searches.", + f"✅ {'mobility_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", verbosity="progress", ) else: - proposed_parameter = self._update_parameter(precursors_df) + self.proposed_parameter = self._update_parameter(precursors_df) self.reporter.log_string( - f"❌ {'mobility_error':<15}: optimization incomplete after {len(self.parameters)} search(es). Will search with parameter {proposed_parameter}.", + f"❌ {'mobility_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", verbosity="progress", ) - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"mobility_error": proposed_parameter}) + self.parameters.append(self.proposed_parameter) + self.optimization_manager.fit({"mobility_error": self.proposed_parameter}) def plot(self): """Plot the optimization of the mobility error parameter.""" From 19dde17dd81be6c5370d7d0e0e3344a0be23180f Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 29 Jul 2024 15:19:16 +0200 Subject: [PATCH 07/36] use df for optimization history in automatic optimization --- alphadia/workflow/searchoptimization.py | 57 ++++++++++++++++++------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 207c242f..e6068028 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -159,11 +159,20 @@ def _update_parameter(self, df: pd.DataFrame): def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): - self.history_df.loc[len(self.history_df)] = [ - self.proposed_parameter, - len(precursors_df), - self.fdr_manager.current_version, - ] + new_row = pd.DataFrame( + [ + { + "parameter": float( + self.proposed_parameter + ), # Ensure float dtype + "feature": int(len(precursors_df)), # Ensure int dtype + "classifier_version": int( + self.fdr_manager.current_version + ), # Ensure int dtype + } + ] + ) + self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) self._check_convergence() if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence @@ -408,11 +417,20 @@ def _update_parameter(self, df: pd.DataFrame): def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): - self.history_df.loc[len(self.history_df)] = [ - self.proposed_parameter, - len(precursors_df), - self.fdr_manager.current_version, - ] + new_row = pd.DataFrame( + [ + { + "parameter": float( + self.proposed_parameter + ), # Ensure float dtype + "feature": int(len(precursors_df)), # Ensure int dtype + "classifier_version": int( + self.fdr_manager.current_version + ), # Ensure int dtype + } + ] + ) + self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) self._check_convergence() if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence @@ -527,11 +545,20 @@ def _update_parameter(self, df: pd.DataFrame): def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if not self.has_converged(): - self.history_df.loc[len(self.history_df)] = [ - self.proposed_parameter, - len(precursors_df), - self.fdr_manager.current_version, - ] + new_row = pd.DataFrame( + [ + { + "parameter": float( + self.proposed_parameter + ), # Ensure float dtype + "feature": int(len(precursors_df)), # Ensure int dtype + "classifier_version": int( + self.fdr_manager.current_version + ), # Ensure int dtype + } + ] + ) + self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) self._check_convergence() if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence From ecbe63a8b7433fe2b61a1dadd2ed2c2b167a29d3 Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 29 Jul 2024 15:23:07 +0200 Subject: [PATCH 08/36] change check for targeted calibration --- alphadia/workflow/peptidecentric.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index caf142c1..65e0d3e7 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -1,6 +1,5 @@ # native imports import logging -from numbers import Number # third party imports import numpy as np @@ -395,10 +394,10 @@ def calibration(self): return if ( - isinstance(self.config["search"]["target_ms2_tolerance"], Number) - and isinstance(self.config["search"]["target_rt_tolerance"], Number) - and isinstance(self.config["search"]["target_ms1_tolerance"], Number) - and isinstance(self.config["search"]["target_mobility_tolerance"], Number) + self.config["search"]["target_ms2_tolerance"] > 0 + and self.config["search"]["target_rt_tolerance"] > 0 + and self.config["search"]["target_ms1_tolerance"] > 0 + and self.config["search"]["target_mobility_tolerance"] > 0 ): self.reporter.log_string( "A complete list of target tolerances has been specified. Targeted search parameter optimization will be performed.", From de6da9b8e86918edf90e13e0a9d908df8cc01aa1 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 09:41:12 +0200 Subject: [PATCH 09/36] temporary_save --- alphadia/workflow/peptidecentric.py | 105 ++- alphadia/workflow/searchoptimization.py | 943 ++++++++---------------- tests/e2e_tests/e2e_test_cases.yaml | 35 + 3 files changed, 413 insertions(+), 670 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 65e0d3e7..cabde883 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -162,6 +162,9 @@ def init_calibration_optimization_manager(self): "recalibration_target" ], "classifier_version": -1, + "fwhm_rt": self.config["optimization_manager"]["fwhm_rt"], + "fwhm_mobility": self.config["optimization_manager"]["fwhm_mobility"], + "score_cutoff": self.config["optimization_manager"]["score_cutoff"], } ) @@ -393,17 +396,7 @@ def calibration(self): ) return - if ( - self.config["search"]["target_ms2_tolerance"] > 0 - and self.config["search"]["target_rt_tolerance"] > 0 - and self.config["search"]["target_ms1_tolerance"] > 0 - and self.config["search"]["target_mobility_tolerance"] > 0 - ): - self.reporter.log_string( - "A complete list of target tolerances has been specified. Targeted search parameter optimization will be performed.", - verbosity="info", - ) - + if self.config["search"]["target_ms2_tolerance"] > 0: self.ms2_optimizer = searchoptimization.TargetedMS2Optimizer( self.config["search_initial"]["initial_ms2_tolerance"], self.config["search"]["target_ms2_tolerance"], @@ -411,7 +404,15 @@ def calibration(self): self.com, self.fdr_manager, ) + else: + self.ms2_optimizer = searchoptimization.AutomaticMS2Optimizer( + self.config["search_initial"]["initial_ms2_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + if self.config["search"]["target_rt_tolerance"] > 0: self.rt_optimizer = searchoptimization.TargetedRTOptimizer( self.config["search_initial"]["initial_rt_tolerance"], self.config["search"]["target_rt_tolerance"], @@ -419,8 +420,15 @@ def calibration(self): self.com, self.fdr_manager, ) - - if self.dia_data.has_ms1: + else: + self.rt_optimizer = searchoptimization.TargetedRTOptimizer( + self.config["search_initial"]["initial_rt_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + if self.dia_data.has_ms1: + if self.config["search"]["target_ms1_tolerance"] > 0: self.ms1_optimizer = searchoptimization.TargetedMS1Optimizer( self.config["search_initial"]["initial_ms1_tolerance"], self.config["search"]["target_ms1_tolerance"], @@ -429,9 +437,16 @@ def calibration(self): self.fdr_manager, ) else: - self.ms1_optimizer = None - - if self.dia_data.has_mobility: + self.ms1_optimizer = searchoptimization.AutomaticMS1Optimizer( + self.config["search_initial"]["initial_ms1_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + else: + self.ms1_optimizer = None + if self.dia_data.has_mobility: + if self.config["search"]["target_mobility_tolerance"] > 0: self.mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( self.config["search_initial"]["initial_mobility_tolerance"], self.config["search"]["target_mobility_tolerance"], @@ -440,25 +455,39 @@ def calibration(self): self.fdr_manager, ) else: - self.mobility_optimizer = None - - order_of_optimization = [ - [ - optimizer - for optimizer in [ - self.ms2_optimizer, - self.rt_optimizer, - self.ms1_optimizer, - self.mobility_optimizer, - ] - if optimizer is not None - ] + self.mobility_optimizer = searchoptimization.AutomaticMobilityOptimizer( + self.config["search_initial"]["initial_mobility_tolerance"], + self.calibration_manager, + self.com, + self.fdr_manager, + ) + else: + self.mobility_optimizer = None + + self.reporter.log_string( + "A complete list of target tolerances has been specified. Targeted search parameter optimization will be performed.", + verbosity="info", + ) + optimizers = [ + self.ms2_optimizer, + self.rt_optimizer, + self.ms1_optimizer, + self.mobility_optimizer, + ] + targeted_optimizers = [ + [ + optimizer + for optimizer in optimizers + if isinstance(optimizer, searchoptimization.TargetedOptimizer) ] + ] + automatic_optimizers = [ + [optimizer] + for optimizer in optimizers + if isinstance(optimizer, searchoptimization.AutomaticOptimizer) + ] - else: - raise NotImplementedError( - "Automatic search parameter optimization is not yet implemented" - ) + order_of_optimization = targeted_optimizers + automatic_optimizers self.extract_optimization_data( self.config["calibration"]["recalibration_target"] @@ -569,7 +598,7 @@ def recalibration(self, precursor_df_filtered, fragments_df_filtered): ) percentile_001 = np.percentile(precursor_df_filtered["score"], 0.1) - self.optimization_manager.fit( + self.com.fit( { "fwhm_rt": precursor_df_filtered["cycle_fwhm"].median(), "fwhm_mobility": precursor_df_filtered["mobility_fwhm"].median(), @@ -625,8 +654,8 @@ def extract_batch(self, batch_df, apply_cutoff=False): if self.dia_data.has_ms1 else "mz_library", fragment_mz_column=f"mz_{self.com.column_type}", - fwhm_rt=self.optimization_manager.fwhm_rt, - fwhm_mobility=self.optimization_manager.fwhm_mobility, + fwhm_rt=self.com.fwhm_rt, + fwhm_mobility=self.com.fwhm_mobility, ) candidates_df = extraction(thread_count=self.config["general"]["thread_count"]) @@ -635,11 +664,11 @@ def extract_batch(self, batch_df, apply_cutoff=False): if apply_cutoff: num_before = len(candidates_df) self.reporter.log_string( - f"Applying score cutoff of {self.optimization_manager.score_cutoff}", + f"Applying score cutoff of {self.com.score_cutoff}", verbosity="info", ) candidates_df = candidates_df[ - candidates_df["score"] > self.optimization_manager.score_cutoff + candidates_df["score"] > self.com.score_cutoff ] num_after = len(candidates_df) num_removed = num_before - num_after diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index e6068028..ed9964b2 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -6,17 +6,16 @@ # alpha family imports # third party imports import pandas as pd +import seaborn as sns # alphadia imports -from alphadia.workflow import manager, reporting +from alphadia.workflow import peptidecentric, reporting class BaseOptimizer(ABC): def __init__( self, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): """This class serves as a base class for organizing the search parameter optimization process, which defines the parameters used for search. @@ -24,20 +23,12 @@ def __init__( Parameters ---------- - calibration_manager: manager.CalibrationManager - The calibration manager for the workflow, which is needed to update the search parameter between rounds of optimization - - optimization_manager: manager.OptimizationManager - The optimization manager for the workflow, which is needed so the optimal parameter and classifier version can be saved to the manager - - fdr_manager: manager.FDRManager - The FDR manager for the workflow, which is needed to update the optimal classifier version in the optimization manager + workflow: peptidecentric.PeptideCentricWorkflow + The workflow object that the optimization is being performed on. """ self.optimal_parameter = None - self.calibration_manager = calibration_manager - self.optimization_manager = optimization_manager - self.fdr_manager = fdr_manager + self.workflow = workflow self.reporter = reporting.LogBackend() if reporter is None else reporter @abstractmethod @@ -58,297 +49,242 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """ pass - @abstractmethod - def _update_parameter(self, df): - """This method specifies the rule according to which the search parameter is updated between rounds of optimization. The rule is specific to the parameter being optimized. - - Parameters - ---------- - - df: pd.DataFrame - The dataframe used to update the parameter. This could be the precursor or fragment dataframe, depending on the search parameter being optimized. - - - """ - pass - - @abstractmethod - def _check_convergence(self): - """This method checks if the optimization has converged according to parameter-specific conditions and, if it has, sets the optimal parameter attribute and updates the optimization manager.""" - pass - - def has_converged(self): - """If the optimal parameter has been set, the optimization must have converged and the method returns True. Otherwise, it returns False.""" - return self.optimal_parameter is not None - - -class AutomaticRTOptimizer(BaseOptimizer): - """TODO Finish this optimizer""" +class AutomaticOptimizer(BaseOptimizer): def __init__( self, initial_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): - """See base class. + """This class automatically optimizes the search parameter and stores the progres of optimization in a dataframe, history_df. Parameters ---------- - initial_parameter: float The parameter used for search in the first round of optimization. - """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.history_df = pd.DataFrame( - columns=["parameter", "feature", "classifier_version"] - ) - self.proposed_parameter = initial_parameter - - def _check_convergence(self): - """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. - If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. - - Notes - ----- - Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - """ + super().__init__(workflow, **kwargs) + self.history_df = pd.DataFrame() + self.workflow.com.fit({self.parameter_name: initial_parameter}) + self.has_converged = False - if ( - len(self.history_df) > 2 - and self.history_df["feature"].iloc[-1] - < 1.1 * self.history_df["feature"].iloc[-2] - and self.history_df["feature"].iloc[-1] - < 1.1 * self.history_df["feature"].iloc[-3] - ): - self.optimal_parameter = self.history_df.loc[ - self.history_df["feature"].idxmax(), "parameter" - ] - self.optimization_manager.fit({"rt_error": self.optimal_parameter}) - self.optimization_manager.fit( + def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" + if self.has_converged: + return + + new_row = pd.DataFrame( + [ { - "classifier_version": self.history_df.loc[ - self.history_df["feature"].idxmax(), "classifier_version" - ] + "parameter": float( + self.workflow.com.__dict__[self.parameter_name] + ), # Ensure float dtype + self.feature_name: self._get_feature_value( + precursors_df, fragments_df + ), # Ensure int dtype + "classifier_version": int( + self.workflow.fdr_manager.current_version + ), # Ensure int dtype } - ) + ] + ) + self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) + just_converged = self._check_convergence() - def _update_parameter(self, df: pd.DataFrame): - """See base class. The update rule is - 1) calculate the deviation of the predicted mz values from the observed mz values, - 2) take the mean of the endpoints of the central 99% of these deviations, and - 3) multiply this value by 1.1. - This is implemented by the ci method for the estimator. + if just_converged: + self.has_converged = True + index_of_optimum = self.history_df[self.feature_name].idxmax() - """ - proposed_parameter = 1.1 * self.calibration_manager.get_estimator( - "precursor", "rt" - ).ci(df, 0.99) + optimal_parameter = self.history_df["parameter"].loc[index_of_optimum] + classifier_version_at_optimum = self.history_df["classifier_version"].loc[ + index_of_optimum + ] - return proposed_parameter + self.workflow.com.fit({self.parameter_name: optimal_parameter}) + self.workflow.com.fit({"classifier_version": classifier_version_at_optimum}) - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" - if not self.has_converged(): - new_row = pd.DataFrame( - [ - { - "parameter": float( - self.proposed_parameter - ), # Ensure float dtype - "feature": int(len(precursors_df)), # Ensure int dtype - "classifier_version": int( - self.fdr_manager.current_version - ), # Ensure int dtype - } - ] - ) - self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) - self._check_convergence() - - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence self.reporter.log_string( - f"✅ {'rt_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", + f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", verbosity="progress", ) else: - self.proposed_parameter = self._update_parameter(precursors_df) + new_parameter = self._propose_new_parameter( + precursors_df + if self.estimator_group_name == "precursor" + else fragments_df + ) + + self.workflow.com.fit({self.parameter_name: new_parameter}) self.reporter.log_string( - f"❌ {'rt_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", + f"❌ {self.parameter_name:<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.workflow.com.__dict__[self.parameter_name]}.", verbosity="progress", ) - self.optimization_manager.fit({"rt_error": self.proposed_parameter}) - def plot(self): """Plot the optimization of the RT error parameter.""" fig, ax = plt.subplots() - ax.vlines( - self.optimal_parameter, - 0, - max(self.feature), + + # Plot the vertical line + ax.axvline( + x=self.workflow.com.__dict__[self.parameter_name], + ymin=0, + ymax=self.history_df[self.feature_name].max(), color="red", zorder=0, - label="Optimal RT error", + label=f"Optimal {self.parameter_name}", + ) + + # Plot the line and scatter plot using Seaborn + sns.lineplot( + x=self.history_df["parameter"], + y=self.history_df[self.feature_name], + ax=ax, + ) + sns.scatterplot( + x=self.history_df["parameter"], + y=self.history_df[self.feature_name], + ax=ax, ) - ax.plot(self.parameters, self.feature) - ax.scatter(self.parameters, self.feature) - ax.set_ylabel("Number of precursor identifications") - ax.set_xlabel("RT error") + + # Set labels and other properties + ax.set_xlabel(self.parameter_name) ax.xaxis.set_inverted(True) - ax.set_ylim(bottom=0, top=max(self.feature) * 1.1) + ax.set_ylim(bottom=0, top=self.history_df[self.feature_name].max() * 1.1) ax.legend(loc="upper left") + plt.show() + @abstractmethod + def _propose_new_parameter(self, df): + """This method specifies the rule according to which the search parameter is updated between rounds of optimization. The rule is specific to the parameter being optimized. + + Parameters + ---------- -class AutomaticMS2Optimizer(BaseOptimizer): + df: pd.DataFrame + The dataframe used to update the parameter. This could be the precursor or fragment dataframe, depending on the search parameter being optimized. + + + """ + pass + + @abstractmethod + def _check_convergence(self): + """This method checks if the optimization has converged according to parameter-specific conditions and, if it has, sets the optimal parameter attribute and updates the optimization manager.""" + pass + + @abstractmethod + def _get_feature_value( + self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame + ): + """Each parameter is optimized according to a particular feature. This method gets the value of that feature for a given round of optimization. + + Parameters + ---------- + + precursors_df: pd.DataFrame + The precursor dataframe for the search + + fragments_df: pd.DataFrame + The fragment dataframe for the search + + + """ + pass + + +class TargetedOptimizer(BaseOptimizer): def __init__( self, initial_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + target_parameter: float, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): - """This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number. + """This class optimizes the search parameter until it reaches a user-specified target value. Parameters ---------- + initial_parameter: float The parameter used for search in the first round of optimization. + target_parameter: float + Optimization will stop when this parameter is reached. """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.history_df = pd.DataFrame( - columns=["parameter", "precursor_ids", "classifier_version"] - ) - self.proposed_parameter = initial_parameter - - def _check_convergence(self): - """Optimization should stop if continued narrowing of the MS2 parameter is not improving the number of precursor identifications. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the number of identifications. - If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. - - Notes - ----- - Because the check for an increase in identifications requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + super().__init__(workflow, **kwargs) + self.workflow.com.fit({self.parameter_name: initial_parameter}) + self.target_parameter = target_parameter + self.has_converged = False + def _check_convergence(self, proposed_parameter): + """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. + Parameters + ---------- + proposed_parameter: float + The proposed parameter for the next round of optimization. """ - if ( - len(self.history_df) > 2 - and self.history_df["precursor_ids"].iloc[-1] - < 1.1 * self.history_df["precursor_ids"].iloc[-2] - and self.history_df["precursor_ids"].iloc[-1] - < 1.1 * self.history_df["precursor_ids"].iloc[-3] - ): - self.optimal_parameter = self.history_df.loc[ - self.history_df["precursor_ids"].idxmax(), "parameter" - ] - self.optimization_manager.fit({"ms2_error": self.optimal_parameter}) - self.optimization_manager.fit( - { - "classifier_version": self.history_df.loc[ - self.history_df["precursor_ids"].idxmax(), "classifier_version" - ] - } - ) + return proposed_parameter <= self.target_parameter - def _update_parameter(self, df: pd.DataFrame): + def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, - 2) take the mean of the endpoints of the central 99% of these deviations, and - 3) multiply this value by 1.1. + 2) take the mean of the endpoints of the central 95% of these deviations, and + 3) take the maximum of this value and the target parameter. This is implemented by the ci method for the estimator. - - """ - proposed_parameter = 1.1 * self.calibration_manager.get_estimator( - "fragment", "mz" - ).ci(df, 0.99) - - return proposed_parameter + return max( + self.workflow.calibration_manager.get_estimator( + self.estimator_group_name, self.estimator_name + ).ci(df, 0.95), + self.target_parameter, + ) def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The number of precursor identifications is used to track the progres of the optimization (stored in .precursor_ids) and determine whether it has converged.""" - if not self.has_converged(): - new_row = pd.DataFrame( - [ - { - "parameter": float( - self.proposed_parameter - ), # Ensure float dtype - "precursor_ids": int(len(precursors_df)), # Ensure int dtype - "classifier_version": int( - self.fdr_manager.current_version - ), # Ensure int dtype - } - ] + """See base class.""" + if self.has_converged: + self.reporter.log_string( + f"✅ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} <= {self.target_parameter:.4f}", + verbosity="progress", ) - self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) - self._check_convergence() + return - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence + new_parameter = self._propose_new_parameter( + precursors_df if self.estimator_group_name == "precursor" else fragments_df + ) + just_converged = self._check_convergence(new_parameter) + self.workflow.com.fit({self.parameter_name: new_parameter}) + + if just_converged: + self.has_converged = True self.reporter.log_string( - f"✅ {'ms2_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", + f"✅ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} <= {self.target_parameter:.4f}", verbosity="progress", ) else: - self.proposed_parameter = self._update_parameter(fragments_df) - self.reporter.log_string( - f"❌ {'ms2_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", + f"❌ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} > {self.target_parameter:.4f}", verbosity="progress", ) - self.optimization_manager.fit({"ms2_error": self.proposed_parameter}) - def plot(self): - """Plot the optimization of the MS2 error parameter.""" - fig, ax = plt.subplots() - ax.vlines( - self.optimal_parameter, - 0, - max(self.precursor_ids), - color="red", - zorder=0, - label="Optimal MS2 error", - ) - ax.plot(self.parameters, self.precursor_ids) - ax.scatter(self.parameters, self.precursor_ids) - ax.set_ylabel("Number of precursor identifications") - ax.set_xlabel("MS2 error") - ax.xaxis.set_inverted(True) - ax.set_ylim(bottom=0, top=max(self.precursor_ids) * 1.1) - ax.legend(loc="upper left") - plt.show() - - -class AutomaticMS1Optimizer(BaseOptimizer): +class AutomaticRTOptimizer(AutomaticOptimizer): """TODO Finish this optimizer""" def __init__( self, initial_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): """See base class. @@ -360,13 +296,11 @@ def __init__( The parameter used for search in the first round of optimization. """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.history_df = pd.DataFrame( - columns=["parameter", "precursor_ids", "classifier_version"] - ) - self.proposed_parameter = initial_parameter + self.parameter_name = "rt_error" + self.estimator_group_name = "precursor" + self.estimator_name = "rt" + self.feature_name = "precursor_count" + super().__init__(initial_parameter, workflow, **kwargs) def _check_convergence(self): """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. @@ -380,26 +314,15 @@ def _check_convergence(self): """ - if ( + return ( len(self.history_df) > 2 - and self.history_df["feature"].iloc[-1] - < 1.1 * self.history_df["feature"].iloc[-2] - and self.history_df["feature"].iloc[-1] - < 1.1 * self.history_df["feature"].iloc[-3] - ): - self.optimal_parameter = self.history_df.loc[ - self.history_df["feature"].idxmax(), "parameter" - ] - self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) - self.optimization_manager.fit( - { - "classifier_version": self.history_df.loc[ - self.history_df["feature"].idxmax(), "classifier_version" - ] - } - ) + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-2] + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-3] + ) - def _update_parameter(self, df: pd.DataFrame): + def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, 2) take the mean of the endpoints of the central 99% of these deviations, and @@ -408,126 +331,59 @@ def _update_parameter(self, df: pd.DataFrame): """ - proposed_parameter = 1.1 * self.calibration_manager.get_estimator( - "precursor", "mz" + return 1.1 * self.workflow.calibration_manager.get_estimator( + self.estimator_group_name, self.estimator_name ).ci(df, 0.99) - return proposed_parameter - - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" - if not self.has_converged(): - new_row = pd.DataFrame( - [ - { - "parameter": float( - self.proposed_parameter - ), # Ensure float dtype - "feature": int(len(precursors_df)), # Ensure int dtype - "classifier_version": int( - self.fdr_manager.current_version - ), # Ensure int dtype - } - ] - ) - self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) - self._check_convergence() - - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence - self.reporter.log_string( - f"✅ {'ms1_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", - verbosity="progress", - ) - - else: - self.proposed_parameter = self._update_parameter(precursors_df) - - self.reporter.log_string( - f"❌ {'ms1_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", - verbosity="progress", - ) - - self.optimization_manager.fit({"ms1_error": self.proposed_parameter}) - - def plot(self): - """Plot the optimization of the MS1 error parameter.""" - fig, ax = plt.subplots() - ax.vlines( - self.optimal_parameter, - 0, - max(self.feature), - color="red", - zorder=0, - label="Optimal MS1 error", - ) - ax.plot(self.parameters, self.feature) - ax.scatter(self.parameters, self.feature) - ax.set_ylabel("Number of precursor identifications") - ax.set_xlabel("MS1 error") - ax.xaxis.set_inverted(True) - ax.set_ylim(bottom=0, top=max(self.feature) * 1.1) - ax.legend(loc="upper left") - plt.show() + def _get_feature_value( + self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame + ): + return len(precursors_df) -class AutomaticMobilityOptimizer(BaseOptimizer): +class AutomaticMS2Optimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): - """See base class. + """This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number. Parameters ---------- - initial_parameter: float The parameter used for search in the first round of optimization. + """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.history_df = pd.DataFrame( - columns=["parameter", "precursor_ids", "classifier_version"] - ) - self.proposed_parameter = initial_parameter + self.parameter_name = "ms2_error" + self.estimator_group_name = "fragment" + self.estimator_name = "mz" + self.feature_name = "precursor_count" + super().__init__(initial_parameter, workflow, **kwargs) def _check_convergence(self): - """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. + """Optimization should stop if continued narrowing of the MS2 parameter is not improving the number of precursor identifications. + This function checks if the previous rounds of optimization have led to a meaningful improvement in the number of identifications. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. Notes ----- - Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + Because the check for an increase in identifications requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. """ - if ( + return ( len(self.history_df) > 2 - and self.history_df["feature"].iloc[-1] - < 1.1 * self.history_df["feature"].iloc[-2] - and self.history_df["feature"].iloc[-1] - < 1.1 * self.history_df["feature"].iloc[-3] - ): - self.optimal_parameter = self.history_df.loc[ - self.history_df["feature"].idxmax(), "parameter" - ] - self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) - self.optimization_manager.fit( - { - "classifier_version": self.history_df.loc[ - self.history_df["feature"].idxmax(), "classifier_version" - ] - } - ) + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-2] + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-3] + ) - def _update_parameter(self, df: pd.DataFrame): + def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, 2) take the mean of the endpoints of the central 99% of these deviations, and @@ -536,79 +392,23 @@ def _update_parameter(self, df: pd.DataFrame): """ - proposed_parameter = 1.1 * self.calibration_manager.get_estimator( - "precursor", "mz" + return 1.1 * self.workflow.calibration_manager.get_estimator( + self.estimator_group_name, self.estimator_name ).ci(df, 0.99) - return proposed_parameter - - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" - if not self.has_converged(): - new_row = pd.DataFrame( - [ - { - "parameter": float( - self.proposed_parameter - ), # Ensure float dtype - "feature": int(len(precursors_df)), # Ensure int dtype - "classifier_version": int( - self.fdr_manager.current_version - ), # Ensure int dtype - } - ] - ) - self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) - self._check_convergence() - - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence - self.reporter.log_string( - f"✅ {'mobility_error':<15}: optimization complete. Optimal parameter {self.optimal_parameter} found after {len(self.history_df)} searches.", - verbosity="progress", - ) - - else: - self.proposed_parameter = self._update_parameter(precursors_df) - - self.reporter.log_string( - f"❌ {'mobility_error':<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.proposed_parameter}.", - verbosity="progress", - ) - - self.parameters.append(self.proposed_parameter) - self.optimization_manager.fit({"mobility_error": self.proposed_parameter}) - - def plot(self): - """Plot the optimization of the mobility error parameter.""" - fig, ax = plt.subplots() - ax.vlines( - self.optimal_parameter, - 0, - max(self.feature), - color="red", - zorder=0, - label="Optimal mobility error", - ) - ax.plot(self.parameters, self.feature) - ax.scatter(self.parameters, self.feature) - ax.set_ylabel("Number of precursor identifications") - ax.set_xlabel("Mobility error") - ax.xaxis.set_inverted(True) - ax.set_ylim(bottom=0, top=max(self.feature) * 1.1) - ax.legend(loc="upper left") - plt.show() + def _get_feature_value( + self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame + ): + return len(precursors_df) -class TargetedRTOptimizer(BaseOptimizer): - """This class optimizes the RT search parameter until it reaches a user-specified target value.""" +class AutomaticMS1Optimizer(AutomaticOptimizer): + """TODO Finish this optimizer""" def __init__( self, initial_parameter: float, - target_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): """See base class. @@ -619,75 +419,57 @@ def __init__( initial_parameter: float The parameter used for search in the first round of optimization. - target_parameter: float - Optimization will stop when this parameter is reached. - """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.target_parameter = target_parameter - self.parameters = [initial_parameter] + self.parameter_name = "ms1_error" + self.estimator_group_name = "precursor" + self.estimator_name = "mz" + self.feature_name = "precursor_count" + super().__init__(initial_parameter, workflow, **kwargs) + + def _check_convergence(self): + """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. + This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. + If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. + + Notes + ----- + Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - def _check_convergence(self, proposed_parameter): - """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. - Parameters - ---------- - proposed_parameter: float - The proposed parameter for the next round of optimization. """ - if proposed_parameter <= self.target_parameter: - self.optimal_parameter = self.target_parameter - self.optimization_manager.fit({"rt_error": self.optimal_parameter}) + return ( + len(self.history_df) > 2 + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-2] + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-3] + ) - def _update_parameter(self, df: pd.DataFrame): + def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is - 1) calculate the deviation of the predicted mz values from the observed mz values, and - 2) take the mean of the endpoints of the central 95% of these deviations + 1) calculate the deviation of the predicted mz values from the observed mz values, + 2) take the mean of the endpoints of the central 99% of these deviations, and + 3) multiply this value by 1.1. This is implemented by the ci method for the estimator. """ - proposed_parameter = self.calibration_manager.get_estimator( - "precursor", "rt" - ).ci(df, 0.95) - - return proposed_parameter - - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class.""" - if not self.has_converged(): - proposed_parameter = self._update_parameter(precursors_df) - self._check_convergence(proposed_parameter) - - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence - self.reporter.log_string( - f"✅ {'rt_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", - verbosity="progress", - ) - - else: - self.reporter.log_string( - f"❌ {'rt_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", - verbosity="progress", - ) - - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"rt_error": proposed_parameter}) + return 1.1 * self.workflow.calibration_manager.get_estimator( + self.estimator_group_name, self.estimator_name + ).ci(df, 0.99) + def _get_feature_value( + self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame + ): + return len(precursors_df) -class TargetedMS2Optimizer(BaseOptimizer): - """This class optimizes the MS2 search parameter until it reaches a user-specified target value.""" +class AutomaticMobilityOptimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, - target_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): """See base class. @@ -698,218 +480,115 @@ def __init__( initial_parameter: float The parameter used for search in the first round of optimization. - target_parameter: float - Optimization will stop when this parameter is reached. - """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.target_parameter = target_parameter - self.parameters = [initial_parameter] + self.parameter_name = "mobility_error" + self.estimator_group_name = "precursor" + self.estimator_name = "mobility" + self.feature_name = "precursor_count" + super().__init__(initial_parameter, workflow, **kwargs) + + def _check_convergence(self): + """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. + This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. + If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. + + Notes + ----- + Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - def _check_convergence(self, proposed_parameter): - """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. - Parameters - ---------- - proposed_parameter: float - The proposed parameter for the next round of optimization. """ - if proposed_parameter <= self.target_parameter: - self.optimal_parameter = self.target_parameter - self.optimization_manager.fit({"ms2_error": self.optimal_parameter}) + return ( + len(self.history_df) > 2 + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-2] + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-3] + ) - def _update_parameter(self, df: pd.DataFrame): + def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is - 1) calculate the deviation of the predicted mz values from the observed mz values, and - 2) take the mean of the endpoints of the central 95% of these deviations + 1) calculate the deviation of the predicted mz values from the observed mz values, + 2) take the mean of the endpoints of the central 99% of these deviations, and + 3) multiply this value by 1.1. This is implemented by the ci method for the estimator. """ - proposed_parameter = self.calibration_manager.get_estimator( - "fragment", "mz" - ).ci(df, 0.95) - - return proposed_parameter - - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class.""" - if not self.has_converged(): - proposed_parameter = self._update_parameter(fragments_df) - self._check_convergence(proposed_parameter) - - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence - self.reporter.log_string( - f"✅ {'ms2_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", - verbosity="progress", - ) - - else: - self.reporter.log_string( - f"❌ {'ms2_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", - verbosity="progress", - ) + return 1.1 * self.workflow.calibration_manager.get_estimator( + self.estimator_group_name, self.estimator_name + ).ci(df, 0.99) - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"ms2_error": proposed_parameter}) + def _get_feature_value( + self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame + ): + return len(precursors_df) -class TargetedMS1Optimizer(BaseOptimizer): - """This class optimizes the MS1 search parameter until it reaches a user-specified target value.""" +class TargetedRTOptimizer(TargetedOptimizer): + """This class optimizes the RT search parameter until it reaches a user-specified target value.""" def __init__( self, initial_parameter: float, target_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): - """See base class. - - Parameters - ---------- - - initial_parameter: float - The parameter used for search in the first round of optimization. - - target_parameter: float - Optimization will stop when this parameter is reached. - - """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.target_parameter = target_parameter - self.parameters = [initial_parameter] - - def _check_convergence(self, proposed_parameter): - """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. - - Parameters - ---------- - proposed_parameter: float - The proposed parameter for the next round of optimization. - """ - - if proposed_parameter <= self.target_parameter: - self.optimal_parameter = self.target_parameter - self.optimization_manager.fit({"ms1_error": self.optimal_parameter}) - - def _update_parameter(self, df: pd.DataFrame): - """See base class. The update rule is - 1) calculate the deviation of the predicted mz values from the observed mz values, and - 2) take the mean of the endpoints of the central 95% of these deviations - This is implemented by the ci method for the estimator. - + """See base class.""" + self.parameter_name = "rt_error" + self.estimator_group_name = "precursor" + self.estimator_name = "rt" + super().__init__(initial_parameter, target_parameter, workflow, **kwargs) - """ - proposed_parameter = self.calibration_manager.get_estimator( - "precursor", "mz" - ).ci(df, 0.95) - return proposed_parameter +class TargetedMS2Optimizer(TargetedOptimizer): + """This class optimizes the MS2 search parameter until it reaches a user-specified target value.""" - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + def __init__( + self, + initial_parameter: float, + target_parameter: float, + workflow: peptidecentric.PeptideCentricWorkflow, + **kwargs, + ): """See base class.""" - if not self.has_converged(): - proposed_parameter = self._update_parameter(precursors_df) - self._check_convergence(proposed_parameter) + self.parameter_name = "ms2_error" + self.estimator_group_name = "fragment" + self.estimator_name = "mz" + super().__init__(initial_parameter, target_parameter, workflow, **kwargs) - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence - self.reporter.log_string( - f"✅ {'ms1_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", - verbosity="progress", - ) - else: - self.reporter.log_string( - f"❌ {'ms1_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", - verbosity="progress", - ) +class TargetedMS1Optimizer(TargetedOptimizer): + """This class optimizes the MS1 search parameter until it reaches a user-specified target value.""" - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"ms1_error": proposed_parameter}) + def __init__( + self, + initial_parameter: float, + target_parameter: float, + workflow: peptidecentric.PeptideCentricWorkflow, + **kwargs, + ): + """See base class.""" + self.parameter_name = "ms1_error" + self.estimator_group_name = "precursor" + self.estimator_name = "mz" + super().__init__(initial_parameter, target_parameter, workflow, **kwargs) -class TargetedMobilityOptimizer(BaseOptimizer): +class TargetedMobilityOptimizer(TargetedOptimizer): """This class optimizes the mobility search parameter until it reaches a user-specified target value.""" def __init__( self, initial_parameter: float, target_parameter: float, - calibration_manager: manager.CalibrationManager, - optimization_manager: manager.OptimizationManager, - fdr_manager: manager.FDRManager, + workflow: peptidecentric.PeptideCentricWorkflow, **kwargs, ): - """See base class. - - Parameters - ---------- - - initial_parameter: float - The parameter used for search in the first round of optimization. - - target_parameter: float - Optimization will stop when this parameter is reached. - - """ - super().__init__( - calibration_manager, optimization_manager, fdr_manager, **kwargs - ) - self.target_parameter = target_parameter - self.parameters = [initial_parameter] - - def _check_convergence(self, proposed_parameter): - """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. - - Parameters - ---------- - proposed_parameter: float - The proposed parameter for the next round of optimization. - """ - - if proposed_parameter <= self.target_parameter: - self.optimal_parameter = self.target_parameter - self.optimization_manager.fit({"mobility_error": self.optimal_parameter}) - - def _update_parameter(self, df: pd.DataFrame): - """See base class. The update rule is - 1) calculate the deviation of the predicted mz values from the observed mz values, and - 2) take the mean of the endpoints of the central 95% of these deviations - This is implemented by the ci method for the estimator. - - - """ - proposed_parameter = self.calibration_manager.get_estimator( - "precursor", "mobility" - ).ci(df, 0.95) - - return proposed_parameter - - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class.""" - if not self.has_converged(): - proposed_parameter = self._update_parameter(precursors_df) - self._check_convergence(proposed_parameter) - - if self.has_converged(): # Note this may change from the above statement since .optimal_parameter may be set in ._check_convergence - self.reporter.log_string( - f"✅ {'mobility_error':<15}: {self.target_parameter:.4f} <= {self.target_parameter:.4f}", - verbosity="progress", - ) - - else: - self.reporter.log_string( - f"❌ {'mobility_error':<15}: {proposed_parameter:.4f} > {self.target_parameter:.4f}", - verbosity="progress", - ) - - self.parameters.append(proposed_parameter) - self.optimization_manager.fit({"mobility_error": proposed_parameter}) + self.parameter_name = "mobility_error" + self.estimator_group_name = "precursor" + self.estimator_name = "mobility" + super().__init__(initial_parameter, target_parameter, workflow, **kwargs) diff --git a/tests/e2e_tests/e2e_test_cases.yaml b/tests/e2e_tests/e2e_test_cases.yaml index 6d6ddfc8..8cfe2fa6 100644 --- a/tests/e2e_tests/e2e_test_cases.yaml +++ b/tests/e2e_tests/e2e_test_cases.yaml @@ -99,6 +99,41 @@ test_cases: - BasicStats + - name: astral_automatic_calibration + config: + library_prediction: + predict: true + fixed_modifications: 'Carbamidomethyl@C' + variable_modifications: 'Oxidation@M;Acetyl@Protein N-term' + max_var_mod_num: 2 + missed_cleavages: 1 + precursor_mz: + - 380 + - 980 + nce: 25 + instrument: Lumos + search: + target_num_candidates: 3 + target_ms1_tolerance: 4 + target_ms2_tolerance: -1 + target_rt_tolerance: -1 + search_initial: + initial_num_candidates: 1 + initial_ms1_tolerance: 10 + initial_ms2_tolerance: 15 + initial_rt_tolerance: 300 + search_output: + peptide_level_lfq: true + precursor_level_lfq: true + fasta: + - source_url: https://datashare.biochem.mpg.de/s/WTu3rFZHNeb3uG2/download?files=2024_01_12_human.fasta + raw_data: + - source_url: https://datashare.biochem.mpg.de/s/WTu3rFZHNeb3uG2/download?files=20231024_OA3_TiHe_ADIAMA_HeLa_200ng_Evo01_21min_F-40_iO_before_01.raw + - source_url: https://datashare.biochem.mpg.de/s/WTu3rFZHNeb3uG2/download?files=20231024_OA3_TiHe_ADIAMA_HeLa_200ng_Evo01_21min_F-40_iO_before_01.raw + - source_url: https://datashare.biochem.mpg.de/s/WTu3rFZHNeb3uG2/download?files=20231024_OA3_TiHe_ADIAMA_HeLa_200ng_Evo01_21min_F-40_iO_before_02.raw + metrics: + - BasicStats + # - name: astral_mixed_species # config: # library_prediction: From 00d533d73da84b927d213afc18ea7e2ecbb0b300 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 10:48:04 +0200 Subject: [PATCH 10/36] automatic calibration in peptidecentric.py with required alterations to config and tests, also refactor optimizers --- alphadia/constants/default.yaml | 8 +- alphadia/workflow/peptidecentric.py | 106 ++++++++++++++---------- alphadia/workflow/searchoptimization.py | 28 ++++--- tests/e2e_tests/e2e_test_cases.yaml | 4 +- 4 files changed, 82 insertions(+), 64 deletions(-) diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml index 97168d82..c9e3be40 100644 --- a/alphadia/constants/default.yaml +++ b/alphadia/constants/default.yaml @@ -68,11 +68,11 @@ calibration: # Number of precursors searched and scored per batch batch_size: 8000 - # recalibration target for the first epoch. For subsequent epochs, the target will increase by this amount. - recalibration_target: 200 + # minimum number of precursors to be found before search parameter optimization begins + min_precursors_for_optimization: 200 - # TODO: remove as not relevant anymore - max_epochs: 20 + # the maximum number of steps that a given optimizer is permitted to take + max_steps: 20 # TODO: remove this parameter final_full_calibration: False diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index cabde883..00dbedd6 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -158,9 +158,6 @@ def init_calibration_optimization_manager(self): "num_candidates": self.config["search_initial"][ "initial_num_candidates" ], - "recalibration_target": self.config["calibration"][ - "recalibration_target" - ], "classifier_version": -1, "fwhm_rt": self.config["optimization_manager"]["fwhm_rt"], "fwhm_mobility": self.config["optimization_manager"]["fwhm_mobility"], @@ -372,7 +369,7 @@ def extract_optimization_data(self, target): precursors_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) self.reporter.log_string( - f"=== checking if recalibration conditions were reached, target {self.com.recalibration_target} precursors ===", + f"=== checking if minimum number of precursors for optimization found yet; minimum number is {target} ===", verbosity="progress", ) @@ -397,82 +394,64 @@ def calibration(self): return if self.config["search"]["target_ms2_tolerance"] > 0: - self.ms2_optimizer = searchoptimization.TargetedMS2Optimizer( + ms2_optimizer = searchoptimization.TargetedMS2Optimizer( self.config["search_initial"]["initial_ms2_tolerance"], self.config["search"]["target_ms2_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) else: - self.ms2_optimizer = searchoptimization.AutomaticMS2Optimizer( + ms2_optimizer = searchoptimization.AutomaticMS2Optimizer( self.config["search_initial"]["initial_ms2_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) if self.config["search"]["target_rt_tolerance"] > 0: - self.rt_optimizer = searchoptimization.TargetedRTOptimizer( + rt_optimizer = searchoptimization.TargetedRTOptimizer( self.config["search_initial"]["initial_rt_tolerance"], self.config["search"]["target_rt_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) else: - self.rt_optimizer = searchoptimization.TargetedRTOptimizer( + rt_optimizer = searchoptimization.AutomaticRTOptimizer( self.config["search_initial"]["initial_rt_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) if self.dia_data.has_ms1: if self.config["search"]["target_ms1_tolerance"] > 0: - self.ms1_optimizer = searchoptimization.TargetedMS1Optimizer( + ms1_optimizer = searchoptimization.TargetedMS1Optimizer( self.config["search_initial"]["initial_ms1_tolerance"], self.config["search"]["target_ms1_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) else: - self.ms1_optimizer = searchoptimization.AutomaticMS1Optimizer( + ms1_optimizer = searchoptimization.AutomaticMS1Optimizer( self.config["search_initial"]["initial_ms1_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) else: - self.ms1_optimizer = None + ms1_optimizer = None if self.dia_data.has_mobility: if self.config["search"]["target_mobility_tolerance"] > 0: - self.mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( + mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( self.config["search_initial"]["initial_mobility_tolerance"], self.config["search"]["target_mobility_tolerance"], - self.calibration_manager, - self.com, - self.fdr_manager, + self, ) else: - self.mobility_optimizer = searchoptimization.AutomaticMobilityOptimizer( + mobility_optimizer = searchoptimization.AutomaticMobilityOptimizer( self.config["search_initial"]["initial_mobility_tolerance"], self.calibration_manager, self.com, self.fdr_manager, ) else: - self.mobility_optimizer = None + mobility_optimizer = None - self.reporter.log_string( - "A complete list of target tolerances has been specified. Targeted search parameter optimization will be performed.", - verbosity="info", - ) optimizers = [ - self.ms2_optimizer, - self.rt_optimizer, - self.ms1_optimizer, - self.mobility_optimizer, + ms2_optimizer, + rt_optimizer, + ms1_optimizer, + mobility_optimizer, ] targeted_optimizers = [ [ @@ -489,13 +468,32 @@ def calibration(self): order_of_optimization = targeted_optimizers + automatic_optimizers + self.reporter.log_string( + "Starting initial classifier training and precursor identification", + verbosity="progress", + ) + self.extract_optimization_data( - self.config["calibration"]["recalibration_target"] + self.config["calibration"]["min_precursors_for_optimization"] + ) + + self.reporter.log_string( + "Target number of precursors found. Starting search parameter optimization.", + verbosity="progress", ) for optimizers in order_of_optimization: - for current_step in range(self.config["calibration"]["max_epochs"]): - if np.all([optimizer.has_converged() for optimizer in optimizers]): + for current_step in range(self.config["calibration"]["max_steps"]): + if np.all([optimizer.has_converged for optimizer in optimizers]): + self.reporter.log_string( + f"Optimization finished for {', '.join([optimizer.parameter_name for optimizer in optimizers])}.", + verbosity="progress", + ) + + for optimizer in optimizers: + if isinstance(optimizer, searchoptimization.AutomaticOptimizer): + optimizer.plot() + break batch_df = self.spectral_library._precursor_df[ self.spectral_library._precursor_df["elution_group_idx"].isin( @@ -531,6 +529,22 @@ def calibration(self): self.reporter.log_string( "==============================================", verbosity="info" ) + self.reporter.log_string( + "Search parameter optimization finished. Values taken forward for search are:", + verbosity="progress", + ) + self.reporter.log_string( + "==============================================", verbosity="progress" + ) + for optimizers in order_of_optimization: + for optimizer in optimizers: + self.reporter.log_string( + f"{optimizer.parameter_name:<15}: {self.com.__dict__[optimizer.parameter_name]:.4f}", + verbosity="progress", + ) + self.reporter.log_string( + "==============================================", verbosity="progress" + ) def filter_dfs(self, precursor_df, fragments_df): precursor_df_filtered = precursor_df[precursor_df["qval"] < 0.01] diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index ed9964b2..49752676 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -9,13 +9,13 @@ import seaborn as sns # alphadia imports -from alphadia.workflow import peptidecentric, reporting +from alphadia.workflow import reporting class BaseOptimizer(ABC): def __init__( self, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): """This class serves as a base class for organizing the search parameter optimization process, which defines the parameters used for search. @@ -54,7 +54,7 @@ class AutomaticOptimizer(BaseOptimizer): def __init__( self, initial_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """This class automatically optimizes the search parameter and stores the progres of optimization in a dataframe, history_df. @@ -74,6 +74,10 @@ def __init__( def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if self.has_converged: + self.reporter.log_string( + f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", + verbosity="progress", + ) return new_row = pd.DataFrame( @@ -204,7 +208,7 @@ def __init__( self, initial_parameter: float, target_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """This class optimizes the search parameter until it reaches a user-specified target value. @@ -284,7 +288,7 @@ class AutomaticRTOptimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class. @@ -345,7 +349,7 @@ class AutomaticMS2Optimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number. @@ -408,7 +412,7 @@ class AutomaticMS1Optimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class. @@ -469,7 +473,7 @@ class AutomaticMobilityOptimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class. @@ -533,7 +537,7 @@ def __init__( self, initial_parameter: float, target_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class.""" @@ -550,7 +554,7 @@ def __init__( self, initial_parameter: float, target_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class.""" @@ -567,7 +571,7 @@ def __init__( self, initial_parameter: float, target_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class.""" @@ -584,7 +588,7 @@ def __init__( self, initial_parameter: float, target_parameter: float, - workflow: peptidecentric.PeptideCentricWorkflow, + workflow, **kwargs, ): """See base class.""" diff --git a/tests/e2e_tests/e2e_test_cases.yaml b/tests/e2e_tests/e2e_test_cases.yaml index 8cfe2fa6..543870d9 100644 --- a/tests/e2e_tests/e2e_test_cases.yaml +++ b/tests/e2e_tests/e2e_test_cases.yaml @@ -114,9 +114,9 @@ test_cases: instrument: Lumos search: target_num_candidates: 3 - target_ms1_tolerance: 4 + target_ms1_tolerance: -1 target_ms2_tolerance: -1 - target_rt_tolerance: -1 + target_rt_tolerance: 200 search_initial: initial_num_candidates: 1 initial_ms1_tolerance: 10 From 3b243244282fa2d8628447a061c2ad007eca330b Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 10:51:36 +0200 Subject: [PATCH 11/36] formatting --- alphadia/workflow/peptidecentric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 00dbedd6..fb47991a 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -469,7 +469,7 @@ def calibration(self): order_of_optimization = targeted_optimizers + automatic_optimizers self.reporter.log_string( - "Starting initial classifier training and precursor identification", + "Starting initial classifier training and precursor identification.", verbosity="progress", ) From 09c0d707527c52064cfc23d1ebfeb6f7ff69a576 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 11:13:01 +0200 Subject: [PATCH 12/36] changed some TODOs and comments --- alphadia/workflow/searchoptimization.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 49752676..03aec284 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -72,7 +72,7 @@ def __init__( self.has_converged = False def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): - """See base class. The TODO is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" + """See base class. The feature is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if self.has_converged: self.reporter.log_string( f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", @@ -88,7 +88,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): ), # Ensure float dtype self.feature_name: self._get_feature_value( precursors_df, fragments_df - ), # Ensure int dtype + ), "classifier_version": int( self.workflow.fdr_manager.current_version ), # Ensure int dtype @@ -283,8 +283,6 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): class AutomaticRTOptimizer(AutomaticOptimizer): - """TODO Finish this optimizer""" - def __init__( self, initial_parameter: float, @@ -307,7 +305,7 @@ def __init__( super().__init__(initial_parameter, workflow, **kwargs) def _check_convergence(self): - """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. + """Optimization should stop if continued optimization of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. @@ -407,8 +405,6 @@ def _get_feature_value( class AutomaticMS1Optimizer(AutomaticOptimizer): - """TODO Finish this optimizer""" - def __init__( self, initial_parameter: float, @@ -431,7 +427,7 @@ def __init__( super().__init__(initial_parameter, workflow, **kwargs) def _check_convergence(self): - """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. + """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. @@ -492,7 +488,7 @@ def __init__( super().__init__(initial_parameter, workflow, **kwargs) def _check_convergence(self): - """Optimization should stop if continued narrowing of the TODO parameter is not improving the TODO feature value. + """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. From 60b884e7333ff6e89c9f53d484f986714aee0276 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 11:52:45 +0200 Subject: [PATCH 13/36] e2e now runs automatic calibration test --- .github/workflows/e2e_testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_testing.yml b/.github/workflows/e2e_testing.yml index 0a6c083a..7198e790 100644 --- a/.github/workflows/e2e_testing.yml +++ b/.github/workflows/e2e_testing.yml @@ -15,7 +15,7 @@ jobs: strategy: matrix: # test case name as defined in e2e_test_cases.yaml - test_case: [ "basic", "synchropasef", "astral", ] + test_case: [ "basic", "synchropasef", "astral", "astral_automatic_calibration", ] env: RUN_NAME: alphadia-${{github.sha}}-${{github.run_id}}-${{github.run_attempt}} BRANCH_NAME: ${{ github.head_ref || github.ref_name }} From 1170dcd596df4efbac38893f1ea214ae2e6640e1 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 15:01:22 +0200 Subject: [PATCH 14/36] fixed bugs with classifier version and workflow.extraction() --- alphadia/workflow/peptidecentric.py | 6 ++---- alphadia/workflow/searchoptimization.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index fb47991a..54bde250 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -512,6 +512,8 @@ def calibration(self): features_df, fragments_df, self.com.classifier_version ) + self.log_precursor_df(precursors_df) + precursors_df_filtered, fragments_df_filtered = self.filter_dfs( precursors_df, fragments_df ) @@ -731,10 +733,6 @@ def extraction(self): self.com.fit( { "num_candidates": self.config["search"]["target_num_candidates"], - "ms1_error": self.config["search"]["target_ms1_tolerance"], - "ms2_error": self.config["search"]["target_ms2_tolerance"], - "rt_error": self.config["search"]["target_rt_tolerance"], - "mobility_error": self.config["search"]["target_mobility_tolerance"], "column_type": "calibrated", } ) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 03aec284..6475d91a 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -92,6 +92,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): "classifier_version": int( self.workflow.fdr_manager.current_version ), # Ensure int dtype + "score_cutoff": float(self.workflow.com.score_cutoff), } ] ) @@ -107,9 +108,13 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): classifier_version_at_optimum = self.history_df["classifier_version"].loc[ index_of_optimum ] + score_cutoff_at_optimum = self.history_df["score_cutoff"].loc[ + index_of_optimum + ] self.workflow.com.fit({self.parameter_name: optimal_parameter}) self.workflow.com.fit({"classifier_version": classifier_version_at_optimum}) + self.workflow.com.fit({"score_cutoff": score_cutoff_at_optimum}) self.reporter.log_string( f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", @@ -134,7 +139,6 @@ def plot(self): """Plot the optimization of the RT error parameter.""" fig, ax = plt.subplots() - # Plot the vertical line ax.axvline( x=self.workflow.com.__dict__[self.parameter_name], ymin=0, @@ -144,19 +148,19 @@ def plot(self): label=f"Optimal {self.parameter_name}", ) - # Plot the line and scatter plot using Seaborn sns.lineplot( - x=self.history_df["parameter"], - y=self.history_df[self.feature_name], + data=self.history_df, + x="parameter", + y=self.feature_name, ax=ax, ) sns.scatterplot( - x=self.history_df["parameter"], - y=self.history_df[self.feature_name], + data=self.history_df, + x="parameter", + y=self.feature_name, ax=ax, ) - # Set labels and other properties ax.set_xlabel(self.parameter_name) ax.xaxis.set_inverted(True) ax.set_ylim(bottom=0, top=self.history_df[self.feature_name].max() * 1.1) @@ -267,6 +271,9 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): ) just_converged = self._check_convergence(new_parameter) self.workflow.com.fit({self.parameter_name: new_parameter}) + self.workflow.com.fit( + {"classifier_version": self.workflow.fdr_manager.current_version} + ) if just_converged: self.has_converged = True From 599e3a206427fca2afb0f5f22f6ae33489e88bed Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 15:16:55 +0200 Subject: [PATCH 15/36] make precursor_df consistent --- alphadia/workflow/peptidecentric.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 54bde250..ddda447b 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -508,17 +508,17 @@ def calibration(self): verbosity="progress", ) - precursors_df = self.fdr_correction( + precursor_df = self.fdr_correction( features_df, fragments_df, self.com.classifier_version ) - self.log_precursor_df(precursors_df) + self.log_precursor_df(precursor_df) - precursors_df_filtered, fragments_df_filtered = self.filter_dfs( - precursors_df, fragments_df + precursor_df_filtered, fragments_df_filtered = self.filter_dfs( + precursor_df, fragments_df ) - self.recalibration(precursors_df_filtered, fragments_df_filtered) + self.recalibration(precursor_df_filtered, fragments_df_filtered) self.reporter.log_string( "=== checking if optimization conditions were reached ===", @@ -526,7 +526,7 @@ def calibration(self): ) for optimizer in optimizers: - optimizer.step(precursors_df_filtered, fragments_df_filtered) + optimizer.step(precursor_df_filtered, fragments_df_filtered) self.reporter.log_string( "==============================================", verbosity="info" From 0878a7710f022738fda06a6aab709e2c63b44996 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 15:21:43 +0200 Subject: [PATCH 16/36] formatting --- alphadia/workflow/peptidecentric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index e38fa06e..34959fe9 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -15,7 +15,6 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils -from alphadia.exceptions import NoRecalibrationTargetError from alphadia.peakgroup import search from alphadia.workflow import base, manager, searchoptimization From 1becbc9515b2509c33681f98250495b41e39b5ba Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 17:49:16 +0200 Subject: [PATCH 17/36] improved logging --- alphadia/workflow/peptidecentric.py | 10 +++++++++- alphadia/workflow/searchoptimization.py | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 34959fe9..f18f2fe0 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -478,7 +478,10 @@ def calibration(self): for optimizers in order_of_optimization: for current_step in range(self.config["calibration"]["max_steps"]): - if np.all([optimizer.has_converged for optimizer in optimizers]): + if ( + np.all([optimizer.has_converged for optimizer in optimizers]) + and len(optimizers) > 0 + ): self.reporter.log_string( f"Optimization finished for {', '.join([optimizer.parameter_name for optimizer in optimizers])}.", verbosity="progress", @@ -506,6 +509,11 @@ def calibration(self): features_df, fragments_df, self.com.classifier_version ) + self.reporter.log_string( + f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", + verbosity="progress", + ) + self.log_precursor_df(precursor_df) precursor_df_filtered, fragments_df_filtered = self.filter_dfs( diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 6475d91a..c5709953 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -117,7 +117,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.workflow.com.fit({"score_cutoff": score_cutoff_at_optimum}) self.reporter.log_string( - f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", + f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]:.4f} found after {len(self.history_df)} searches.", verbosity="progress", ) @@ -131,7 +131,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.workflow.com.fit({self.parameter_name: new_parameter}) self.reporter.log_string( - f"❌ {self.parameter_name:<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.workflow.com.__dict__[self.parameter_name]}.", + f"❌ {self.parameter_name:<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.workflow.com.__dict__[self.parameter_name]:.4f}.", verbosity="progress", ) From b37e70c6b5a677143e581851e88ce50444c8d194 Mon Sep 17 00:00:00 2001 From: odespard Date: Tue, 30 Jul 2024 22:18:50 +0200 Subject: [PATCH 18/36] remove empty list in completely automatic calibration --- alphadia/workflow/peptidecentric.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index f18f2fe0..ecef0a27 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -358,7 +358,15 @@ def extract_optimization_data(self, target): f"=== Step {current_step}, extracted {len(feature_df)} precursors and {len(fragment_df)} fragments ===", verbosity="progress", ) - precursor_df = self.fdr_correction(features_df, fragments_df) + + precursor_df = self.fdr_correction( + features_df, fragments_df, self.com.classifier_version + ) + + self.reporter.log_string( + f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", + verbosity="info", + ) precursors_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) @@ -460,7 +468,11 @@ def calibration(self): if isinstance(optimizer, searchoptimization.AutomaticOptimizer) ] - order_of_optimization = targeted_optimizers + automatic_optimizers + order_of_optimization = ( + targeted_optimizers + automatic_optimizers + if any(targeted_optimizers) + else automatic_optimizers + ) self.reporter.log_string( "Starting initial classifier training and precursor identification.", @@ -478,10 +490,7 @@ def calibration(self): for optimizers in order_of_optimization: for current_step in range(self.config["calibration"]["max_steps"]): - if ( - np.all([optimizer.has_converged for optimizer in optimizers]) - and len(optimizers) > 0 - ): + if np.all([optimizer.has_converged for optimizer in optimizers]): self.reporter.log_string( f"Optimization finished for {', '.join([optimizer.parameter_name for optimizer in optimizers])}.", verbosity="progress", @@ -511,7 +520,7 @@ def calibration(self): self.reporter.log_string( f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", - verbosity="progress", + verbosity="info", ) self.log_precursor_df(precursor_df) From 0cb800f113f83659bef6cbc31fd1931653e91ed3 Mon Sep 17 00:00:00 2001 From: odespard Date: Wed, 31 Jul 2024 09:56:44 +0200 Subject: [PATCH 19/36] enforce minimum training iterations and backtrack for kernel --- alphadia/constants/default.yaml | 4 ++-- alphadia/workflow/peptidecentric.py | 2 +- alphadia/workflow/searchoptimization.py | 14 +++++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml index d59ec2f0..b0597ad5 100644 --- a/alphadia/constants/default.yaml +++ b/alphadia/constants/default.yaml @@ -95,8 +95,8 @@ search_advanced: calibration: - # minimum number of times (epochs) the updated calibration target has to been passed - min_epochs: 3 + # minimum number of times the classifier is trained. + min_training_iterations: 3 # Number of precursors searched and scored per batch batch_size: 8000 diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index ecef0a27..3be6fe5d 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -484,7 +484,7 @@ def calibration(self): ) self.reporter.log_string( - "Target number of precursors found. Starting search parameter optimization.", + "Required number of precursors found. Starting search parameter optimization.", verbosity="progress", ) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index c5709953..6a874bd1 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -93,6 +93,8 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): self.workflow.fdr_manager.current_version ), # Ensure int dtype "score_cutoff": float(self.workflow.com.score_cutoff), + "fwhm_rt": float(self.workflow.com.fwhm_rt), + "fwhm_mobility": float(self.workflow.com.fwhm_mobility), } ] ) @@ -111,10 +113,16 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): score_cutoff_at_optimum = self.history_df["score_cutoff"].loc[ index_of_optimum ] + fwhm_rt_at_optimum = self.history_df["fwhm_rt"].loc[index_of_optimum] + fwhm_mobility_at_optimum = self.history_df["fwhm_mobility"].loc[ + index_of_optimum + ] self.workflow.com.fit({self.parameter_name: optimal_parameter}) self.workflow.com.fit({"classifier_version": classifier_version_at_optimum}) self.workflow.com.fit({"score_cutoff": score_cutoff_at_optimum}) + self.workflow.com.fit({"fwhm_rt": fwhm_rt_at_optimum}) + self.workflow.com.fit({"fwhm_mobility": fwhm_mobility_at_optimum}) self.reporter.log_string( f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]:.4f} found after {len(self.history_df)} searches.", @@ -241,7 +249,11 @@ def _check_convergence(self, proposed_parameter): The proposed parameter for the next round of optimization. """ - return proposed_parameter <= self.target_parameter + return ( + proposed_parameter <= self.target_parameter + and self.workflow.current_version + > self.workflow.config["min_training_iterations"] + ) def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is From 7d3f930d6081d2687d3daa419600ede9de53a8a3 Mon Sep 17 00:00:00 2001 From: odespard Date: Wed, 31 Jul 2024 10:54:27 +0200 Subject: [PATCH 20/36] impose minimum iterations during extraction of optimization data --- alphadia/workflow/peptidecentric.py | 19 ++++++++++++++----- alphadia/workflow/searchoptimization.py | 6 +----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 3be6fe5d..44151f7d 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -314,7 +314,7 @@ def get_batch_plan(self): return plan - def extract_optimization_data(self, target): + def extract_optimization_data(self, target, min_iterations): """Search parameter optimization (i.e. refinement of tolerances for RT, MS2, etc.) is performed on a subset of the elution groups in the spectral library. The number of elution groups which must be searched to get a sufficiently large number for robust calibration varies depending the library used and the data. This function searches an increasing number of elution groups until a sufficient number (determined by target) of precursors are identified at 1% FDR. @@ -371,13 +371,21 @@ def extract_optimization_data(self, target): precursors_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) self.reporter.log_string( - f"=== checking if minimum number of precursors for optimization found yet; minimum number is {target} ===", + f"=== Checking if minimum number of precursors for optimization found yet; minimum number is {target} ===", verbosity="progress", ) self.log_precursor_df(precursor_df) - if precursors_01FDR > target: + self.reporter.log_string( + f"=== Classifier has been trained for {self.fdr_manager.current_version + 1} iteration(s); minimum number is {min_iterations} ===", + verbosity="progress", + ) + + if ( + precursors_01FDR > target + and self.fdr_manager.current_version >= min_iterations - 1 + ): final_stop_index = stop_index # final_stop_index is the number of elution groups that will be included in the calibration data break @@ -480,11 +488,12 @@ def calibration(self): ) self.extract_optimization_data( - self.config["calibration"]["min_precursors_for_optimization"] + self.config["calibration"]["min_precursors_for_optimization"], + self.config["calibration"]["min_training_iterations"], ) self.reporter.log_string( - "Required number of precursors found. Starting search parameter optimization.", + "Required number of precursors found and required number of training iterations performed. Starting search parameter optimization.", verbosity="progress", ) diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 6a874bd1..9c8d392a 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -249,11 +249,7 @@ def _check_convergence(self, proposed_parameter): The proposed parameter for the next round of optimization. """ - return ( - proposed_parameter <= self.target_parameter - and self.workflow.current_version - > self.workflow.config["min_training_iterations"] - ) + return proposed_parameter <= self.target_parameter def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is From 4542a7eafcfb8a5aaf8825517b72bca294d7fb13 Mon Sep 17 00:00:00 2001 From: odespard Date: Wed, 31 Jul 2024 14:58:11 +0200 Subject: [PATCH 21/36] introduce extra optimization before loop and enforce minimum steps for optimization --- alphadia/constants/default.yaml | 3 +++ alphadia/workflow/peptidecentric.py | 39 ++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml index b0597ad5..9a4202f2 100644 --- a/alphadia/constants/default.yaml +++ b/alphadia/constants/default.yaml @@ -107,6 +107,9 @@ calibration: # the maximum number of steps that a given optimizer is permitted to take max_steps: 20 + # the maximum number of steps that a given optimizer is permitted to take + min_steps: 3 + # TODO: remove this parameter final_full_calibration: False diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 44151f7d..8768b6c5 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -389,8 +389,10 @@ def extract_optimization_data(self, target, min_iterations): final_stop_index = stop_index # final_stop_index is the number of elution groups that will be included in the calibration data break - self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] - self.com.fit({"classifier_version": self.fdr_manager.current_version}) + return self.elution_group_order[:final_stop_index], precursor_df, fragments_df + + # self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] + # self.com.fit({"classifier_version": self.fdr_manager.current_version}) def calibration(self): if ( @@ -487,19 +489,38 @@ def calibration(self): verbosity="progress", ) - self.extract_optimization_data( - self.config["calibration"]["min_precursors_for_optimization"], - self.config["calibration"]["min_training_iterations"], + self.eg_idxes_for_calibration, precursor_df, fragments_df = ( + self.extract_optimization_data( + self.config["calibration"]["min_precursors_for_optimization"], + self.config["calibration"]["min_training_iterations"], + ) + ) + + self.com.fit({"classifier_version": self.fdr_manager.current_version}) + + precursor_df_filtered, fragments_df_filtered = self.filter_dfs( + precursor_df, fragments_df ) + self.recalibration(precursor_df_filtered, fragments_df_filtered) + self.reporter.log_string( "Required number of precursors found and required number of training iterations performed. Starting search parameter optimization.", verbosity="progress", ) + # Perform an initial optimization step based on the extracted data to update the initial search parameters. + # This ensures that the classifier is trained at least once prior to the end of optimization, even if the min_steps parameter is 0. + for optimizers in order_of_optimization: + for optimizer in optimizers: + optimizer.step(precursor_df_filtered, fragments_df_filtered) + for optimizers in order_of_optimization: for current_step in range(self.config["calibration"]["max_steps"]): - if np.all([optimizer.has_converged for optimizer in optimizers]): + if ( + np.all([optimizer.has_converged for optimizer in optimizers]) + and current_step > self.config["calibration"]["min_steps"] - 1 + ): self.reporter.log_string( f"Optimization finished for {', '.join([optimizer.parameter_name for optimizer in optimizers])}.", verbosity="progress", @@ -551,6 +572,12 @@ def calibration(self): self.reporter.log_string( "==============================================", verbosity="info" ) + + self.reporter.log_string( + f"=== Optimization has been performed for {current_step + 1} step(s); minimum number is {self.config["calibration"]["min_steps"]} ===", + verbosity="progress", + ) + self.reporter.log_string( "Search parameter optimization finished. Values taken forward for search are:", verbosity="progress", From 087a85642ee0f8726d4486558d6979392395dc97 Mon Sep 17 00:00:00 2001 From: odespard Date: Wed, 31 Jul 2024 16:34:28 +0200 Subject: [PATCH 22/36] check min_steps as part of _check_convergence method --- alphadia/workflow/peptidecentric.py | 24 +++++++++++++++++------- alphadia/workflow/searchoptimization.py | 23 ++++++++++++++++++----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 8768b6c5..b195c6b4 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -509,6 +509,10 @@ def calibration(self): verbosity="progress", ) + self.reporter.log_string( + "Perform initial optimization on extracted data.", + verbosity="info", + ) # Perform an initial optimization step based on the extracted data to update the initial search parameters. # This ensures that the classifier is trained at least once prior to the end of optimization, even if the min_steps parameter is 0. for optimizers in order_of_optimization: @@ -517,10 +521,7 @@ def calibration(self): for optimizers in order_of_optimization: for current_step in range(self.config["calibration"]["max_steps"]): - if ( - np.all([optimizer.has_converged for optimizer in optimizers]) - and current_step > self.config["calibration"]["min_steps"] - 1 - ): + if np.all([optimizer.has_converged for optimizer in optimizers]): self.reporter.log_string( f"Optimization finished for {', '.join([optimizer.parameter_name for optimizer in optimizers])}.", verbosity="progress", @@ -567,14 +568,16 @@ def calibration(self): ) for optimizer in optimizers: - optimizer.step(precursor_df_filtered, fragments_df_filtered) + optimizer.step( + precursor_df_filtered, fragments_df_filtered, current_step + ) self.reporter.log_string( "==============================================", verbosity="info" ) self.reporter.log_string( - f"=== Optimization has been performed for {current_step + 1} step(s); minimum number is {self.config["calibration"]["min_steps"]} ===", + f"=== Optimization has been performed for {current_step + 1} step(s); minimum number is {self.config['calibration']['min_steps']} ===", verbosity="progress", ) @@ -794,7 +797,14 @@ def extraction(self): apply_cutoff=True, ) - precursor_df = self.fdr_correction(features_df, fragments_df) + self.reporter.log_string( + f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", + verbosity="info", + ) + + precursor_df = self.fdr_correction( + features_df, fragments_df, self.com.classifier_version + ) precursor_df = precursor_df[precursor_df["qval"] <= self.config["fdr"]["fdr"]] diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 9c8d392a..264a2ca5 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -240,16 +240,24 @@ def __init__( self.target_parameter = target_parameter self.has_converged = False - def _check_convergence(self, proposed_parameter): + def _check_convergence(self, proposed_parameter: float, current_step: int = -1): """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. Parameters ---------- proposed_parameter: float The proposed parameter for the next round of optimization. + + current_step: int + The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. + + """ - return proposed_parameter <= self.target_parameter + return ( + proposed_parameter <= self.target_parameter + and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 + ) def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is @@ -265,7 +273,12 @@ def _propose_new_parameter(self, df: pd.DataFrame): self.target_parameter, ) - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + def step( + self, + precursors_df: pd.DataFrame, + fragments_df: pd.DataFrame, + current_step: int = -1, + ): """See base class.""" if self.has_converged: self.reporter.log_string( @@ -277,7 +290,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): new_parameter = self._propose_new_parameter( precursors_df if self.estimator_group_name == "precursor" else fragments_df ) - just_converged = self._check_convergence(new_parameter) + just_converged = self._check_convergence(new_parameter, current_step) self.workflow.com.fit({self.parameter_name: new_parameter}) self.workflow.com.fit( {"classifier_version": self.workflow.fdr_manager.current_version} @@ -292,7 +305,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): else: self.reporter.log_string( - f"❌ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} > {self.target_parameter:.4f}", + f"❌ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} > {self.target_parameter:.4f} or insufficient steps taken.", verbosity="progress", ) From 70dba731bb1da8c470ff0665503b0a44e397cb5c Mon Sep 17 00:00:00 2001 From: odespard Date: Wed, 31 Jul 2024 20:06:28 +0200 Subject: [PATCH 23/36] add changes to automatic optimizers --- alphadia/workflow/peptidecentric.py | 2 +- alphadia/workflow/searchoptimization.py | 37 +++++++++++++++++++++---- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index b195c6b4..8a71513b 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -510,7 +510,7 @@ def calibration(self): ) self.reporter.log_string( - "Perform initial optimization on extracted data.", + "=== Performing initial optimization on extracted data. ===", verbosity="info", ) # Perform an initial optimization step based on the extracted data to update the initial search parameters. diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/searchoptimization.py index 264a2ca5..05837a44 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/searchoptimization.py @@ -71,7 +71,12 @@ def __init__( self.workflow.com.fit({self.parameter_name: initial_parameter}) self.has_converged = False - def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): + def step( + self, + precursors_df: pd.DataFrame, + fragments_df: pd.DataFrame, + current_step: int = -1, + ): """See base class. The feature is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if self.has_converged: self.reporter.log_string( @@ -99,7 +104,7 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): ] ) self.history_df = pd.concat([self.history_df, new_row], ignore_index=True) - just_converged = self._check_convergence() + just_converged = self._check_convergence(current_step) if just_converged: self.has_converged = True @@ -332,7 +337,7 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, **kwargs) - def _check_convergence(self): + def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued optimization of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. @@ -341,6 +346,10 @@ def _check_convergence(self): ----- Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + Parameters + ---------- + current_step: int + The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. """ @@ -350,6 +359,7 @@ def _check_convergence(self): < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] + and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): @@ -393,7 +403,7 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, **kwargs) - def _check_convergence(self): + def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued narrowing of the MS2 parameter is not improving the number of precursor identifications. This function checks if the previous rounds of optimization have led to a meaningful improvement in the number of identifications. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. @@ -402,6 +412,10 @@ def _check_convergence(self): ----- Because the check for an increase in identifications requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + Parameters + ---------- + current_step: int + The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. """ @@ -411,6 +425,7 @@ def _check_convergence(self): < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] + and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): @@ -454,7 +469,7 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, **kwargs) - def _check_convergence(self): + def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. @@ -463,6 +478,10 @@ def _check_convergence(self): ----- Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + Parameters + ---------- + current_step: int + The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. """ @@ -472,6 +491,7 @@ def _check_convergence(self): < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] + and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): @@ -515,7 +535,7 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, **kwargs) - def _check_convergence(self): + def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. @@ -524,6 +544,10 @@ def _check_convergence(self): ----- Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + Parameters + ---------- + current_step: int + The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. """ @@ -533,6 +557,7 @@ def _check_convergence(self): < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] + and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): From e6987c1250bdefaea94219b93b73f6d6dfab2f97 Mon Sep 17 00:00:00 2001 From: odespard Date: Thu, 1 Aug 2024 14:12:38 +0200 Subject: [PATCH 24/36] improved names and formatting --- alphadia/constants/default.yaml | 6 +- ...{searchoptimization.py => optimization.py} | 70 ++++++++------ alphadia/workflow/peptidecentric.py | 91 +++++++++---------- tests/unit_tests/test_workflow.py | 4 +- 4 files changed, 91 insertions(+), 80 deletions(-) rename alphadia/workflow/{searchoptimization.py => optimization.py} (92%) diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml index 9a4202f2..390c2862 100644 --- a/alphadia/constants/default.yaml +++ b/alphadia/constants/default.yaml @@ -95,14 +95,14 @@ search_advanced: calibration: - # minimum number of times the classifier is trained. - min_training_iterations: 3 + # minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted) + optimization_lock_min_steps: 3 # Number of precursors searched and scored per batch batch_size: 8000 # minimum number of precursors to be found before search parameter optimization begins - min_precursors_for_optimization: 200 + optimization_lock_target: 200 # the maximum number of steps that a given optimizer is permitted to take max_steps: 20 diff --git a/alphadia/workflow/searchoptimization.py b/alphadia/workflow/optimization.py similarity index 92% rename from alphadia/workflow/searchoptimization.py rename to alphadia/workflow/optimization.py index 05837a44..7fa6283e 100644 --- a/alphadia/workflow/searchoptimization.py +++ b/alphadia/workflow/optimization.py @@ -49,13 +49,25 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """ pass + @abstractmethod + def plot(self): + """ + This method plots relevant information about optimization of the search parameter. + + Notes + ----- + This can be left blank if there is nothing of interest to plot. + + """ + pass + class AutomaticOptimizer(BaseOptimizer): def __init__( self, initial_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """This class automatically optimizes the search parameter and stores the progres of optimization in a dataframe, history_df. @@ -66,7 +78,7 @@ def __init__( """ - super().__init__(workflow, **kwargs) + super().__init__(workflow, reporter) self.history_df = pd.DataFrame() self.workflow.com.fit({self.parameter_name: initial_parameter}) self.has_converged = False @@ -88,18 +100,14 @@ def step( new_row = pd.DataFrame( [ { - "parameter": float( - self.workflow.com.__dict__[self.parameter_name] - ), # Ensure float dtype + "parameter": self.workflow.com.__dict__[self.parameter_name], self.feature_name: self._get_feature_value( precursors_df, fragments_df ), - "classifier_version": int( - self.workflow.fdr_manager.current_version - ), # Ensure int dtype - "score_cutoff": float(self.workflow.com.score_cutoff), - "fwhm_rt": float(self.workflow.com.fwhm_rt), - "fwhm_mobility": float(self.workflow.com.fwhm_mobility), + "classifier_version": self.workflow.fdr_manager.current_version, + "score_cutoff": self.workflow.com.score_cutoff, + "fwhm_rt": self.workflow.com.fwhm_rt, + "fwhm_mobility": self.workflow.com.fwhm_mobility, } ] ) @@ -226,7 +234,7 @@ def __init__( initial_parameter: float, target_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """This class optimizes the search parameter until it reaches a user-specified target value. @@ -240,7 +248,7 @@ def __init__( Optimization will stop when this parameter is reached. """ - super().__init__(workflow, **kwargs) + super().__init__(workflow, reporter) self.workflow.com.fit({self.parameter_name: initial_parameter}) self.target_parameter = target_parameter self.has_converged = False @@ -314,13 +322,17 @@ def step( verbosity="progress", ) + def plot(self): + """Empty method for consistency with AutomaticOptimizer.""" + pass + class AutomaticRTOptimizer(AutomaticOptimizer): def __init__( self, initial_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class. @@ -335,7 +347,7 @@ def __init__( self.estimator_group_name = "precursor" self.estimator_name = "rt" self.feature_name = "precursor_count" - super().__init__(initial_parameter, workflow, **kwargs) + super().__init__(initial_parameter, workflow, reporter) def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued optimization of the parameter is not improving the TODO feature value. @@ -386,7 +398,7 @@ def __init__( self, initial_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number. @@ -401,7 +413,7 @@ def __init__( self.estimator_group_name = "fragment" self.estimator_name = "mz" self.feature_name = "precursor_count" - super().__init__(initial_parameter, workflow, **kwargs) + super().__init__(initial_parameter, workflow, reporter) def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued narrowing of the MS2 parameter is not improving the number of precursor identifications. @@ -452,7 +464,7 @@ def __init__( self, initial_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class. @@ -467,7 +479,7 @@ def __init__( self.estimator_group_name = "precursor" self.estimator_name = "mz" self.feature_name = "precursor_count" - super().__init__(initial_parameter, workflow, **kwargs) + super().__init__(initial_parameter, workflow, reporter) def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. @@ -518,7 +530,7 @@ def __init__( self, initial_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class. @@ -533,7 +545,7 @@ def __init__( self.estimator_group_name = "precursor" self.estimator_name = "mobility" self.feature_name = "precursor_count" - super().__init__(initial_parameter, workflow, **kwargs) + super().__init__(initial_parameter, workflow, reporter) def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. @@ -587,13 +599,13 @@ def __init__( initial_parameter: float, target_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class.""" self.parameter_name = "rt_error" self.estimator_group_name = "precursor" self.estimator_name = "rt" - super().__init__(initial_parameter, target_parameter, workflow, **kwargs) + super().__init__(initial_parameter, target_parameter, workflow, reporter) class TargetedMS2Optimizer(TargetedOptimizer): @@ -604,13 +616,13 @@ def __init__( initial_parameter: float, target_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class.""" self.parameter_name = "ms2_error" self.estimator_group_name = "fragment" self.estimator_name = "mz" - super().__init__(initial_parameter, target_parameter, workflow, **kwargs) + super().__init__(initial_parameter, target_parameter, workflow, reporter) class TargetedMS1Optimizer(TargetedOptimizer): @@ -621,13 +633,13 @@ def __init__( initial_parameter: float, target_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class.""" self.parameter_name = "ms1_error" self.estimator_group_name = "precursor" self.estimator_name = "mz" - super().__init__(initial_parameter, target_parameter, workflow, **kwargs) + super().__init__(initial_parameter, target_parameter, workflow, reporter) class TargetedMobilityOptimizer(TargetedOptimizer): @@ -638,10 +650,10 @@ def __init__( initial_parameter: float, target_parameter: float, workflow, - **kwargs, + reporter: None | reporting.Pipeline | reporting.Backend = None, ): """See base class.""" self.parameter_name = "mobility_error" self.estimator_group_name = "precursor" self.estimator_name = "mobility" - super().__init__(initial_parameter, target_parameter, workflow, **kwargs) + super().__init__(initial_parameter, target_parameter, workflow, reporter) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 8a71513b..98ed982c 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -16,7 +16,7 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils from alphadia.peakgroup import search -from alphadia.workflow import base, manager, searchoptimization +from alphadia.workflow import base, manager, optimization logger = logging.getLogger() @@ -314,17 +314,13 @@ def get_batch_plan(self): return plan - def extract_optimization_data(self, target, min_iterations): + def get_optimization_lock(self): """Search parameter optimization (i.e. refinement of tolerances for RT, MS2, etc.) is performed on a subset of the elution groups in the spectral library. The number of elution groups which must be searched to get a sufficiently large number for robust calibration varies depending the library used and the data. - This function searches an increasing number of elution groups until a sufficient number (determined by target) of precursors are identified at 1% FDR. + This function searches an increasing number of elution groups until a sufficient number of precursors are identified at 1% FDR and a sufficient number of steps have been taken. + The values deemed sufficient are specified in by "optimization_lock_target" and "optmization_lock_min_steps" in the config. It then returns the elution group indexes which will be used to find the data in the spectral library for search parameter optimization. - Parameters - ---------- - - target : int - The number of precursors which must be identified at 1% FDR to stop the extraction. """ @@ -371,20 +367,22 @@ def extract_optimization_data(self, target, min_iterations): precursors_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) self.reporter.log_string( - f"=== Checking if minimum number of precursors for optimization found yet; minimum number is {target} ===", + f"=== Checking if minimum number of precursors for optimization found yet; minimum number is {self.config['calibration']['optimization_lock_target']} ===", verbosity="progress", ) self.log_precursor_df(precursor_df) self.reporter.log_string( - f"=== Classifier has been trained for {self.fdr_manager.current_version + 1} iteration(s); minimum number is {min_iterations} ===", + f"=== Classifier has been trained for {self.fdr_manager.current_version + 1} iteration(s); minimum number is {self.config['calibration']['optimization_lock_min_steps']} ===", verbosity="progress", ) if ( - precursors_01FDR > target - and self.fdr_manager.current_version >= min_iterations - 1 + precursors_01FDR + > self.config["calibration"]["optimization_lock_target"] + and self.fdr_manager.current_version + >= self.config["calibration"]["optimization_lock_min_steps"] - 1 ): final_stop_index = stop_index # final_stop_index is the number of elution groups that will be included in the calibration data break @@ -394,49 +392,39 @@ def extract_optimization_data(self, target, min_iterations): # self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] # self.com.fit({"classifier_version": self.fdr_manager.current_version}) - def calibration(self): - if ( - self.calibration_manager.is_fitted - and self.calibration_manager.is_loaded_from_file - ): - self.reporter.log_string( - "Skipping calibration as existing calibration was found", - verbosity="progress", - ) - return - + def get_optimizers(self): if self.config["search"]["target_ms2_tolerance"] > 0: - ms2_optimizer = searchoptimization.TargetedMS2Optimizer( + ms2_optimizer = optimization.TargetedMS2Optimizer( self.config["search_initial"]["initial_ms2_tolerance"], self.config["search"]["target_ms2_tolerance"], self, ) else: - ms2_optimizer = searchoptimization.AutomaticMS2Optimizer( + ms2_optimizer = optimization.AutomaticMS2Optimizer( self.config["search_initial"]["initial_ms2_tolerance"], self, ) if self.config["search"]["target_rt_tolerance"] > 0: - rt_optimizer = searchoptimization.TargetedRTOptimizer( + rt_optimizer = optimization.TargetedRTOptimizer( self.config["search_initial"]["initial_rt_tolerance"], self.config["search"]["target_rt_tolerance"], self, ) else: - rt_optimizer = searchoptimization.AutomaticRTOptimizer( + rt_optimizer = optimization.AutomaticRTOptimizer( self.config["search_initial"]["initial_rt_tolerance"], self, ) if self.dia_data.has_ms1: if self.config["search"]["target_ms1_tolerance"] > 0: - ms1_optimizer = searchoptimization.TargetedMS1Optimizer( + ms1_optimizer = optimization.TargetedMS1Optimizer( self.config["search_initial"]["initial_ms1_tolerance"], self.config["search"]["target_ms1_tolerance"], self, ) else: - ms1_optimizer = searchoptimization.AutomaticMS1Optimizer( + ms1_optimizer = optimization.AutomaticMS1Optimizer( self.config["search_initial"]["initial_ms1_tolerance"], self, ) @@ -444,13 +432,13 @@ def calibration(self): ms1_optimizer = None if self.dia_data.has_mobility: if self.config["search"]["target_mobility_tolerance"] > 0: - mobility_optimizer = searchoptimization.TargetedMobilityOptimizer( + mobility_optimizer = optimization.TargetedMobilityOptimizer( self.config["search_initial"]["initial_mobility_tolerance"], self.config["search"]["target_mobility_tolerance"], self, ) else: - mobility_optimizer = searchoptimization.AutomaticMobilityOptimizer( + mobility_optimizer = optimization.AutomaticMobilityOptimizer( self.config["search_initial"]["initial_mobility_tolerance"], self.calibration_manager, self.com, @@ -469,13 +457,13 @@ def calibration(self): [ optimizer for optimizer in optimizers - if isinstance(optimizer, searchoptimization.TargetedOptimizer) + if isinstance(optimizer, optimization.TargetedOptimizer) ] ] automatic_optimizers = [ [optimizer] for optimizer in optimizers - if isinstance(optimizer, searchoptimization.AutomaticOptimizer) + if isinstance(optimizer, optimization.AutomaticOptimizer) ] order_of_optimization = ( @@ -484,20 +472,33 @@ def calibration(self): else automatic_optimizers ) + return order_of_optimization + + def calibration(self): + if ( + self.calibration_manager.is_fitted + and self.calibration_manager.is_loaded_from_file + ): + self.reporter.log_string( + "Skipping calibration as existing calibration was found", + verbosity="progress", + ) + return + + order_of_optimization = self.get_optimizers() + self.reporter.log_string( "Starting initial classifier training and precursor identification.", verbosity="progress", ) - self.eg_idxes_for_calibration, precursor_df, fragments_df = ( - self.extract_optimization_data( - self.config["calibration"]["min_precursors_for_optimization"], - self.config["calibration"]["min_training_iterations"], - ) + eg_idxes_for_calibration, precursor_df, fragments_df = ( + self.get_optimization_lock() ) self.com.fit({"classifier_version": self.fdr_manager.current_version}) + # Perform a first recalibration on the optimization lock. precursor_df_filtered, fragments_df_filtered = self.filter_dfs( precursor_df, fragments_df ) @@ -513,12 +514,13 @@ def calibration(self): "=== Performing initial optimization on extracted data. ===", verbosity="info", ) - # Perform an initial optimization step based on the extracted data to update the initial search parameters. - # This ensures that the classifier is trained at least once prior to the end of optimization, even if the min_steps parameter is 0. + for optimizers in order_of_optimization: for optimizer in optimizers: optimizer.step(precursor_df_filtered, fragments_df_filtered) + # End of first recalibration + # Start of optimization/recalibration loop for optimizers in order_of_optimization: for current_step in range(self.config["calibration"]["max_steps"]): if np.all([optimizer.has_converged for optimizer in optimizers]): @@ -528,13 +530,12 @@ def calibration(self): ) for optimizer in optimizers: - if isinstance(optimizer, searchoptimization.AutomaticOptimizer): - optimizer.plot() + optimizer.plot() break batch_df = self.spectral_library._precursor_df[ self.spectral_library._precursor_df["elution_group_idx"].isin( - self.eg_idxes_for_calibration + eg_idxes_for_calibration ) ] @@ -623,9 +624,7 @@ def filter_dfs(self, precursor_df, fragments_df): ), max_fragments, ) - fragments_df_filtered = fragments_df_filtered.iloc[ - :stop_rank - ] # QUESTION: Should this raise an exception if the length of fragments_df_full is less than min_fragments? + fragments_df_filtered = fragments_df_filtered.iloc[:stop_rank] self.reporter.log_string( f"fragments_df_filtered: {len(fragments_df_filtered)}", verbosity="info" diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 1729a165..3337859e 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -13,7 +13,7 @@ from alphadia.calibration.models import LOESSRegression from alphadia.calibration.property import Calibration from alphadia.fdrexperimental import BinaryClassifierLegacyNewBatching -from alphadia.workflow import base, manager, searchoptimization +from alphadia.workflow import base, manager, optimization def test_base_manager(): @@ -393,7 +393,7 @@ def ms2_optimizer_test(): test_dict = defaultdict(list) test_dict["var"] = list(range(100)) - ms2_optimizer = searchoptimization.MS2Optimizer( + ms2_optimizer = optimization.MS2Optimizer( 100, calibration_manager, optimization_manager, fdr_manager ) From d91689abede6b11e4418115fedaa2cb619cb8cd0 Mon Sep 17 00:00:00 2001 From: odespard Date: Thu, 1 Aug 2024 15:42:50 +0200 Subject: [PATCH 25/36] improve docstrings --- alphadia/workflow/optimization.py | 115 ++++++++++++++++------------ alphadia/workflow/peptidecentric.py | 69 ++++++++++++++++- 2 files changed, 132 insertions(+), 52 deletions(-) diff --git a/alphadia/workflow/optimization.py b/alphadia/workflow/optimization.py index 7fa6283e..f5b14acd 100644 --- a/alphadia/workflow/optimization.py +++ b/alphadia/workflow/optimization.py @@ -24,7 +24,10 @@ def __init__( ---------- workflow: peptidecentric.PeptideCentricWorkflow - The workflow object that the optimization is being performed on. + The workflow object, which includes the calibration, calibration_optimization and FDR managers which are used as part of optimization. + + reporter: None | reporting.Pipeline | reporting.Backend + The reporter object used to log information about the optimization process. If None, a new LogBackend object is created. """ self.optimal_parameter = None @@ -56,7 +59,7 @@ def plot(self): Notes ----- - This can be left blank if there is nothing of interest to plot. + This can be overwritten with pass if there is nothing of interest to plot. """ pass @@ -76,6 +79,7 @@ def __init__( initial_parameter: float The parameter used for search in the first round of optimization. + See base class for other parameters. """ super().__init__(workflow, reporter) @@ -157,7 +161,7 @@ def step( ) def plot(self): - """Plot the optimization of the RT error parameter.""" + """Plot the value of the feature used to assess optimization progress against the parameter value, for each value tested.""" fig, ax = plt.subplots() ax.axvline( @@ -199,13 +203,24 @@ def _propose_new_parameter(self, df): df: pd.DataFrame The dataframe used to update the parameter. This could be the precursor or fragment dataframe, depending on the search parameter being optimized. + Returns + ------- + float + The proposed new value for the search parameter. + """ pass @abstractmethod def _check_convergence(self): - """This method checks if the optimization has converged according to parameter-specific conditions and, if it has, sets the optimal parameter attribute and updates the optimization manager.""" + """This method checks if the optimization has converged according to parameter-specific conditions. + + Returns + ------- + bool + + """ pass @abstractmethod @@ -247,6 +262,8 @@ def __init__( target_parameter: float Optimization will stop when this parameter is reached. + See base class for other parameters. + """ super().__init__(workflow, reporter) self.workflow.com.fit({self.parameter_name: initial_parameter}) @@ -254,7 +271,7 @@ def __init__( self.has_converged = False def _check_convergence(self, proposed_parameter: float, current_step: int = -1): - """The optimization has converged if the proposed parameter is equal to or less than the target parameter. At this point, the target parameter is saved as the optimal parameter. + """The optimization has converged if the proposed parameter is equal to or less than the target parameter and the a sufficient number of steps has been taken. Parameters ---------- @@ -264,6 +281,11 @@ def _check_convergence(self, proposed_parameter: float, current_step: int = -1): current_step: int The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. + Returns + ------- + bool + True if proposed parameter less than target and the current step is greater than the minimum required, False otherwise. + """ @@ -334,15 +356,7 @@ def __init__( workflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): - """See base class. - - Parameters - ---------- - - initial_parameter: float - The parameter used for search in the first round of optimization. - - """ + """See base class. Optimizes retention time error.""" self.parameter_name = "rt_error" self.estimator_group_name = "precursor" self.estimator_name = "rt" @@ -352,7 +366,7 @@ def __init__( def _check_convergence(self, current_step: int = -1): """Optimization should stop if continued optimization of the parameter is not improving the TODO feature value. This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. - If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. + It also checks if the current step is greater than the minimum number of steps required for optimization. Notes ----- @@ -363,6 +377,11 @@ def _check_convergence(self, current_step: int = -1): current_step: int The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. + Returns + ------- + bool + True if the convergence conditions are met, False otherwise. + """ return ( @@ -381,6 +400,10 @@ def _propose_new_parameter(self, df: pd.DataFrame): 3) multiply this value by 1.1. This is implemented by the ci method for the estimator. + Returns + ------- + float + The proposed new value for the search parameter. """ return 1.1 * self.workflow.calibration_manager.get_estimator( @@ -400,15 +423,7 @@ def __init__( workflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): - """This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number. - - Parameters - ---------- - initial_parameter: float - The parameter used for search in the first round of optimization. - - - """ + """See base class. This class automatically optimizes the MS2 tolerance parameter by tracking the number of precursor identifications and stopping when further changes do not increase this number.""" self.parameter_name = "ms2_error" self.estimator_group_name = "fragment" self.estimator_name = "mz" @@ -429,6 +444,11 @@ def _check_convergence(self, current_step: int = -1): current_step: int The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. + Returns + ------- + bool + True if the convergence conditions are met, False otherwise. + """ return ( @@ -447,6 +467,10 @@ def _propose_new_parameter(self, df: pd.DataFrame): 3) multiply this value by 1.1. This is implemented by the ci method for the estimator. + Returns + ------- + float + The proposed new value for the search parameter. """ return 1.1 * self.workflow.calibration_manager.get_estimator( @@ -466,15 +490,7 @@ def __init__( workflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): - """See base class. - - Parameters - ---------- - - initial_parameter: float - The parameter used for search in the first round of optimization. - - """ + """See base class. Optimizes MS1 error.""" self.parameter_name = "ms1_error" self.estimator_group_name = "precursor" self.estimator_name = "mz" @@ -514,6 +530,11 @@ def _propose_new_parameter(self, df: pd.DataFrame): This is implemented by the ci method for the estimator. + Returns + ------- + float + The proposed new value for the search parameter. + """ return 1.1 * self.workflow.calibration_manager.get_estimator( self.estimator_group_name, self.estimator_name @@ -532,15 +553,7 @@ def __init__( workflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): - """See base class. - - Parameters - ---------- - - initial_parameter: float - The parameter used for search in the first round of optimization. - - """ + """See base class. Optimizes mobility error.""" self.parameter_name = "mobility_error" self.estimator_group_name = "precursor" self.estimator_name = "mobility" @@ -561,6 +574,11 @@ def _check_convergence(self, current_step: int = -1): current_step: int The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. + Returns + ------- + bool + True if the convergence conditions are met, False otherwise. + """ return ( @@ -579,8 +597,13 @@ def _propose_new_parameter(self, df: pd.DataFrame): 3) multiply this value by 1.1. This is implemented by the ci method for the estimator. + Returns + ------- + float + The proposed new value for the search parameter. """ + return 1.1 * self.workflow.calibration_manager.get_estimator( self.estimator_group_name, self.estimator_name ).ci(df, 0.99) @@ -592,8 +615,6 @@ def _get_feature_value( class TargetedRTOptimizer(TargetedOptimizer): - """This class optimizes the RT search parameter until it reaches a user-specified target value.""" - def __init__( self, initial_parameter: float, @@ -609,8 +630,6 @@ def __init__( class TargetedMS2Optimizer(TargetedOptimizer): - """This class optimizes the MS2 search parameter until it reaches a user-specified target value.""" - def __init__( self, initial_parameter: float, @@ -626,8 +645,6 @@ def __init__( class TargetedMS1Optimizer(TargetedOptimizer): - """This class optimizes the MS1 search parameter until it reaches a user-specified target value.""" - def __init__( self, initial_parameter: float, @@ -643,8 +660,6 @@ def __init__( class TargetedMobilityOptimizer(TargetedOptimizer): - """This class optimizes the mobility search parameter until it reaches a user-specified target value.""" - def __init__( self, initial_parameter: float, diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 98ed982c..88b76fa7 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -297,6 +297,7 @@ def get_exponential_batches(self, step): return int(2**step) def get_batch_plan(self): + """Gets an exponential batch plan based on the batch_size value in the config.""" n_eg = self.spectral_library._precursor_df["elution_group_idx"].nunique() plan = [] @@ -316,11 +317,21 @@ def get_batch_plan(self): def get_optimization_lock(self): """Search parameter optimization (i.e. refinement of tolerances for RT, MS2, etc.) is performed on a subset of the elution groups in the spectral library. + This subset is termed the optimization lock. The number of elution groups which must be searched to get a sufficiently large number for robust calibration varies depending the library used and the data. This function searches an increasing number of elution groups until a sufficient number of precursors are identified at 1% FDR and a sufficient number of steps have been taken. The values deemed sufficient are specified in by "optimization_lock_target" and "optmization_lock_min_steps" in the config. - It then returns the elution group indexes which will be used to find the data in the spectral library for search parameter optimization. + Returns + ------- + eg_idxes_for_calibration : np.ndarray + The indices (in .spectral_library._precursor_df) of the precursors which will be used for calibration. + + precursor_df : pd.DataFrame + Dataframe of all precursors accumulated during the optimization lock, including q-values from FDR correction. + + fragments_df : pd.DataFrame + Dataframe of all fragments accumulated during the optimization lock, including q-values from FDR correction. """ @@ -387,12 +398,25 @@ def get_optimization_lock(self): final_stop_index = stop_index # final_stop_index is the number of elution groups that will be included in the calibration data break - return self.elution_group_order[:final_stop_index], precursor_df, fragments_df + eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] + return eg_idxes_for_calibration, precursor_df, fragments_df # self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] # self.com.fit({"classifier_version": self.fdr_manager.current_version}) def get_optimizers(self): + """Select appropriate optimizers. Targeted optimization is used if a valid target value (i.e. a number greater than 0) is specified in the config; + if a value less than or equal to 0 is supplied, automatic optimization is used. + Targeted optimizers are run simultaneously; automatic optimizers are run separately in the order MS2, RT, MS1, mobility. + This order is built into the structure of the returned list of lists, order_of_optimization. + For MS1 and mobility, the relevant optimizer will be excluded from the returned list of lists if it is not present in the data. + + Returns + ------- + order_of_optimization : list + List of lists of optimizers + + """ if self.config["search"]["target_ms2_tolerance"] > 0: ms2_optimizer = optimization.TargetedMS2Optimizer( self.config["search_initial"]["initial_ms2_tolerance"], @@ -475,6 +499,14 @@ def get_optimizers(self): return order_of_optimization def calibration(self): + """Performs optimization of the search parameters. This occurs in two stages: + 1) Optimization lock: the data are searched to acquire a locked set of precursors which is used for search parameter optimization. The classifier is also trained during this stage. + 2) Optimization loop: the search parameters are optimized iteratively using the locked set of precursors. + In each iteration, the data are searched with the locked library from stage 1, and the properties -- m/z for both precursors and fragments (i.e. MS1 and MS2), RT and mobility -- are recalibrated. + The optimization loop is repeated for each list of optimizers in order_of_optimization. + + """ + # First check to see if the calibration has already been performed. Return if so. if ( self.calibration_manager.is_fitted and self.calibration_manager.is_loaded_from_file @@ -485,6 +517,7 @@ def calibration(self): ) return + # Get the order of optimization order_of_optimization = self.get_optimizers() self.reporter.log_string( @@ -492,6 +525,7 @@ def calibration(self): verbosity="progress", ) + # Get the optimization lock eg_idxes_for_calibration, precursor_df, fragments_df = ( self.get_optimization_lock() ) @@ -600,6 +634,25 @@ def calibration(self): ) def filter_dfs(self, precursor_df, fragments_df): + """Filters precursor and fragment dataframes to extract the most reliable examples for calibration. + + Parameters + ---------- + precursor_df : pd.DataFrame + Precursor dataframe after FDR correction. + + fragments_df : pd.DataFrame + Fragment dataframe. + + Returns + ------- + precursor_df_filtered : pd.DataFrame + Filtered precursor dataframe. Decoy precursors and those found at worse than 1% FDR are removed from the precursor dataframe. + + fragments_df_filtered : pd.DataFrame + Filtered fragment dataframe. Retained fragments must have a correlation greater than 0.7 and belong to the top 5000 fragments sorted by correlation. + + """ precursor_df_filtered = precursor_df[precursor_df["qval"] < 0.01] precursor_df_filtered = precursor_df_filtered[ precursor_df_filtered["decoy"] == 0 @@ -633,6 +686,18 @@ def filter_dfs(self, precursor_df, fragments_df): return precursor_df_filtered, fragments_df_filtered def recalibration(self, precursor_df_filtered, fragments_df_filtered): + """Performs recalibration of the the MS1, MS2, RT and mobility properties. Also fits the convolution kernel and the score cutoff. + The calibration manager is used to fit the data and predict the calibrated values. + + Parameters + ---------- + precursor_df_filtered : pd.DataFrame + Filtered precursor dataframe (see filter_dfs) + + fragments_df_filtered : pd.DataFrame + Filtered fragment dataframe (see filter_dfs) + + """ self.calibration_manager.fit( precursor_df_filtered, "precursor", From 8d5c989b5c2a8c2982c7e08eb52ce4413a7509e6 Mon Sep 17 00:00:00 2001 From: odespard Date: Thu, 1 Aug 2024 17:58:29 +0200 Subject: [PATCH 26/36] add unit tests for optimizers --- tests/unit_tests/test_workflow.py | 425 +++++++++++++++++++++++++++--- 1 file changed, 395 insertions(+), 30 deletions(-) diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 3337859e..638a12c7 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -1,7 +1,6 @@ import os import shutil import tempfile -from collections import defaultdict from pathlib import Path import numpy as np @@ -13,7 +12,8 @@ from alphadia.calibration.models import LOESSRegression from alphadia.calibration.property import Calibration from alphadia.fdrexperimental import BinaryClassifierLegacyNewBatching -from alphadia.workflow import base, manager, optimization +from alphadia.workflow import base, manager, optimization, peptidecentric, reporting +from alphadia.workflow.config import Config def test_base_manager(): @@ -131,12 +131,21 @@ def calibration_testdata(): rt_library + np.random.normal(0, 0.5, 1000) + np.sin(rt_library * 0.05) ) + mobility_library = np.linspace(0, 100, 1000) + mobility_observed = ( + mobility_library + + np.random.normal(0, 0.5, 1000) + + np.sin(mobility_library * 0.05) + ) + return pd.DataFrame( { "mz_library": mz_library, "mz_observed": mz_observed, "rt_library": rt_library, "rt_observed": rt_observed, + "mobility_library": mobility_library, + "mobility_observed": mobility_observed, } ).copy() @@ -372,48 +381,404 @@ def test_fdr_manager_fit_predict(): os.remove(temp_path) -def ms2_optimizer_test(): - temp_path = os.path.join(tempfile.tempdir, "calibration_manager.pkl") - calibration_manager = manager.CalibrationManager( - TEST_CONFIG, path=temp_path, load_from_file=False +def create_workflow_instance(): + config_base_path = os.path.join( + Path(__file__).parents[2], "alphadia", "constants", "default.yaml" ) - temp_path = os.path.join(tempfile.tempdir, "optimization_manager.pkl") + config = Config() + config.from_yaml(config_base_path) + config["output"] = tempfile.mkdtemp() + workflow = peptidecentric.PeptideCentricWorkflow( + "test", + config, + ) + workflow.reporter = reporting.Pipeline( + backends=[ + reporting.LogBackend(), + reporting.JSONLBackend(path=workflow.path), + reporting.FigureBackend(path=workflow.path), + ] + ) + workflow._calibration_manager = manager.CalibrationManager( + workflow.config["calibration_manager"], + path=os.path.join(workflow.path, workflow.CALIBRATION_MANAGER_PATH), + load_from_file=workflow.config["general"]["reuse_calibration"], + reporter=workflow.reporter, + ) - optimization_manager = manager.OptimizationManager( - OPTIMIZATION_TEST_DATA, path=temp_path, load_from_file=False + workflow.init_calibration_optimization_manager() + workflow.init_fdr_manager() + + return workflow + + +def automatic_ms2_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df2, "fragment", plot=False) + + ms2_optimizer = optimization.AutomaticMS2Optimizer( + 100, + workflow, ) - test_fragment_df = calibration_testdata() - calibration_manager.fit(test_fragment_df, "fragment", plot=False) + assert ms2_optimizer.has_converged is False + assert ms2_optimizer.parameter_name == "ms2_error" - fdr_manager = manager.FDRManager(FDR_TEST_FEATURES, FDR_TEST_BASE_CLASSIFIER) - fdr_manager._num_classifiers = 1 + workflow.fdr_manager._current_version += 1 + ms2_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert len(ms2_optimizer.history_df) == 1 + + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + + ms2_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + + ms2_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + + assert ms2_optimizer.has_converged is True + assert ( + ms2_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) + ).all() + assert ( + workflow.com.ms2_error + == ms2_optimizer.history_df.parameter[ + ms2_optimizer.history_df.precursor_count.idxmax() + ] + ) + assert workflow.com.classifier_version == 2 + + +def automatic_rt_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + rt_optimizer = optimization.AutomaticRTOptimizer( + 100, + workflow, + ) + + assert rt_optimizer.has_converged is False + assert rt_optimizer.parameter_name == "rt_error" + + workflow.fdr_manager._current_version += 1 + rt_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert len(rt_optimizer.history_df) == 1 + + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + + rt_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + + rt_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + + assert rt_optimizer.has_converged is True + assert ( + rt_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) + ).all() + assert ( + workflow.com.rt_error + == rt_optimizer.history_df.parameter[ + rt_optimizer.history_df.precursor_count.idxmax() + ] + ) + assert workflow.com.classifier_version == 2 + + +def automatic_ms1_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + ms1_optimizer = optimization.AutomaticMS1Optimizer( + 100, + workflow, + ) + + assert ms1_optimizer.has_converged is False + assert ms1_optimizer.parameter_name == "ms1_error" + + workflow.fdr_manager._current_version += 1 + ms1_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert len(ms1_optimizer.history_df) == 1 + + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + + ms1_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + + ms1_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) - test_dict = defaultdict(list) - test_dict["var"] = list(range(100)) + assert ms1_optimizer.has_converged is True + assert ( + ms1_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) + ).all() + assert ( + workflow.com.ms1_error + == ms1_optimizer.history_df.parameter[ + ms1_optimizer.history_df.precursor_count.idxmax() + ] + ) + assert workflow.com.classifier_version == 2 + + +def automatic_mobility_optimizer_test(): + workflow = create_workflow_instance() - ms2_optimizer = optimization.MS2Optimizer( - 100, calibration_manager, optimization_manager, fdr_manager + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + mobility_optimizer = optimization.AutomaticMobilityOptimizer( + 100, + workflow, ) - assert ms2_optimizer.optimal_parameter is None + assert mobility_optimizer.has_converged is False + assert mobility_optimizer.parameter_name == "mobility_error" + + workflow.fdr_manager._current_version += 1 + mobility_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert len(mobility_optimizer.history_df) == 1 - ms2_optimizer.step(pd.DataFrame(test_dict), test_fragment_df) + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 + + assert workflow.com.classifier_version == -1 + mobility_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) - assert len(ms2_optimizer.parameters) == 2 + calibration_test_df1 = pd.concat( + [calibration_test_df1, pd.DataFrame(calibration_test_df1.loc[0]).T], + ignore_index=True, + ) + workflow.fdr_manager._current_version += 1 - test_dict["var"].append(1) - ms2_optimizer.step(pd.DataFrame(test_dict), test_fragment_df) + assert workflow.com.classifier_version == -1 - test_dict["var"].append(1) - ms2_optimizer.step(pd.DataFrame(test_dict), test_fragment_df) + mobility_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) - assert ms2_optimizer.optimal_parameter is not None - assert ms2_optimizer.precursor_ids == [100, 101, 102] + assert mobility_optimizer.has_converged is True assert ( - ms2_optimizer.optimal_parameter - == ms2_optimizer.parameters[np.argmax(ms2_optimizer.precursor_ids)] + mobility_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) + ).all() + assert ( + workflow.com.mobility_error + == mobility_optimizer.history_df.parameter[ + mobility_optimizer.history_df.precursor_count.idxmax() + ] ) - assert optimization_manager.ms2_error == ms2_optimizer.optimal_parameter - assert optimization_manager.classifier_version == 0 + assert workflow.com.classifier_version == 2 + + +def targeted_ms2_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + optimizer = optimization.TargetedMS2Optimizer( + 100, + 7, + workflow, + ) + + assert optimizer.has_converged is False + assert optimizer.parameter_name == "ms2_error" + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 0 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 1 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + + assert optimizer.has_converged is True + assert workflow.com.classifier_version == 2 + + assert workflow.com.ms2_error == optimizer.target_parameter + assert workflow.com.classifier_version == 2 + + +def targeted_rt_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + optimizer = optimization.TargetedRTOptimizer( + 100, + 7, + workflow, + ) + + assert optimizer.has_converged is False + assert optimizer.parameter_name == "rt_error" + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 0 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 1 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + + assert optimizer.has_converged is True + assert workflow.com.classifier_version == 2 + + assert workflow.com.rt_error == optimizer.target_parameter + assert workflow.com.classifier_version == 2 + + +def targeted_ms1_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + optimizer = optimization.TargetedMS1Optimizer( + 100, + 7, + workflow, + ) + + assert optimizer.has_converged is False + assert optimizer.parameter_name == "ms1_error" + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 0 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 1 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + + assert optimizer.has_converged is True + assert workflow.com.classifier_version == 2 + + assert workflow.com.ms1_error == optimizer.target_parameter + assert workflow.com.classifier_version == 2 + + +def targeted_mobility_optimizer_test(): + workflow = create_workflow_instance() + + calibration_test_df1 = calibration_testdata() + calibration_test_df2 = calibration_testdata() + + workflow.calibration_manager.fit(calibration_test_df1, "precursor", plot=False) + + optimizer = optimization.TargetedMobilityOptimizer( + 100, + 7, + workflow, + ) + + assert optimizer.has_converged is False + assert optimizer.parameter_name == "mobility_error" + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 0 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + + assert optimizer.has_converged is False + assert workflow.com.classifier_version == 1 + + workflow.fdr_manager._current_version += 1 + optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + + assert optimizer.has_converged is True + assert workflow.com.classifier_version == 2 + + assert workflow.com.mobility_error == optimizer.target_parameter + assert workflow.com.classifier_version == 2 From 0d9e2ebcaebeae183681b122b5de10bc1353ea4f Mon Sep 17 00:00:00 2001 From: odespard Date: Thu, 1 Aug 2024 18:39:02 +0200 Subject: [PATCH 27/36] merge optimization and calibration_optimization managers --- alphadia/workflow/base.py | 16 ++++- alphadia/workflow/optimization.py | 52 ++++++++------ alphadia/workflow/peptidecentric.py | 101 ++++++++++------------------ 3 files changed, 82 insertions(+), 87 deletions(-) diff --git a/alphadia/workflow/base.py b/alphadia/workflow/base.py index b06fc5d7..3268fd5d 100644 --- a/alphadia/workflow/base.py +++ b/alphadia/workflow/base.py @@ -95,8 +95,22 @@ def load( self._calibration_manager.disable_mobility_calibration() # initialize the optimization manager + optimization_manager_config = { + "ms1_error": self.config["search_initial"]["initial_ms1_tolerance"], + "ms2_error": self.config["search_initial"]["initial_ms2_tolerance"], + "rt_error": self.config["search_initial"]["initial_rt_tolerance"], + "mobility_error": self.config["search_initial"][ + "initial_mobility_tolerance" + ], + "column_type": "library", + "num_candidates": self.config["search_initial"]["initial_num_candidates"], + "classifier_version": -1, + "fwhm_rt": self.config["optimization_manager"]["fwhm_rt"], + "fwhm_mobility": self.config["optimization_manager"]["fwhm_mobility"], + "score_cutoff": self.config["optimization_manager"]["score_cutoff"], + } self._optimization_manager = manager.OptimizationManager( - self.config["optimization_manager"], + optimization_manager_config, path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH), load_from_file=self.config["general"]["reuse_calibration"], figure_path=os.path.join(self.path, self.FIGURE_PATH), diff --git a/alphadia/workflow/optimization.py b/alphadia/workflow/optimization.py index f5b14acd..79d7f9f8 100644 --- a/alphadia/workflow/optimization.py +++ b/alphadia/workflow/optimization.py @@ -84,7 +84,7 @@ def __init__( """ super().__init__(workflow, reporter) self.history_df = pd.DataFrame() - self.workflow.com.fit({self.parameter_name: initial_parameter}) + self.workflow.optimization_manager.fit({self.parameter_name: initial_parameter}) self.has_converged = False def step( @@ -96,7 +96,7 @@ def step( """See base class. The feature is used to track the progres of the optimization (stored in .feature) and determine whether it has converged.""" if self.has_converged: self.reporter.log_string( - f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", + f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.optimization_manager.__dict__[self.parameter_name]} found after {len(self.history_df)} searches.", verbosity="progress", ) return @@ -104,14 +104,16 @@ def step( new_row = pd.DataFrame( [ { - "parameter": self.workflow.com.__dict__[self.parameter_name], + "parameter": self.workflow.optimization_manager.__dict__[ + self.parameter_name + ], self.feature_name: self._get_feature_value( precursors_df, fragments_df ), "classifier_version": self.workflow.fdr_manager.current_version, - "score_cutoff": self.workflow.com.score_cutoff, - "fwhm_rt": self.workflow.com.fwhm_rt, - "fwhm_mobility": self.workflow.com.fwhm_mobility, + "score_cutoff": self.workflow.optimization_manager.score_cutoff, + "fwhm_rt": self.workflow.optimization_manager.fwhm_rt, + "fwhm_mobility": self.workflow.optimization_manager.fwhm_mobility, } ] ) @@ -135,14 +137,22 @@ def step( index_of_optimum ] - self.workflow.com.fit({self.parameter_name: optimal_parameter}) - self.workflow.com.fit({"classifier_version": classifier_version_at_optimum}) - self.workflow.com.fit({"score_cutoff": score_cutoff_at_optimum}) - self.workflow.com.fit({"fwhm_rt": fwhm_rt_at_optimum}) - self.workflow.com.fit({"fwhm_mobility": fwhm_mobility_at_optimum}) + self.workflow.optimization_manager.fit( + {self.parameter_name: optimal_parameter} + ) + self.workflow.optimization_manager.fit( + {"classifier_version": classifier_version_at_optimum} + ) + self.workflow.optimization_manager.fit( + {"score_cutoff": score_cutoff_at_optimum} + ) + self.workflow.optimization_manager.fit({"fwhm_rt": fwhm_rt_at_optimum}) + self.workflow.optimization_manager.fit( + {"fwhm_mobility": fwhm_mobility_at_optimum} + ) self.reporter.log_string( - f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.com.__dict__[self.parameter_name]:.4f} found after {len(self.history_df)} searches.", + f"✅ {self.parameter_name:<15}: optimization complete. Optimal parameter {self.workflow.optimization_manager.__dict__[self.parameter_name]:.4f} found after {len(self.history_df)} searches.", verbosity="progress", ) @@ -153,10 +163,10 @@ def step( else fragments_df ) - self.workflow.com.fit({self.parameter_name: new_parameter}) + self.workflow.optimization_manager.fit({self.parameter_name: new_parameter}) self.reporter.log_string( - f"❌ {self.parameter_name:<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.workflow.com.__dict__[self.parameter_name]:.4f}.", + f"❌ {self.parameter_name:<15}: optimization incomplete after {len(self.history_df)} search(es). Will search with parameter {self.workflow.optimization_manager.__dict__[self.parameter_name]:.4f}.", verbosity="progress", ) @@ -165,7 +175,7 @@ def plot(self): fig, ax = plt.subplots() ax.axvline( - x=self.workflow.com.__dict__[self.parameter_name], + x=self.workflow.optimization_manager.__dict__[self.parameter_name], ymin=0, ymax=self.history_df[self.feature_name].max(), color="red", @@ -266,7 +276,7 @@ def __init__( """ super().__init__(workflow, reporter) - self.workflow.com.fit({self.parameter_name: initial_parameter}) + self.workflow.optimization_manager.fit({self.parameter_name: initial_parameter}) self.target_parameter = target_parameter self.has_converged = False @@ -317,7 +327,7 @@ def step( """See base class.""" if self.has_converged: self.reporter.log_string( - f"✅ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} <= {self.target_parameter:.4f}", + f"✅ {self.parameter_name:<15}: {self.workflow.optimization_manager.__dict__[self.parameter_name]:.4f} <= {self.target_parameter:.4f}", verbosity="progress", ) return @@ -326,21 +336,21 @@ def step( precursors_df if self.estimator_group_name == "precursor" else fragments_df ) just_converged = self._check_convergence(new_parameter, current_step) - self.workflow.com.fit({self.parameter_name: new_parameter}) - self.workflow.com.fit( + self.workflow.optimization_manager.fit({self.parameter_name: new_parameter}) + self.workflow.optimization_manager.fit( {"classifier_version": self.workflow.fdr_manager.current_version} ) if just_converged: self.has_converged = True self.reporter.log_string( - f"✅ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} <= {self.target_parameter:.4f}", + f"✅ {self.parameter_name:<15}: {self.workflow.optimization_manager.__dict__[self.parameter_name]:.4f} <= {self.target_parameter:.4f}", verbosity="progress", ) else: self.reporter.log_string( - f"❌ {self.parameter_name:<15}: {self.workflow.com.__dict__[self.parameter_name]:.4f} > {self.target_parameter:.4f} or insufficient steps taken.", + f"❌ {self.parameter_name:<15}: {self.workflow.optimization_manager.__dict__[self.parameter_name]:.4f} > {self.target_parameter:.4f} or insufficient steps taken.", verbosity="progress", ) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 88b76fa7..649ada78 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -123,42 +123,9 @@ def load( f"Initializing workflow {self.instance_name}", verbosity="progress" ) - self.init_calibration_optimization_manager() self.init_fdr_manager() self.init_spectral_library() - @property - def calibration_optimization_manager(self): - """Is used during the iterative optimization of the calibration parameters. - Should not be stored on disk. - """ - return self._calibration_optimization_manager - - @property - def com(self): - """alias for calibration_optimization_manager""" - return self.calibration_optimization_manager - - def init_calibration_optimization_manager(self): - self._calibration_optimization_manager = manager.OptimizationManager( - { - "ms1_error": self.config["search_initial"]["initial_ms1_tolerance"], - "ms2_error": self.config["search_initial"]["initial_ms2_tolerance"], - "rt_error": self.config["search_initial"]["initial_rt_tolerance"], - "mobility_error": self.config["search_initial"][ - "initial_mobility_tolerance" - ], - "column_type": "library", - "num_candidates": self.config["search_initial"][ - "initial_num_candidates" - ], - "classifier_version": -1, - "fwhm_rt": self.config["optimization_manager"]["fwhm_rt"], - "fwhm_mobility": self.config["optimization_manager"]["fwhm_mobility"], - "score_cutoff": self.config["optimization_manager"]["score_cutoff"], - } - ) - def init_fdr_manager(self): self.fdr_manager = manager.FDRManager( feature_columns=feature_columns, @@ -367,11 +334,11 @@ def get_optimization_lock(self): ) precursor_df = self.fdr_correction( - features_df, fragments_df, self.com.classifier_version + features_df, fragments_df, self.optimization_manager.classifier_version ) self.reporter.log_string( - f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", + f"=== FDR correction performed with classifier version {self.optimization_manager.classifier_version} ===", verbosity="info", ) @@ -402,7 +369,7 @@ def get_optimization_lock(self): return eg_idxes_for_calibration, precursor_df, fragments_df # self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] - # self.com.fit({"classifier_version": self.fdr_manager.current_version}) + # self.optimization_manager.fit({"classifier_version": self.fdr_manager.current_version}) def get_optimizers(self): """Select appropriate optimizers. Targeted optimization is used if a valid target value (i.e. a number greater than 0) is specified in the config; @@ -465,7 +432,7 @@ def get_optimizers(self): mobility_optimizer = optimization.AutomaticMobilityOptimizer( self.config["search_initial"]["initial_mobility_tolerance"], self.calibration_manager, - self.com, + self.optimization_manager, self.fdr_manager, ) else: @@ -530,7 +497,9 @@ def calibration(self): self.get_optimization_lock() ) - self.com.fit({"classifier_version": self.fdr_manager.current_version}) + self.optimization_manager.fit( + {"classifier_version": self.fdr_manager.current_version} + ) # Perform a first recalibration on the optimization lock. precursor_df_filtered, fragments_df_filtered = self.filter_dfs( @@ -581,11 +550,13 @@ def calibration(self): ) precursor_df = self.fdr_correction( - features_df, fragments_df, self.com.classifier_version + features_df, + fragments_df, + self.optimization_manager.classifier_version, ) self.reporter.log_string( - f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", + f"=== FDR correction performed with classifier version {self.optimization_manager.classifier_version} ===", verbosity="info", ) @@ -626,7 +597,7 @@ def calibration(self): for optimizers in order_of_optimization: for optimizer in optimizers: self.reporter.log_string( - f"{optimizer.parameter_name:<15}: {self.com.__dict__[optimizer.parameter_name]:.4f}", + f"{optimizer.parameter_name:<15}: {self.optimization_manager.__dict__[optimizer.parameter_name]:.4f}", verbosity="progress", ) self.reporter.log_string( @@ -720,7 +691,7 @@ def recalibration(self, precursor_df_filtered, fragments_df_filtered): self.calibration_manager.predict(self.spectral_library._fragment_df, "fragment") - self.com.fit( + self.optimization_manager.fit( { "column_type": "calibrated", "num_candidates": self.config["search"]["target_num_candidates"], @@ -728,7 +699,7 @@ def recalibration(self, precursor_df_filtered, fragments_df_filtered): ) percentile_001 = np.percentile(precursor_df_filtered["score"], 0.1) - self.com.fit( + self.optimization_manager.fit( { "fwhm_rt": precursor_df_filtered["cycle_fwhm"].median(), "fwhm_mobility": precursor_df_filtered["mobility_fwhm"].median(), @@ -761,11 +732,11 @@ def extract_batch(self, batch_df, apply_cutoff=False): config.update( { "top_k_fragments": self.config["search_advanced"]["top_k_fragments"], - "rt_tolerance": self.com.rt_error, - "mobility_tolerance": self.com.mobility_error, - "candidate_count": self.com.num_candidates, - "precursor_mz_tolerance": self.com.ms1_error, - "fragment_mz_tolerance": self.com.ms2_error, + "rt_tolerance": self.optimization_manager.rt_error, + "mobility_tolerance": self.optimization_manager.mobility_error, + "candidate_count": self.optimization_manager.num_candidates, + "precursor_mz_tolerance": self.optimization_manager.ms1_error, + "fragment_mz_tolerance": self.optimization_manager.ms2_error, "exclude_shared_ions": self.config["search"]["exclude_shared_ions"], "min_size_rt": self.config["search"]["quant_window"], } @@ -776,16 +747,16 @@ def extract_batch(self, batch_df, apply_cutoff=False): batch_df, self.spectral_library.fragment_df, config.jitclass(), - rt_column=f"rt_{self.com.column_type}", - mobility_column=f"mobility_{self.com.column_type}" + rt_column=f"rt_{self.optimization_manager.column_type}", + mobility_column=f"mobility_{self.optimization_manager.column_type}" if self.dia_data.has_mobility else "mobility_library", - precursor_mz_column=f"mz_{self.com.column_type}" + precursor_mz_column=f"mz_{self.optimization_manager.column_type}" if self.dia_data.has_ms1 else "mz_library", - fragment_mz_column=f"mz_{self.com.column_type}", - fwhm_rt=self.com.fwhm_rt, - fwhm_mobility=self.com.fwhm_mobility, + fragment_mz_column=f"mz_{self.optimization_manager.column_type}", + fwhm_rt=self.optimization_manager.fwhm_rt, + fwhm_mobility=self.optimization_manager.fwhm_mobility, ) candidates_df = extraction(thread_count=self.config["general"]["thread_count"]) @@ -794,11 +765,11 @@ def extract_batch(self, batch_df, apply_cutoff=False): if apply_cutoff: num_before = len(candidates_df) self.reporter.log_string( - f"Applying score cutoff of {self.com.score_cutoff}", + f"Applying score cutoff of {self.optimization_manager.score_cutoff}", verbosity="info", ) candidates_df = candidates_df[ - candidates_df["score"] > self.com.score_cutoff + candidates_df["score"] > self.optimization_manager.score_cutoff ] num_after = len(candidates_df) num_removed = num_before - num_after @@ -812,8 +783,8 @@ def extract_batch(self, batch_df, apply_cutoff=False): config.update( { "top_k_fragments": self.config["search_advanced"]["top_k_fragments"], - "precursor_mz_tolerance": self.com.ms1_error, - "fragment_mz_tolerance": self.com.ms2_error, + "precursor_mz_tolerance": self.optimization_manager.ms1_error, + "fragment_mz_tolerance": self.optimization_manager.ms2_error, "exclude_shared_ions": self.config["search"]["exclude_shared_ions"], "quant_window": self.config["search"]["quant_window"], "quant_all": self.config["search"]["quant_all"], @@ -825,14 +796,14 @@ def extract_batch(self, batch_df, apply_cutoff=False): self.spectral_library._precursor_df, self.spectral_library._fragment_df, config=config, - rt_column=f"rt_{self.com.column_type}", - mobility_column=f"mobility_{self.com.column_type}" + rt_column=f"rt_{self.optimization_manager.column_type}", + mobility_column=f"mobility_{self.optimization_manager.column_type}" if self.dia_data.has_mobility else "mobility_library", - precursor_mz_column=f"mz_{self.com.column_type}" + precursor_mz_column=f"mz_{self.optimization_manager.column_type}" if self.dia_data.has_ms1 else "mz_library", - fragment_mz_column=f"mz_{self.com.column_type}", + fragment_mz_column=f"mz_{self.optimization_manager.column_type}", ) features_df, fragments_df = candidate_scoring( @@ -844,7 +815,7 @@ def extract_batch(self, batch_df, apply_cutoff=False): return features_df, fragments_df def extraction(self): - self.com.fit( + self.optimization_manager.fit( { "num_candidates": self.config["search"]["target_num_candidates"], "column_type": "calibrated", @@ -862,12 +833,12 @@ def extraction(self): ) self.reporter.log_string( - f"=== FDR correction performed with classifier version {self.com.classifier_version} ===", + f"=== FDR correction performed with classifier version {self.optimization_manager.classifier_version} ===", verbosity="info", ) precursor_df = self.fdr_correction( - features_df, fragments_df, self.com.classifier_version + features_df, fragments_df, self.optimization_manager.classifier_version ) precursor_df = precursor_df[precursor_df["qval"] <= self.config["fdr"]["fdr"]] From f6d56f17f56485892ab317fc8f2c2818deef50dc Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 2 Aug 2024 11:36:39 +0200 Subject: [PATCH 28/36] review comments --- alphadia/workflow/optimization.py | 40 +++++---- alphadia/workflow/peptidecentric.py | 125 ++++++++++++++++++---------- tests/unit_tests/test_workflow.py | 16 ++-- 3 files changed, 111 insertions(+), 70 deletions(-) diff --git a/alphadia/workflow/optimization.py b/alphadia/workflow/optimization.py index 79d7f9f8..b7ed45d5 100644 --- a/alphadia/workflow/optimization.py +++ b/alphadia/workflow/optimization.py @@ -50,18 +50,13 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """ + pass @abstractmethod def plot(self): - """ - This method plots relevant information about optimization of the search parameter. - - Notes - ----- - This can be overwritten with pass if there is nothing of interest to plot. + """This method plots relevant information about optimization of the search parameter. This can be overwritten with an empty function if there is nothing to plot.""" - """ pass @@ -355,7 +350,7 @@ def step( ) def plot(self): - """Empty method for consistency with AutomaticOptimizer.""" + """See base class. There is nothing of interest to plot here.""" pass @@ -394,13 +389,16 @@ def _check_convergence(self, current_step: int = -1): """ + min_steps_reached = ( + current_step >= self.workflow.config["calibration"]["min_steps"] - 1 + ) return ( - len(self.history_df) > 2 + min_steps_reached + and len(self.history_df) > 2 and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] - and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): @@ -460,14 +458,16 @@ def _check_convergence(self, current_step: int = -1): True if the convergence conditions are met, False otherwise. """ - + min_steps_reached = ( + current_step >= self.workflow.config["calibration"]["min_steps"] - 1 + ) return ( - len(self.history_df) > 2 + min_steps_reached + and len(self.history_df) > 2 and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] - and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): @@ -523,13 +523,16 @@ def _check_convergence(self, current_step: int = -1): """ + min_steps_reached = ( + current_step >= self.workflow.config["calibration"]["min_steps"] - 1 + ) return ( - len(self.history_df) > 2 + min_steps_reached + and len(self.history_df) > 2 and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] - and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): @@ -591,13 +594,16 @@ def _check_convergence(self, current_step: int = -1): """ + min_steps_reached = ( + current_step >= self.workflow.config["calibration"]["min_steps"] - 1 + ) return ( - len(self.history_df) > 2 + min_steps_reached + and len(self.history_df) > 2 and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-2] and self.history_df[self.feature_name].iloc[-1] < 1.1 * self.history_df[self.feature_name].iloc[-3] - and current_step >= self.workflow.config["calibration"]["min_steps"] - 1 ) def _propose_new_parameter(self, df: pd.DataFrame): diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 649ada78..cb85d4e1 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -342,7 +342,7 @@ def get_optimization_lock(self): verbosity="info", ) - precursors_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) + num_precursors_at_01FDR = len(precursor_df[precursor_df["qval"] < 0.01]) self.reporter.log_string( f"=== Checking if minimum number of precursors for optimization found yet; minimum number is {self.config['calibration']['optimization_lock_target']} ===", @@ -357,9 +357,9 @@ def get_optimization_lock(self): ) if ( - precursors_01FDR + num_precursors_at_01FDR > self.config["calibration"]["optimization_lock_target"] - and self.fdr_manager.current_version + and current_step >= self.config["calibration"]["optimization_lock_min_steps"] - 1 ): final_stop_index = stop_index # final_stop_index is the number of elution groups that will be included in the calibration data @@ -371,66 +371,69 @@ def get_optimization_lock(self): # self.eg_idxes_for_calibration = self.elution_group_order[:final_stop_index] # self.optimization_manager.fit({"classifier_version": self.fdr_manager.current_version}) - def get_optimizers(self): + def get_ordered_optimizers(self): """Select appropriate optimizers. Targeted optimization is used if a valid target value (i.e. a number greater than 0) is specified in the config; if a value less than or equal to 0 is supplied, automatic optimization is used. Targeted optimizers are run simultaneously; automatic optimizers are run separately in the order MS2, RT, MS1, mobility. - This order is built into the structure of the returned list of lists, order_of_optimization. + This order is built into the structure of the returned list of lists, ordered_optimizers. For MS1 and mobility, the relevant optimizer will be excluded from the returned list of lists if it is not present in the data. Returns ------- - order_of_optimization : list + ordered_optimizers : list List of lists of optimizers """ - if self.config["search"]["target_ms2_tolerance"] > 0: + config_search_initial = self.config["search_initial"] + config_search = self.config["search"] + + if config_search["target_ms2_tolerance"] > 0: ms2_optimizer = optimization.TargetedMS2Optimizer( - self.config["search_initial"]["initial_ms2_tolerance"], - self.config["search"]["target_ms2_tolerance"], + config_search_initial["initial_ms2_tolerance"], + config_search["target_ms2_tolerance"], self, ) else: ms2_optimizer = optimization.AutomaticMS2Optimizer( - self.config["search_initial"]["initial_ms2_tolerance"], + config_search_initial["initial_ms2_tolerance"], self, ) - if self.config["search"]["target_rt_tolerance"] > 0: + if config_search["target_rt_tolerance"] > 0: rt_optimizer = optimization.TargetedRTOptimizer( - self.config["search_initial"]["initial_rt_tolerance"], - self.config["search"]["target_rt_tolerance"], + config_search_initial["initial_rt_tolerance"], + config_search["target_rt_tolerance"], self, ) else: rt_optimizer = optimization.AutomaticRTOptimizer( - self.config["search_initial"]["initial_rt_tolerance"], + config_search_initial["initial_rt_tolerance"], self, ) if self.dia_data.has_ms1: - if self.config["search"]["target_ms1_tolerance"] > 0: + if config_search["target_ms1_tolerance"] > 0: ms1_optimizer = optimization.TargetedMS1Optimizer( - self.config["search_initial"]["initial_ms1_tolerance"], - self.config["search"]["target_ms1_tolerance"], + config_search_initial["initial_ms1_tolerance"], + config_search["target_ms1_tolerance"], self, ) else: ms1_optimizer = optimization.AutomaticMS1Optimizer( - self.config["search_initial"]["initial_ms1_tolerance"], + config_search_initial["initial_ms1_tolerance"], self, ) else: ms1_optimizer = None if self.dia_data.has_mobility: - if self.config["search"]["target_mobility_tolerance"] > 0: + if config_search["target_mobility_tolerance"] > 0: mobility_optimizer = optimization.TargetedMobilityOptimizer( - self.config["search_initial"]["initial_mobility_tolerance"], - self.config["search"]["target_mobility_tolerance"], + config_search_initial["initial_mobility_tolerance"], + config_search["target_mobility_tolerance"], self, ) else: mobility_optimizer = optimization.AutomaticMobilityOptimizer( - self.config["search_initial"]["initial_mobility_tolerance"], + config_search_initial["initial_mobility_tolerance"], self.calibration_manager, self.optimization_manager, self.fdr_manager, @@ -457,20 +460,56 @@ def get_optimizers(self): if isinstance(optimizer, optimization.AutomaticOptimizer) ] - order_of_optimization = ( + ordered_optimizers = ( targeted_optimizers + automatic_optimizers - if any(targeted_optimizers) + if any( + targeted_optimizers + ) # This line is required so no empty list is added to the ordered_optimizers list else automatic_optimizers ) - return order_of_optimization + return ordered_optimizers + + def first_recalibration_and_optimization( + self, + precursor_df: pd.DataFrame, + fragments_df: pd.DataFrame, + ordered_optimizers: list, + ): + """Performs the first recalibration and optimization step. + + Parameters + ---------- + precursor_df : pd.DataFrame + Precursor dataframe from optimization lock + + fragments_df : pd.DataFrame + Fragment dataframe from optimization lock + + ordered_optimizers : list + List of lists of optimizers in correct order + """ + precursor_df_filtered, fragments_df_filtered = self.filter_dfs( + precursor_df, fragments_df + ) + + self.recalibration(precursor_df_filtered, fragments_df_filtered) + + self.reporter.log_string( + "=== Performing initial optimization on extracted data. ===", + verbosity="info", + ) + + for optimizers in ordered_optimizers: + for optimizer in optimizers: + optimizer.step(precursor_df_filtered, fragments_df_filtered) def calibration(self): """Performs optimization of the search parameters. This occurs in two stages: 1) Optimization lock: the data are searched to acquire a locked set of precursors which is used for search parameter optimization. The classifier is also trained during this stage. 2) Optimization loop: the search parameters are optimized iteratively using the locked set of precursors. In each iteration, the data are searched with the locked library from stage 1, and the properties -- m/z for both precursors and fragments (i.e. MS1 and MS2), RT and mobility -- are recalibrated. - The optimization loop is repeated for each list of optimizers in order_of_optimization. + The optimization loop is repeated for each list of optimizers in ordered_optimizers. """ # First check to see if the calibration has already been performed. Return if so. @@ -485,7 +524,7 @@ def calibration(self): return # Get the order of optimization - order_of_optimization = self.get_optimizers() + ordered_optimizers = self.get_ordered_optimizers() self.reporter.log_string( "Starting initial classifier training and precursor identification.", @@ -501,30 +540,18 @@ def calibration(self): {"classifier_version": self.fdr_manager.current_version} ) - # Perform a first recalibration on the optimization lock. - precursor_df_filtered, fragments_df_filtered = self.filter_dfs( - precursor_df, fragments_df - ) - - self.recalibration(precursor_df_filtered, fragments_df_filtered) - self.reporter.log_string( "Required number of precursors found and required number of training iterations performed. Starting search parameter optimization.", verbosity="progress", ) - self.reporter.log_string( - "=== Performing initial optimization on extracted data. ===", - verbosity="info", + # Perform a first recalibration on the optimization lock. + self.first_recalibration_and_optimization( + precursor_df, fragments_df, ordered_optimizers ) - for optimizers in order_of_optimization: - for optimizer in optimizers: - optimizer.step(precursor_df_filtered, fragments_df_filtered) - # End of first recalibration - # Start of optimization/recalibration loop - for optimizers in order_of_optimization: + for optimizers in ordered_optimizers: for current_step in range(self.config["calibration"]["max_steps"]): if np.all([optimizer.has_converged for optimizer in optimizers]): self.reporter.log_string( @@ -587,6 +614,12 @@ def calibration(self): verbosity="progress", ) + else: + self.reporter.log_string( + "Optimization did not converge within the maximum number of steps, which is {self.config['calibration']['max_steps']}.", + verbosity="warning", + ) + self.reporter.log_string( "Search parameter optimization finished. Values taken forward for search are:", verbosity="progress", @@ -594,7 +627,7 @@ def calibration(self): self.reporter.log_string( "==============================================", verbosity="progress" ) - for optimizers in order_of_optimization: + for optimizers in ordered_optimizers: for optimizer in optimizers: self.reporter.log_string( f"{optimizer.parameter_name:<15}: {self.optimization_manager.__dict__[optimizer.parameter_name]:.4f}", @@ -621,7 +654,9 @@ def filter_dfs(self, precursor_df, fragments_df): Filtered precursor dataframe. Decoy precursors and those found at worse than 1% FDR are removed from the precursor dataframe. fragments_df_filtered : pd.DataFrame - Filtered fragment dataframe. Retained fragments must have a correlation greater than 0.7 and belong to the top 5000 fragments sorted by correlation. + Filtered fragment dataframe. Retained fragments must either: + 1) have a correlation greater than 0.7 and belong to the top 5000 fragments sorted by correlation, if there are more than 500 with a correlation greater than 0.7, or + 2) belong to the top 500 fragments sorted by correlation otherwise. """ precursor_df_filtered = precursor_df[precursor_df["qval"] < 0.01] diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 638a12c7..a5b5f27b 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -413,7 +413,7 @@ def create_workflow_instance(): return workflow -def automatic_ms2_optimizer_test(): +def test_automatic_ms2_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -467,7 +467,7 @@ def automatic_ms2_optimizer_test(): assert workflow.com.classifier_version == 2 -def automatic_rt_optimizer_test(): +def test_automatic_rt_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -521,7 +521,7 @@ def automatic_rt_optimizer_test(): assert workflow.com.classifier_version == 2 -def automatic_ms1_optimizer_test(): +def test_automatic_ms1_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -575,7 +575,7 @@ def automatic_ms1_optimizer_test(): assert workflow.com.classifier_version == 2 -def automatic_mobility_optimizer_test(): +def test_automatic_mobility_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -628,7 +628,7 @@ def automatic_mobility_optimizer_test(): assert workflow.com.classifier_version == 2 -def targeted_ms2_optimizer_test(): +def test_targeted_ms2_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -667,7 +667,7 @@ def targeted_ms2_optimizer_test(): assert workflow.com.classifier_version == 2 -def targeted_rt_optimizer_test(): +def test_targeted_rt_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -706,7 +706,7 @@ def targeted_rt_optimizer_test(): assert workflow.com.classifier_version == 2 -def targeted_ms1_optimizer_test(): +def test_targeted_ms1_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() @@ -745,7 +745,7 @@ def targeted_ms1_optimizer_test(): assert workflow.com.classifier_version == 2 -def targeted_mobility_optimizer_test(): +def test_targeted_mobility_optimizer(): workflow = create_workflow_instance() calibration_test_df1 = calibration_testdata() From 157c14a0155357169f8869985815f1d4f3bce01c Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 2 Aug 2024 13:48:31 +0200 Subject: [PATCH 29/36] update unit tests --- tests/unit_tests/test_workflow.py | 96 +++++++++++++++++++------------ 1 file changed, 59 insertions(+), 37 deletions(-) diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index a5b5f27b..02e4434a 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -407,7 +407,29 @@ def create_workflow_instance(): reporter=workflow.reporter, ) - workflow.init_calibration_optimization_manager() + optimization_manager_config = { + "ms1_error": workflow.config["search_initial"]["initial_ms1_tolerance"], + "ms2_error": workflow.config["search_initial"]["initial_ms2_tolerance"], + "rt_error": workflow.config["search_initial"]["initial_rt_tolerance"], + "mobility_error": workflow.config["search_initial"][ + "initial_mobility_tolerance" + ], + "column_type": "library", + "num_candidates": workflow.config["search_initial"]["initial_num_candidates"], + "classifier_version": -1, + "fwhm_rt": workflow.config["optimization_manager"]["fwhm_rt"], + "fwhm_mobility": workflow.config["optimization_manager"]["fwhm_mobility"], + "score_cutoff": workflow.config["optimization_manager"]["score_cutoff"], + } + + workflow._optimization_manager = manager.OptimizationManager( + optimization_manager_config, + path=os.path.join(workflow.path, workflow.OPTIMIZATION_MANAGER_PATH), + load_from_file=workflow.config["general"]["reuse_calibration"], + figure_path=os.path.join(workflow.path, workflow.FIGURE_PATH), + reporter=workflow.reporter, + ) + workflow.init_fdr_manager() return workflow @@ -440,7 +462,7 @@ def test_automatic_ms2_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 ms2_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) @@ -450,7 +472,7 @@ def test_automatic_ms2_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 ms2_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) @@ -459,12 +481,12 @@ def test_automatic_ms2_optimizer(): ms2_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) ).all() assert ( - workflow.com.ms2_error + workflow.optimization_manager.ms2_error == ms2_optimizer.history_df.parameter[ ms2_optimizer.history_df.precursor_count.idxmax() ] ) - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 def test_automatic_rt_optimizer(): @@ -494,7 +516,7 @@ def test_automatic_rt_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 rt_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) @@ -504,7 +526,7 @@ def test_automatic_rt_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 rt_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) @@ -513,12 +535,12 @@ def test_automatic_rt_optimizer(): rt_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) ).all() assert ( - workflow.com.rt_error + workflow.optimization_manager.rt_error == rt_optimizer.history_df.parameter[ rt_optimizer.history_df.precursor_count.idxmax() ] ) - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 def test_automatic_ms1_optimizer(): @@ -548,7 +570,7 @@ def test_automatic_ms1_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 ms1_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) @@ -558,7 +580,7 @@ def test_automatic_ms1_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 ms1_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) @@ -567,12 +589,12 @@ def test_automatic_ms1_optimizer(): ms1_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) ).all() assert ( - workflow.com.ms1_error + workflow.optimization_manager.ms1_error == ms1_optimizer.history_df.parameter[ ms1_optimizer.history_df.precursor_count.idxmax() ] ) - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 def test_automatic_mobility_optimizer(): @@ -602,7 +624,7 @@ def test_automatic_mobility_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 mobility_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) calibration_test_df1 = pd.concat( @@ -611,7 +633,7 @@ def test_automatic_mobility_optimizer(): ) workflow.fdr_manager._current_version += 1 - assert workflow.com.classifier_version == -1 + assert workflow.optimization_manager.classifier_version == -1 mobility_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) @@ -620,12 +642,12 @@ def test_automatic_mobility_optimizer(): mobility_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) ).all() assert ( - workflow.com.mobility_error + workflow.optimization_manager.mobility_error == mobility_optimizer.history_df.parameter[ mobility_optimizer.history_df.precursor_count.idxmax() ] ) - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 def test_targeted_ms2_optimizer(): @@ -649,22 +671,22 @@ def test_targeted_ms2_optimizer(): optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 0 + assert workflow.optimization_manager.classifier_version == 0 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 1 + assert workflow.optimization_manager.classifier_version == 1 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) assert optimizer.has_converged is True - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 - assert workflow.com.ms2_error == optimizer.target_parameter - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.ms2_error == optimizer.target_parameter + assert workflow.optimization_manager.classifier_version == 2 def test_targeted_rt_optimizer(): @@ -688,22 +710,22 @@ def test_targeted_rt_optimizer(): optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 0 + assert workflow.optimization_manager.classifier_version == 0 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 1 + assert workflow.optimization_manager.classifier_version == 1 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) assert optimizer.has_converged is True - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 - assert workflow.com.rt_error == optimizer.target_parameter - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.rt_error == optimizer.target_parameter + assert workflow.optimization_manager.classifier_version == 2 def test_targeted_ms1_optimizer(): @@ -727,22 +749,22 @@ def test_targeted_ms1_optimizer(): optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 0 + assert workflow.optimization_manager.classifier_version == 0 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 1 + assert workflow.optimization_manager.classifier_version == 1 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) assert optimizer.has_converged is True - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 - assert workflow.com.ms1_error == optimizer.target_parameter - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.ms1_error == optimizer.target_parameter + assert workflow.optimization_manager.classifier_version == 2 def test_targeted_mobility_optimizer(): @@ -766,19 +788,19 @@ def test_targeted_mobility_optimizer(): optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 0 + assert workflow.optimization_manager.classifier_version == 0 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) assert optimizer.has_converged is False - assert workflow.com.classifier_version == 1 + assert workflow.optimization_manager.classifier_version == 1 workflow.fdr_manager._current_version += 1 optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) assert optimizer.has_converged is True - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 2 - assert workflow.com.mobility_error == optimizer.target_parameter - assert workflow.com.classifier_version == 2 + assert workflow.optimization_manager.mobility_error == optimizer.target_parameter + assert workflow.optimization_manager.classifier_version == 2 From 8c61993063fd907d7d974605cc192fdaf369e114 Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 2 Aug 2024 17:32:01 +0200 Subject: [PATCH 30/36] change feature for AutomaticMS1Optimizer --- alphadia/workflow/optimization.py | 163 +++++------------------------- 1 file changed, 27 insertions(+), 136 deletions(-) diff --git a/alphadia/workflow/optimization.py b/alphadia/workflow/optimization.py index b7ed45d5..2370af1c 100644 --- a/alphadia/workflow/optimization.py +++ b/alphadia/workflow/optimization.py @@ -169,7 +169,7 @@ def plot(self): """Plot the value of the feature used to assess optimization progress against the parameter value, for each value tested.""" fig, ax = plt.subplots() - ax.axvline( + ax.vlines( x=self.workflow.optimization_manager.__dict__[self.parameter_name], ymin=0, ymax=self.history_df[self.feature_name].max(), @@ -217,16 +217,33 @@ def _propose_new_parameter(self, df): """ pass - @abstractmethod - def _check_convergence(self): - """This method checks if the optimization has converged according to parameter-specific conditions. + def _check_convergence(self, current_step: int = -1): + """Optimization should stop if continued narrowing of the parameter is not improving the feature value. + This function checks if the previous rounds of optimization have led to a meaningful improvement in the feature value. + If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. - Returns - ------- - bool + Notes + ----- + Because the check for an increase in feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. + + Parameters + ---------- + current_step: int + The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. """ - pass + + min_steps_reached = ( + current_step >= self.workflow.config["calibration"]["min_steps"] - 1 + ) + return ( + min_steps_reached + and len(self.history_df) > 2 + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-2] + and self.history_df[self.feature_name].iloc[-1] + < 1.1 * self.history_df[self.feature_name].iloc[-3] + ) @abstractmethod def _get_feature_value( @@ -368,39 +385,6 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, reporter) - def _check_convergence(self, current_step: int = -1): - """Optimization should stop if continued optimization of the parameter is not improving the TODO feature value. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. - It also checks if the current step is greater than the minimum number of steps required for optimization. - - Notes - ----- - Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - - Parameters - ---------- - current_step: int - The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. - - Returns - ------- - bool - True if the convergence conditions are met, False otherwise. - - """ - - min_steps_reached = ( - current_step >= self.workflow.config["calibration"]["min_steps"] - 1 - ) - return ( - min_steps_reached - and len(self.history_df) > 2 - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-2] - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-3] - ) - def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, @@ -438,38 +422,6 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, reporter) - def _check_convergence(self, current_step: int = -1): - """Optimization should stop if continued narrowing of the MS2 parameter is not improving the number of precursor identifications. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the number of identifications. - If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. - - Notes - ----- - Because the check for an increase in identifications requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - - Parameters - ---------- - current_step: int - The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. - - Returns - ------- - bool - True if the convergence conditions are met, False otherwise. - - """ - min_steps_reached = ( - current_step >= self.workflow.config["calibration"]["min_steps"] - 1 - ) - return ( - min_steps_reached - and len(self.history_df) > 2 - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-2] - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-3] - ) - def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, @@ -504,37 +456,9 @@ def __init__( self.parameter_name = "ms1_error" self.estimator_group_name = "precursor" self.estimator_name = "mz" - self.feature_name = "precursor_count" + self.feature_name = "mean_isotope_intensity_correlation" super().__init__(initial_parameter, workflow, reporter) - def _check_convergence(self, current_step: int = -1): - """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. - If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. - - Notes - ----- - Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - - Parameters - ---------- - current_step: int - The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. - - """ - - min_steps_reached = ( - current_step >= self.workflow.config["calibration"]["min_steps"] - 1 - ) - return ( - min_steps_reached - and len(self.history_df) > 2 - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-2] - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-3] - ) - def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, @@ -556,7 +480,7 @@ def _propose_new_parameter(self, df: pd.DataFrame): def _get_feature_value( self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame ): - return len(precursors_df) + return precursors_df.isotope_intensity_correlation.mean() class AutomaticMobilityOptimizer(AutomaticOptimizer): @@ -573,39 +497,6 @@ def __init__( self.feature_name = "precursor_count" super().__init__(initial_parameter, workflow, reporter) - def _check_convergence(self, current_step: int = -1): - """Optimization should stop if continued narrowing of the parameter is not improving the TODO feature value. - This function checks if the previous rounds of optimization have led to a meaningful improvement in the TODO feature value. - If so, it continues optimization and appends the proposed new parameter to the list of parameters. If not, it stops optimization and sets the optimal parameter attribute. - - Notes - ----- - Because the check for an increase in TODO feature value requires two previous rounds, the function will also initialize for another round of optimization if there have been fewer than 3 rounds. - - Parameters - ---------- - current_step: int - The current step in the optimization process. By default it is set to -1, which prevents the optimizer from converging unless min_steps has been set to 0. - - Returns - ------- - bool - True if the convergence conditions are met, False otherwise. - - """ - - min_steps_reached = ( - current_step >= self.workflow.config["calibration"]["min_steps"] - 1 - ) - return ( - min_steps_reached - and len(self.history_df) > 2 - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-2] - and self.history_df[self.feature_name].iloc[-1] - < 1.1 * self.history_df[self.feature_name].iloc[-3] - ) - def _propose_new_parameter(self, df: pd.DataFrame): """See base class. The update rule is 1) calculate the deviation of the predicted mz values from the observed mz values, From 724085752767f3fcc673f33bd4cb893b7896346b Mon Sep 17 00:00:00 2001 From: odespard Date: Fri, 2 Aug 2024 22:23:41 +0200 Subject: [PATCH 31/36] fixed unit test --- tests/unit_tests/test_workflow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 02e4434a..9de6eee7 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -138,6 +138,8 @@ def calibration_testdata(): + np.sin(mobility_library * 0.05) ) + isotope_intensity_correlation = np.linspace(0, 100, 1000) + return pd.DataFrame( { "mz_library": mz_library, @@ -146,6 +148,7 @@ def calibration_testdata(): "rt_observed": rt_observed, "mobility_library": mobility_library, "mobility_observed": mobility_observed, + "isotope_intensity_correlation": isotope_intensity_correlation, } ).copy() @@ -585,16 +588,13 @@ def test_automatic_ms1_optimizer(): ms1_optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) assert ms1_optimizer.has_converged is True - assert ( - ms1_optimizer.history_df.precursor_count == pd.Series([1000, 1001, 1002]) - ).all() assert ( workflow.optimization_manager.ms1_error == ms1_optimizer.history_df.parameter[ - ms1_optimizer.history_df.precursor_count.idxmax() + ms1_optimizer.history_df.mean_isotope_intensity_correlation.idxmax() ] ) - assert workflow.optimization_manager.classifier_version == 2 + assert workflow.optimization_manager.classifier_version == 0 def test_automatic_mobility_optimizer(): From 84899f8ec662ae94abdb0de94994d94299ff2426 Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 5 Aug 2024 09:21:02 +0200 Subject: [PATCH 32/36] change e2e test for automatic calibration --- tests/e2e_tests/e2e_test_cases.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_tests/e2e_test_cases.yaml b/tests/e2e_tests/e2e_test_cases.yaml index 543870d9..74e842ba 100644 --- a/tests/e2e_tests/e2e_test_cases.yaml +++ b/tests/e2e_tests/e2e_test_cases.yaml @@ -116,7 +116,7 @@ test_cases: target_num_candidates: 3 target_ms1_tolerance: -1 target_ms2_tolerance: -1 - target_rt_tolerance: 200 + target_rt_tolerance: -1 search_initial: initial_num_candidates: 1 initial_ms1_tolerance: 10 From a5d01f2f25e3ac535a031ba7c89a9782360968b0 Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 5 Aug 2024 10:20:20 +0200 Subject: [PATCH 33/36] adjust min_steps and optimization_lock_min_steps in default.yaml --- alphadia/constants/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml index 390c2862..365c1b96 100644 --- a/alphadia/constants/default.yaml +++ b/alphadia/constants/default.yaml @@ -96,7 +96,7 @@ search_advanced: calibration: # minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted) - optimization_lock_min_steps: 3 + optimization_lock_min_steps: 0 # Number of precursors searched and scored per batch batch_size: 8000 @@ -108,7 +108,7 @@ calibration: max_steps: 20 # the maximum number of steps that a given optimizer is permitted to take - min_steps: 3 + min_steps: 2 # TODO: remove this parameter final_full_calibration: False From 4f9b0949d4fa6c1da28f2d2fa7163a05a59a467d Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 5 Aug 2024 10:22:29 +0200 Subject: [PATCH 34/36] formatting --- alphadia/workflow/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alphadia/workflow/optimization.py b/alphadia/workflow/optimization.py index 2370af1c..16e2e2b6 100644 --- a/alphadia/workflow/optimization.py +++ b/alphadia/workflow/optimization.py @@ -18,7 +18,7 @@ def __init__( workflow, reporter: None | reporting.Pipeline | reporting.Backend = None, ): - """This class serves as a base class for organizing the search parameter optimization process, which defines the parameters used for search. + """This class serves as a base class for the search parameter optimization process, which defines the parameters used for search. Parameters ---------- From bd55b6cb3736346404682dfa8d9bcba47a818384 Mon Sep 17 00:00:00 2001 From: odespard Date: Mon, 5 Aug 2024 10:59:30 +0200 Subject: [PATCH 35/36] update unit test --- tests/unit_tests/test_workflow.py | 91 ++++++++++--------------------- 1 file changed, 28 insertions(+), 63 deletions(-) diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 9de6eee7..b62737a9 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -664,29 +664,20 @@ def test_targeted_ms2_optimizer(): workflow, ) - assert optimizer.has_converged is False assert optimizer.parameter_name == "ms2_error" - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + for current_step in range(workflow.config["calibration"]["min_steps"]): + assert optimizer.has_converged is False - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 0 + workflow.fdr_manager._current_version += 1 + optimizer.step( + calibration_test_df1, calibration_test_df2, current_step=current_step + ) - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) - - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 1 - - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + assert workflow.optimization_manager.classifier_version == current_step assert optimizer.has_converged is True - assert workflow.optimization_manager.classifier_version == 2 - assert workflow.optimization_manager.ms2_error == optimizer.target_parameter - assert workflow.optimization_manager.classifier_version == 2 def test_targeted_rt_optimizer(): @@ -703,29 +694,20 @@ def test_targeted_rt_optimizer(): workflow, ) - assert optimizer.has_converged is False assert optimizer.parameter_name == "rt_error" - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + for current_step in range(workflow.config["calibration"]["min_steps"]): + assert optimizer.has_converged is False - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 0 + workflow.fdr_manager._current_version += 1 + optimizer.step( + calibration_test_df1, calibration_test_df2, current_step=current_step + ) - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) - - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 1 - - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + assert workflow.optimization_manager.classifier_version == current_step assert optimizer.has_converged is True - assert workflow.optimization_manager.classifier_version == 2 - assert workflow.optimization_manager.rt_error == optimizer.target_parameter - assert workflow.optimization_manager.classifier_version == 2 def test_targeted_ms1_optimizer(): @@ -742,29 +724,20 @@ def test_targeted_ms1_optimizer(): workflow, ) - assert optimizer.has_converged is False assert optimizer.parameter_name == "ms1_error" - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) + for current_step in range(workflow.config["calibration"]["min_steps"]): + assert optimizer.has_converged is False - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 0 + workflow.fdr_manager._current_version += 1 + optimizer.step( + calibration_test_df1, calibration_test_df2, current_step=current_step + ) - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) - - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 1 - - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + assert workflow.optimization_manager.classifier_version == current_step assert optimizer.has_converged is True - assert workflow.optimization_manager.classifier_version == 2 - assert workflow.optimization_manager.ms1_error == optimizer.target_parameter - assert workflow.optimization_manager.classifier_version == 2 def test_targeted_mobility_optimizer(): @@ -781,26 +754,18 @@ def test_targeted_mobility_optimizer(): workflow, ) - assert optimizer.has_converged is False assert optimizer.parameter_name == "mobility_error" - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=0) - - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 0 - - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=1) + for current_step in range(workflow.config["calibration"]["min_steps"]): + assert optimizer.has_converged is False - assert optimizer.has_converged is False - assert workflow.optimization_manager.classifier_version == 1 + workflow.fdr_manager._current_version += 1 + optimizer.step( + calibration_test_df1, calibration_test_df2, current_step=current_step + ) - workflow.fdr_manager._current_version += 1 - optimizer.step(calibration_test_df1, calibration_test_df2, current_step=2) + assert workflow.optimization_manager.classifier_version == current_step assert optimizer.has_converged is True - assert workflow.optimization_manager.classifier_version == 2 assert workflow.optimization_manager.mobility_error == optimizer.target_parameter - assert workflow.optimization_manager.classifier_version == 2 From b5ea3bb5211baedc854e24e69464baaed4f6a69c Mon Sep 17 00:00:00 2001 From: odespard Date: Wed, 7 Aug 2024 09:07:58 +0200 Subject: [PATCH 36/36] modify_e2e_test_for_automatic_calibration --- tests/e2e_tests/e2e_test_cases.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/e2e_tests/e2e_test_cases.yaml b/tests/e2e_tests/e2e_test_cases.yaml index 74e842ba..4b286aa9 100644 --- a/tests/e2e_tests/e2e_test_cases.yaml +++ b/tests/e2e_tests/e2e_test_cases.yaml @@ -119,9 +119,9 @@ test_cases: target_rt_tolerance: -1 search_initial: initial_num_candidates: 1 - initial_ms1_tolerance: 10 - initial_ms2_tolerance: 15 - initial_rt_tolerance: 300 + initial_ms1_tolerance: 100 + initial_ms2_tolerance: 100 + initial_rt_tolerance: 600 search_output: peptide_level_lfq: true precursor_level_lfq: true