Skip to content

Commit

Permalink
Merge pull request #309 from MannLabs/new_dynamic_optimization_lock
Browse files Browse the repository at this point in the history
New dynamic optimization lock
  • Loading branch information
odespard authored Aug 21, 2024
2 parents dad6ed5 + 696e5bf commit 5dc9ac9
Show file tree
Hide file tree
Showing 11 changed files with 547 additions and 363 deletions.
5 changes: 1 addition & 4 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,6 @@ search_advanced:

calibration:

# minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted)
optimization_lock_min_steps: 0

# Number of precursors searched and scored per batch
batch_size: 8000

Expand All @@ -110,7 +107,7 @@ calibration:
# the maximum number of steps that a given optimizer is permitted to take
max_steps: 20

# the maximum number of steps that a given optimizer is permitted to take
# the minimum number of steps that a given optimizer must take before it can be said to have converged
min_steps: 2

# TODO: remove this parameter
Expand Down
12 changes: 6 additions & 6 deletions alphadia/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,19 @@ class NoPsmFoundError(BusinessError):
_msg = "No psm files accumulated, can't continue"


class NoRecalibrationTargetError(BusinessError):
"""Raise when no recalibration target is found."""
class NoOptimizationLockTargetError(BusinessError):
"""Raise when the optimization lock target is not found."""

_error_code = "NO_RECALIBRATION_TARGET"
_error_code = "NO_OPTIMIZATION_LOCK_TARGET"

_msg = "Searched all data without finding recalibration target"
_msg = "Searched all data without finding optimization lock target"

_detail_msg = """Search for raw file failed as not enough precursors were found for calibration.
_detail_msg = """Search for raw file failed as not enough precursors were found for calibration and optimization.
This can have the following reasons:
1. The sample was empty and therefore no precursors were found.
2. The sample contains only very few precursors.
For small libraries, try to set recalibration_target to a lower value.
For large libraries, try to reduce the library size and reduce the calibration MS1 and MS2 tolerance.
For large libraries, try to reduce the library size and reduce the initial MS1 and MS2 tolerance.
3. There was a fundamental issue with search parameters."""


Expand Down
96 changes: 83 additions & 13 deletions alphadia/outputtransform.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ class SearchPlanOutput:
PSM_INPUT = "psm"
PRECURSOR_OUTPUT = "precursors"
STAT_OUTPUT = "stat"
INTERNAL_OUTPUT = "internal"
PG_OUTPUT = "protein_groups"
LIBRARY_OUTPUT = "speclib.mbr"
TRANSFER_OUTPUT = "speclib.transfer"
Expand Down Expand Up @@ -365,6 +366,7 @@ def build(
folder_list, save=False, base_spec_lib=base_spec_lib
)
_ = self.build_stat_df(folder_list, psm_df=psm_df, save=True)
_ = self.build_internal_df(folder_list, save=True)
_ = self.build_lfq_tables(folder_list, psm_df=psm_df, save=True)
_ = self.build_library(
base_spec_lib,
Expand Down Expand Up @@ -678,6 +680,50 @@ def build_stat_df(

return stat_df

def build_internal_df(
self,
folder_list: list[str],
save: bool = True,
):
"""Build internal data table from a list of seach outputs
Parameters
----------
folder_list: List[str]
List of folders containing the search outputs
save: bool
Save the precursor table to disk
Returns
-------
stat_df: pd.DataFrame
Precursor table
"""
logger.progress("Building internal statistics")

internal_df_list = []
for folder in folder_list:
internal_df_list.append(
_build_run_internal_df(
folder,
)
)

internal_df = pd.concat(internal_df_list)

if save:
logger.info("Writing internal output to disk")
write_df(
internal_df,
os.path.join(self.output_folder, self.INTERNAL_OUTPUT),
file_format="tsv",
)

return internal_df

def build_lfq_tables(
self,
folder_list: list[str],
Expand Down Expand Up @@ -850,9 +896,6 @@ def _build_run_stat_df(
optimization_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH
)
timing_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
)

if channels is None:
channels = [0]
Expand Down Expand Up @@ -894,20 +937,47 @@ def _build_run_stat_df(
base_dict["rt_error"] = np.nan
base_dict["mobility_error"] = np.nan

if os.path.exists(timing_manager_path):
timing_manager = manager.TimingManager(path=timing_manager_path)
out_df.append(base_dict)

base_dict["optimization_duration"] = timing_manager.optimization["duration"]
base_dict["extraction_duration"] = timing_manager.extraction["duration"]
return pd.DataFrame(out_df)

else:
logger.warning(f"Error reading timing manager for {raw_name}")
base_dict["optimization_duration"] = np.nan
base_dict["extraction_duration"] = np.nan

out_df.append(base_dict)
def _build_run_internal_df(
folder_path: str,
):
"""Build stat dataframe for a single run.
return pd.DataFrame(out_df)
Parameters
----------
folder_path: str
Path (from the base directory of the output_folder attribute of the Plan class) to the directory containing the raw file and the managers
Returns
-------
pd.DataFrame
Dataframe containing the statistics
"""
timing_manager_path = os.path.join(
folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
)
raw_name = os.path.basename(folder_path)

internal_dict = {
"run": raw_name,
}

if os.path.exists(timing_manager_path):
timing_manager = manager.TimingManager(path=timing_manager_path)
for key in timing_manager.timings:
internal_dict[f"duration_{key}"] = [timing_manager.timings[key]["duration"]]

else:
logger.warning(f"Error reading timing manager for {raw_name}")

return pd.DataFrame(internal_dict)


def perform_protein_fdr(psm_df):
Expand Down
9 changes: 8 additions & 1 deletion alphadia/planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,16 @@ def run(
logger.info(f"No existing quantification found for {raw_name}")

workflow.load(dia_path, speclib)
workflow.calibration()

workflow.timing_manager.set_start_time("optimization")
workflow.search_parameter_optimization()
workflow.timing_manager.set_end_time("optimization")

workflow.timing_manager.set_start_time("extraction")
psm_df, frag_df = workflow.extraction()
workflow.timing_manager.set_end_time("extraction")
workflow.timing_manager.save()

psm_df = psm_df[psm_df["qval"] <= self.config["fdr"]["fdr"]]

if self.config["multiplexing"]["enabled"]:
Expand Down
2 changes: 1 addition & 1 deletion alphadia/workflow/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ def load(
self._calibration_manager.disable_mobility_calibration()

# initialize the optimization manager

self._optimization_manager = manager.OptimizationManager(
self.config,
gradient_length=self.dia_data.rt_values.max(),
path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH),
load_from_file=self.config["general"]["reuse_calibration"],
figure_path=os.path.join(self.path, self.FIGURE_PATH),
Expand Down
55 changes: 29 additions & 26 deletions alphadia/workflow/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ class OptimizationManager(BaseManager):
def __init__(
self,
config: None | dict = None,
gradient_length: None | float = None,
path: None | str = None,
load_from_file: bool = True,
**kwargs,
Expand All @@ -464,10 +465,15 @@ def __init__(
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

if not self.is_loaded_from_file:
rt_error = (
config["search_initial"]["initial_rt_tolerance"]
if config["search_initial"]["initial_rt_tolerance"] > 1
else config["search_initial"]["initial_rt_tolerance"] * gradient_length
)
initial_parameters = {
"ms1_error": config["search_initial"]["initial_ms1_tolerance"],
"ms2_error": config["search_initial"]["initial_ms2_tolerance"],
"rt_error": config["search_initial"]["initial_rt_tolerance"],
"rt_error": rt_error,
"mobility_error": config["search_initial"][
"initial_mobility_tolerance"
],
Expand All @@ -487,7 +493,6 @@ def fit(self, update_dict):
"""Update the parameters dict with the values in update_dict."""
self.__dict__.update(update_dict)
self.is_fitted = True
self.save()

def predict(self):
"""Return the parameters dict."""
Expand Down Expand Up @@ -761,34 +766,32 @@ def __init__(
load_from_file: bool = True,
**kwargs,
):
"""Contains and updates timing information for the portions of the workflow."""
super().__init__(path=path, load_from_file=load_from_file, **kwargs)
self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

if not self.is_loaded_from_file:
self.__dict__.update(
{
"optimization": {
"start": None,
"end": None,
"duration": None,
},
"extraction": {
"start": None,
"end": None,
"duration": None,
},
}
)
self.timings = {}

def start(self, workflow_stage: str):
self.__dict__.update({workflow_stage: {"start": pd.Timestamp.now()}})
self.save()
def set_start_time(self, workflow_stage: str):
"""Stores the start time of the given stage of the workflow in the timings attribute. Also saves the timing manager to disk.
Parameters
----------
workflow_stage : str
The name under which the timing will be stored in the timings dict
"""
self.timings.update({workflow_stage: {"start": pd.Timestamp.now()}})

def set_end_time(self, workflow_stage: str):
"""Stores the end time of the given stage of the workflow in the timings attribute and calculates the duration. Also saves the timing manager to disk.
Parameters
----------
workflow_stage : str
The name under which the timing will be stored in the timings dict
def end(self, workflow_stage: str):
self.__dict__[workflow_stage]["end"] = pd.Timestamp.now()
self.__dict__[workflow_stage]["duration"] = (
self.__dict__[workflow_stage]["end"]
- self.__dict__[workflow_stage]["start"]
"""
self.timings[workflow_stage]["end"] = pd.Timestamp.now()
self.timings[workflow_stage]["duration"] = (
self.timings[workflow_stage]["end"] - self.timings[workflow_stage]["start"]
).total_seconds() / 60
self.save()
Loading

0 comments on commit 5dc9ac9

Please sign in to comment.