Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New dynamic optimization lock #309

Merged
merged 20 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,6 @@ search_advanced:

calibration:

# minimum number of steps taken during the optimization lock (during which the elution groups used for optimization are extracted)
optimization_lock_min_steps: 0

# Number of precursors searched and scored per batch
batch_size: 8000

Expand All @@ -110,7 +107,7 @@ calibration:
# the maximum number of steps that a given optimizer is permitted to take
max_steps: 20

# the maximum number of steps that a given optimizer is permitted to take
# the minimum number of steps that a given optimizer must take before it can be said to have converged
min_steps: 2

# TODO: remove this parameter
Expand Down
12 changes: 6 additions & 6 deletions alphadia/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,19 @@ class NoPsmFoundError(BusinessError):
_msg = "No psm files accumulated, can't continue"


class NoRecalibrationTargetError(BusinessError):
"""Raise when no recalibration target is found."""
class NoOptimizationLockTargetError(BusinessError):
"""Raise when the optimization lock target is not found."""

_error_code = "NO_RECALIBRATION_TARGET"
_error_code = "NO_OPTIMIZATION_LOCK_TARGET"

_msg = "Searched all data without finding recalibration target"
_msg = "Searched all data without finding optimization lock target"

_detail_msg = """Search for raw file failed as not enough precursors were found for calibration.
_detail_msg = """Search for raw file failed as not enough precursors were found for calibration and optimization.
This can have the following reasons:
1. The sample was empty and therefore no precursors were found.
2. The sample contains only very few precursors.
For small libraries, try to set recalibration_target to a lower value.
For large libraries, try to reduce the library size and reduce the calibration MS1 and MS2 tolerance.
For large libraries, try to reduce the library size and reduce the initial MS1 and MS2 tolerance.
3. There was a fundamental issue with search parameters."""


Expand Down
96 changes: 83 additions & 13 deletions alphadia/outputtransform.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ class SearchPlanOutput:
PSM_INPUT = "psm"
PRECURSOR_OUTPUT = "precursors"
STAT_OUTPUT = "stat"
INTERNAL_OUTPUT = "internal"
PG_OUTPUT = "protein_groups"
LIBRARY_OUTPUT = "speclib.mbr"
TRANSFER_OUTPUT = "speclib.transfer"
Expand Down Expand Up @@ -365,6 +366,7 @@ def build(
folder_list, save=False, base_spec_lib=base_spec_lib
)
_ = self.build_stat_df(folder_list, psm_df=psm_df, save=True)
_ = self.build_internal_df(folder_list, save=True)
_ = self.build_lfq_tables(folder_list, psm_df=psm_df, save=True)
_ = self.build_library(
base_spec_lib,
Expand Down Expand Up @@ -678,6 +680,50 @@ def build_stat_df(

return stat_df

def build_internal_df(
self,
folder_list: list[str],
odespard marked this conversation as resolved.
Show resolved Hide resolved
save: bool = True,
):
"""Build internal data table from a list of seach outputs

Parameters
----------

folder_list: List[str]
List of folders containing the search outputs

save: bool
Save the precursor table to disk

Returns
-------

stat_df: pd.DataFrame
Precursor table
"""
logger.progress("Building internal statistics")

internal_df_list = []
for folder in folder_list:
internal_df_list.append(
_build_run_internal_df(
folder,
)
)

internal_df = pd.concat(internal_df_list)

if save:
logger.info("Writing internal output to disk")
write_df(
internal_df,
os.path.join(self.output_folder, self.INTERNAL_OUTPUT),
file_format="tsv",
)

return internal_df

def build_lfq_tables(
self,
folder_list: list[str],
Expand Down Expand Up @@ -850,9 +896,6 @@ def _build_run_stat_df(
optimization_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH
)
timing_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
)

if channels is None:
channels = [0]
Expand Down Expand Up @@ -894,20 +937,47 @@ def _build_run_stat_df(
base_dict["rt_error"] = np.nan
base_dict["mobility_error"] = np.nan

if os.path.exists(timing_manager_path):
timing_manager = manager.TimingManager(path=timing_manager_path)
out_df.append(base_dict)

base_dict["optimization_duration"] = timing_manager.optimization["duration"]
base_dict["extraction_duration"] = timing_manager.extraction["duration"]
return pd.DataFrame(out_df)

else:
logger.warning(f"Error reading timing manager for {raw_name}")
base_dict["optimization_duration"] = np.nan
base_dict["extraction_duration"] = np.nan

out_df.append(base_dict)
def _build_run_internal_df(
odespard marked this conversation as resolved.
Show resolved Hide resolved
folder_path: str,
):
"""Build stat dataframe for a single run.

return pd.DataFrame(out_df)
Parameters
----------

folder_path: str
Path (from the base directory of the output_folder attribute of the Plan class) to the directory containing the raw file and the managers


Returns
-------
pd.DataFrame
Dataframe containing the statistics

"""
timing_manager_path = os.path.join(
folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
)
raw_name = os.path.basename(folder_path)

internal_dict = {
"run": raw_name,
}

if os.path.exists(timing_manager_path):
timing_manager = manager.TimingManager(path=timing_manager_path)
for key in timing_manager.timings:
internal_dict[f"duration_{key}"] = [timing_manager.timings[key]["duration"]]

else:
logger.warning(f"Error reading timing manager for {raw_name}")

return pd.DataFrame(internal_dict)


def perform_protein_fdr(psm_df):
Expand Down
9 changes: 8 additions & 1 deletion alphadia/planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,16 @@ def run(
logger.info(f"No existing quantification found for {raw_name}")

workflow.load(dia_path, speclib)
workflow.calibration()

workflow.timing_manager.set_start_time("optimization")
workflow.search_parameter_optimization()
workflow.timing_manager.set_end_time("optimization")

workflow.timing_manager.set_start_time("extraction")
psm_df, frag_df = workflow.extraction()
workflow.timing_manager.set_end_time("extraction")
workflow.timing_manager.save()
odespard marked this conversation as resolved.
Show resolved Hide resolved

psm_df = psm_df[psm_df["qval"] <= self.config["fdr"]["fdr"]]

if self.config["multiplexing"]["enabled"]:
Expand Down
2 changes: 1 addition & 1 deletion alphadia/workflow/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ def load(
self._calibration_manager.disable_mobility_calibration()

# initialize the optimization manager

self._optimization_manager = manager.OptimizationManager(
self.config,
gradient_length=self.dia_data.rt_values.max(),
path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH),
load_from_file=self.config["general"]["reuse_calibration"],
figure_path=os.path.join(self.path, self.FIGURE_PATH),
Expand Down
55 changes: 29 additions & 26 deletions alphadia/workflow/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ class OptimizationManager(BaseManager):
def __init__(
self,
config: None | dict = None,
gradient_length: None | float = None,
path: None | str = None,
load_from_file: bool = True,
**kwargs,
Expand All @@ -464,10 +465,15 @@ def __init__(
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

if not self.is_loaded_from_file:
rt_error = (
config["search_initial"]["initial_rt_tolerance"]
if config["search_initial"]["initial_rt_tolerance"] > 1
else config["search_initial"]["initial_rt_tolerance"] * gradient_length
)
initial_parameters = {
"ms1_error": config["search_initial"]["initial_ms1_tolerance"],
"ms2_error": config["search_initial"]["initial_ms2_tolerance"],
"rt_error": config["search_initial"]["initial_rt_tolerance"],
"rt_error": rt_error,
"mobility_error": config["search_initial"][
"initial_mobility_tolerance"
],
Expand All @@ -487,7 +493,6 @@ def fit(self, update_dict):
"""Update the parameters dict with the values in update_dict."""
self.__dict__.update(update_dict)
self.is_fitted = True
self.save()

def predict(self):
"""Return the parameters dict."""
Expand Down Expand Up @@ -761,34 +766,32 @@ def __init__(
load_from_file: bool = True,
**kwargs,
):
"""Contains and updates timing information for the portions of the workflow."""
super().__init__(path=path, load_from_file=load_from_file, **kwargs)
self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

if not self.is_loaded_from_file:
self.__dict__.update(
{
"optimization": {
"start": None,
"end": None,
"duration": None,
},
"extraction": {
"start": None,
"end": None,
"duration": None,
},
}
)
self.timings = {}

def start(self, workflow_stage: str):
self.__dict__.update({workflow_stage: {"start": pd.Timestamp.now()}})
self.save()
def set_start_time(self, workflow_stage: str):
"""Stores the start time of the given stage of the workflow in the timings attribute. Also saves the timing manager to disk.

Parameters
----------
workflow_stage : str
The name under which the timing will be stored in the timings dict
"""
self.timings.update({workflow_stage: {"start": pd.Timestamp.now()}})

def set_end_time(self, workflow_stage: str):
"""Stores the end time of the given stage of the workflow in the timings attribute and calculates the duration. Also saves the timing manager to disk.
Parameters
----------
workflow_stage : str
The name under which the timing will be stored in the timings dict

def end(self, workflow_stage: str):
self.__dict__[workflow_stage]["end"] = pd.Timestamp.now()
self.__dict__[workflow_stage]["duration"] = (
self.__dict__[workflow_stage]["end"]
- self.__dict__[workflow_stage]["start"]
"""
self.timings[workflow_stage]["end"] = pd.Timestamp.now()
self.timings[workflow_stage]["duration"] = (
self.timings[workflow_stage]["end"] - self.timings[workflow_stage]["start"]
).total_seconds() / 60
self.save()
Loading
Loading