diff --git a/alphadia/outputtransform.py b/alphadia/outputtransform.py index f9d900e6..d98ad9cd 100644 --- a/alphadia/outputtransform.py +++ b/alphadia/outputtransform.py @@ -1,6 +1,7 @@ # native imports import logging import os +from collections import defaultdict from collections.abc import Iterator import directlfq.config as lfqconfig @@ -25,6 +26,7 @@ ) from alphadia.transferlearning.train import FinetuneManager from alphadia.workflow import manager, peptidecentric +from alphadia.workflow.managers.raw_file_manager import RawFileManager logger = logging.getLogger() @@ -938,61 +940,59 @@ def _build_run_stat_df( Dataframe containing the statistics """ - optimization_manager_path = os.path.join( - folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH - ) - - calibration_manager_path = os.path.join( - folder, peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PATH - ) if channels is None: channels = [0] - out_df = [] + all_stats = [] for channel in channels: channel_df = run_df[run_df["channel"] == channel] - base_dict = { + stats = { "run": raw_name, "channel": channel, "precursors": len(channel_df), "proteins": channel_df["pg"].nunique(), } + stats["fwhm_rt"] = np.nan if "cycle_fwhm" in channel_df.columns: - base_dict["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"]) + stats["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"]) + stats["fwhm_mobility"] = np.nan if "mobility_fwhm" in channel_df.columns: - base_dict["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"]) + stats["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"]) # collect optimization stats - base_dict["optimization.ms2_error"] = np.nan - base_dict["optimization.ms1_error"] = np.nan - base_dict["optimization.rt_error"] = np.nan - base_dict["optimization.mobility_error"] = np.nan - - if os.path.exists(optimization_manager_path): + optimization_stats = defaultdict(lambda: np.nan) + if os.path.exists( + optimization_manager_path := os.path.join( + folder, + peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME, + ) + ): optimization_manager = manager.OptimizationManager( path=optimization_manager_path ) - base_dict["optimization.ms2_error"] = optimization_manager.ms2_error - base_dict["optimization.ms1_error"] = optimization_manager.ms1_error - base_dict["optimization.rt_error"] = optimization_manager.rt_error - base_dict["optimization.mobility_error"] = ( - optimization_manager.mobility_error - ) - + optimization_stats["ms2_error"] = optimization_manager.ms2_error + optimization_stats["ms1_error"] = optimization_manager.ms1_error + optimization_stats["rt_error"] = optimization_manager.rt_error + optimization_stats["mobility_error"] = optimization_manager.mobility_error else: logger.warning(f"Error reading optimization manager for {raw_name}") - # collect calibration stats - base_dict["calibration.ms2_median_accuracy"] = np.nan - base_dict["calibration.ms2_median_precision"] = np.nan - base_dict["calibration.ms1_median_accuracy"] = np.nan - base_dict["calibration.ms1_median_precision"] = np.nan + prefix = "optimization." + for key in ["ms2_error", "ms1_error", "rt_error", "mobility_error"]: + stats[f"{prefix}{key}"] = optimization_stats[key] - if os.path.exists(calibration_manager_path): + # collect calibration stats + calibration_stats = defaultdict(lambda: np.nan) + if os.path.exists( + calibration_manager_path := os.path.join( + folder, + peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PKL_NAME, + ) + ): calibration_manager = manager.CalibrationManager( path=calibration_manager_path ) @@ -1002,10 +1002,10 @@ def _build_run_stat_df( "fragment", "mz" ) ) and (fragment_mz_metrics := fragment_mz_estimator.metrics): - base_dict["calibration.ms2_median_accuracy"] = fragment_mz_metrics[ + calibration_stats["ms2_median_accuracy"] = fragment_mz_metrics[ "median_accuracy" ] - base_dict["calibration.ms2_median_precision"] = fragment_mz_metrics[ + calibration_stats["ms2_median_precision"] = fragment_mz_metrics[ "median_precision" ] @@ -1014,19 +1014,65 @@ def _build_run_stat_df( "precursor", "mz" ) ) and (precursor_mz_metrics := precursor_mz_estimator.metrics): - base_dict["calibration.ms1_median_accuracy"] = precursor_mz_metrics[ + calibration_stats["ms1_median_accuracy"] = precursor_mz_metrics[ "median_accuracy" ] - base_dict["calibration.ms1_median_precision"] = precursor_mz_metrics[ + calibration_stats["ms1_median_precision"] = precursor_mz_metrics[ "median_precision" ] else: logger.warning(f"Error reading calibration manager for {raw_name}") - out_df.append(base_dict) + prefix = "calibration." + for key in [ + "ms2_median_accuracy", + "ms2_median_precision", + "ms1_median_accuracy", + "ms1_median_precision", + ]: + stats[f"{prefix}{key}"] = calibration_stats[key] + + # collect raw stats + raw_stats = defaultdict(lambda: np.nan) + if os.path.exists( + raw_file_manager_path := os.path.join( + folder, peptidecentric.PeptideCentricWorkflow.RAW_FILE_MANAGER_PKL_NAME + ) + ): + raw_stats = RawFileManager( + path=raw_file_manager_path, load_from_file=True + ).stats + else: + logger.warning(f"Error reading raw file manager for {raw_name}") + + # deliberately mapping explicitly to avoid coupling raw_stats to the output too tightly + prefix = "raw." + + stats[f"{prefix}gradient_min_m"] = raw_stats["rt_limit_min"] / 60 + stats[f"{prefix}gradient_max_m"] = raw_stats["rt_limit_max"] / 60 + stats[f"{prefix}gradient_length_m"] = ( + raw_stats["rt_limit_max"] - raw_stats["rt_limit_min"] + ) / 60 + for key in [ + "cycle_length", + "cycle_duration", + "cycle_number", + "msms_range_min", + "msms_range_max", + ]: + stats[f"{prefix}{key}"] = raw_stats[key] + + all_stats.append(stats) + + return pd.DataFrame(all_stats) + - return pd.DataFrame(out_df) +def _get_value_or_nan(d: dict, key: str): + try: + return d[key] + except KeyError: + return np.nan def _build_run_internal_df( @@ -1048,7 +1094,7 @@ def _build_run_internal_df( """ timing_manager_path = os.path.join( - folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH + folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME ) raw_name = os.path.basename(folder_path) diff --git a/alphadia/workflow/base.py b/alphadia/workflow/base.py index a0a8a592..06e64d14 100644 --- a/alphadia/workflow/base.py +++ b/alphadia/workflow/base.py @@ -23,12 +23,12 @@ class WorkflowBase: It also initializes the calibration_manager and fdr_manager for the workflow. """ - RAW_FILE_MANAGER_PATH = "raw_file_manager.pkl" - CALIBRATION_MANAGER_PATH = "calibration_manager.pkl" - OPTIMIZATION_MANAGER_PATH = "optimization_manager.pkl" - TIMING_MANAGER_PATH = "timing_manager.pkl" - FDR_MANAGER_PATH = "fdr_manager.pkl" - FIGURE_PATH = "figures" + RAW_FILE_MANAGER_PKL_NAME = "raw_file_manager.pkl" + CALIBRATION_MANAGER_PKL_NAME = "calibration_manager.pkl" + OPTIMIZATION_MANAGER_PKL_NAME = "optimization_manager.pkl" + TIMING_MANAGER_PKL_NAME = "timing_manager.pkl" + FDR_MANAGER_PKL_NAME = "fdr_manager.pkl" + FIGURES_FOLDER_NAME = "figures" def __init__( self, @@ -87,7 +87,7 @@ def load( self.reporter.log_event("loading_data", {"progress": 0}) raw_file_manager = RawFileManager( self.config, - path=os.path.join(self.path, self.RAW_FILE_MANAGER_PATH), + path=os.path.join(self.path, self.RAW_FILE_MANAGER_PKL_NAME), reporter=self.reporter, ) @@ -102,7 +102,7 @@ def load( # initialize the calibration manager self._calibration_manager = manager.CalibrationManager( self.config["calibration_manager"], - path=os.path.join(self.path, self.CALIBRATION_MANAGER_PATH), + path=os.path.join(self.path, self.CALIBRATION_MANAGER_PKL_NAME), load_from_file=self.config["general"]["reuse_calibration"], reporter=self.reporter, ) @@ -115,14 +115,14 @@ def load( self._optimization_manager = manager.OptimizationManager( self.config, gradient_length=self.dia_data.rt_values.max(), - path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH), + path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PKL_NAME), load_from_file=self.config["general"]["reuse_calibration"], - figure_path=os.path.join(self.path, self.FIGURE_PATH), + figure_path=os.path.join(self.path, self.FIGURES_FOLDER_NAME), reporter=self.reporter, ) self._timing_manager = manager.TimingManager( - path=os.path.join(self.path, self.TIMING_MANAGER_PATH), + path=os.path.join(self.path, self.TIMING_MANAGER_PKL_NAME), load_from_file=self.config["general"]["reuse_calibration"], ) diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index ea70d2f2..04c12bbf 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -83,51 +83,51 @@ def is_fitted(self, value): def save(self): """Save the state to pickle file.""" - if self.path is not None: - try: - with open(self.path, "wb") as f: - pickle.dump(self, f) - except Exception as e: - self.reporter.log_string( - f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}", - verbosity="error", - ) - # Log the full traceback + if self.path is None: + return - self.reporter.log_string( - f"Traceback: {traceback.format_exc()}", verbosity="error" - ) + try: + with open(self.path, "wb") as f: + pickle.dump(self, f) + except Exception as e: + self.reporter.log_string( + f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}", + verbosity="error", + ) + + self.reporter.log_string( + f"Traceback: {traceback.format_exc()}", verbosity="error" + ) def load(self): """Load the state from pickle file.""" - if self.path is not None: - if os.path.exists(self.path): - try: - with open(self.path, "rb") as f: - loaded_state = pickle.load(f) - - if loaded_state._version == self._version: - self.__dict__.update(loaded_state.__dict__) - self.is_loaded_from_file = True - else: - self.reporter.log_string( - f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.", - verbosity="warning", - ) - except Exception: + if self.path is None or not os.path.exists(self.path): + self.reporter.log_string( + f"{self.__class__.__name__} not found at: {self.path}", + verbosity="warning", + ) + return + + try: + with open(self.path, "rb") as f: + loaded_state = pickle.load(f) + + if loaded_state._version == self._version: + self.__dict__.update(loaded_state.__dict__) + self.is_loaded_from_file = True self.reporter.log_string( - f"Failed to load {self.__class__.__name__} from {self.path}", - verbosity="error", + f"Loaded {self.__class__.__name__} from {self.path}" ) else: self.reporter.log_string( - f"Loaded {self.__class__.__name__} from {self.path}" + f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.", + verbosity="warning", ) - else: - self.reporter.log_string( - f"{self.__class__.__name__} not found at: {self.path}", - verbosity="warning", - ) + except Exception: + self.reporter.log_string( + f"Failed to load {self.__class__.__name__} from {self.path}", + verbosity="error", + ) def fit(self): """Fit the workflow property of the manager.""" diff --git a/alphadia/workflow/managers/raw_file_manager.py b/alphadia/workflow/managers/raw_file_manager.py index 0a4f369b..2d7b1776 100644 --- a/alphadia/workflow/managers/raw_file_manager.py +++ b/alphadia/workflow/managers/raw_file_manager.py @@ -24,11 +24,14 @@ def __init__( self.stats = {} # needs to be before super().__init__ to avoid overwriting loaded values super().__init__(path=path, load_from_file=load_from_file, **kwargs) - self.reporter.log_string(f"Initializing {self.__class__.__name__}") - self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"}) self._config: Config = config + # deliberately not saving the actual raw data here to avoid the saved manager file being too large + + self.reporter.log_string(f"Initializing {self.__class__.__name__}") + self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"}) + def get_dia_data_object( self, dia_data_path: str ) -> bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRaw: @@ -121,8 +124,6 @@ def _calc_stats( stats["rt_limit_min"] = rt_values.min() stats["rt_limit_max"] = rt_values.max() - stats["rt_duration_sec"] = rt_values.max() - rt_values.min() - cycle_length = cycle.shape[1] stats["cycle_length"] = cycle_length stats["cycle_duration"] = np.diff(rt_values[::cycle_length]).mean() @@ -138,13 +139,13 @@ def _calc_stats( def _log_stats(self): """Log the statistics calculated from the DIA data.""" - rt_duration_min = self.stats["rt_duration_sec"] / 60 + rt_duration = self.stats["rt_limit_max"] - self.stats["rt_limit_min"] logger.info( f"{'RT (min)':<20}: {self.stats['rt_limit_min']/60:.1f} - {self.stats['rt_limit_max']/60:.1f}" ) - logger.info(f"{'RT duration (sec)':<20}: {self.stats['rt_duration_sec']:.1f}") - logger.info(f"{'RT duration (min)':<20}: {rt_duration_min:.1f}") + logger.info(f"{'RT duration (sec)':<20}: {rt_duration:.1f}") + logger.info(f"{'RT duration (min)':<20}: {rt_duration/60:.1f}") logger.info(f"{'Cycle len (scans)':<20}: {self.stats['cycle_length']:.0f}") logger.info(f"{'Cycle len (sec)':<20}: {self.stats['cycle_duration']:.2f}") diff --git a/docs/methods.md b/docs/methods.md index 83e3ad8b..5eb82707 100644 --- a/docs/methods.md +++ b/docs/methods.md @@ -6,5 +6,6 @@ methods/command-line methods/configuration methods/calibration methods/transfer-learning +methods/output-format ``` diff --git a/docs/methods/output-format.md b/docs/methods/output-format.md new file mode 100644 index 00000000..67954550 --- /dev/null +++ b/docs/methods/output-format.md @@ -0,0 +1,78 @@ +# Output format +AlphaDIA writes tab-separated values (TSV) files. + +## `stats.tsv` +The `stats.tsv` file contains summary statistics and quality metrics for each run and channel in the analysis. +It provides insights into the search results, calibration quality, and general performance metrics. + +Format: one row per run/channel combination. + +### Columns + +#### Basic Information +- `run`: Name of the raw file/run +- `channel`: Channel number (0 for label-free data, or channel numbers for multiplexed data) + +#### Search Results Statistics +- `precursors`: Number of identified precursors in this run/channel +- `proteins`: Number of unique protein groups identified in this run/channel + +#### Peak Width Statistics +These columns are only present if the data contains the relevant measurements: +- `fwhm_rt`: Mean full width at half maximum (FWHM) of peaks in retention time dimension +- `fwhm_mobility`: Mean FWHM of peaks in mobility dimension (only for ion mobility data) + +#### Optimization Statistics +These metrics show the final values used for various search parameters after optimization: + +- `optimization.ms2_error`: Final MS2 mass error tolerance used +- `optimization.ms1_error`: Final MS1 mass error tolerance used +- `optimization.rt_error`: Final retention time tolerance used +- `optimization.mobility_error`: Final ion mobility tolerance used (only for ion mobility data) + +#### Calibration Statistics +These metrics show the quality of mass calibration: + +##### MS2 Level +- `calibration.ms2_median_accuracy`: Median mass accuracy for fragment ions +- `calibration.ms2_median_precision`: Median mass precision for fragment ions + +##### MS1 Level +- `calibration.ms1_median_accuracy`: Median mass accuracy for precursor ions +- `calibration.ms1_median_precision`: Median mass precision for precursor ions + +### Notes + +- All FWHM values are reported in the native units of the measurement (minutes for RT, mobility units for IM) +- Mass accuracy values are typically reported in parts per million (ppm) +- Some columns may be NaN if the corresponding measurements or calibrations were not performed +- For label-free data, there will typically be one row per run with channel=0 +- For multiplexed data, there will be multiple rows per run (one for each channel) + +#### Raw File Statistics +These metrics are derived from the raw data file analysis: + +- `raw.gradient_min_m`: Minimum retention time in the run (minutes) +- `raw.gradient_max_m`: Maximum retention time in the run (minutes) +- `raw.gradient_length_m`: Total duration of the run (minutes) +- `raw.cycle_length`: Number of scans per cycle +- `raw.cycle_duration`: Average duration of each cycle in seconds +- `raw.cycle_number`: Total number of cycles in the run +- `raw.msms_range_min`: Minimum MS2 m/z value measured +- `raw.msms_range_max`: Maximum MS2 m/z value measured + + +## `precursors.tsv` +TODO + +## `pg.matrix.tsv` +TODO + +## `internal.tsv` +TODO + +## `speclib.mbr.hdf ` +TODO + +## `quant folder` +TODO diff --git a/tests/unit_tests/test_outputtransform.py b/tests/unit_tests/test_outputtransform.py index 4b68447f..63e51088 100644 --- a/tests/unit_tests/test_outputtransform.py +++ b/tests/unit_tests/test_outputtransform.py @@ -79,13 +79,13 @@ def test_output_transform(): config, path=os.path.join( raw_folder, - peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH, + peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME, ), ) timing_manager = manager.TimingManager( path=os.path.join( - raw_folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH + raw_folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME ) ) @@ -141,11 +141,9 @@ def test_output_transform(): assert all( [ col in stat_df.columns - for col in [ - "run", - "precursors", - "proteins", - ] + for col in ['run', 'channel', 'precursors', 'proteins', 'fwhm_rt', 'fwhm_mobility', 'optimization.ms2_error', + 'optimization.ms1_error', 'optimization.rt_error', 'optimization.mobility_error', 'calibration.ms2_median_accuracy', 'calibration.ms2_median_precision', 'calibration.ms1_median_accuracy', 'calibration.ms1_median_precision', 'raw.gradient_min_m', 'raw.gradient_max_m', 'raw.gradient_length_m', 'raw.cycle_length', 'raw.cycle_duration', 'raw.cycle_number', 'raw.msms_range_min', 'raw.msms_range_max'] + ] ) diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 20c4cc0a..2b53cf14 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -434,16 +434,16 @@ def create_workflow_instance(): ) workflow._calibration_manager = manager.CalibrationManager( workflow.config["calibration_manager"], - path=os.path.join(workflow.path, workflow.CALIBRATION_MANAGER_PATH), + path=os.path.join(workflow.path, workflow.CALIBRATION_MANAGER_PKL_NAME), load_from_file=workflow.config["general"]["reuse_calibration"], reporter=workflow.reporter, ) workflow._optimization_manager = manager.OptimizationManager( TEST_OPTIMIZATION_CONFIG, - path=os.path.join(workflow.path, workflow.OPTIMIZATION_MANAGER_PATH), + path=os.path.join(workflow.path, workflow.OPTIMIZATION_MANAGER_PKL_NAME), load_from_file=workflow.config["general"]["reuse_calibration"], - figure_path=os.path.join(workflow.path, workflow.FIGURE_PATH), + figure_path=os.path.join(workflow.path, workflow.FIGURES_FOLDER_NAME), reporter=workflow.reporter, )