From 187cb207db34e7121a78ecf4cc8fa0634e63ede5 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Thu, 5 Dec 2024 14:52:59 -0800 Subject: [PATCH 01/16] Rework lipid wdl to expect list of files --- Makefile | 9 +++++---- wdl/metaMS_lcmslipidomics.wdl | 32 ++++++++++++++++++++++++++++++++ wdl/metams_input_lipidomics.json | 10 +++++++++- 3 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 wdl/metaMS_lcmslipidomics.wdl diff --git a/Makefile b/Makefile index c0cc5d5..df7cfcc 100644 --- a/Makefile +++ b/Makefile @@ -63,9 +63,10 @@ docker-run: @echo $(config_dir) @docker run -v $(data_dir):/metams/data -v $(config_dir):/metams/configuration microbiomedata/metams:latest metaMS run-gcms-workflow /metams/configuration/metams.toml -wdl-run : +wdl-run-gcms : - miniwdl run wdl/metaMS.wdl -i wdl/metams_input.json --verbose --no-cache --copy-input-files + miniwdl run wdl/metaMS_gcms.wdl -i wdl/metams_input_gcms.json --verbose --no-cache --copy-input-files - - \ No newline at end of file +wdl-run-lipid : + + miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files \ No newline at end of file diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl new file mode 100644 index 0000000..fcaf979 --- /dev/null +++ b/wdl/metaMS_lcmslipidomics.wdl @@ -0,0 +1,32 @@ +version 1.0 + +workflow lcmsLipidomics { + call runMetaMSLCMSLipidomics + + output { + Array[File] output_files = runMetaMSLCMSLipidomics.output_files + } +} + +task runMetaMSLCMSLipidomics { + input { + Array[File] file_paths + String output_directory + File corems_toml_path + File metabref_token_path + File scan_translator_path + Int cores + } + + command { + + } + + output { + Array[File] output_files = glob('${output_directory}/**/*.*') + } + + runtime { + docker: "microbiomedata/metams:2.2.2" + } +} \ No newline at end of file diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json index 056f702..2243f1c 100644 --- a/wdl/metams_input_lipidomics.json +++ b/wdl/metams_input_lipidomics.json @@ -1,3 +1,11 @@ { - "lcmsLipidomics.runLipidomicsMetaMS.config_file": "./configuration/lipidomics_metams.toml" + "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [ + "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", + "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" + ], + "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output", + "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.metabref_token_path": "./configuration/gcms_corems.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1 } \ No newline at end of file From fdd817d123bf9011a870fa310c1f57fb523298ed Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Thu, 5 Dec 2024 15:17:21 -0800 Subject: [PATCH 02/16] Reconfigured CLI and WDL for lipid workflow --- metaMS/cli.py | 54 ++++++++++++++++++++++++----------- wdl/metaMS_lcmslipidomics.wdl | 10 +++++-- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/metaMS/cli.py b/metaMS/cli.py index a6f719e..8321766 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -148,10 +148,10 @@ def dump_lipidomics_corems_toml_template(toml_file_name): ) @click.option( "-i", - "--directory", + "--file_paths", required=False, type=str, - help="The directory where the data is stored, all files in the directory will be processed", + help="The path to the directory with the input files", ) @click.option( "-o", @@ -160,6 +160,13 @@ def dump_lipidomics_corems_toml_template(toml_file_name): type=str, help="The directory where the output files will be stored", ) +@click.option( + "-c", + "--corems_params", + required=False, + type=str, + help="The path corems parameters toml file", +) @click.option( "-t", "--token_path", required=False, type=str, help="The path to the metabref token" ) @@ -169,36 +176,51 @@ def dump_lipidomics_corems_toml_template(toml_file_name): @click.option( "-j", "--cores", required=False, type=int, help="'cpu's to use for processing" ) -def run_lipidomics_workflow(paramaters_file, directory, output_directory, token_path, scan_translator_path, cores): +def run_lipidomics_workflow( + paramaters_file, + file_paths, + output_directory, + corems_params, + token_path, + scan_translator_path, + cores + ): """Run the lipidomics workflow Parameters ---------- - paramaters_file : str - The path to the toml file with the lipidomics workflow parameters - This file can be generated using the dump-lipidomics-toml-template command + #TODO KRH: Add this in """ if paramaters_file is not None: - if cores is not None or directory is not None: + if cores is not None or file_paths is not None: + #TODO KRH: Add all other parameters above click.echo("Using parameters file, ignoring other parameters") - run_lcms_lipidomics_workflow( - lipidomics_workflow_paramaters_file=paramaters_file - ) + #run_lcms_lipidomics_workflow( + # lipidomics_workflow_paramaters_file=paramaters_file + #) else: if cores is None: cores = 1 - if directory is None: - click.echo("No directory provided, no data to process") + if file_paths is None: + click.echo("No file paths provided, no data to process") return + if corems_params is None: + click.echo("No corems parameters provided") + if scan_translator is None: + click.echo("No scan translator provided") if output_directory is None: click.echo( "Must provide an output directory if not using a parameters file" ) return - run_lcms_lipidomics_workflow( - directory=directory, output_directory=output_directory, cores=cores - ) - click.echo("Running lipidomics workflow") + if token_path is None: + click.echo("No metabref token provided") + return + #run_lcms_lipidomics_workflow( + # directory=directory, output_directory=output_directory, cores=cores + #) + click.echo("Ready to run lipidomics workflow") + # test call: # MetaMS run-lipidomics-workflow -p configuration/lipidomics_metams.toml # miniwdl run wdl/metaMS_lipidomics.wdl -i wdl/metams_input_lipidomics.json diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl index fcaf979..de2214e 100644 --- a/wdl/metaMS_lcmslipidomics.wdl +++ b/wdl/metaMS_lcmslipidomics.wdl @@ -19,7 +19,13 @@ task runMetaMSLCMSLipidomics { } command { - + metaMS run-lipidomics-workflow \ + -i ${sep=',' file_paths} \ + -o ${output_directory} \ + -c ${corems_toml_path} \ + -t ${metabref_token_path} \ + -s ${scan_translator_path} \ + -j ${cores} } output { @@ -27,6 +33,6 @@ task runMetaMSLCMSLipidomics { } runtime { - docker: "microbiomedata/metams:2.2.2" + docker: "local-metams:latest" } } \ No newline at end of file From ca01692a65822c49d13cd4e80990d23c92b3d0ae Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Thu, 5 Dec 2024 16:56:11 -0800 Subject: [PATCH 03/16] Add check function to workflow --- Makefile | 3 ++ metaMS/cli.py | 13 ++++--- metaMS/lcms_lipidomics_workflow.py | 55 ++++++++++++++++++++++++------ wdl/metaMS_lcmslipidomics.wdl | 4 ++- wdl/metaMS_lipidomics.wdl | 20 ----------- wdl/metams_input_lipidomics.json | 2 +- 6 files changed, 61 insertions(+), 36 deletions(-) delete mode 100644 wdl/metaMS_lipidomics.wdl diff --git a/Makefile b/Makefile index df7cfcc..4f5e855 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,9 @@ docker-build: docker build -t microbiomedata/metams:latest . +docker-build-local: + docker build -t local-metams:latest . + docker-run: @echo $(data_dir) @echo $(config_dir) diff --git a/metaMS/cli.py b/metaMS/cli.py index 8321766..d99c7ad 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -206,7 +206,7 @@ def run_lipidomics_workflow( return if corems_params is None: click.echo("No corems parameters provided") - if scan_translator is None: + if scan_translator_path is None: click.echo("No scan translator provided") if output_directory is None: click.echo( @@ -216,9 +216,14 @@ def run_lipidomics_workflow( if token_path is None: click.echo("No metabref token provided") return - #run_lcms_lipidomics_workflow( - # directory=directory, output_directory=output_directory, cores=cores - #) + run_lcms_lipidomics_workflow( + file_paths=file_paths, + output_directory=output_directory, + corems_toml_path=corems_params, + metabref_token_path=token_path, + scan_translator_path=scan_translator_path, + cores=cores, + ) click.echo("Ready to run lipidomics workflow") # test call: diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index c72052f..b905df9 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -3,6 +3,8 @@ from pathlib import Path import datetime from multiprocessing import Pool +from typing import List +import click from corems.mass_spectra.input.mzml import MZMLSpectraParser from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader @@ -29,14 +31,41 @@ class LipidomicsWorkflowParameters: The number of cores to use for processing, optional """ - - directory: str = "data/..." - output_directory: str = "output/..." - corems_toml_path: str = "configuration/lipidomics_corems.toml" + file_paths: tuple = ('data/...', 'data/...') + output_directory: str = "output" + corems_toml_path: str = None metabref_token_path: str = None scan_translator_path: str = None cores: int = 1 +def check_lipidomics_workflow_params(lipid_workflow_params): + # Check that corems_toml_path exists + if not Path(lipid_workflow_params.corems_toml_path).exists(): + click.echo("Corems toml file not found, exiting workflow") + return + + # Check that metabref_token_path exists + if not Path(lipid_workflow_params.metabref_token_path).exists(): + click.echo("Metabref token file not found, exiting workflow") + return + + # Check that scan_translator_path exists + if not Path(lipid_workflow_params.scan_translator_path).exists(): + click.echo("Scan translator file not found, exiting workflow") + return + + # Check that output_directory exists + if not Path(lipid_workflow_params.output_directory).exists(): + click.echo("Output directory not found, exiting workflow") + return + + # Check that file_paths exist + for file_path in lipid_workflow_params.file_paths: + if not Path(file_path).exists(): + click.echo(f"File path {file_path} not found, exiting workflow") + return + + def instantiate_lcms_obj(file_in): """Instantiate a corems LCMS object from a binary file. Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory) @@ -75,7 +104,7 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): def run_lcms_lipidomics_workflow( lipidomics_workflow_paramaters_file=None, - directory=None, + file_paths=None, output_directory=None, corems_toml_path=None, metabref_token_path=None, @@ -88,7 +117,7 @@ def run_lcms_lipidomics_workflow( lipid_workflow_params = LipidomicsWorkflowParameters(**toml.load(infile)) else: lipid_workflow_params = LipidomicsWorkflowParameters( - directory=directory, + file_paths= file_paths.split(","), output_directory=output_directory, metabref_token_path=metabref_token_path, scan_translator_path=scan_translator_path, @@ -100,14 +129,20 @@ def run_lcms_lipidomics_workflow( out_dir = Path(lipid_workflow_params.output_directory) out_dir.mkdir(parents=True, exist_ok=True) - file_dir = Path(lipid_workflow_params.directory) - files_list = list(file_dir.glob("*.raw")) + # Check that all parameters are valid and exist + check_lipidomics_workflow_params(lipid_workflow_params) + + # Organize input and output paths + file_paths = [Path(file_path) for file_path in lipid_workflow_params.file_paths] + files_list = list(file_paths) out_paths_list = [out_dir / f.stem for f in files_list] - + # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results cores = lipid_workflow_params.cores params_toml = lipid_workflow_params.corems_toml_path scan_translator = lipid_workflow_params.scan_translator_path + + """ if cores == 1 or len(files_list) == 1: mz_dicts = [] for file_in, file_out in list(zip(files_list, out_paths_list)): @@ -133,5 +168,5 @@ def run_lcms_lipidomics_workflow( pool.close() pool.join() print("Finished processing all files") - + """ # TODO KRH: Add full lipidomics workflow here diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl index de2214e..802fe2d 100644 --- a/wdl/metaMS_lcmslipidomics.wdl +++ b/wdl/metaMS_lcmslipidomics.wdl @@ -4,6 +4,7 @@ workflow lcmsLipidomics { call runMetaMSLCMSLipidomics output { + String out = runMetaMSLCMSLipidomics.out Array[File] output_files = runMetaMSLCMSLipidomics.output_files } } @@ -29,7 +30,8 @@ task runMetaMSLCMSLipidomics { } output { - Array[File] output_files = glob('${output_directory}/**/*.*') + String out = read_string(stdout()) + Array[File] output_files = glob('${output_directory}/*') } runtime { diff --git a/wdl/metaMS_lipidomics.wdl b/wdl/metaMS_lipidomics.wdl deleted file mode 100644 index dea4d11..0000000 --- a/wdl/metaMS_lipidomics.wdl +++ /dev/null @@ -1,20 +0,0 @@ -version 1.0 - -workflow lcmsLipidomics { - call runLipidomicsMetaMS -} - -task runLipidomicsMetaMS { - input { - File config_file - } - - command { - metaMS run-lipidomics-workflow -p ${config_file} - } - - runtime { - docker: "local-metams:latest" - #TODO KRH: Change to dockerhub version after we've pushed the updated image - } -} \ No newline at end of file diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json index 2243f1c..b7fedfb 100644 --- a/wdl/metams_input_lipidomics.json +++ b/wdl/metams_input_lipidomics.json @@ -1,7 +1,7 @@ { "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [ "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", - "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" + "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml" ], "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output", "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml", From 34a120b3ee04eb9741e950daee910f064d6614f9 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Thu, 5 Dec 2024 17:32:00 -0800 Subject: [PATCH 04/16] Add error handling to lipid workflow --- metaMS/cli.py | 5 ----- metaMS/lcms_lipidomics_workflow.py | 24 ++++++++++++++---------- wdl/metams_input_lipidomics.json | 3 +-- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/metaMS/cli.py b/metaMS/cli.py index d99c7ad..32b2b6c 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -224,8 +224,3 @@ def run_lipidomics_workflow( scan_translator_path=scan_translator_path, cores=cores, ) - click.echo("Ready to run lipidomics workflow") - - # test call: - # MetaMS run-lipidomics-workflow -p configuration/lipidomics_metams.toml - # miniwdl run wdl/metaMS_lipidomics.wdl -i wdl/metams_input_lipidomics.json diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index b905df9..59b1c4e 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -41,29 +41,31 @@ class LipidomicsWorkflowParameters: def check_lipidomics_workflow_params(lipid_workflow_params): # Check that corems_toml_path exists if not Path(lipid_workflow_params.corems_toml_path).exists(): - click.echo("Corems toml file not found, exiting workflow") - return + raise FileNotFoundError("Corems toml file not found, exiting workflow") # Check that metabref_token_path exists if not Path(lipid_workflow_params.metabref_token_path).exists(): - click.echo("Metabref token file not found, exiting workflow") - return + raise FileNotFoundError("Metabref token file not found, exiting workflow") # Check that scan_translator_path exists if not Path(lipid_workflow_params.scan_translator_path).exists(): - click.echo("Scan translator file not found, exiting workflow") - return + raise FileNotFoundError("Scan translator file not found, exiting workflow") # Check that output_directory exists if not Path(lipid_workflow_params.output_directory).exists(): - click.echo("Output directory not found, exiting workflow") - return + raise FileNotFoundError("Output directory not found, exiting workflow") # Check that file_paths exist for file_path in lipid_workflow_params.file_paths: if not Path(file_path).exists(): - click.echo(f"File path {file_path} not found, exiting workflow") - return + raise FileNotFoundError(f"File path {file_path} not found, exiting workflow") + + # Check that all file_paths end in .raw or .mzML + for file_path in lipid_workflow_params.file_paths: + if ".raw" not in file_path and ".mzML" not in file_path: + raise ValueError(f"File path {file_path} is not a .raw or .mzML file, exiting workflow") + + #TODO KRH: Add a check that we can access the metabref API with the token def instantiate_lcms_obj(file_in): @@ -141,6 +143,8 @@ def run_lcms_lipidomics_workflow( cores = lipid_workflow_params.cores params_toml = lipid_workflow_params.corems_toml_path scan_translator = lipid_workflow_params.scan_translator_path + + click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " files, using " + str(cores) + " core(s)") """ if cores == 1 or len(files_list) == 1: diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json index b7fedfb..4cc08d4 100644 --- a/wdl/metams_input_lipidomics.json +++ b/wdl/metams_input_lipidomics.json @@ -1,7 +1,6 @@ { "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [ - "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", - "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml" + "./data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw" ], "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output", "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml", From eb7da1959edc54288a9d144bdf7c390143f3bf69 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Fri, 6 Dec 2024 09:09:23 -0800 Subject: [PATCH 05/16] WIP lipid wdl working through parameter setting --- .gitignore | 5 +- Makefile | 4 +- .../emsl_lipidomics_corems_params.toml | 333 ++++++++++++++++-- metaMS/lcms_lipidomics_workflow.py | 44 ++- 4 files changed, 346 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 13134e4..c742356 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,7 @@ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg -*.egg \ No newline at end of file +*.egg + +# Raw thermo data +*.raw \ No newline at end of file diff --git a/Makefile b/Makefile index 4f5e855..1c909c0 100644 --- a/Makefile +++ b/Makefile @@ -71,5 +71,7 @@ wdl-run-gcms : miniwdl run wdl/metaMS_gcms.wdl -i wdl/metams_input_gcms.json --verbose --no-cache --copy-input-files wdl-run-lipid : - +#TODO KRH: remove the docker-build-local when the docker image is available in dockerhub and +# update the docker image in the wdl file + make docker-build-local miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files \ No newline at end of file diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml index 222d6d0..4d43664 100644 --- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml +++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml @@ -17,26 +17,67 @@ peak_picking_method = "persistent homology" implemented_peak_picking_methods = [ "persistent homology",] mass_feature_cluster_mz_tolerance_rel = 5e-6 mass_feature_cluster_rt_tolerance = 0.3 -ms1_scans_to_average = 1 +ms1_scans_to_average = 5 ms1_deconvolution_corr_min = 0.95 ms2_dda_rt_tolerance = 0.15 ms2_dda_mz_tolerance = 0.05 ms2_min_fe_score = 0.3 search_as_lipids = true include_fragment_types = true -ph_inten_min_rel = 0.05 -ph_persis_min_rel = 0.005 -ph_smooth_it = 0 -export_profile_spectra = false +export_profile_spectra = true export_eics = true export_unprocessed_ms1 = false +verbose_processing = false +ph_inten_min_rel = 0.05 #TODO KRH: Change this back to reasonable setting after dev (0.005 or 0.0005?) +ph_persis_min_rel = 0.05 +ph_smooth_it = 0 -[MassSpectrum] +[mass_spectrum.ms1.molecular_search] +use_isotopologue_filter = false +isotopologue_filter_threshold = 33.0 +isotopologue_filter_atoms = [ "Cl", "Br",] +use_runtime_kendrick_filter = false +use_min_peaks_filter = true +min_peaks_per_class = 15 +url_database = "" +db_jobs = 3 +db_chunk_size = 300 +ion_charge = -1 +min_hc_filter = 0.3 +max_hc_filter = 3.0 +min_oc_filter = 0.0 +max_oc_filter = 1.2 +min_op_filter = 2.0 +use_pah_line_rule = false +min_dbe = 0.0 +max_dbe = 40.0 +mz_error_score_weight = 0.6 +isotopologue_score_weight = 0.4 +adduct_atoms_neg = [ "Cl", "Br",] +adduct_atoms_pos = [ "Na", "K",] +score_methods = [ "S_P_lowest_error", "N_S_P_lowest_error", "lowest_error", "prob_score", "air_filter_error", "water_filter_error", "earth_filter_error",] +score_method = "prob_score" +output_min_score = 0.1 +output_score_method = "All Candidates" +isRadical = false +isProtonated = true +isAdduct = false +ion_types_excluded = [] +ionization_type = "ESI" +min_ppm_error = -10.0 +max_ppm_error = 10.0 +min_abun_error = -100.0 +max_abun_error = 100.0 +mz_error_range = 1.5 +error_method = "None" +mz_error_average = 0.0 + +[mass_spectrum.ms1.mass_spectrum] noise_threshold_method = "relative_abundance" noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] noise_threshold_min_std = 6 noise_threshold_min_s2n = 4.0 -noise_threshold_min_relative_abundance = 1 +noise_threshold_min_relative_abundance = 0.1 noise_threshold_absolute_abundance = 1000000.0 noise_threshold_log_nsigma = 6 noise_threshold_log_nsigma_corr_factor = 0.463 @@ -51,9 +92,14 @@ calib_pol_order = 2 max_calib_ppm_error = 1.0 min_calib_ppm_error = -1.0 calib_sn_threshold = 2.0 +calibration_ref_match_method = "legacy" +calibration_ref_match_method_implemented = [ "legacy", "merged",] +calibration_ref_match_tolerance = 0.003 +calibration_ref_match_std_raw_error_limit = 1.5 do_calibration = true +verbose_processing = true -[MassSpecPeak] +[mass_spectrum.ms1.ms_peak] kendrick_rounding_method = "floor" implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",] peak_derivative_threshold = 0.0 @@ -62,26 +108,27 @@ min_peak_datapoints = 5.0 peak_max_prominence_percent = 0.1 peak_height_max_percent = 10.0 legacy_resolving_power = false +legacy_centroid_polyfit = false -[MS1MolecularSearch] +[mass_spectrum.ms2.molecular_search] use_isotopologue_filter = false isotopologue_filter_threshold = 33.0 isotopologue_filter_atoms = [ "Cl", "Br",] use_runtime_kendrick_filter = false use_min_peaks_filter = true min_peaks_per_class = 15 -url_database = "" +url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@molformdb:5432/coremsapp" db_jobs = 3 db_chunk_size = 300 ion_charge = -1 min_hc_filter = 0.3 max_hc_filter = 3.0 min_oc_filter = 0.0 -max_oc_filter = 5 +max_oc_filter = 1.2 min_op_filter = 2.0 use_pah_line_rule = false -min_dbe = 0 -max_dbe = 50 +min_dbe = 0.0 +max_dbe = 40.0 mz_error_score_weight = 0.6 isotopologue_score_weight = 0.4 adduct_atoms_neg = [ "Cl", "Br",] @@ -93,17 +140,63 @@ output_score_method = "All Candidates" isRadical = false isProtonated = true isAdduct = false -ion_types_excluded = [] +ion_types_excluded = [ "[M+HCOO]-",] ionization_type = "ESI" -min_ppm_error = -5 -max_ppm_error = 5 +min_ppm_error = -10.0 +max_ppm_error = 10.0 min_abun_error = -100.0 max_abun_error = 100.0 mz_error_range = 1.5 error_method = "None" mz_error_average = 0.0 -[MS2MolecularSearch] +[mass_spectrum.ms2.transient] +implemented_apodization_function = [ "Hamming", "Hanning", "Blackman", "Full-Sine", "Half-Sine", "Kaiser", "Half-Kaiser",] +apodization_method = "Hanning" +number_of_truncations = 0 +number_of_zero_fills = 1 +next_power_of_two = false +kaiser_beta = 8.6 + +[mass_spectrum.ms2.mass_spectrum] +noise_threshold_method = "relative_abundance" +noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] +noise_threshold_min_std = 6 +noise_threshold_min_s2n = 4.0 +noise_threshold_min_relative_abundance = 0.1 +noise_threshold_absolute_abundance = 1000000.0 +noise_threshold_log_nsigma = 6 +noise_threshold_log_nsigma_corr_factor = 0.463 +noise_threshold_log_nsigma_bins = 500 +noise_min_mz = 0.0 +noise_max_mz = inf +min_picking_mz = 0.0 +max_picking_mz = inf +picking_point_extrapolate = 3 +calib_minimize_method = "Powell" +calib_pol_order = 2 +max_calib_ppm_error = 1.0 +min_calib_ppm_error = -1.0 +calib_sn_threshold = 2.0 +calibration_ref_match_method = "legacy" +calibration_ref_match_method_implemented = [ "legacy", "merged",] +calibration_ref_match_tolerance = 0.003 +calibration_ref_match_std_raw_error_limit = 1.5 +do_calibration = true +verbose_processing = true + +[mass_spectrum.ms2.ms_peak] +kendrick_rounding_method = "floor" +implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",] +peak_derivative_threshold = 0.0 +peak_min_prominence_percent = 0.1 +min_peak_datapoints = 5.0 +peak_max_prominence_percent = 0.1 +peak_height_max_percent = 10.0 +legacy_resolving_power = false +legacy_centroid_polyfit = false + +[mass_spectrum.ms2_cid.molecular_search] use_isotopologue_filter = false isotopologue_filter_threshold = 33.0 isotopologue_filter_atoms = [ "Cl", "Br",] @@ -136,18 +229,60 @@ isAdduct = false ion_types_excluded = [ "[M+HCOO]-",] ionization_type = "ESI" min_ppm_error = -10.0 -max_ppm_error = 10.0 +max_ppm_error = 200 min_abun_error = -100.0 max_abun_error = 100.0 mz_error_range = 1.5 error_method = "None" mz_error_average = 0.0 -[MassSpecPeak.kendrick_base] -C = 1 -H = 2 +[mass_spectrum.ms2_cid.transient] +implemented_apodization_function = [ "Hamming", "Hanning", "Blackman", "Full-Sine", "Half-Sine", "Kaiser", "Half-Kaiser",] +apodization_method = "Hanning" +number_of_truncations = 0 +number_of_zero_fills = 1 +next_power_of_two = false +kaiser_beta = 8.6 + +[mass_spectrum.ms2_cid.mass_spectrum] +noise_threshold_method = "relative_abundance" +noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] +noise_threshold_min_std = 6 +noise_threshold_min_s2n = 4.0 +noise_threshold_min_relative_abundance = 0.01 +noise_threshold_absolute_abundance = 1000000.0 +noise_threshold_log_nsigma = 6 +noise_threshold_log_nsigma_corr_factor = 0.463 +noise_threshold_log_nsigma_bins = 500 +noise_min_mz = 0.0 +noise_max_mz = inf +min_picking_mz = 0.0 +max_picking_mz = inf +picking_point_extrapolate = 3 +calib_minimize_method = "Powell" +calib_pol_order = 2 +max_calib_ppm_error = 1.0 +min_calib_ppm_error = -1.0 +calib_sn_threshold = 2.0 +calibration_ref_match_method = "legacy" +calibration_ref_match_method_implemented = [ "legacy", "merged",] +calibration_ref_match_tolerance = 0.003 +calibration_ref_match_std_raw_error_limit = 1.5 +do_calibration = true +verbose_processing = true + +[mass_spectrum.ms2_cid.ms_peak] +kendrick_rounding_method = "floor" +implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",] +peak_derivative_threshold = 0.0 +peak_min_prominence_percent = 0.1 +min_peak_datapoints = 5.0 +peak_max_prominence_percent = 0.1 +peak_height_max_percent = 10.0 +legacy_resolving_power = false +legacy_centroid_polyfit = false -[MS1MolecularSearch.usedAtoms] +[mass_spectrum.ms1.molecular_search.usedAtoms] C = [ 10, 100,] H = [ 18, 200,] N = [ 0, 3,] @@ -157,7 +292,7 @@ S = [ 0, 1,] Na = [ 0, 1,] Cl = [ 0, 1,] -[MS1MolecularSearch.used_atom_valences] +[mass_spectrum.ms1.molecular_search.used_atom_valences] C = 4 13C = 4 N = 3 @@ -229,11 +364,35 @@ Bi = 3 Po = 2 Ac = 3 -[MS2MolecularSearch.usedAtoms] -C = [ 1, 100,] -H = [ 1, 200,] +[mass_spectrum.ms1.ms_peak.kendrick_base] +C = 1 +H = 2 + +[mass_spectrum.ms1.data_input.header_translate] +"m/z" = "m/z" +mOz = "m/z" +Mass = "m/z" +"Resolving Power" = "Resolving Power" +"Res." = "Resolving Power" +resolution = "Resolving Power" +Intensity = "Peak Height" +"Peak Height" = "Peak Height" +I = "Peak Height" +Abundance = "Peak Height" +abs_abu = "Peak Height" +"Signal/Noise" = "S/N" +"S/N" = "S/N" +sn = "S/N" + +[mass_spectrum.ms2.molecular_search.usedAtoms] +C = [ 10, 30,] +H = [ 18, 200,] +O = [ 1, 23,] +N = [ 0, 3,] +P = [ 0, 1,] +S = [ 0, 1,] -[MS2MolecularSearch.used_atom_valences] +[mass_spectrum.ms2.molecular_search.used_atom_valences] C = 4 13C = 4 N = 3 @@ -304,3 +463,123 @@ Pb = 4 Bi = 3 Po = 2 Ac = 3 + +[mass_spectrum.ms2.ms_peak.kendrick_base] +C = 1 +H = 2 + +[mass_spectrum.ms2.data_input.header_translate] +"m/z" = "m/z" +mOz = "m/z" +Mass = "m/z" +"Resolving Power" = "Resolving Power" +"Res." = "Resolving Power" +resolution = "Resolving Power" +Intensity = "Peak Height" +"Peak Height" = "Peak Height" +I = "Peak Height" +Abundance = "Peak Height" +abs_abu = "Peak Height" +"Signal/Noise" = "S/N" +"S/N" = "S/N" +sn = "S/N" + +[mass_spectrum.ms2_cid.molecular_search.usedAtoms] +C = [ 10, 30,] +H = [ 18, 200,] +O = [ 1, 23,] +N = [ 0, 3,] +P = [ 0, 1,] +S = [ 0, 1,] + +[mass_spectrum.ms2_cid.molecular_search.used_atom_valences] +C = 4 +13C = 4 +N = 3 +O = 2 +S = 2 +H = 1 +F = 1 +Cl = 1 +Br = 1 +I = 1 +At = 1 +Li = 1 +Na = 1 +K = 1 +Rb = 1 +Cs = 1 +Fr = 1 +B = 4 +In = 3 +Al = 3 +P = 3 +Ga = 3 +Mg = 2 +Be = 2 +Ca = 2 +Sr = 2 +Ba = 2 +Ra = 2 +V = 5 +Fe = 3 +Si = 4 +Sc = 3 +Ti = 4 +Cr = 1 +Mn = 1 +Co = 1 +Ni = 1 +Cu = 2 +Zn = 2 +Ge = 4 +As = 5 +Se = 6 +Y = 3 +Zr = 4 +Nb = 5 +Mo = 6 +Tc = 7 +Ru = 8 +Rh = 6 +Pd = 4 +Ag = 0 +Cd = 2 +Sn = 4 +Sb = 5 +Te = 6 +La = 3 +Hf = 4 +Ta = 5 +W = 6 +Re = 4 +Os = 4 +Ir = 4 +Pt = 4 +Au = 3 +Hg = 1 +Tl = 3 +Pb = 4 +Bi = 3 +Po = 2 +Ac = 3 + +[mass_spectrum.ms2_cid.ms_peak.kendrick_base] +C = 1 +H = 2 + +[mass_spectrum.ms2_cid.data_input.header_translate] +"m/z" = "m/z" +mOz = "m/z" +Mass = "m/z" +"Resolving Power" = "Resolving Power" +"Res." = "Resolving Power" +resolution = "Resolving Power" +Intensity = "Peak Height" +"Peak Height" = "Peak Height" +I = "Peak Height" +Abundance = "Peak Height" +abs_abu = "Peak Height" +"Signal/Noise" = "S/N" +"S/N" = "S/N" +sn = "S/N" diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index 59b1c4e..8eb935f 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -8,6 +8,9 @@ from corems.mass_spectra.input.mzml import MZMLSpectraParser from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader +from corems.encapsulation.input.parameter_from_json import ( + load_and_set_toml_parameters_lcms, +) @dataclass class LipidomicsWorkflowParameters: @@ -67,7 +70,6 @@ def check_lipidomics_workflow_params(lipid_workflow_params): #TODO KRH: Add a check that we can access the metabref API with the token - def instantiate_lcms_obj(file_in): """Instantiate a corems LCMS object from a binary file. Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory) @@ -85,23 +87,45 @@ def instantiate_lcms_obj(file_in): """ # Instantiate parser based on binary file type if ".raw" in str(file_in): - #TODO KRH: Add real functionality here - pass - #parser = ImportMassSpectraThermoMSFileReader(file_in) + parser = ImportMassSpectraThermoMSFileReader(file_in) if ".mzML" in str(file_in): - #parser = MZMLSpectraParser(file_in) - pass + parser = MZMLSpectraParser(file_in) # Instantiate lc-ms data object using parser and pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory) - #myLCMSobj = parser.get_lcms_obj(spectra="ms1") - myLCMSobj = None + myLCMSobj = parser.get_lcms_obj(spectra="ms1") return myLCMSobj +def set_params_on_lcms_obj(myLCMSobj, params_toml): + """Set parameters on the LCMS object + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to set parameters on + params_toml : str or Path + Path to toml file with parameters + + Returns + ------- + None, sets parameters on the LCMS object + """ + # Load parameters from toml file + load_and_set_toml_parameters_lcms(myLCMSobj, params_toml) + + # If myLCMSobj is a positive mode, remove Cl from atoms used in molecular search + # This cuts down on the number of molecular formulas searched hugely + if myLCMSobj.polarity == "positive": + myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Cl") + elif myLCMSobj.polarity == "negative": + myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Na") + def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): time_start = datetime.datetime.now() - myLCMSobj = instantiate_lcms_obj(file_in) + myLCMSobj = instantiate_lcms_obj(file_in) + set_params_on_lcms_obj(myLCMSobj, params_toml) + # TODO KRH: Add signal processing and ms1 molecular search here def run_lcms_lipidomics_workflow( @@ -146,7 +170,6 @@ def run_lcms_lipidomics_workflow( click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " files, using " + str(cores) + " core(s)") - """ if cores == 1 or len(files_list) == 1: mz_dicts = [] for file_in, file_out in list(zip(files_list, out_paths_list)): @@ -172,5 +195,4 @@ def run_lcms_lipidomics_workflow( pool.close() pool.join() print("Finished processing all files") - """ # TODO KRH: Add full lipidomics workflow here From e6af514d56a204732058d9d6471514528694626b Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Fri, 6 Dec 2024 09:20:02 -0800 Subject: [PATCH 06/16] WIP lipid wdl working through scan translator setting --- metaMS/lcms_lipidomics_workflow.py | 59 ++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index 8eb935f..1e3aa97 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -121,10 +121,69 @@ def set_params_on_lcms_obj(myLCMSobj, params_toml): elif myLCMSobj.polarity == "negative": myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Na") +def load_scan_translator(scan_translator=None): + """Translate scans using a scan translator + + Parameters + ---------- + scan_translator : str or Path + Path to scan translator yaml file + + Returns + ------- + scan_dict : dict + Dict with keys as parameter keys and values as lists of scans + """ + # Convert the scan translator to a dictionary + if scan_translator is None: + scan_translator_dict = {"ms2": {"scan_filter": "", "resolution": "high"}} + else: + # Convert the scan translator to a dictionary + if isinstance(scan_translator, str): + scan_translator = Path(scan_translator) + # read in the scan translator from toml + with open(scan_translator, "r") as f: + scan_translator_dict = toml.load(f) + for param_key in scan_translator_dict.keys(): + if scan_translator_dict[param_key]["scan_filter"] == "": + scan_translator_dict[param_key]["scan_filter"] = None + return scan_translator_dict + +def check_scan_translator(myLCMSobj, scan_translator): + """Check if scan translator is provided and that it maps correctly to scans and parameters""" + scan_translator_dict = load_scan_translator(scan_translator) + # Check that the scan translator maps correctly to scans and parameters + scan_df = myLCMSobj.scan_df + scans_pulled_out = [] + for param_key in scan_translator_dict.keys(): + assert param_key in myLCMSobj.parameters.mass_spectrum.keys() + assert "scan_filter" in scan_translator_dict[param_key].keys() + assert "resolution" in scan_translator_dict[param_key].keys() + # Pull out scans that match the scan filter + scan_df_sub = scan_df[ + scan_df.scan_text.str.contains( + scan_translator_dict[param_key]["scan_filter"] + ) + ] + scans_pulled_out.extend(scan_df_sub.scan.tolist()) + if len(scan_df_sub) == 0: + raise ValueError( + "No scans pulled out by scan translator for parameter key: ", + param_key, + " and scan filter: ", + scan_translator_dict[param_key]["scan_filter"], + ) + + # Check that the scans pulled out by the scan translator are not overlapping and assert error if they are + if len(set(scans_pulled_out)) != len(scans_pulled_out): + raise ValueError("Overlapping scans pulled out by scan translator") + def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): time_start = datetime.datetime.now() myLCMSobj = instantiate_lcms_obj(file_in) set_params_on_lcms_obj(myLCMSobj, params_toml) + check_scan_translator(myLCMSobj, scan_translator) + # TODO KRH: Add signal processing and ms1 molecular search here From 98fa8341e0477e53673993f20fddc1a7da77e971 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Fri, 6 Dec 2024 09:40:42 -0800 Subject: [PATCH 07/16] WIP lipid wdl working through ms1 signal processing and export Mono crash is consistently happening --- metaMS/lcms_lipidomics_workflow.py | 86 +++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index 1e3aa97..3e9d5df 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -5,9 +5,11 @@ from multiprocessing import Pool from typing import List import click +import warnings from corems.mass_spectra.input.mzml import MZMLSpectraParser from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader +from corems.mass_spectra.output.export import LipidomicsExport from corems.encapsulation.input.parameter_from_json import ( load_and_set_toml_parameters_lcms, ) @@ -178,13 +180,93 @@ def check_scan_translator(myLCMSobj, scan_translator): if len(set(scans_pulled_out)) != len(scans_pulled_out): raise ValueError("Overlapping scans pulled out by scan translator") +def add_mass_features(myLCMSobj, scan_translator): + """Process ms1 spectra and perform molecular search + + This includes peak picking, adding and processing associated ms1 spectra, + integration of mass features, annotation of c13 mass features, deconvolution of ms1 mass features, + and adding of peak shape metrics of mass features to the mass feature dataframe. + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + scan_translator : str or Path + Path to scan translator yaml file + + Returns + ------- + None, processes the LCMS object + """ + # Process ms1 spectra + myLCMSobj.find_mass_features() + myLCMSobj.add_associated_ms1( + auto_process=True, use_parser=False, spectrum_mode="profile" + ) + myLCMSobj.integrate_mass_features(drop_if_fail=True) + myLCMSobj.find_c13_mass_features() + myLCMSobj.deconvolute_ms1_mass_features() + myLCMSobj.add_peak_metrics() + + # Add associated ms2 spectra to mass features + scan_dictionary = load_scan_translator(scan_translator=scan_translator) + for param_key in scan_dictionary.keys(): + scan_filter = scan_dictionary[param_key]["scan_filter"] + if scan_filter == "": + scan_filter = None + myLCMSobj.add_associated_ms2_dda( + spectrum_mode="centroid", ms_params_key=param_key, scan_filter=scan_filter + ) + +def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): + """Export results to hdf5 and csv as a lipid report + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + out_path : str or Path + Path to output file + molecular_metadata : dict + Dict with molecular metadata + final : bool + Whether to export final results + + Returns + ------- + None, exports results to hdf5 and csv as a lipid report + """ + exporter = LipidomicsExport(out_path, myLCMSobj) + exporter.to_hdf(overwrite=True) + if final: + # Do not show warnings, these are expected + exporter.report_to_csv(molecular_metadata=molecular_metadata) + else: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + exporter.report_to_csv() + + def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): time_start = datetime.datetime.now() myLCMSobj = instantiate_lcms_obj(file_in) set_params_on_lcms_obj(myLCMSobj, params_toml) check_scan_translator(myLCMSobj, scan_translator) - - + add_mass_features(myLCMSobj, scan_translator) + myLCMSobj.remove_unprocessed_data() + #TODO KRH: add molecular search here + export_results(myLCMSobj, out_path=out_path, final=False) + precursor_mz_list = list( + set( + [ + v.mz + for k, v in myLCMSobj.mass_features.items() + if len(v.ms2_scan_numbers) > 0 and v.isotopologue_type is None + ] + ) + ) + mz_dict = {myLCMSobj.polarity: precursor_mz_list} + return mz_dict # TODO KRH: Add signal processing and ms1 molecular search here def run_lcms_lipidomics_workflow( From e98799011edb5aebdfbcd63e8b5b24bfb984d03c Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Fri, 6 Dec 2024 13:00:08 -0800 Subject: [PATCH 08/16] Add alternate working dockerfile --- Dockerfile_test.txt | 26 ++++++++++++++++++++++++++ Makefile | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 Dockerfile_test.txt diff --git a/Dockerfile_test.txt b/Dockerfile_test.txt new file mode 100644 index 0000000..00369b8 --- /dev/null +++ b/Dockerfile_test.txt @@ -0,0 +1,26 @@ +FROM jcarr87/corems-base-py3.10 + +# set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONNET_RUNTIME=coreclr + +# set working directory +WORKDIR /metams + +# Set up the directory structure +COPY metaMS/ /metams/metaMS/ +COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/ +COPY db/ /metams/db/ + +# install dependencies +RUN pip install --upgrade pip +RUN apt-get update && apt-get install -y libgomp1 + +# Install the requirements +#RUN pip install pythonnet +RUN pip install -r requirements.txt + +# Install the MetaMS package in editable mode +RUN pip install --editable . + diff --git a/Makefile b/Makefile index 1c909c0..a8b8da3 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,6 @@ wdl-run-gcms : wdl-run-lipid : #TODO KRH: remove the docker-build-local when the docker image is available in dockerhub and -# update the docker image in the wdl file +# update the docker image in the wdl file. Good to rebuild for each run while in development make docker-build-local miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files \ No newline at end of file From 1776ff73cbe3237c62e1e21ba253d4812b900dab Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Mon, 16 Dec 2024 08:31:31 -0800 Subject: [PATCH 09/16] WIP Switch lipid workflow to query local sqlite database --- configuration/lipidomics_metams.toml | 2 +- metaMS/lcms_lipidomics_workflow.py | 107 +++++++++++- metaMS/lipid_metadata_prepper.py | 240 +++++++++++++++++++++++++++ 3 files changed, 341 insertions(+), 8 deletions(-) create mode 100644 metaMS/lipid_metadata_prepper.py diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml index 0b4e426..28be8c6 100644 --- a/configuration/lipidomics_metams.toml +++ b/configuration/lipidomics_metams.toml @@ -1,4 +1,4 @@ -directory = "data/raw_data/lipid_data" +file_paths = ["data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw"] output_directory = "output" corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml" metabref_token_path = "metabref.token" diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index 3e9d5df..dff7812 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -6,6 +6,7 @@ from typing import List import click import warnings +import pandas as pd from corems.mass_spectra.input.mzml import MZMLSpectraParser from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader @@ -14,6 +15,8 @@ load_and_set_toml_parameters_lcms, ) +from metaMS.lipid_metadata_prepper import get_lipid_library + @dataclass class LipidomicsWorkflowParameters: """ @@ -42,16 +45,13 @@ class LipidomicsWorkflowParameters: metabref_token_path: str = None scan_translator_path: str = None cores: int = 1 + #TODO KRH: Add a parameter for the location of the sqlite database def check_lipidomics_workflow_params(lipid_workflow_params): # Check that corems_toml_path exists if not Path(lipid_workflow_params.corems_toml_path).exists(): raise FileNotFoundError("Corems toml file not found, exiting workflow") - # Check that metabref_token_path exists - if not Path(lipid_workflow_params.metabref_token_path).exists(): - raise FileNotFoundError("Metabref token file not found, exiting workflow") - # Check that scan_translator_path exists if not Path(lipid_workflow_params.scan_translator_path).exists(): raise FileNotFoundError("Scan translator file not found, exiting workflow") @@ -267,7 +267,91 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): ) mz_dict = {myLCMSobj.polarity: precursor_mz_list} return mz_dict - # TODO KRH: Add signal processing and ms1 molecular search here + +def prep_metadata(mz_dicts, out_dir): + """Prepare metadata for ms2 spectral search + + Parameters + ---------- + mz_dicts : list of dicts + List of dicts with keys "positive" and "negative" and values of lists of precursor mzs + out_dir : Path + Path to output directory + + Returns + ------- + metadata : dict + Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively + + Notes + ------- + Also writes out files for the flash entropy search databases and molecular metadata + """ + metadata = { + "mzs": {"positive": None, "negative": None}, + "fe": {"positive": None, "negative": None}, + "molecular_metadata": {}, + } + for d in mz_dicts: + metadata["mzs"].update(d) + + print("Preparing negative lipid library") + + if metadata["mzs"]["negative"] is not None: + metabref_negative, lipidmetadata_negative = get_lipid_library( + mz_list=metadata["mzs"]["negative"], + polarity="negative", + mz_tol_ppm=5, + format="flashentropy", + normalize=True, + fe_kwargs={ + "normalize_intensity": True, + "min_ms2_difference_in_da": 0.02, # for cleaning spectra + "max_ms2_tolerance_in_da": 0.01, # for setting search space + "max_indexed_mz": 3000, + "precursor_ions_removal_da": None, + "noise_threshold": 0, + }, + ) + metadata["fe"]["negative"] = metabref_negative + metadata["molecular_metadata"].update(lipidmetadata_negative) + fe_negative_df = pd.DataFrame.from_dict( + {k: v for k, v in enumerate(metadata["fe"]["negative"])}, orient="index" + ) + fe_negative_df.to_csv(out_dir / "ms2_db_negative.csv") + + print("Preparing positive lipid library") + if metadata["mzs"]["positive"] is not None: + metabref_positive, lipidmetadata_positive = get_lipid_library( + mz_list=metadata["mzs"]["positive"], + polarity="positive", + mz_tol_ppm=5, + format="flashentropy", + normalize=True, + fe_kwargs={ + "normalize_intensity": True, + "min_ms2_difference_in_da": 0.02, # for cleaning spectra + "max_ms2_tolerance_in_da": 0.01, # for setting search space + "max_indexed_mz": 3000, + "precursor_ions_removal_da": None, + "noise_threshold": 0, + }, + ) + metadata["fe"]["positive"] = metabref_positive + metadata["molecular_metadata"].update(lipidmetadata_positive) + fe_positive_df = pd.DataFrame.from_dict( + {k: v for k, v in enumerate(metadata["fe"]["positive"])}, orient="index" + ) + fe_positive_df.to_csv(out_dir / "ms2_db_positive.csv") + + mol_metadata_df = pd.concat( + [ + pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() + for k, v in metadata["molecular_metadata"].items() + ], + ignore_index=True, + ) + mol_metadata_df.to_csv(out_dir / "molecular_metadata.csv") def run_lcms_lipidomics_workflow( lipidomics_workflow_paramaters_file=None, @@ -304,13 +388,14 @@ def run_lcms_lipidomics_workflow( files_list = list(file_paths) out_paths_list = [out_dir / f.stem for f in files_list] - # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results + # Set the workflow parameters cores = lipid_workflow_params.cores params_toml = lipid_workflow_params.corems_toml_path scan_translator = lipid_workflow_params.scan_translator_path - click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " files, using " + str(cores) + " core(s)") + click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " file(s), using " + str(cores) + " core(s)") + # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results if cores == 1 or len(files_list) == 1: mz_dicts = [] for file_in, file_out in list(zip(files_list, out_paths_list)): @@ -335,5 +420,13 @@ def run_lcms_lipidomics_workflow( mz_dicts = pool.starmap(run_lipid_sp_ms1, args) pool.close() pool.join() + + # Prepare metadata for searching + prep_metadata(mz_dicts, out_dir) print("Finished processing all files") # TODO KRH: Add full lipidomics workflow here + +if __name__ == "__main__": + run_lcms_lipidomics_workflow( + lipidomics_workflow_paramaters_file="configuration/lipidomics_metams.toml" + ) \ No newline at end of file diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py new file mode 100644 index 0000000..95d75e9 --- /dev/null +++ b/metaMS/lipid_metadata_prepper.py @@ -0,0 +1,240 @@ +import pandas as pd +import numpy as np +import sqlite3 +import re +from ms_entropy import FlashEntropySearch +from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata + + +def find_closest(A, target): + """Find the index of closest value in A to each value in target. + + Parameters + ---------- + A : :obj:`~numpy.array` + The array to search (blueprint). A must be sorted. + target : :obj:`~numpy.array` + The array of values to search for. target must be sorted. + + Returns + ------- + :obj:`~numpy.array` + The indices of the closest values in A to each value in target. + """ + idx = A.searchsorted(target) + idx = np.clip(idx, 1, len(A) - 1) + left = A[idx - 1] + right = A[idx] + idx -= target - left < right - target + return idx + +def spectrum_to_array(spectrum, normalize=True): + """ + Convert MetabRef-formatted spectrum to array. + + Parameters + ---------- + spectrum : str + MetabRef spectrum, i.e. list of (m/z,abundance) pairs. + normalize : bool + Normalize the spectrum by its magnitude. + + Returns + ------- + :obj:`~numpy.array` + Array of shape (N, 2), with m/z in the first column and abundance in + the second. + + """ + + # Convert parenthesis-delimited string to array + arr = np.array( + re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float + ).reshape(-1, 2) + + # Normalize the array + if normalize: + arr[:, -1] = arr[:, -1] / arr[:, -1].sum() + + return arr + +def _to_flashentropy(metabref_lib, normalize=True, fe_kwargs={}): + """ + Convert metabref-formatted library to FlashEntropy library. + + Parameters + ---------- + metabref_lib : dict + MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). + normalize : bool + Normalize each spectrum by its magnitude. + fe_kwargs : dict, optional + Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; + any keys not recognized will be ignored. By default, all parameters set to defaults. + + Returns + ------- + :obj:`~ms_entropy.FlashEntropySearch` + MS2 library as FlashEntropy search instance. + + Raises + ------ + ValueError + If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. + + """ + # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da + if ( + "min_ms2_difference_in_da" in fe_kwargs + or "max_ms2_tolerance_in_da" in fe_kwargs + ): + if ( + "min_ms2_difference_in_da" not in fe_kwargs + or "max_ms2_tolerance_in_da" not in fe_kwargs + ): + raise ValueError( + "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." + ) + if ( + fe_kwargs["min_ms2_difference_in_da"] + != 2 * fe_kwargs["max_ms2_tolerance_in_da"] + ): + raise ValueError( + "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." + ) + + # Initialize empty library + fe_lib = [] + + # Enumerate spectra + for i, source in enumerate(metabref_lib): + # Reorganize source dict, if necessary + if "spectrum_data" in source.keys(): + spectrum = source["spectrum_data"] + else: + spectrum = source + + # Rename precursor_mz key for FlashEntropy + if "precursor_mz" not in spectrum.keys(): + spectrum["precursor_mz"] = spectrum.pop("precursor_ion") + + # Convert CoreMS spectrum to array and clean, store as `peaks` + spectrum["peaks"] = spectrum_to_array( + spectrum["mz"], normalize=normalize + ) + + # Add spectrum to library + fe_lib.append(spectrum) + + # Initialize FlashEntropy + fe_init_kws = [ + "max_ms2_tolerance_in_da", + "mz_index_step", + "low_memory", + "path_data", + ] + fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} + fes = FlashEntropySearch(**fe_init_kws) + + # Build FlashEntropy index + fe_index_kws = [ + "max_indexed_mz", + "precursor_ions_removal_da", + "noise_threshold", + "min_ms2_difference_in_da", + "max_peak_num", + ] + fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} + fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True) + + return fes + +def _dict_to_dataclass(metabref_lib, data_class): + """ + Convert dictionary to dataclass. + + Notes + ----- + This function will pull the attributes a dataclass and its parent class + and convert the dictionary to a dataclass instance with the appropriate + attributes. + + Parameters + ---------- + data_class : :obj:`~dataclasses.dataclass` + Dataclass to convert to. + metabref_lib : dict + Metabref dictionary object to convert to dataclass. + + Returns + ------- + :obj:`~dataclasses.dataclass` + Dataclass instance. + + """ + + # Get list of expected attributes of data_class + data_class_keys = list(data_class.__annotations__.keys()) + + # Does the data_class inherit from another class, if so, get the attributes of the parent class as well + if len(data_class.__mro__) > 2: + parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) + data_class_keys = list(set(data_class_keys + parent_class_keys)) + + # Remove keys that are not in the data_class from the input dictionary + input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} + + # Add keys that are in the data class but not in the input dictionary as None + for key in data_class_keys: + if key not in input_dict.keys(): + input_dict[key] = None + return data_class(**input_dict) + +def get_lipid_library( + mz_list, + polarity, + mz_tol_ppm, + format='flashentropy', + normalize=True, + fe_kwargs={}, +): + + # connect to the database + conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite') + + # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz + mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn) + mz_all = mz_all.sort_values(by='precursor_mz') + + # filter by polarity + mz_subset = mz_all[mz_all['polarity'] == polarity].copy() + mz_subset['closest_mz'] = mz_all['precursor_mz'].apply(lambda x: mz_list[find_closest(np.array(mz_list), x)]) + mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz']) / mz_subset['closest_mz'] * 1e6 + mz_subset = mz_subset[np.abs(mz_subset['ppm_error']) <= mz_tol_ppm] + + # get the full lipidMassSpectrumObject table for the filtered ms2 ids + mz_subset_ids = mz_subset['id'].tolist() + mz_subset_ids = tuple(mz_subset_ids) + mz_subset_full = pd.read_sql_query(f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_subset_ids}", conn) + + # get the lipid tree for the filtered molecular ids + mol_ids = mz_subset_full['molecular_data_id'].tolist() + mol_ids = tuple(mol_ids) + lipid_tree = pd.read_sql_query(f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", conn) + + # convert molecular data to dictionary of LipidMetadata objects, with mol_id as key + lipid_tree = lipid_tree.set_index('id') + lipid_tree = lipid_tree.to_dict(orient='index') + lipid_metadata = { + k: _dict_to_dataclass(v, LipidMetadata) + for k, v in lipid_tree.items() + } + + # convert ms2 data to flashentropy library + mz_subset_full = mz_subset_full.to_dict(orient='records') + fe_lib = _to_flashentropy(mz_subset_full, normalize=normalize) + + # close the connection + conn.close() + + return fe_lib, lipid_metadata \ No newline at end of file From 3ad017e36faeab79b3b4faa9ca77581cda49394f Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Mon, 16 Dec 2024 09:27:20 -0800 Subject: [PATCH 10/16] WIP add processing of MS2 to lipid workflow' --- metaMS/lcms_lipidomics_workflow.py | 124 ++++++++++++++++++++++++++++- metaMS/lipid_metadata_prepper.py | 7 +- 2 files changed, 124 insertions(+), 7 deletions(-) diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index dff7812..a73e943 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -10,12 +10,13 @@ from corems.mass_spectra.input.mzml import MZMLSpectraParser from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader +from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra from corems.mass_spectra.output.export import LipidomicsExport from corems.encapsulation.input.parameter_from_json import ( load_and_set_toml_parameters_lcms, ) -from metaMS.lipid_metadata_prepper import get_lipid_library +from metaMS.lipid_metadata_prepper import get_lipid_library, _to_flashentropy @dataclass class LipidomicsWorkflowParameters: @@ -353,6 +354,109 @@ def prep_metadata(mz_dicts, out_dir): ) mol_metadata_df.to_csv(out_dir / "molecular_metadata.csv") + return metadata + +def process_ms2(myLCMSobj, metadata, scan_translator): + """Process ms2 spectra and perform molecular search + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + metadata : dict + Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively + + Returns + ------- + None, processes the LCMS object + """ + # Perform molecular search on ms2 spectra + # Grab fe from metatdata associated with polarity (this is inherently high resolution as its from a in-silico high res library) + fe_search = metadata["fe"][myLCMSobj.polarity] + + scan_dictionary = load_scan_translator(scan_translator) + ms2_scan_df = myLCMSobj.scan_df[myLCMSobj.scan_df.ms_level == 2] + + # Process high resolution MS2 scans + # Collect all high resolution MS2 scans using the scan translator + ms2_scans_oi_hr = [] + for param_key in scan_dictionary.keys(): + if scan_dictionary[param_key]["resolution"] == "high": + scan_filter = scan_dictionary[param_key]["scan_filter"] + if scan_filter is not None: + ms2_scan_df_hr = ms2_scan_df[ + ms2_scan_df.scan_text.str.contains(scan_filter) + ] + else: + ms2_scan_df_hr = ms2_scan_df + ms2_scans_oi_hr_i = [ + x for x in ms2_scan_df_hr.scan.tolist() if x in myLCMSobj._ms.keys() + ] + ms2_scans_oi_hr.extend(ms2_scans_oi_hr_i) + # Perform search on high res scans + if len(ms2_scans_oi_hr) > 0: + myLCMSobj.fe_search( + scan_list=ms2_scans_oi_hr, fe_lib=fe_search, peak_sep_da=0.01 + ) + + # Process low resolution MS2 scans + # Collect all low resolution MS2 scans using the scan translator + ms2_scans_oi_lr = [] + for param_key in scan_dictionary.keys(): + if scan_dictionary[param_key]["resolution"] == "low": + scan_filter = scan_dictionary[param_key]["scan_filter"] + if scan_filter is not None: + ms2_scan_df_lr = ms2_scan_df[ + ms2_scan_df.scan_text.str.contains(scan_filter) + ] + else: + ms2_scan_df_lr = ms2_scan_df + ms2_scans_oi_lri = [ + x for x in ms2_scan_df_lr.scan.tolist() if x in myLCMSobj._ms.keys() + ] + ms2_scans_oi_lr.extend(ms2_scans_oi_lri) + # Perform search on low res scans + if len(ms2_scans_oi_lr) > 0: + # Recast the flashentropy search database to low resolution + fe_search_lr = _to_flashentropy( + metabref_lib=fe_search, + normalize=True, + fe_kwargs={ + "normalize_intensity": True, + "min_ms2_difference_in_da": 0.4, + "max_ms2_tolerance_in_da": 0.2, + "max_indexed_mz": 3000, + "precursor_ions_removal_da": None, + "noise_threshold": 0, + }, + ) + myLCMSobj.fe_search( + scan_list=ms2_scans_oi_lr, fe_lib=fe_search_lr, peak_sep_da=0.3 + ) + +def run_lipid_ms2(out_path, metadata, scan_translator=None): + """Run ms2 spectral search and export final results + + Parameters + ---------- + out_path : str or Path + Path to output file + metadata : dict + Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively + + Returns + ------- + None, runs ms2 spectral search and exports final results + """ + # Read in the intermediate results + out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5" + parser = ReadCoreMSHDFMassSpectra(out_path_hdf5) + myLCMSobj = parser.get_lcms_obj() + + # Process ms2 spectra, perform spectral search, and export final results + process_ms2(myLCMSobj, metadata, scan_translator=scan_translator) + export_results(myLCMSobj, str(out_path), metadata["molecular_metadata"], final=True) + def run_lcms_lipidomics_workflow( lipidomics_workflow_paramaters_file=None, file_paths=None, @@ -422,9 +526,21 @@ def run_lcms_lipidomics_workflow( pool.join() # Prepare metadata for searching - prep_metadata(mz_dicts, out_dir) - print("Finished processing all files") - # TODO KRH: Add full lipidomics workflow here + metadata = prep_metadata(mz_dicts, out_dir) + + # Run ms2 spectral search and export final results + click.echo("Starting ms2 spectral search and exporting final results") + if cores == 1 or len(files_list) == 1: + for file_out in out_paths_list: + mz_dicts = run_lipid_ms2( + file_out, metadata, scan_translator=scan_translator + ) + elif cores > 1: + pool = Pool(cores) + args = [(file_out, metadata, scan_translator) for file_out in out_paths_list] + mz_dicts = pool.starmap(run_lipid_ms2, args) + pool.close() + pool.join() if __name__ == "__main__": run_lcms_lipidomics_workflow( diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py index 95d75e9..771c12f 100644 --- a/metaMS/lipid_metadata_prepper.py +++ b/metaMS/lipid_metadata_prepper.py @@ -223,7 +223,8 @@ def get_lipid_library( lipid_tree = pd.read_sql_query(f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", conn) # convert molecular data to dictionary of LipidMetadata objects, with mol_id as key - lipid_tree = lipid_tree.set_index('id') + lipid_tree['id_index'] = lipid_tree['id'] + lipid_tree = lipid_tree.set_index('id_index') lipid_tree = lipid_tree.to_dict(orient='index') lipid_metadata = { k: _dict_to_dataclass(v, LipidMetadata) @@ -231,8 +232,8 @@ def get_lipid_library( } # convert ms2 data to flashentropy library - mz_subset_full = mz_subset_full.to_dict(orient='records') - fe_lib = _to_flashentropy(mz_subset_full, normalize=normalize) + mz_subset_full_dict = mz_subset_full.to_dict(orient='records') + fe_lib = _to_flashentropy(mz_subset_full_dict, normalize=normalize, fe_kwargs=fe_kwargs) # close the connection conn.close() From f6c37ba24c2c53ea9ca4e99f13e306fdf1d976d8 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Mon, 16 Dec 2024 10:15:40 -0800 Subject: [PATCH 11/16] WIP Reconfigure multiprocessing for lipid workflow --- metaMS/lcms_lipidomics_workflow.py | 44 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index a73e943..7330b82 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -7,6 +7,7 @@ import click import warnings import pandas as pd +import gc from corems.mass_spectra.input.mzml import MZMLSpectraParser from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader @@ -247,7 +248,6 @@ def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): warnings.simplefilter("ignore") exporter.report_to_csv() - def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): time_start = datetime.datetime.now() myLCMSobj = instantiate_lcms_obj(file_in) @@ -498,7 +498,8 @@ def run_lcms_lipidomics_workflow( scan_translator = lipid_workflow_params.scan_translator_path click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " file(s), using " + str(cores) + " core(s)") - + gc.collect() + # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results if cores == 1 or len(files_list) == 1: mz_dicts = [] @@ -511,22 +512,23 @@ def run_lcms_lipidomics_workflow( ) mz_dicts.append(mz_dict) elif cores > 1: - pool = Pool(cores) - args = [ - ( - str(file_in), - str(file_out), - params_toml, - scan_translator, - ) - for file_in, file_out in list(zip(files_list, out_paths_list)) - ] - mz_dicts = pool.starmap(run_lipid_sp_ms1, args) - pool.close() - pool.join() - + with Pool(cores) as pool: + args = [ + ( + str(file_in), + str(file_out), + params_toml, + scan_translator, + ) + for file_in, file_out in zip(files_list, out_paths_list) + ] + mz_dicts = pool.starmap(run_lipid_sp_ms1, args) + gc.collect() + # Prepare metadata for searching metadata = prep_metadata(mz_dicts, out_dir) + del mz_dicts + gc.collect() # Run ms2 spectral search and export final results click.echo("Starting ms2 spectral search and exporting final results") @@ -536,11 +538,11 @@ def run_lcms_lipidomics_workflow( file_out, metadata, scan_translator=scan_translator ) elif cores > 1: - pool = Pool(cores) - args = [(file_out, metadata, scan_translator) for file_out in out_paths_list] - mz_dicts = pool.starmap(run_lipid_ms2, args) - pool.close() - pool.join() + with Pool(cores) as pool: + args = [(file_out, metadata, scan_translator) for file_out in out_paths_list] + pool.starmap(run_lipid_ms2, args) + + gc.collect() if __name__ == "__main__": run_lcms_lipidomics_workflow( From de72860da24d8fa1785aa26e5d2be5b3a2c0457f Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Mon, 16 Dec 2024 10:38:21 -0800 Subject: [PATCH 12/16] WIP Fix query for metadata prepper --- metaMS/lipid_metadata_prepper.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py index 771c12f..af5d57f 100644 --- a/metaMS/lipid_metadata_prepper.py +++ b/metaMS/lipid_metadata_prepper.py @@ -199,6 +199,12 @@ def get_lipid_library( fe_kwargs={}, ): + # prepare the mz_list for searching against the database + mz_list = pd.DataFrame(mz_list, columns=['mz_obs']) + mz_list = mz_list.sort_values(by='mz_obs') + mz_list = mz_list.reset_index(drop=True) + mz_obs_arr = mz_list['mz_obs'].values + # connect to the database conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite') @@ -206,10 +212,14 @@ def get_lipid_library( mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn) mz_all = mz_all.sort_values(by='precursor_mz') - # filter by polarity + # filter by polarity and if there are any matches within mz_tol_ppm mz_subset = mz_all[mz_all['polarity'] == polarity].copy() - mz_subset['closest_mz'] = mz_all['precursor_mz'].apply(lambda x: mz_list[find_closest(np.array(mz_list), x)]) - mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz']) / mz_subset['closest_mz'] * 1e6 + mz_subset = mz_subset.sort_values(by='precursor_mz') + mz_subset = mz_subset.reset_index(drop=True) + mz_subset['closest_mz_obs'] = mz_obs_arr[ + find_closest(mz_obs_arr, mz_subset.precursor_mz.values) + ] + mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz_obs']) / mz_subset['precursor_mz'] * 1e6 mz_subset = mz_subset[np.abs(mz_subset['ppm_error']) <= mz_tol_ppm] # get the full lipidMassSpectrumObject table for the filtered ms2 ids From bf880558f3940705866e842a33bf9ffc37220540 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Mon, 16 Dec 2024 14:08:34 -0800 Subject: [PATCH 13/16] WIP lipid workflow working with formula searching --- .../emsl_lipidomics_corems_params.toml | 30 ++++++------- configuration/lipidomics_metams.toml | 2 +- metaMS/lcms_lipidomics_workflow.py | 45 +++++++++++++++++-- metaMS/lipid_metadata_prepper.py | 1 + 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml index 4d43664..647be6a 100644 --- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml +++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml @@ -17,19 +17,19 @@ peak_picking_method = "persistent homology" implemented_peak_picking_methods = [ "persistent homology",] mass_feature_cluster_mz_tolerance_rel = 5e-6 mass_feature_cluster_rt_tolerance = 0.3 -ms1_scans_to_average = 5 +ms1_scans_to_average = 1 ms1_deconvolution_corr_min = 0.95 ms2_dda_rt_tolerance = 0.15 ms2_dda_mz_tolerance = 0.05 ms2_min_fe_score = 0.3 search_as_lipids = true include_fragment_types = true -export_profile_spectra = true -export_eics = true +export_profile_spectra = false +export_eics = false export_unprocessed_ms1 = false verbose_processing = false -ph_inten_min_rel = 0.05 #TODO KRH: Change this back to reasonable setting after dev (0.005 or 0.0005?) -ph_persis_min_rel = 0.05 +ph_inten_min_rel = 0.0005 +ph_persis_min_rel = 0.0005 ph_smooth_it = 0 [mass_spectrum.ms1.molecular_search] @@ -40,7 +40,7 @@ use_runtime_kendrick_filter = false use_min_peaks_filter = true min_peaks_per_class = 15 url_database = "" -db_jobs = 3 +db_jobs = 1 db_chunk_size = 300 ion_charge = -1 min_hc_filter = 0.3 @@ -64,8 +64,8 @@ isProtonated = true isAdduct = false ion_types_excluded = [] ionization_type = "ESI" -min_ppm_error = -10.0 -max_ppm_error = 10.0 +min_ppm_error = -5.0 +max_ppm_error = 5.0 min_abun_error = -100.0 max_abun_error = 100.0 mz_error_range = 1.5 @@ -73,13 +73,13 @@ error_method = "None" mz_error_average = 0.0 [mass_spectrum.ms1.mass_spectrum] -noise_threshold_method = "relative_abundance" +noise_threshold_method = "log" noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] noise_threshold_min_std = 6 noise_threshold_min_s2n = 4.0 -noise_threshold_min_relative_abundance = 0.1 +noise_threshold_min_relative_abundance = 0.01 noise_threshold_absolute_abundance = 1000000.0 -noise_threshold_log_nsigma = 6 +noise_threshold_log_nsigma = 8 noise_threshold_log_nsigma_corr_factor = 0.463 noise_threshold_log_nsigma_bins = 500 noise_min_mz = 0 @@ -163,7 +163,7 @@ noise_threshold_method = "relative_abundance" noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] noise_threshold_min_std = 6 noise_threshold_min_s2n = 4.0 -noise_threshold_min_relative_abundance = 0.1 +noise_threshold_min_relative_abundance = 0.01 noise_threshold_absolute_abundance = 1000000.0 noise_threshold_log_nsigma = 6 noise_threshold_log_nsigma_corr_factor = 0.463 @@ -283,9 +283,9 @@ legacy_resolving_power = false legacy_centroid_polyfit = false [mass_spectrum.ms1.molecular_search.usedAtoms] -C = [ 10, 100,] -H = [ 18, 200,] -N = [ 0, 3,] +C = [ 11, 70,] +H = [ 19, 150,] +N = [ 0, 2,] P = [ 0, 1,] O = [ 1, 23,] S = [ 0, 1,] diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml index 28be8c6..d14d324 100644 --- a/configuration/lipidomics_metams.toml +++ b/configuration/lipidomics_metams.toml @@ -3,4 +3,4 @@ output_directory = "output" corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml" metabref_token_path = "metabref.token" scan_translator_path = "configuration/lipid_configs/emsl_lipidomics_scan_translator.toml" -cores = 1 \ No newline at end of file +cores = 2 \ No newline at end of file diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index 7330b82..c089d32 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -13,6 +13,7 @@ from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra from corems.mass_spectra.output.export import LipidomicsExport +from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas from corems.encapsulation.input.parameter_from_json import ( load_and_set_toml_parameters_lcms, ) @@ -220,6 +221,45 @@ def add_mass_features(myLCMSobj, scan_translator): spectrum_mode="centroid", ms_params_key=param_key, scan_filter=scan_filter ) +def molecular_formula_search(myLCMSobj): + """Perform molecular search on ms1 spectra + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + + Returns + ------- + None, processes the LCMS object + """ + i = 1 + # get df of mass features + mf_df = myLCMSobj.mass_features_to_df() + + # search molecular formulas for each mass features that are the deconvoluted parent and have a ms2 + mf_searched = [] + for mf_id in mf_df.index: + if myLCMSobj.mass_features[mf_id].mass_spectrum_deconvoluted_parent: + if len(myLCMSobj.mass_features[mf_id].ms2_scan_numbers) > 0: + if mf_id not in mf_searched: + if i > 5: # TODO KRH: remove this when ready + break + + scan = myLCMSobj.mass_features[mf_id].apex_scan + # Search single spectrum for all peaks that correspond to the same scan + mf_df_scan = mf_df[mf_df.apex_scan == scan] + peaks_to_search = [ + myLCMSobj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist() + ] + SearchMolecularFormulas( + myLCMSobj._ms[scan], + first_hit=False, + find_isotopologues=True, + ).run_worker_ms_peaks(peaks_to_search) + mf_searched.extend(mf_df_scan.index.tolist()) + i += 1 + def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): """Export results to hdf5 and csv as a lipid report @@ -249,13 +289,12 @@ def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): exporter.report_to_csv() def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): - time_start = datetime.datetime.now() myLCMSobj = instantiate_lcms_obj(file_in) set_params_on_lcms_obj(myLCMSobj, params_toml) check_scan_translator(myLCMSobj, scan_translator) add_mass_features(myLCMSobj, scan_translator) myLCMSobj.remove_unprocessed_data() - #TODO KRH: add molecular search here + molecular_formula_search(myLCMSobj) export_results(myLCMSobj, out_path=out_path, final=False) precursor_mz_list = list( set( @@ -480,7 +519,7 @@ def run_lcms_lipidomics_workflow( cores=cores, ) - # Make output dir and get list of files to process + # Make output dir out_dir = Path(lipid_workflow_params.output_directory) out_dir.mkdir(parents=True, exist_ok=True) diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py index af5d57f..ce7cfab 100644 --- a/metaMS/lipid_metadata_prepper.py +++ b/metaMS/lipid_metadata_prepper.py @@ -207,6 +207,7 @@ def get_lipid_library( # connect to the database conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite') + #TODO KRH: change path to a parameter # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn) From 412511b991a4d535be4573cb50e952560e1be33e Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Wed, 18 Dec 2024 11:41:30 -0800 Subject: [PATCH 14/16] Modify lipid workflow to comply with corems 3.2 --- metaMS/cli.py | 49 +++++++++--------- metaMS/lcms_lipidomics_workflow.py | 79 +++++++++++++----------------- metaMS/lipid_metadata_prepper.py | 9 +++- wdl/metaMS_lcmslipidomics.wdl | 5 +- wdl/metams_input_lipidomics.json | 2 +- 5 files changed, 68 insertions(+), 76 deletions(-) diff --git a/metaMS/cli.py b/metaMS/cli.py index 32b2b6c..bcafaf8 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -121,23 +121,6 @@ def dump_lipidomics_toml_template(toml_file_name): toml.dump(LipidomicsWorkflowParameters().__dict__, workflow_param) -@cli.command(name="dump-lipidomics-corems-toml-template") -@click.argument("toml_file_name", required=True, type=str) -def dump_lipidomics_corems_toml_template(toml_file_name): - """ - Writes a toml file with the CoreMS parameters to be used in the lipidomics workflow - - Parameters - ---------- - toml_file_name : str - The name of the toml file to write the parameters to - """ - path_obj = Path(toml_file_name).with_suffix(".toml") - print("dumping lipidomics corems toml template") - pass - # TODO KRH: add call for dumping lipidomics corems toml template from corems once we can import it - - @cli.command(name="run-lipidomics-workflow") @click.option( "-p", @@ -168,7 +151,7 @@ def dump_lipidomics_corems_toml_template(toml_file_name): help="The path corems parameters toml file", ) @click.option( - "-t", "--token_path", required=False, type=str, help="The path to the metabref token" + "-d", "--db_location", required=False, type=str, help="The path to the local database" ) @click.option( "-s", "--scan_translator_path", required=False, type=str, help="The path to the scan translator file" @@ -181,7 +164,7 @@ def run_lipidomics_workflow( file_paths, output_directory, corems_params, - token_path, + db_location, scan_translator_path, cores ): @@ -189,15 +172,27 @@ def run_lipidomics_workflow( Parameters ---------- - #TODO KRH: Add this in + paramaters_file : str + The path to the toml file with the lipidomics workflow parameters + file_paths : str + The paths to the input files, separated by commas as one string + output_directory : str + The directory where the output files will be stored + corems_params : str + The path corems parameters toml file + db_location : str + The path to the sqlite database for lipid spectra searching + scan_translator_path : str + The path to the scan translator file + cores : int + The number of cores to use for processing """ if paramaters_file is not None: if cores is not None or file_paths is not None: - #TODO KRH: Add all other parameters above click.echo("Using parameters file, ignoring other parameters") - #run_lcms_lipidomics_workflow( - # lipidomics_workflow_paramaters_file=paramaters_file - #) + run_lcms_lipidomics_workflow( + lipidomics_workflow_paramaters_file=paramaters_file + ) else: if cores is None: cores = 1 @@ -213,14 +208,14 @@ def run_lipidomics_workflow( "Must provide an output directory if not using a parameters file" ) return - if token_path is None: - click.echo("No metabref token provided") + if db_location is None: + click.echo("No database path provided") return run_lcms_lipidomics_workflow( file_paths=file_paths, output_directory=output_directory, corems_toml_path=corems_params, - metabref_token_path=token_path, + db_location=db_location, scan_translator_path=scan_translator_path, cores=cores, ) diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index c089d32..10c7dc9 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -1,9 +1,7 @@ from dataclasses import dataclass import toml from pathlib import Path -import datetime from multiprocessing import Pool -from typing import List import click import warnings import pandas as pd @@ -13,13 +11,20 @@ from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra from corems.mass_spectra.output.export import LipidomicsExport -from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas +from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulasLC from corems.encapsulation.input.parameter_from_json import ( load_and_set_toml_parameters_lcms, ) from metaMS.lipid_metadata_prepper import get_lipid_library, _to_flashentropy +# Suppress specific warning from the corems.mass_spectrum.input.massList module +warnings.filterwarnings( + "ignore", + message="No isotopologue matched the formula_dict: {formula_dict}", + module="corems.mass_spectrum.input.massList" +) + @dataclass class LipidomicsWorkflowParameters: """ @@ -33,9 +38,8 @@ class LipidomicsWorkflowParameters: The directory where the output files will be stored corems_toml_path : str The path to the corems configuration file - metabref_token_path : str - The path to the metabref token file - See https://metabref.emsl.pnnl.gov/api for more information for how to get a token + db_location : str + The path to the local sqlite database used for searching lipid ms2 spectra scan_translator_path : str The path to the scan translator file, optional cores : int @@ -45,10 +49,9 @@ class LipidomicsWorkflowParameters: file_paths: tuple = ('data/...', 'data/...') output_directory: str = "output" corems_toml_path: str = None - metabref_token_path: str = None + db_location: str = None scan_translator_path: str = None cores: int = 1 - #TODO KRH: Add a parameter for the location of the sqlite database def check_lipidomics_workflow_params(lipid_workflow_params): # Check that corems_toml_path exists @@ -72,8 +75,11 @@ def check_lipidomics_workflow_params(lipid_workflow_params): for file_path in lipid_workflow_params.file_paths: if ".raw" not in file_path and ".mzML" not in file_path: raise ValueError(f"File path {file_path} is not a .raw or .mzML file, exiting workflow") - - #TODO KRH: Add a check that we can access the metabref API with the token + + # Check that db_location exists + if lipid_workflow_params.db_location is not None: + if not Path(lipid_workflow_params.db_location).exists(): + raise FileNotFoundError("Database location not found, exiting workflow") def instantiate_lcms_obj(file_in): """Instantiate a corems LCMS object from a binary file. Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory) @@ -233,32 +239,9 @@ def molecular_formula_search(myLCMSobj): ------- None, processes the LCMS object """ - i = 1 - # get df of mass features - mf_df = myLCMSobj.mass_features_to_df() - - # search molecular formulas for each mass features that are the deconvoluted parent and have a ms2 - mf_searched = [] - for mf_id in mf_df.index: - if myLCMSobj.mass_features[mf_id].mass_spectrum_deconvoluted_parent: - if len(myLCMSobj.mass_features[mf_id].ms2_scan_numbers) > 0: - if mf_id not in mf_searched: - if i > 5: # TODO KRH: remove this when ready - break - - scan = myLCMSobj.mass_features[mf_id].apex_scan - # Search single spectrum for all peaks that correspond to the same scan - mf_df_scan = mf_df[mf_df.apex_scan == scan] - peaks_to_search = [ - myLCMSobj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist() - ] - SearchMolecularFormulas( - myLCMSobj._ms[scan], - first_hit=False, - find_isotopologues=True, - ).run_worker_ms_peaks(peaks_to_search) - mf_searched.extend(mf_df_scan.index.tolist()) - i += 1 + # Perform a molecular search on all of the mass features + mol_form_search = SearchMolecularFormulasLC(myLCMSobj) + mol_form_search.run_mass_feature_search() def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): """Export results to hdf5 and csv as a lipid report @@ -281,7 +264,6 @@ def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): exporter = LipidomicsExport(out_path, myLCMSobj) exporter.to_hdf(overwrite=True) if final: - # Do not show warnings, these are expected exporter.report_to_csv(molecular_metadata=molecular_metadata) else: with warnings.catch_warnings(): @@ -294,6 +276,7 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): check_scan_translator(myLCMSobj, scan_translator) add_mass_features(myLCMSobj, scan_translator) myLCMSobj.remove_unprocessed_data() + #Finally, perform molecular formula search on all ms1 spectra associated with mass features molecular_formula_search(myLCMSobj) export_results(myLCMSobj, out_path=out_path, final=False) precursor_mz_list = list( @@ -308,7 +291,7 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): mz_dict = {myLCMSobj.polarity: precursor_mz_list} return mz_dict -def prep_metadata(mz_dicts, out_dir): +def prep_metadata(mz_dicts, out_dir, db_location): """Prepare metadata for ms2 spectral search Parameters @@ -317,6 +300,8 @@ def prep_metadata(mz_dicts, out_dir): List of dicts with keys "positive" and "negative" and values of lists of precursor mzs out_dir : Path Path to output directory + db_location : str + Path to lipid database Returns ------- @@ -339,6 +324,7 @@ def prep_metadata(mz_dicts, out_dir): if metadata["mzs"]["negative"] is not None: metabref_negative, lipidmetadata_negative = get_lipid_library( + db_location=db_location, mz_list=metadata["mzs"]["negative"], polarity="negative", mz_tol_ppm=5, @@ -363,6 +349,7 @@ def prep_metadata(mz_dicts, out_dir): print("Preparing positive lipid library") if metadata["mzs"]["positive"] is not None: metabref_positive, lipidmetadata_positive = get_lipid_library( + db_location=db_location, mz_list=metadata["mzs"]["positive"], polarity="positive", mz_tol_ppm=5, @@ -489,8 +476,11 @@ def run_lipid_ms2(out_path, metadata, scan_translator=None): """ # Read in the intermediate results out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5" - parser = ReadCoreMSHDFMassSpectra(out_path_hdf5) - myLCMSobj = parser.get_lcms_obj() + # Catch known UserWarning from corems and ignore it + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + parser = ReadCoreMSHDFMassSpectra(out_path_hdf5) + myLCMSobj = parser.get_lcms_obj(load_raw=False) # Process ms2 spectra, perform spectral search, and export final results process_ms2(myLCMSobj, metadata, scan_translator=scan_translator) @@ -501,7 +491,7 @@ def run_lcms_lipidomics_workflow( file_paths=None, output_directory=None, corems_toml_path=None, - metabref_token_path=None, + db_location=None, scan_translator_path=None, cores=None, ): @@ -513,7 +503,7 @@ def run_lcms_lipidomics_workflow( lipid_workflow_params = LipidomicsWorkflowParameters( file_paths= file_paths.split(","), output_directory=output_directory, - metabref_token_path=metabref_token_path, + db_location=db_location, scan_translator_path=scan_translator_path, corems_toml_path=corems_toml_path, cores=cores, @@ -565,7 +555,8 @@ def run_lcms_lipidomics_workflow( gc.collect() # Prepare metadata for searching - metadata = prep_metadata(mz_dicts, out_dir) + click.echo("Preparing metadata for ms2 spectral search") + metadata = prep_metadata(mz_dicts, out_dir, lipid_workflow_params.db_location) del mz_dicts gc.collect() @@ -573,7 +564,7 @@ def run_lcms_lipidomics_workflow( click.echo("Starting ms2 spectral search and exporting final results") if cores == 1 or len(files_list) == 1: for file_out in out_paths_list: - mz_dicts = run_lipid_ms2( + run_lipid_ms2( file_out, metadata, scan_translator=scan_translator ) elif cores > 1: diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py index ce7cfab..b33d07a 100644 --- a/metaMS/lipid_metadata_prepper.py +++ b/metaMS/lipid_metadata_prepper.py @@ -123,6 +123,11 @@ def _to_flashentropy(metabref_lib, normalize=True, fe_kwargs={}): spectrum["mz"], normalize=normalize ) + # Cast "fragment_types" to a list (if present and not already a list) + if "fragment_types" in spectrum.keys(): + if not isinstance(spectrum["fragment_types"], list): + spectrum["fragment_types"] = spectrum["fragment_types"].split(",") + # Add spectrum to library fe_lib.append(spectrum) @@ -191,6 +196,7 @@ def _dict_to_dataclass(metabref_lib, data_class): return data_class(**input_dict) def get_lipid_library( + db_location, mz_list, polarity, mz_tol_ppm, @@ -206,8 +212,7 @@ def get_lipid_library( mz_obs_arr = mz_list['mz_obs'].values # connect to the database - conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite') - #TODO KRH: change path to a parameter + conn = sqlite3.connect(db_location) # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn) diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl index 802fe2d..a5d4bba 100644 --- a/wdl/metaMS_lcmslipidomics.wdl +++ b/wdl/metaMS_lcmslipidomics.wdl @@ -14,7 +14,7 @@ task runMetaMSLCMSLipidomics { Array[File] file_paths String output_directory File corems_toml_path - File metabref_token_path + File db_location File scan_translator_path Int cores } @@ -24,7 +24,7 @@ task runMetaMSLCMSLipidomics { -i ${sep=',' file_paths} \ -o ${output_directory} \ -c ${corems_toml_path} \ - -t ${metabref_token_path} \ + -d ${db_location} \ -s ${scan_translator_path} \ -j ${cores} } @@ -36,5 +36,6 @@ task runMetaMSLCMSLipidomics { runtime { docker: "local-metams:latest" + #TODO KRH: Update to pushed version when available } } \ No newline at end of file diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json index 4cc08d4..e6dabb6 100644 --- a/wdl/metams_input_lipidomics.json +++ b/wdl/metams_input_lipidomics.json @@ -4,7 +4,7 @@ ], "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output", "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml", - "lcmsLipidomics.runMetaMSLCMSLipidomics.metabref_token_path": "./configuration/gcms_corems.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "", "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml", "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1 } \ No newline at end of file From c0d272046d0a47944ca59ffbcf667f5a81a243ff Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Wed, 18 Dec 2024 14:14:06 -0800 Subject: [PATCH 15/16] WIP Add functioning wdl and configs for lipid workflow Pending PiPy release of corems 3.2 --- Dockerfile | 7 +++++++ .../emsl_lipidomics_corems_params.toml | 17 +++++++++-------- configuration/lipidomics_metams.toml | 15 +++++++++------ requirements.txt | 5 ++--- wdl/metams_input_lipidomics.json | 7 ++++--- 5 files changed, 31 insertions(+), 20 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9308ee2..174f5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,13 @@ COPY metaMS/ /metams/metaMS/ COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/ COPY db/ /metams/db/ +#TODO KRH: Remove this section once the CoreMS package is available on PyPI and installable via the requirements.txt file +# Copy the CoreMS tar.gz file into the Docker image +COPY CoreMS-3.2.0.tar.gz /metams/ + +# Install the CoreMS package from the tar.gz file +RUN pip install /metams/CoreMS-3.2.0.tar.gz + # Install the MetaMS package in editable mode RUN pip3 install --editable . diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml index 647be6a..9620f12 100644 --- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml +++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml @@ -28,8 +28,8 @@ export_profile_spectra = false export_eics = false export_unprocessed_ms1 = false verbose_processing = false -ph_inten_min_rel = 0.0005 -ph_persis_min_rel = 0.0005 +ph_inten_min_rel = 0.01 +ph_persis_min_rel = 0.01 ph_smooth_it = 0 [mass_spectrum.ms1.molecular_search] @@ -57,7 +57,7 @@ adduct_atoms_neg = [ "Cl", "Br",] adduct_atoms_pos = [ "Na", "K",] score_methods = [ "S_P_lowest_error", "N_S_P_lowest_error", "lowest_error", "prob_score", "air_filter_error", "water_filter_error", "earth_filter_error",] score_method = "prob_score" -output_min_score = 0.1 +output_min_score = 0.3 output_score_method = "All Candidates" isRadical = false isProtonated = true @@ -66,11 +66,12 @@ ion_types_excluded = [] ionization_type = "ESI" min_ppm_error = -5.0 max_ppm_error = 5.0 -min_abun_error = -100.0 -max_abun_error = 100.0 +min_abun_error = -50.0 +max_abun_error = 50.0 mz_error_range = 1.5 error_method = "None" mz_error_average = 0.0 +verbose_processing = false [mass_spectrum.ms1.mass_spectrum] noise_threshold_method = "log" @@ -97,7 +98,7 @@ calibration_ref_match_method_implemented = [ "legacy", "merged",] calibration_ref_match_tolerance = 0.003 calibration_ref_match_std_raw_error_limit = 1.5 do_calibration = true -verbose_processing = true +verbose_processing = false [mass_spectrum.ms1.ms_peak] kendrick_rounding_method = "floor" @@ -183,7 +184,7 @@ calibration_ref_match_method_implemented = [ "legacy", "merged",] calibration_ref_match_tolerance = 0.003 calibration_ref_match_std_raw_error_limit = 1.5 do_calibration = true -verbose_processing = true +verbose_processing = false [mass_spectrum.ms2.ms_peak] kendrick_rounding_method = "floor" @@ -269,7 +270,7 @@ calibration_ref_match_method_implemented = [ "legacy", "merged",] calibration_ref_match_tolerance = 0.003 calibration_ref_match_std_raw_error_limit = 1.5 do_calibration = true -verbose_processing = true +verbose_processing = false [mass_spectrum.ms2_cid.ms_peak] kendrick_rounding_method = "floor" diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml index d14d324..c7957f6 100644 --- a/configuration/lipidomics_metams.toml +++ b/configuration/lipidomics_metams.toml @@ -1,6 +1,9 @@ -file_paths = ["data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw"] -output_directory = "output" -corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml" -metabref_token_path = "metabref.token" -scan_translator_path = "configuration/lipid_configs/emsl_lipidomics_scan_translator.toml" -cores = 2 \ No newline at end of file +file_paths = [ + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_POS_23Jan18_Brandi-WCSH5801.raw", + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_NEG_25Jan18_Brandi-WCSH5801.raw" +] +output_directory = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/_blanchard_11_8ws97026/processed_20241218" +corems_toml_path = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_corems_params.toml" +db_location = "/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite" +scan_translator_path = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_scan_translator.toml" +cores = 4 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index edeb4e1..66e74e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -corems==3.0.2 +corems==3.2.0 Click>=7.1.1 -requests -nmdc-schema>=7.0.0 \ No newline at end of file +requests \ No newline at end of file diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json index e6dabb6..c9d9796 100644 --- a/wdl/metams_input_lipidomics.json +++ b/wdl/metams_input_lipidomics.json @@ -1,10 +1,11 @@ { "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [ - "./data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw" + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_POS_23Jan18_Brandi-WCSH5801.raw", + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_NEG_25Jan18_Brandi-WCSH5801.raw" ], "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output", - "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml", - "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "", + "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_corems_params.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite", "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml", "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1 } \ No newline at end of file From ca45251aa2d7ea2110c069bb70d01175cb99f3d0 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Thu, 19 Dec 2024 10:36:56 -0800 Subject: [PATCH 16/16] Fix biosample API check --- .gitignore | 5 +++- .../api_info_retriever.py | 28 ++++++++++--------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index c742356..78f7f33 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,7 @@ share/python-wheels/ *.egg # Raw thermo data -*.raw \ No newline at end of file +*.raw + +# Metabref token +*.token \ No newline at end of file diff --git a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py index 3b4f862..1b8f357 100644 --- a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py +++ b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py @@ -147,18 +147,20 @@ def check_if_ids_exist(self, ids: list) -> bool: If there's an error in making the API request. """ ids_test = list(set(ids)) - ids_test = [id.replace('"', "'") for id in ids_test] - ids_test_str = ", ".join(f'"{id}"' for id in ids_test) - filter_param = f'{{"id": {{"$in": [{ids_test_str}]}}}}' - og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection=id" - - try: - resp = requests.get(og_url) - resp.raise_for_status() # Raises an HTTPError for bad responses - data = resp.json() - if len(data["resources"]) != len(ids_test): - return False - except requests.RequestException as e: - raise requests.RequestException(f"Error making API request: {e}") + for id in ids_test: + filter_param = f'{{"id": "{id}"}}' + field = "id" + + og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}" + + try: + resp = requests.get(og_url) + resp.raise_for_status() # Raises an HTTPError for bad responses + data = resp.json() + if len(data["resources"]) == 0: + print(f"ID {id} not found") + return False + except requests.RequestException as e: + raise requests.RequestException(f"Error making API request: {e}") return True