From 187cb207db34e7121a78ecf4cc8fa0634e63ede5 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Thu, 5 Dec 2024 14:52:59 -0800
Subject: [PATCH 01/16] Rework lipid wdl to expect list of files

---
 Makefile                         |  9 +++++----
 wdl/metaMS_lcmslipidomics.wdl    | 32 ++++++++++++++++++++++++++++++++
 wdl/metams_input_lipidomics.json | 10 +++++++++-
 3 files changed, 46 insertions(+), 5 deletions(-)
 create mode 100644 wdl/metaMS_lcmslipidomics.wdl

diff --git a/Makefile b/Makefile
index c0cc5d5..df7cfcc 100644
--- a/Makefile
+++ b/Makefile
@@ -63,9 +63,10 @@ docker-run:
 	@echo $(config_dir)
 	@docker run -v $(data_dir):/metams/data -v $(config_dir):/metams/configuration microbiomedata/metams:latest metaMS run-gcms-workflow /metams/configuration/metams.toml
 
-wdl-run :
+wdl-run-gcms :
  	 
-	 miniwdl run wdl/metaMS.wdl -i wdl/metams_input.json --verbose --no-cache --copy-input-files
+	 miniwdl run wdl/metaMS_gcms.wdl -i wdl/metams_input_gcms.json --verbose --no-cache --copy-input-files
 
-
-	
\ No newline at end of file
+wdl-run-lipid :
+ 	 
+	 miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files
\ No newline at end of file
diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl
new file mode 100644
index 0000000..fcaf979
--- /dev/null
+++ b/wdl/metaMS_lcmslipidomics.wdl
@@ -0,0 +1,32 @@
+version 1.0
+
+workflow lcmsLipidomics {
+    call runMetaMSLCMSLipidomics
+
+    output {
+        Array[File] output_files = runMetaMSLCMSLipidomics.output_files
+    }
+}
+
+task runMetaMSLCMSLipidomics {
+    input {
+        Array[File] file_paths
+        String output_directory
+        File corems_toml_path
+        File metabref_token_path
+        File scan_translator_path
+        Int cores
+    }
+
+    command {
+
+    }
+
+    output {
+        Array[File] output_files = glob('${output_directory}/**/*.*')
+    }
+
+    runtime {
+        docker: "microbiomedata/metams:2.2.2"
+    }
+}
\ No newline at end of file
diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json
index 056f702..2243f1c 100644
--- a/wdl/metams_input_lipidomics.json
+++ b/wdl/metams_input_lipidomics.json
@@ -1,3 +1,11 @@
 {
-    "lcmsLipidomics.runLipidomicsMetaMS.config_file": "./configuration/lipidomics_metams.toml"
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [
+        "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf",
+        "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf"
+    ],
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.metabref_token_path": "./configuration/gcms_corems.toml",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1
 }
\ No newline at end of file

From fdd817d123bf9011a870fa310c1f57fb523298ed Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Thu, 5 Dec 2024 15:17:21 -0800
Subject: [PATCH 02/16] Reconfigured CLI and WDL for lipid workflow

---
 metaMS/cli.py                 | 54 ++++++++++++++++++++++++-----------
 wdl/metaMS_lcmslipidomics.wdl | 10 +++++--
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/metaMS/cli.py b/metaMS/cli.py
index a6f719e..8321766 100644
--- a/metaMS/cli.py
+++ b/metaMS/cli.py
@@ -148,10 +148,10 @@ def dump_lipidomics_corems_toml_template(toml_file_name):
 )
 @click.option(
     "-i",
-    "--directory",
+    "--file_paths",
     required=False,
     type=str,
-    help="The directory where the data is stored, all files in the directory will be processed",
+    help="The path to the directory with the input files",
 )
 @click.option(
     "-o",
@@ -160,6 +160,13 @@ def dump_lipidomics_corems_toml_template(toml_file_name):
     type=str,
     help="The directory where the output files will be stored",
 )
+@click.option(
+    "-c",
+    "--corems_params",
+    required=False,
+    type=str,
+    help="The path corems parameters toml file",
+)
 @click.option(
     "-t", "--token_path", required=False, type=str, help="The path to the metabref token"
 )
@@ -169,36 +176,51 @@ def dump_lipidomics_corems_toml_template(toml_file_name):
 @click.option(
     "-j", "--cores", required=False, type=int, help="'cpu's to use for processing"
 )
-def run_lipidomics_workflow(paramaters_file, directory, output_directory, token_path, scan_translator_path, cores):
+def run_lipidomics_workflow(
+    paramaters_file, 
+    file_paths, 
+    output_directory, 
+    corems_params, 
+    token_path, 
+    scan_translator_path, 
+    cores
+    ):
     """Run the lipidomics workflow
 
     Parameters
     ----------
-    paramaters_file : str
-        The path to the toml file with the lipidomics workflow parameters
-        This file can be generated using the dump-lipidomics-toml-template command
+    #TODO KRH: Add this in
     """
     if paramaters_file is not None:
-        if cores is not None or directory is not None:
+        if cores is not None or file_paths is not None:
+        #TODO KRH: Add all other parameters above
             click.echo("Using parameters file, ignoring other parameters")
-        run_lcms_lipidomics_workflow(
-            lipidomics_workflow_paramaters_file=paramaters_file
-        )
+        #run_lcms_lipidomics_workflow(
+        #    lipidomics_workflow_paramaters_file=paramaters_file
+        #)
     else:
         if cores is None:
             cores = 1
-        if directory is None:
-            click.echo("No directory provided, no data to process")
+        if file_paths is None:
+            click.echo("No file paths provided, no data to process")
             return
+        if corems_params is None:
+            click.echo("No corems parameters provided")
+        if scan_translator is None:
+            click.echo("No scan translator provided")
         if output_directory is None:
             click.echo(
                 "Must provide an output directory if not using a parameters file"
             )
             return
-        run_lcms_lipidomics_workflow(
-            directory=directory, output_directory=output_directory, cores=cores
-        )
-    click.echo("Running lipidomics workflow")
+        if token_path is None:
+            click.echo("No metabref token provided")
+            return
+        #run_lcms_lipidomics_workflow(
+        #    directory=directory, output_directory=output_directory, cores=cores
+        #)
+    click.echo("Ready to run lipidomics workflow")
+    
     # test call:
     # MetaMS run-lipidomics-workflow -p configuration/lipidomics_metams.toml
     # miniwdl run wdl/metaMS_lipidomics.wdl -i wdl/metams_input_lipidomics.json
diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl
index fcaf979..de2214e 100644
--- a/wdl/metaMS_lcmslipidomics.wdl
+++ b/wdl/metaMS_lcmslipidomics.wdl
@@ -19,7 +19,13 @@ task runMetaMSLCMSLipidomics {
     }
 
     command {
-
+        metaMS run-lipidomics-workflow \
+            -i ${sep=',' file_paths} \
+            -o ${output_directory} \
+            -c ${corems_toml_path} \
+            -t ${metabref_token_path} \
+            -s ${scan_translator_path} \
+            -j ${cores}
     }
 
     output {
@@ -27,6 +33,6 @@ task runMetaMSLCMSLipidomics {
     }
 
     runtime {
-        docker: "microbiomedata/metams:2.2.2"
+        docker: "local-metams:latest"
     }
 }
\ No newline at end of file

From ca01692a65822c49d13cd4e80990d23c92b3d0ae Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Thu, 5 Dec 2024 16:56:11 -0800
Subject: [PATCH 03/16] Add check function to workflow

---
 Makefile                           |  3 ++
 metaMS/cli.py                      | 13 ++++---
 metaMS/lcms_lipidomics_workflow.py | 55 ++++++++++++++++++++++++------
 wdl/metaMS_lcmslipidomics.wdl      |  4 ++-
 wdl/metaMS_lipidomics.wdl          | 20 -----------
 wdl/metams_input_lipidomics.json   |  2 +-
 6 files changed, 61 insertions(+), 36 deletions(-)
 delete mode 100644 wdl/metaMS_lipidomics.wdl

diff --git a/Makefile b/Makefile
index df7cfcc..4f5e855 100644
--- a/Makefile
+++ b/Makefile
@@ -58,6 +58,9 @@ docker-build:
 
 	docker build -t microbiomedata/metams:latest .
 
+docker-build-local:
+	docker build -t local-metams:latest .
+	
 docker-run:
 	@echo $(data_dir)
 	@echo $(config_dir)
diff --git a/metaMS/cli.py b/metaMS/cli.py
index 8321766..d99c7ad 100644
--- a/metaMS/cli.py
+++ b/metaMS/cli.py
@@ -206,7 +206,7 @@ def run_lipidomics_workflow(
             return
         if corems_params is None:
             click.echo("No corems parameters provided")
-        if scan_translator is None:
+        if scan_translator_path is None:
             click.echo("No scan translator provided")
         if output_directory is None:
             click.echo(
@@ -216,9 +216,14 @@ def run_lipidomics_workflow(
         if token_path is None:
             click.echo("No metabref token provided")
             return
-        #run_lcms_lipidomics_workflow(
-        #    directory=directory, output_directory=output_directory, cores=cores
-        #)
+        run_lcms_lipidomics_workflow(
+            file_paths=file_paths,
+            output_directory=output_directory,
+            corems_toml_path=corems_params,
+            metabref_token_path=token_path,
+            scan_translator_path=scan_translator_path,
+            cores=cores,
+        )
     click.echo("Ready to run lipidomics workflow")
     
     # test call:
diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index c72052f..b905df9 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 import datetime
 from multiprocessing import Pool
+from typing import List
+import click
 
 from corems.mass_spectra.input.mzml import MZMLSpectraParser
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
@@ -29,14 +31,41 @@ class LipidomicsWorkflowParameters:
         The number of cores to use for processing, optional
 
     """
-
-    directory: str = "data/..."
-    output_directory: str = "output/..."
-    corems_toml_path: str = "configuration/lipidomics_corems.toml"
+    file_paths: tuple = ('data/...', 'data/...')
+    output_directory: str = "output"
+    corems_toml_path: str = None
     metabref_token_path: str = None
     scan_translator_path: str = None
     cores: int = 1
 
+def check_lipidomics_workflow_params(lipid_workflow_params):
+    # Check that corems_toml_path exists
+    if not Path(lipid_workflow_params.corems_toml_path).exists():
+        click.echo("Corems toml file not found, exiting workflow")
+        return
+    
+    # Check that metabref_token_path exists
+    if not Path(lipid_workflow_params.metabref_token_path).exists():
+        click.echo("Metabref token file not found, exiting workflow")
+        return
+    
+    # Check that scan_translator_path exists
+    if not Path(lipid_workflow_params.scan_translator_path).exists():
+        click.echo("Scan translator file not found, exiting workflow")
+        return
+    
+    # Check that output_directory exists
+    if not Path(lipid_workflow_params.output_directory).exists():
+        click.echo("Output directory not found, exiting workflow")
+        return
+    
+    # Check that file_paths exist
+    for file_path in lipid_workflow_params.file_paths:
+        if not Path(file_path).exists():
+            click.echo(f"File path {file_path} not found, exiting workflow")
+            return
+
+
 def instantiate_lcms_obj(file_in):
     """Instantiate a corems LCMS object from a binary file.  Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory)
 
@@ -75,7 +104,7 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
 
 def run_lcms_lipidomics_workflow(
     lipidomics_workflow_paramaters_file=None,
-    directory=None,
+    file_paths=None,
     output_directory=None,
     corems_toml_path=None,
     metabref_token_path=None,
@@ -88,7 +117,7 @@ def run_lcms_lipidomics_workflow(
             lipid_workflow_params = LipidomicsWorkflowParameters(**toml.load(infile))
     else:
         lipid_workflow_params = LipidomicsWorkflowParameters(
-            directory=directory,
+            file_paths= file_paths.split(","),
             output_directory=output_directory,
             metabref_token_path=metabref_token_path,
             scan_translator_path=scan_translator_path,
@@ -100,14 +129,20 @@ def run_lcms_lipidomics_workflow(
     out_dir = Path(lipid_workflow_params.output_directory)
     out_dir.mkdir(parents=True, exist_ok=True)
 
-    file_dir = Path(lipid_workflow_params.directory)
-    files_list = list(file_dir.glob("*.raw"))
+    # Check that all parameters are valid and exist
+    check_lipidomics_workflow_params(lipid_workflow_params)
+
+    # Organize input and output paths
+    file_paths = [Path(file_path) for file_path in lipid_workflow_params.file_paths]
+    files_list = list(file_paths)
     out_paths_list = [out_dir / f.stem for f in files_list]
-        
+    
     # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results
     cores = lipid_workflow_params.cores
     params_toml = lipid_workflow_params.corems_toml_path
     scan_translator = lipid_workflow_params.scan_translator_path
+    
+    """
     if cores == 1 or len(files_list) == 1:
         mz_dicts = []
         for file_in, file_out in list(zip(files_list, out_paths_list)):
@@ -133,5 +168,5 @@ def run_lcms_lipidomics_workflow(
         pool.close()
         pool.join()
     print("Finished processing all files")
-
+    """
     # TODO KRH: Add full lipidomics workflow here
diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl
index de2214e..802fe2d 100644
--- a/wdl/metaMS_lcmslipidomics.wdl
+++ b/wdl/metaMS_lcmslipidomics.wdl
@@ -4,6 +4,7 @@ workflow lcmsLipidomics {
     call runMetaMSLCMSLipidomics
 
     output {
+        String out = runMetaMSLCMSLipidomics.out
         Array[File] output_files = runMetaMSLCMSLipidomics.output_files
     }
 }
@@ -29,7 +30,8 @@ task runMetaMSLCMSLipidomics {
     }
 
     output {
-        Array[File] output_files = glob('${output_directory}/**/*.*')
+        String out = read_string(stdout())
+        Array[File] output_files = glob('${output_directory}/*')
     }
 
     runtime {
diff --git a/wdl/metaMS_lipidomics.wdl b/wdl/metaMS_lipidomics.wdl
deleted file mode 100644
index dea4d11..0000000
--- a/wdl/metaMS_lipidomics.wdl
+++ /dev/null
@@ -1,20 +0,0 @@
-version 1.0
-
-workflow lcmsLipidomics {
-    call runLipidomicsMetaMS
-}
-
-task runLipidomicsMetaMS {
-    input {
-        File config_file
-    }
-
-    command {
-        metaMS run-lipidomics-workflow -p ${config_file}
-    }
-
-    runtime {
-        docker: "local-metams:latest"
-        #TODO KRH: Change to dockerhub version after we've pushed the updated image
-    }
-}
\ No newline at end of file
diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json
index 2243f1c..b7fedfb 100644
--- a/wdl/metams_input_lipidomics.json
+++ b/wdl/metams_input_lipidomics.json
@@ -1,7 +1,7 @@
 {
     "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [
         "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf",
-        "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf"
+        "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml"
     ],
     "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml",

From 34a120b3ee04eb9741e950daee910f064d6614f9 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Thu, 5 Dec 2024 17:32:00 -0800
Subject: [PATCH 04/16] Add error handling to lipid workflow

---
 metaMS/cli.py                      |  5 -----
 metaMS/lcms_lipidomics_workflow.py | 24 ++++++++++++++----------
 wdl/metams_input_lipidomics.json   |  3 +--
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/metaMS/cli.py b/metaMS/cli.py
index d99c7ad..32b2b6c 100644
--- a/metaMS/cli.py
+++ b/metaMS/cli.py
@@ -224,8 +224,3 @@ def run_lipidomics_workflow(
             scan_translator_path=scan_translator_path,
             cores=cores,
         )
-    click.echo("Ready to run lipidomics workflow")
-    
-    # test call:
-    # MetaMS run-lipidomics-workflow -p configuration/lipidomics_metams.toml
-    # miniwdl run wdl/metaMS_lipidomics.wdl -i wdl/metams_input_lipidomics.json
diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index b905df9..59b1c4e 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -41,29 +41,31 @@ class LipidomicsWorkflowParameters:
 def check_lipidomics_workflow_params(lipid_workflow_params):
     # Check that corems_toml_path exists
     if not Path(lipid_workflow_params.corems_toml_path).exists():
-        click.echo("Corems toml file not found, exiting workflow")
-        return
+        raise FileNotFoundError("Corems toml file not found, exiting workflow")
     
     # Check that metabref_token_path exists
     if not Path(lipid_workflow_params.metabref_token_path).exists():
-        click.echo("Metabref token file not found, exiting workflow")
-        return
+        raise FileNotFoundError("Metabref token file not found, exiting workflow")
     
     # Check that scan_translator_path exists
     if not Path(lipid_workflow_params.scan_translator_path).exists():
-        click.echo("Scan translator file not found, exiting workflow")
-        return
+        raise FileNotFoundError("Scan translator file not found, exiting workflow")
     
     # Check that output_directory exists
     if not Path(lipid_workflow_params.output_directory).exists():
-        click.echo("Output directory not found, exiting workflow")
-        return
+        raise FileNotFoundError("Output directory not found, exiting workflow")
     
     # Check that file_paths exist
     for file_path in lipid_workflow_params.file_paths:
         if not Path(file_path).exists():
-            click.echo(f"File path {file_path} not found, exiting workflow")
-            return
+            raise FileNotFoundError(f"File path {file_path} not found, exiting workflow")
+    
+    # Check that all file_paths end in .raw or .mzML
+    for file_path in lipid_workflow_params.file_paths:
+        if ".raw" not in file_path and ".mzML" not in file_path:
+            raise ValueError(f"File path {file_path} is not a .raw or .mzML file, exiting workflow")
+
+    #TODO KRH: Add a check that we can access the metabref API with the token
 
 
 def instantiate_lcms_obj(file_in):
@@ -141,6 +143,8 @@ def run_lcms_lipidomics_workflow(
     cores = lipid_workflow_params.cores
     params_toml = lipid_workflow_params.corems_toml_path
     scan_translator = lipid_workflow_params.scan_translator_path
+
+    click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " files, using " +  str(cores) + " core(s)")
     
     """
     if cores == 1 or len(files_list) == 1:
diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json
index b7fedfb..4cc08d4 100644
--- a/wdl/metams_input_lipidomics.json
+++ b/wdl/metams_input_lipidomics.json
@@ -1,7 +1,6 @@
 {
     "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [
-        "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf",
-        "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml"
+        "./data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw"
     ],
     "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml",

From eb7da1959edc54288a9d144bdf7c390143f3bf69 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Fri, 6 Dec 2024 09:09:23 -0800
Subject: [PATCH 05/16] WIP lipid wdl working through parameter setting

---
 .gitignore                                    |   5 +-
 Makefile                                      |   4 +-
 .../emsl_lipidomics_corems_params.toml        | 333 ++++++++++++++++--
 metaMS/lcms_lipidomics_workflow.py            |  44 ++-
 4 files changed, 346 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 13134e4..c742356 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,4 +35,7 @@ wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
-*.egg
\ No newline at end of file
+*.egg
+
+# Raw thermo data
+*.raw
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 4f5e855..1c909c0 100644
--- a/Makefile
+++ b/Makefile
@@ -71,5 +71,7 @@ wdl-run-gcms :
 	 miniwdl run wdl/metaMS_gcms.wdl -i wdl/metams_input_gcms.json --verbose --no-cache --copy-input-files
 
 wdl-run-lipid :
- 	 
+#TODO KRH: remove the docker-build-local when the docker image is available in dockerhub and
+# update the docker image in the wdl file
+	 make docker-build-local
 	 miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files
\ No newline at end of file
diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
index 222d6d0..4d43664 100644
--- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
+++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
@@ -17,26 +17,67 @@ peak_picking_method = "persistent homology"
 implemented_peak_picking_methods = [ "persistent homology",]
 mass_feature_cluster_mz_tolerance_rel = 5e-6
 mass_feature_cluster_rt_tolerance = 0.3
-ms1_scans_to_average = 1
+ms1_scans_to_average = 5
 ms1_deconvolution_corr_min = 0.95
 ms2_dda_rt_tolerance = 0.15
 ms2_dda_mz_tolerance = 0.05
 ms2_min_fe_score = 0.3
 search_as_lipids = true
 include_fragment_types = true
-ph_inten_min_rel = 0.05
-ph_persis_min_rel = 0.005
-ph_smooth_it = 0
-export_profile_spectra = false
+export_profile_spectra = true
 export_eics = true
 export_unprocessed_ms1 = false
+verbose_processing = false
+ph_inten_min_rel = 0.05 #TODO KRH: Change this back to reasonable setting after dev (0.005 or 0.0005?)
+ph_persis_min_rel = 0.05
+ph_smooth_it = 0
 
-[MassSpectrum]
+[mass_spectrum.ms1.molecular_search]
+use_isotopologue_filter = false
+isotopologue_filter_threshold = 33.0
+isotopologue_filter_atoms = [ "Cl", "Br",]
+use_runtime_kendrick_filter = false
+use_min_peaks_filter = true
+min_peaks_per_class = 15
+url_database = ""
+db_jobs = 3
+db_chunk_size = 300
+ion_charge = -1
+min_hc_filter = 0.3
+max_hc_filter = 3.0
+min_oc_filter = 0.0
+max_oc_filter = 1.2
+min_op_filter = 2.0
+use_pah_line_rule = false
+min_dbe = 0.0
+max_dbe = 40.0
+mz_error_score_weight = 0.6
+isotopologue_score_weight = 0.4
+adduct_atoms_neg = [ "Cl", "Br",]
+adduct_atoms_pos = [ "Na", "K",]
+score_methods = [ "S_P_lowest_error", "N_S_P_lowest_error", "lowest_error", "prob_score", "air_filter_error", "water_filter_error", "earth_filter_error",]
+score_method = "prob_score"
+output_min_score = 0.1
+output_score_method = "All Candidates"
+isRadical = false
+isProtonated = true
+isAdduct = false
+ion_types_excluded = []
+ionization_type = "ESI"
+min_ppm_error = -10.0
+max_ppm_error = 10.0
+min_abun_error = -100.0
+max_abun_error = 100.0
+mz_error_range = 1.5
+error_method = "None"
+mz_error_average = 0.0
+
+[mass_spectrum.ms1.mass_spectrum]
 noise_threshold_method = "relative_abundance"
 noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",]
 noise_threshold_min_std = 6
 noise_threshold_min_s2n = 4.0
-noise_threshold_min_relative_abundance = 1
+noise_threshold_min_relative_abundance = 0.1
 noise_threshold_absolute_abundance = 1000000.0
 noise_threshold_log_nsigma = 6
 noise_threshold_log_nsigma_corr_factor = 0.463
@@ -51,9 +92,14 @@ calib_pol_order = 2
 max_calib_ppm_error = 1.0
 min_calib_ppm_error = -1.0
 calib_sn_threshold = 2.0
+calibration_ref_match_method = "legacy"
+calibration_ref_match_method_implemented = [ "legacy", "merged",]
+calibration_ref_match_tolerance = 0.003
+calibration_ref_match_std_raw_error_limit = 1.5
 do_calibration = true
+verbose_processing = true
 
-[MassSpecPeak]
+[mass_spectrum.ms1.ms_peak]
 kendrick_rounding_method = "floor"
 implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",]
 peak_derivative_threshold = 0.0
@@ -62,26 +108,27 @@ min_peak_datapoints = 5.0
 peak_max_prominence_percent = 0.1
 peak_height_max_percent = 10.0
 legacy_resolving_power = false
+legacy_centroid_polyfit = false
 
-[MS1MolecularSearch]
+[mass_spectrum.ms2.molecular_search]
 use_isotopologue_filter = false
 isotopologue_filter_threshold = 33.0
 isotopologue_filter_atoms = [ "Cl", "Br",]
 use_runtime_kendrick_filter = false
 use_min_peaks_filter = true
 min_peaks_per_class = 15
-url_database = ""
+url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@molformdb:5432/coremsapp"
 db_jobs = 3
 db_chunk_size = 300
 ion_charge = -1
 min_hc_filter = 0.3
 max_hc_filter = 3.0
 min_oc_filter = 0.0
-max_oc_filter = 5
+max_oc_filter = 1.2
 min_op_filter = 2.0
 use_pah_line_rule = false
-min_dbe = 0
-max_dbe = 50
+min_dbe = 0.0
+max_dbe = 40.0
 mz_error_score_weight = 0.6
 isotopologue_score_weight = 0.4
 adduct_atoms_neg = [ "Cl", "Br",]
@@ -93,17 +140,63 @@ output_score_method = "All Candidates"
 isRadical = false
 isProtonated = true
 isAdduct = false
-ion_types_excluded = []
+ion_types_excluded = [ "[M+HCOO]-",]
 ionization_type = "ESI"
-min_ppm_error = -5
-max_ppm_error = 5
+min_ppm_error = -10.0
+max_ppm_error = 10.0
 min_abun_error = -100.0
 max_abun_error = 100.0
 mz_error_range = 1.5
 error_method = "None"
 mz_error_average = 0.0
 
-[MS2MolecularSearch]
+[mass_spectrum.ms2.transient]
+implemented_apodization_function = [ "Hamming", "Hanning", "Blackman", "Full-Sine", "Half-Sine", "Kaiser", "Half-Kaiser",]
+apodization_method = "Hanning"
+number_of_truncations = 0
+number_of_zero_fills = 1
+next_power_of_two = false
+kaiser_beta = 8.6
+
+[mass_spectrum.ms2.mass_spectrum]
+noise_threshold_method = "relative_abundance"
+noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",]
+noise_threshold_min_std = 6
+noise_threshold_min_s2n = 4.0
+noise_threshold_min_relative_abundance = 0.1
+noise_threshold_absolute_abundance = 1000000.0
+noise_threshold_log_nsigma = 6
+noise_threshold_log_nsigma_corr_factor = 0.463
+noise_threshold_log_nsigma_bins = 500
+noise_min_mz = 0.0
+noise_max_mz = inf
+min_picking_mz = 0.0
+max_picking_mz = inf
+picking_point_extrapolate = 3
+calib_minimize_method = "Powell"
+calib_pol_order = 2
+max_calib_ppm_error = 1.0
+min_calib_ppm_error = -1.0
+calib_sn_threshold = 2.0
+calibration_ref_match_method = "legacy"
+calibration_ref_match_method_implemented = [ "legacy", "merged",]
+calibration_ref_match_tolerance = 0.003
+calibration_ref_match_std_raw_error_limit = 1.5
+do_calibration = true
+verbose_processing = true
+
+[mass_spectrum.ms2.ms_peak]
+kendrick_rounding_method = "floor"
+implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",]
+peak_derivative_threshold = 0.0
+peak_min_prominence_percent = 0.1
+min_peak_datapoints = 5.0
+peak_max_prominence_percent = 0.1
+peak_height_max_percent = 10.0
+legacy_resolving_power = false
+legacy_centroid_polyfit = false
+
+[mass_spectrum.ms2_cid.molecular_search]
 use_isotopologue_filter = false
 isotopologue_filter_threshold = 33.0
 isotopologue_filter_atoms = [ "Cl", "Br",]
@@ -136,18 +229,60 @@ isAdduct = false
 ion_types_excluded = [ "[M+HCOO]-",]
 ionization_type = "ESI"
 min_ppm_error = -10.0
-max_ppm_error = 10.0
+max_ppm_error = 200
 min_abun_error = -100.0
 max_abun_error = 100.0
 mz_error_range = 1.5
 error_method = "None"
 mz_error_average = 0.0
 
-[MassSpecPeak.kendrick_base]
-C = 1
-H = 2
+[mass_spectrum.ms2_cid.transient]
+implemented_apodization_function = [ "Hamming", "Hanning", "Blackman", "Full-Sine", "Half-Sine", "Kaiser", "Half-Kaiser",]
+apodization_method = "Hanning"
+number_of_truncations = 0
+number_of_zero_fills = 1
+next_power_of_two = false
+kaiser_beta = 8.6
+
+[mass_spectrum.ms2_cid.mass_spectrum]
+noise_threshold_method = "relative_abundance"
+noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",]
+noise_threshold_min_std = 6
+noise_threshold_min_s2n = 4.0
+noise_threshold_min_relative_abundance = 0.01
+noise_threshold_absolute_abundance = 1000000.0
+noise_threshold_log_nsigma = 6
+noise_threshold_log_nsigma_corr_factor = 0.463
+noise_threshold_log_nsigma_bins = 500
+noise_min_mz = 0.0
+noise_max_mz = inf
+min_picking_mz = 0.0
+max_picking_mz = inf
+picking_point_extrapolate = 3
+calib_minimize_method = "Powell"
+calib_pol_order = 2
+max_calib_ppm_error = 1.0
+min_calib_ppm_error = -1.0
+calib_sn_threshold = 2.0
+calibration_ref_match_method = "legacy"
+calibration_ref_match_method_implemented = [ "legacy", "merged",]
+calibration_ref_match_tolerance = 0.003
+calibration_ref_match_std_raw_error_limit = 1.5
+do_calibration = true
+verbose_processing = true
+
+[mass_spectrum.ms2_cid.ms_peak]
+kendrick_rounding_method = "floor"
+implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",]
+peak_derivative_threshold = 0.0
+peak_min_prominence_percent = 0.1
+min_peak_datapoints = 5.0
+peak_max_prominence_percent = 0.1
+peak_height_max_percent = 10.0
+legacy_resolving_power = false
+legacy_centroid_polyfit = false
 
-[MS1MolecularSearch.usedAtoms]
+[mass_spectrum.ms1.molecular_search.usedAtoms]
 C = [ 10, 100,]
 H = [ 18, 200,]
 N = [ 0, 3,]
@@ -157,7 +292,7 @@ S = [ 0, 1,]
 Na = [ 0, 1,]
 Cl = [ 0, 1,]
 
-[MS1MolecularSearch.used_atom_valences]
+[mass_spectrum.ms1.molecular_search.used_atom_valences]
 C = 4
 13C = 4
 N = 3
@@ -229,11 +364,35 @@ Bi = 3
 Po = 2
 Ac = 3
 
-[MS2MolecularSearch.usedAtoms]
-C = [ 1, 100,]
-H = [ 1, 200,]
+[mass_spectrum.ms1.ms_peak.kendrick_base]
+C = 1
+H = 2
+
+[mass_spectrum.ms1.data_input.header_translate]
+"m/z" = "m/z"
+mOz = "m/z"
+Mass = "m/z"
+"Resolving Power" = "Resolving Power"
+"Res." = "Resolving Power"
+resolution = "Resolving Power"
+Intensity = "Peak Height"
+"Peak Height" = "Peak Height"
+I = "Peak Height"
+Abundance = "Peak Height"
+abs_abu = "Peak Height"
+"Signal/Noise" = "S/N"
+"S/N" = "S/N"
+sn = "S/N"
+
+[mass_spectrum.ms2.molecular_search.usedAtoms]
+C = [ 10, 30,]
+H = [ 18, 200,]
+O = [ 1, 23,]
+N = [ 0, 3,]
+P = [ 0, 1,]
+S = [ 0, 1,]
 
-[MS2MolecularSearch.used_atom_valences]
+[mass_spectrum.ms2.molecular_search.used_atom_valences]
 C = 4
 13C = 4
 N = 3
@@ -304,3 +463,123 @@ Pb = 4
 Bi = 3
 Po = 2
 Ac = 3
+
+[mass_spectrum.ms2.ms_peak.kendrick_base]
+C = 1
+H = 2
+
+[mass_spectrum.ms2.data_input.header_translate]
+"m/z" = "m/z"
+mOz = "m/z"
+Mass = "m/z"
+"Resolving Power" = "Resolving Power"
+"Res." = "Resolving Power"
+resolution = "Resolving Power"
+Intensity = "Peak Height"
+"Peak Height" = "Peak Height"
+I = "Peak Height"
+Abundance = "Peak Height"
+abs_abu = "Peak Height"
+"Signal/Noise" = "S/N"
+"S/N" = "S/N"
+sn = "S/N"
+
+[mass_spectrum.ms2_cid.molecular_search.usedAtoms]
+C = [ 10, 30,]
+H = [ 18, 200,]
+O = [ 1, 23,]
+N = [ 0, 3,]
+P = [ 0, 1,]
+S = [ 0, 1,]
+
+[mass_spectrum.ms2_cid.molecular_search.used_atom_valences]
+C = 4
+13C = 4
+N = 3
+O = 2
+S = 2
+H = 1
+F = 1
+Cl = 1
+Br = 1
+I = 1
+At = 1
+Li = 1
+Na = 1
+K = 1
+Rb = 1
+Cs = 1
+Fr = 1
+B = 4
+In = 3
+Al = 3
+P = 3
+Ga = 3
+Mg = 2
+Be = 2
+Ca = 2
+Sr = 2
+Ba = 2
+Ra = 2
+V = 5
+Fe = 3
+Si = 4
+Sc = 3
+Ti = 4
+Cr = 1
+Mn = 1
+Co = 1
+Ni = 1
+Cu = 2
+Zn = 2
+Ge = 4
+As = 5
+Se = 6
+Y = 3
+Zr = 4
+Nb = 5
+Mo = 6
+Tc = 7
+Ru = 8
+Rh = 6
+Pd = 4
+Ag = 0
+Cd = 2
+Sn = 4
+Sb = 5
+Te = 6
+La = 3
+Hf = 4
+Ta = 5
+W = 6
+Re = 4
+Os = 4
+Ir = 4
+Pt = 4
+Au = 3
+Hg = 1
+Tl = 3
+Pb = 4
+Bi = 3
+Po = 2
+Ac = 3
+
+[mass_spectrum.ms2_cid.ms_peak.kendrick_base]
+C = 1
+H = 2
+
+[mass_spectrum.ms2_cid.data_input.header_translate]
+"m/z" = "m/z"
+mOz = "m/z"
+Mass = "m/z"
+"Resolving Power" = "Resolving Power"
+"Res." = "Resolving Power"
+resolution = "Resolving Power"
+Intensity = "Peak Height"
+"Peak Height" = "Peak Height"
+I = "Peak Height"
+Abundance = "Peak Height"
+abs_abu = "Peak Height"
+"Signal/Noise" = "S/N"
+"S/N" = "S/N"
+sn = "S/N"
diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index 59b1c4e..8eb935f 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -8,6 +8,9 @@
 
 from corems.mass_spectra.input.mzml import MZMLSpectraParser
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
+from corems.encapsulation.input.parameter_from_json import (
+    load_and_set_toml_parameters_lcms,
+)
 
 @dataclass
 class LipidomicsWorkflowParameters:
@@ -67,7 +70,6 @@ def check_lipidomics_workflow_params(lipid_workflow_params):
 
     #TODO KRH: Add a check that we can access the metabref API with the token
 
-
 def instantiate_lcms_obj(file_in):
     """Instantiate a corems LCMS object from a binary file.  Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory)
 
@@ -85,23 +87,45 @@ def instantiate_lcms_obj(file_in):
     """
     # Instantiate parser based on binary file type
     if ".raw" in str(file_in):
-        #TODO KRH: Add real functionality here
-        pass
-        #parser = ImportMassSpectraThermoMSFileReader(file_in)
+        parser = ImportMassSpectraThermoMSFileReader(file_in)
 
     if ".mzML" in str(file_in):
-        #parser = MZMLSpectraParser(file_in)
-        pass
+        parser = MZMLSpectraParser(file_in)
 
     # Instantiate lc-ms data object using parser and pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory)
-    #myLCMSobj = parser.get_lcms_obj(spectra="ms1")
-    myLCMSobj = None
+    myLCMSobj = parser.get_lcms_obj(spectra="ms1")
 
     return myLCMSobj
 
+def set_params_on_lcms_obj(myLCMSobj, params_toml):
+    """Set parameters on the LCMS object
+
+    Parameters
+    ----------
+    myLCMSobj : corems LCMS object
+        LCMS object to set parameters on
+    params_toml : str or Path
+        Path to toml file with parameters
+
+    Returns
+    -------
+    None, sets parameters on the LCMS object
+    """
+    # Load parameters from toml file
+    load_and_set_toml_parameters_lcms(myLCMSobj, params_toml)
+
+    # If myLCMSobj is a positive mode, remove Cl from atoms used in molecular search
+    # This cuts down on the number of molecular formulas searched hugely
+    if myLCMSobj.polarity == "positive":
+        myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Cl")
+    elif myLCMSobj.polarity == "negative":
+        myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Na")
+
 def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     time_start = datetime.datetime.now()
-    myLCMSobj = instantiate_lcms_obj(file_in)   
+    myLCMSobj = instantiate_lcms_obj(file_in)           
+    set_params_on_lcms_obj(myLCMSobj, params_toml)
+
     # TODO KRH: Add signal processing and ms1 molecular search here
 
 def run_lcms_lipidomics_workflow(
@@ -146,7 +170,6 @@ def run_lcms_lipidomics_workflow(
 
     click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " files, using " +  str(cores) + " core(s)")
     
-    """
     if cores == 1 or len(files_list) == 1:
         mz_dicts = []
         for file_in, file_out in list(zip(files_list, out_paths_list)):
@@ -172,5 +195,4 @@ def run_lcms_lipidomics_workflow(
         pool.close()
         pool.join()
     print("Finished processing all files")
-    """
     # TODO KRH: Add full lipidomics workflow here

From e6af514d56a204732058d9d6471514528694626b Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Fri, 6 Dec 2024 09:20:02 -0800
Subject: [PATCH 06/16] WIP lipid wdl working through scan translator setting

---
 metaMS/lcms_lipidomics_workflow.py | 59 ++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index 8eb935f..1e3aa97 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -121,10 +121,69 @@ def set_params_on_lcms_obj(myLCMSobj, params_toml):
     elif myLCMSobj.polarity == "negative":
         myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Na")
 
+def load_scan_translator(scan_translator=None):
+    """Translate scans using a scan translator
+
+    Parameters
+    ----------
+    scan_translator : str or Path
+        Path to scan translator yaml file
+
+    Returns
+    -------
+    scan_dict : dict
+        Dict with keys as parameter keys and values as lists of scans
+    """
+    # Convert the scan translator to a dictionary
+    if scan_translator is None:
+        scan_translator_dict = {"ms2": {"scan_filter": "", "resolution": "high"}}
+    else:
+        # Convert the scan translator to a dictionary
+        if isinstance(scan_translator, str):
+            scan_translator = Path(scan_translator)
+        # read in the scan translator from toml
+        with open(scan_translator, "r") as f:
+            scan_translator_dict = toml.load(f)
+    for param_key in scan_translator_dict.keys():
+        if scan_translator_dict[param_key]["scan_filter"] == "":
+            scan_translator_dict[param_key]["scan_filter"] = None
+    return scan_translator_dict
+
+def check_scan_translator(myLCMSobj, scan_translator):
+    """Check if scan translator is provided and that it maps correctly to scans and parameters"""
+    scan_translator_dict = load_scan_translator(scan_translator)
+    # Check that the scan translator maps correctly to scans and parameters
+    scan_df = myLCMSobj.scan_df
+    scans_pulled_out = []
+    for param_key in scan_translator_dict.keys():
+        assert param_key in myLCMSobj.parameters.mass_spectrum.keys()
+        assert "scan_filter" in scan_translator_dict[param_key].keys()
+        assert "resolution" in scan_translator_dict[param_key].keys()
+        # Pull out scans that match the scan filter
+        scan_df_sub = scan_df[
+            scan_df.scan_text.str.contains(
+                scan_translator_dict[param_key]["scan_filter"]
+            )
+        ]
+        scans_pulled_out.extend(scan_df_sub.scan.tolist())
+        if len(scan_df_sub) == 0:
+            raise ValueError(
+                "No scans pulled out by scan translator for parameter key: ",
+                param_key,
+                " and scan filter: ",
+                scan_translator_dict[param_key]["scan_filter"],
+            )
+
+    # Check that the scans pulled out by the scan translator are not overlapping and assert error if they are
+    if len(set(scans_pulled_out)) != len(scans_pulled_out):
+        raise ValueError("Overlapping scans pulled out by scan translator")
+
 def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     time_start = datetime.datetime.now()
     myLCMSobj = instantiate_lcms_obj(file_in)           
     set_params_on_lcms_obj(myLCMSobj, params_toml)
+    check_scan_translator(myLCMSobj, scan_translator)
+
 
     # TODO KRH: Add signal processing and ms1 molecular search here
 

From 98fa8341e0477e53673993f20fddc1a7da77e971 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Fri, 6 Dec 2024 09:40:42 -0800
Subject: [PATCH 07/16] WIP lipid wdl working through ms1 signal processing and
 export Mono crash is consistently happening

---
 metaMS/lcms_lipidomics_workflow.py | 86 +++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index 1e3aa97..3e9d5df 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -5,9 +5,11 @@
 from multiprocessing import Pool
 from typing import List
 import click
+import warnings
 
 from corems.mass_spectra.input.mzml import MZMLSpectraParser
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
+from corems.mass_spectra.output.export import LipidomicsExport
 from corems.encapsulation.input.parameter_from_json import (
     load_and_set_toml_parameters_lcms,
 )
@@ -178,13 +180,93 @@ def check_scan_translator(myLCMSobj, scan_translator):
     if len(set(scans_pulled_out)) != len(scans_pulled_out):
         raise ValueError("Overlapping scans pulled out by scan translator")
 
+def add_mass_features(myLCMSobj, scan_translator):
+    """Process ms1 spectra and perform molecular search
+
+    This includes peak picking, adding and processing associated ms1 spectra,
+    integration of mass features, annotation of c13 mass features, deconvolution of ms1 mass features,
+    and adding of peak shape metrics of mass features to the mass feature dataframe.
+
+    Parameters
+    ----------
+    myLCMSobj : corems LCMS object
+        LCMS object to process
+    scan_translator : str or Path
+        Path to scan translator yaml file
+
+    Returns
+    -------
+    None, processes the LCMS object
+    """
+    # Process ms1 spectra
+    myLCMSobj.find_mass_features()
+    myLCMSobj.add_associated_ms1(
+        auto_process=True, use_parser=False, spectrum_mode="profile"
+    )
+    myLCMSobj.integrate_mass_features(drop_if_fail=True)
+    myLCMSobj.find_c13_mass_features()
+    myLCMSobj.deconvolute_ms1_mass_features()
+    myLCMSobj.add_peak_metrics()
+
+    # Add associated ms2 spectra to mass features
+    scan_dictionary = load_scan_translator(scan_translator=scan_translator)
+    for param_key in scan_dictionary.keys():
+        scan_filter = scan_dictionary[param_key]["scan_filter"]
+        if scan_filter == "":
+            scan_filter = None
+        myLCMSobj.add_associated_ms2_dda(
+            spectrum_mode="centroid", ms_params_key=param_key, scan_filter=scan_filter
+        )
+
+def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False):
+    """Export results to hdf5 and csv as a lipid report
+
+    Parameters
+    ----------
+    myLCMSobj : corems LCMS object
+        LCMS object to process
+    out_path : str or Path
+        Path to output file
+    molecular_metadata : dict
+        Dict with molecular metadata
+    final : bool
+        Whether to export final results
+
+    Returns
+    -------
+    None, exports results to hdf5 and csv as a lipid report
+    """
+    exporter = LipidomicsExport(out_path, myLCMSobj)
+    exporter.to_hdf(overwrite=True)
+    if final:
+        # Do not show warnings, these are expected
+        exporter.report_to_csv(molecular_metadata=molecular_metadata)
+    else:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            exporter.report_to_csv()
+
+
 def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     time_start = datetime.datetime.now()
     myLCMSobj = instantiate_lcms_obj(file_in)           
     set_params_on_lcms_obj(myLCMSobj, params_toml)
     check_scan_translator(myLCMSobj, scan_translator)
-
-
+    add_mass_features(myLCMSobj, scan_translator)
+    myLCMSobj.remove_unprocessed_data()
+    #TODO KRH: add molecular search here
+    export_results(myLCMSobj, out_path=out_path, final=False)
+    precursor_mz_list = list(
+        set(
+            [
+                v.mz
+                for k, v in myLCMSobj.mass_features.items()
+                if len(v.ms2_scan_numbers) > 0 and v.isotopologue_type is None
+            ]
+        )
+    )
+    mz_dict = {myLCMSobj.polarity: precursor_mz_list}
+    return mz_dict
     # TODO KRH: Add signal processing and ms1 molecular search here
 
 def run_lcms_lipidomics_workflow(

From e98799011edb5aebdfbcd63e8b5b24bfb984d03c Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Fri, 6 Dec 2024 13:00:08 -0800
Subject: [PATCH 08/16] Add alternate working dockerfile

---
 Dockerfile_test.txt | 26 ++++++++++++++++++++++++++
 Makefile            |  2 +-
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile_test.txt

diff --git a/Dockerfile_test.txt b/Dockerfile_test.txt
new file mode 100644
index 0000000..00369b8
--- /dev/null
+++ b/Dockerfile_test.txt
@@ -0,0 +1,26 @@
+FROM jcarr87/corems-base-py3.10
+
+# set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONNET_RUNTIME=coreclr
+
+# set working directory
+WORKDIR /metams
+
+# Set up the directory structure
+COPY metaMS/ /metams/metaMS/
+COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/
+COPY db/ /metams/db/
+
+# install dependencies
+RUN pip install --upgrade pip
+RUN apt-get update && apt-get install -y libgomp1
+
+# Install the requirements
+#RUN pip install pythonnet
+RUN pip install -r requirements.txt
+
+# Install the MetaMS package in editable mode
+RUN pip install --editable .
+
diff --git a/Makefile b/Makefile
index 1c909c0..a8b8da3 100644
--- a/Makefile
+++ b/Makefile
@@ -72,6 +72,6 @@ wdl-run-gcms :
 
 wdl-run-lipid :
 #TODO KRH: remove the docker-build-local when the docker image is available in dockerhub and
-# update the docker image in the wdl file
+# update the docker image in the wdl file.  Good to rebuild for each run while in development
 	 make docker-build-local
 	 miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files
\ No newline at end of file

From 1776ff73cbe3237c62e1e21ba253d4812b900dab Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Mon, 16 Dec 2024 08:31:31 -0800
Subject: [PATCH 09/16] WIP Switch lipid workflow to query local sqlite
 database

---
 configuration/lipidomics_metams.toml |   2 +-
 metaMS/lcms_lipidomics_workflow.py   | 107 +++++++++++-
 metaMS/lipid_metadata_prepper.py     | 240 +++++++++++++++++++++++++++
 3 files changed, 341 insertions(+), 8 deletions(-)
 create mode 100644 metaMS/lipid_metadata_prepper.py

diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml
index 0b4e426..28be8c6 100644
--- a/configuration/lipidomics_metams.toml
+++ b/configuration/lipidomics_metams.toml
@@ -1,4 +1,4 @@
-directory = "data/raw_data/lipid_data"
+file_paths = ["data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw"]
 output_directory = "output"
 corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml"
 metabref_token_path = "metabref.token"
diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index 3e9d5df..dff7812 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -6,6 +6,7 @@
 from typing import List
 import click
 import warnings
+import pandas as pd
 
 from corems.mass_spectra.input.mzml import MZMLSpectraParser
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
@@ -14,6 +15,8 @@
     load_and_set_toml_parameters_lcms,
 )
 
+from metaMS.lipid_metadata_prepper import get_lipid_library
+
 @dataclass
 class LipidomicsWorkflowParameters:
     """
@@ -42,16 +45,13 @@ class LipidomicsWorkflowParameters:
     metabref_token_path: str = None
     scan_translator_path: str = None
     cores: int = 1
+    #TODO KRH: Add a parameter for the location of the sqlite database
 
 def check_lipidomics_workflow_params(lipid_workflow_params):
     # Check that corems_toml_path exists
     if not Path(lipid_workflow_params.corems_toml_path).exists():
         raise FileNotFoundError("Corems toml file not found, exiting workflow")
     
-    # Check that metabref_token_path exists
-    if not Path(lipid_workflow_params.metabref_token_path).exists():
-        raise FileNotFoundError("Metabref token file not found, exiting workflow")
-    
     # Check that scan_translator_path exists
     if not Path(lipid_workflow_params.scan_translator_path).exists():
         raise FileNotFoundError("Scan translator file not found, exiting workflow")
@@ -267,7 +267,91 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     )
     mz_dict = {myLCMSobj.polarity: precursor_mz_list}
     return mz_dict
-    # TODO KRH: Add signal processing and ms1 molecular search here
+
+def prep_metadata(mz_dicts, out_dir):
+    """Prepare metadata for ms2 spectral search
+
+    Parameters
+    ----------
+    mz_dicts : list of dicts
+        List of dicts with keys "positive" and "negative" and values of lists of precursor mzs
+    out_dir : Path
+        Path to output directory
+
+    Returns
+    -------
+    metadata : dict
+        Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively
+
+    Notes
+    -------
+    Also writes out files for the flash entropy search databases and molecular metadata
+    """
+    metadata = {
+        "mzs": {"positive": None, "negative": None},
+        "fe": {"positive": None, "negative": None},
+        "molecular_metadata": {},
+    }
+    for d in mz_dicts:
+        metadata["mzs"].update(d)
+
+    print("Preparing negative lipid library")
+
+    if metadata["mzs"]["negative"] is not None:
+        metabref_negative, lipidmetadata_negative = get_lipid_library(
+            mz_list=metadata["mzs"]["negative"],
+            polarity="negative",
+            mz_tol_ppm=5,
+            format="flashentropy",
+            normalize=True,
+            fe_kwargs={
+                "normalize_intensity": True,
+                "min_ms2_difference_in_da": 0.02,  # for cleaning spectra
+                "max_ms2_tolerance_in_da": 0.01,  # for setting search space
+                "max_indexed_mz": 3000,
+                "precursor_ions_removal_da": None,
+                "noise_threshold": 0,
+            },
+        )
+        metadata["fe"]["negative"] = metabref_negative
+        metadata["molecular_metadata"].update(lipidmetadata_negative)
+        fe_negative_df = pd.DataFrame.from_dict(
+            {k: v for k, v in enumerate(metadata["fe"]["negative"])}, orient="index"
+        )
+        fe_negative_df.to_csv(out_dir / "ms2_db_negative.csv")
+    
+    print("Preparing positive lipid library")
+    if metadata["mzs"]["positive"] is not None:
+        metabref_positive, lipidmetadata_positive = get_lipid_library(
+            mz_list=metadata["mzs"]["positive"],
+            polarity="positive",
+            mz_tol_ppm=5,
+            format="flashentropy",
+            normalize=True,
+            fe_kwargs={
+                "normalize_intensity": True,
+                "min_ms2_difference_in_da": 0.02,  # for cleaning spectra
+                "max_ms2_tolerance_in_da": 0.01,  # for setting search space
+                "max_indexed_mz": 3000,
+                "precursor_ions_removal_da": None,
+                "noise_threshold": 0,
+            },
+        )
+        metadata["fe"]["positive"] = metabref_positive
+        metadata["molecular_metadata"].update(lipidmetadata_positive)
+        fe_positive_df = pd.DataFrame.from_dict(
+            {k: v for k, v in enumerate(metadata["fe"]["positive"])}, orient="index"
+        )
+        fe_positive_df.to_csv(out_dir / "ms2_db_positive.csv")
+
+    mol_metadata_df = pd.concat(
+        [
+            pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
+            for k, v in metadata["molecular_metadata"].items()
+        ],
+        ignore_index=True,
+    )
+    mol_metadata_df.to_csv(out_dir / "molecular_metadata.csv")
 
 def run_lcms_lipidomics_workflow(
     lipidomics_workflow_paramaters_file=None,
@@ -304,13 +388,14 @@ def run_lcms_lipidomics_workflow(
     files_list = list(file_paths)
     out_paths_list = [out_dir / f.stem for f in files_list]
     
-    # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results
+    # Set the workflow parameters
     cores = lipid_workflow_params.cores
     params_toml = lipid_workflow_params.corems_toml_path
     scan_translator = lipid_workflow_params.scan_translator_path
 
-    click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " files, using " +  str(cores) + " core(s)")
+    click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " file(s), using " +  str(cores) + " core(s)")
     
+    # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results
     if cores == 1 or len(files_list) == 1:
         mz_dicts = []
         for file_in, file_out in list(zip(files_list, out_paths_list)):
@@ -335,5 +420,13 @@ def run_lcms_lipidomics_workflow(
         mz_dicts = pool.starmap(run_lipid_sp_ms1, args)
         pool.close()
         pool.join()
+    
+    # Prepare metadata for searching
+    prep_metadata(mz_dicts, out_dir)
     print("Finished processing all files")
     # TODO KRH: Add full lipidomics workflow here
+
+if __name__ == "__main__":
+    run_lcms_lipidomics_workflow(
+        lipidomics_workflow_paramaters_file="configuration/lipidomics_metams.toml"
+    )
\ No newline at end of file
diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py
new file mode 100644
index 0000000..95d75e9
--- /dev/null
+++ b/metaMS/lipid_metadata_prepper.py
@@ -0,0 +1,240 @@
+import pandas as pd
+import numpy as np
+import sqlite3
+import re
+from ms_entropy import FlashEntropySearch
+from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata
+
+
+def find_closest(A, target):
+    """Find the index of closest value in A to each value in target.
+
+    Parameters
+    ----------
+    A : :obj:`~numpy.array`
+        The array to search (blueprint). A must be sorted.
+    target : :obj:`~numpy.array`
+        The array of values to search for. target must be sorted.
+
+    Returns
+    -------
+    :obj:`~numpy.array`
+        The indices of the closest values in A to each value in target.
+    """
+    idx = A.searchsorted(target)
+    idx = np.clip(idx, 1, len(A) - 1)
+    left = A[idx - 1]
+    right = A[idx]
+    idx -= target - left < right - target
+    return idx
+
+def spectrum_to_array(spectrum, normalize=True):
+    """
+    Convert MetabRef-formatted spectrum to array.
+
+    Parameters
+    ----------
+    spectrum : str
+        MetabRef spectrum, i.e. list of (m/z,abundance) pairs.
+    normalize : bool
+        Normalize the spectrum by its magnitude.
+
+    Returns
+    -------
+    :obj:`~numpy.array`
+        Array of shape (N, 2), with m/z in the first column and abundance in
+        the second.
+
+    """
+
+    # Convert parenthesis-delimited string to array
+    arr = np.array(
+        re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float
+    ).reshape(-1, 2)
+
+    # Normalize the array
+    if normalize:
+        arr[:, -1] = arr[:, -1] / arr[:, -1].sum()
+
+    return arr
+
+def _to_flashentropy(metabref_lib, normalize=True, fe_kwargs={}):
+    """
+    Convert metabref-formatted library to FlashEntropy library.
+
+    Parameters
+    ----------
+    metabref_lib : dict
+        MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation).
+    normalize : bool
+        Normalize each spectrum by its magnitude.
+    fe_kwargs : dict, optional
+        Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search;
+        any keys not recognized will be ignored. By default, all parameters set to defaults.
+
+    Returns
+    -------
+    :obj:`~ms_entropy.FlashEntropySearch`
+        MS2 library as FlashEntropy search instance.
+
+    Raises
+    ------
+    ValueError
+        If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal.
+
+    """
+    # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da
+    if (
+        "min_ms2_difference_in_da" in fe_kwargs
+        or "max_ms2_tolerance_in_da" in fe_kwargs
+    ):
+        if (
+            "min_ms2_difference_in_da" not in fe_kwargs
+            or "max_ms2_tolerance_in_da" not in fe_kwargs
+        ):
+            raise ValueError(
+                "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified."
+            )
+        if (
+            fe_kwargs["min_ms2_difference_in_da"]
+            != 2 * fe_kwargs["max_ms2_tolerance_in_da"]
+        ):
+            raise ValueError(
+                "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'."
+            )
+
+    # Initialize empty library
+    fe_lib = []
+
+    # Enumerate spectra
+    for i, source in enumerate(metabref_lib):
+        # Reorganize source dict, if necessary
+        if "spectrum_data" in source.keys():
+            spectrum = source["spectrum_data"]
+        else:
+            spectrum = source
+
+        # Rename precursor_mz key for FlashEntropy
+        if "precursor_mz" not in spectrum.keys():
+            spectrum["precursor_mz"] = spectrum.pop("precursor_ion")
+
+        # Convert CoreMS spectrum to array and clean, store as `peaks`
+        spectrum["peaks"] = spectrum_to_array(
+            spectrum["mz"], normalize=normalize
+        )
+
+        # Add spectrum to library
+        fe_lib.append(spectrum)
+
+    # Initialize FlashEntropy
+    fe_init_kws = [
+        "max_ms2_tolerance_in_da",
+        "mz_index_step",
+        "low_memory",
+        "path_data",
+    ]
+    fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws}
+    fes = FlashEntropySearch(**fe_init_kws)
+
+    # Build FlashEntropy index
+    fe_index_kws = [
+        "max_indexed_mz",
+        "precursor_ions_removal_da",
+        "noise_threshold",
+        "min_ms2_difference_in_da",
+        "max_peak_num",
+    ]
+    fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws}
+    fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True)
+
+    return fes
+
+def _dict_to_dataclass(metabref_lib, data_class):
+    """
+    Convert dictionary to dataclass.
+
+    Notes
+    -----
+    This function will pull the attributes a dataclass and its parent class
+    and convert the dictionary to a dataclass instance with the appropriate
+    attributes.
+
+    Parameters
+    ----------
+    data_class : :obj:`~dataclasses.dataclass`
+        Dataclass to convert to.
+    metabref_lib : dict
+        Metabref dictionary object to convert to dataclass.
+
+    Returns
+    -------
+    :obj:`~dataclasses.dataclass`
+        Dataclass instance.
+
+    """
+
+    # Get list of expected attributes of data_class
+    data_class_keys = list(data_class.__annotations__.keys())
+
+    # Does the data_class inherit from another class, if so, get the attributes of the parent class as well
+    if len(data_class.__mro__) > 2:
+        parent_class_keys = list(data_class.__bases__[0].__annotations__.keys())
+        data_class_keys = list(set(data_class_keys + parent_class_keys))
+
+    # Remove keys that are not in the data_class from the input dictionary
+    input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys}
+
+    # Add keys that are in the data class but not in the input dictionary as None
+    for key in data_class_keys:
+        if key not in input_dict.keys():
+            input_dict[key] = None
+    return data_class(**input_dict)
+
+def get_lipid_library(
+        mz_list,
+        polarity,
+        mz_tol_ppm,
+        format='flashentropy',
+        normalize=True,
+        fe_kwargs={},
+):
+
+    # connect to the database
+    conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite')
+
+    # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz
+    mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn)
+    mz_all = mz_all.sort_values(by='precursor_mz')
+
+    # filter by polarity
+    mz_subset = mz_all[mz_all['polarity'] == polarity].copy()
+    mz_subset['closest_mz'] = mz_all['precursor_mz'].apply(lambda x: mz_list[find_closest(np.array(mz_list), x)])
+    mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz']) / mz_subset['closest_mz'] * 1e6
+    mz_subset = mz_subset[np.abs(mz_subset['ppm_error']) <= mz_tol_ppm]
+
+    # get the full lipidMassSpectrumObject table for the filtered ms2 ids
+    mz_subset_ids = mz_subset['id'].tolist()
+    mz_subset_ids = tuple(mz_subset_ids)
+    mz_subset_full = pd.read_sql_query(f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_subset_ids}", conn)
+
+    # get the lipid tree for the filtered molecular ids
+    mol_ids = mz_subset_full['molecular_data_id'].tolist()
+    mol_ids = tuple(mol_ids)
+    lipid_tree = pd.read_sql_query(f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", conn)
+
+    # convert molecular data to dictionary of LipidMetadata objects, with mol_id as key
+    lipid_tree = lipid_tree.set_index('id')
+    lipid_tree = lipid_tree.to_dict(orient='index')
+    lipid_metadata = {
+            k: _dict_to_dataclass(v, LipidMetadata)
+            for k, v in lipid_tree.items()
+        }
+
+    # convert ms2 data to flashentropy library
+    mz_subset_full = mz_subset_full.to_dict(orient='records')
+    fe_lib = _to_flashentropy(mz_subset_full, normalize=normalize)
+
+    # close the connection
+    conn.close()
+
+    return fe_lib, lipid_metadata
\ No newline at end of file

From 3ad017e36faeab79b3b4faa9ca77581cda49394f Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Mon, 16 Dec 2024 09:27:20 -0800
Subject: [PATCH 10/16] WIP add processing of MS2 to lipid workflow'

---
 metaMS/lcms_lipidomics_workflow.py | 124 ++++++++++++++++++++++++++++-
 metaMS/lipid_metadata_prepper.py   |   7 +-
 2 files changed, 124 insertions(+), 7 deletions(-)

diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index dff7812..a73e943 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -10,12 +10,13 @@
 
 from corems.mass_spectra.input.mzml import MZMLSpectraParser
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
+from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra
 from corems.mass_spectra.output.export import LipidomicsExport
 from corems.encapsulation.input.parameter_from_json import (
     load_and_set_toml_parameters_lcms,
 )
 
-from metaMS.lipid_metadata_prepper import get_lipid_library
+from metaMS.lipid_metadata_prepper import get_lipid_library, _to_flashentropy
 
 @dataclass
 class LipidomicsWorkflowParameters:
@@ -353,6 +354,109 @@ def prep_metadata(mz_dicts, out_dir):
     )
     mol_metadata_df.to_csv(out_dir / "molecular_metadata.csv")
 
+    return metadata
+
+def process_ms2(myLCMSobj, metadata, scan_translator):
+    """Process ms2 spectra and perform molecular search
+
+    Parameters
+    ----------
+    myLCMSobj : corems LCMS object
+        LCMS object to process
+    metadata : dict
+        Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively
+
+    Returns
+    -------
+    None, processes the LCMS object
+    """
+    # Perform molecular search on ms2 spectra
+    # Grab fe from metatdata associated with polarity (this is inherently high resolution as its from a in-silico high res library)
+    fe_search = metadata["fe"][myLCMSobj.polarity]
+
+    scan_dictionary = load_scan_translator(scan_translator)
+    ms2_scan_df = myLCMSobj.scan_df[myLCMSobj.scan_df.ms_level == 2]
+
+    # Process high resolution MS2 scans
+    # Collect all high resolution MS2 scans using the scan translator
+    ms2_scans_oi_hr = []
+    for param_key in scan_dictionary.keys():
+        if scan_dictionary[param_key]["resolution"] == "high":
+            scan_filter = scan_dictionary[param_key]["scan_filter"]
+            if scan_filter is not None:
+                ms2_scan_df_hr = ms2_scan_df[
+                    ms2_scan_df.scan_text.str.contains(scan_filter)
+                ]
+            else:
+                ms2_scan_df_hr = ms2_scan_df
+            ms2_scans_oi_hr_i = [
+                x for x in ms2_scan_df_hr.scan.tolist() if x in myLCMSobj._ms.keys()
+            ]
+            ms2_scans_oi_hr.extend(ms2_scans_oi_hr_i)
+    # Perform search on high res scans
+    if len(ms2_scans_oi_hr) > 0:
+        myLCMSobj.fe_search(
+            scan_list=ms2_scans_oi_hr, fe_lib=fe_search, peak_sep_da=0.01
+        )
+
+    # Process low resolution MS2 scans
+    # Collect all low resolution MS2 scans using the scan translator
+    ms2_scans_oi_lr = []
+    for param_key in scan_dictionary.keys():
+        if scan_dictionary[param_key]["resolution"] == "low":
+            scan_filter = scan_dictionary[param_key]["scan_filter"]
+            if scan_filter is not None:
+                ms2_scan_df_lr = ms2_scan_df[
+                    ms2_scan_df.scan_text.str.contains(scan_filter)
+                ]
+            else:
+                ms2_scan_df_lr = ms2_scan_df
+            ms2_scans_oi_lri = [
+                x for x in ms2_scan_df_lr.scan.tolist() if x in myLCMSobj._ms.keys()
+            ]
+            ms2_scans_oi_lr.extend(ms2_scans_oi_lri)
+    # Perform search on low res scans
+    if len(ms2_scans_oi_lr) > 0:
+        # Recast the flashentropy search database to low resolution
+        fe_search_lr = _to_flashentropy(
+            metabref_lib=fe_search,
+            normalize=True,
+            fe_kwargs={
+                "normalize_intensity": True,
+                "min_ms2_difference_in_da": 0.4,
+                "max_ms2_tolerance_in_da": 0.2,
+                "max_indexed_mz": 3000,
+                "precursor_ions_removal_da": None,
+                "noise_threshold": 0,
+            },
+        )
+        myLCMSobj.fe_search(
+            scan_list=ms2_scans_oi_lr, fe_lib=fe_search_lr, peak_sep_da=0.3
+        )
+
+def run_lipid_ms2(out_path, metadata, scan_translator=None):
+    """Run ms2 spectral search and export final results
+
+    Parameters
+    ----------
+    out_path : str or Path
+        Path to output file
+    metadata : dict
+        Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively
+
+    Returns
+    -------
+    None, runs ms2 spectral search and exports final results
+    """
+    # Read in the intermediate results
+    out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5"
+    parser = ReadCoreMSHDFMassSpectra(out_path_hdf5)
+    myLCMSobj = parser.get_lcms_obj()
+
+    # Process ms2 spectra, perform spectral search, and export final results
+    process_ms2(myLCMSobj, metadata, scan_translator=scan_translator)
+    export_results(myLCMSobj, str(out_path), metadata["molecular_metadata"], final=True)
+
 def run_lcms_lipidomics_workflow(
     lipidomics_workflow_paramaters_file=None,
     file_paths=None,
@@ -422,9 +526,21 @@ def run_lcms_lipidomics_workflow(
         pool.join()
     
     # Prepare metadata for searching
-    prep_metadata(mz_dicts, out_dir)
-    print("Finished processing all files")
-    # TODO KRH: Add full lipidomics workflow here
+    metadata = prep_metadata(mz_dicts, out_dir)
+    
+    # Run ms2 spectral search and export final results
+    click.echo("Starting ms2 spectral search and exporting final results")
+    if cores == 1 or len(files_list) == 1:
+        for file_out in out_paths_list:
+            mz_dicts = run_lipid_ms2(
+                file_out, metadata, scan_translator=scan_translator
+            )
+    elif cores > 1:
+        pool = Pool(cores)
+        args = [(file_out, metadata, scan_translator) for file_out in out_paths_list]
+        mz_dicts = pool.starmap(run_lipid_ms2, args)
+        pool.close()
+        pool.join()
 
 if __name__ == "__main__":
     run_lcms_lipidomics_workflow(
diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py
index 95d75e9..771c12f 100644
--- a/metaMS/lipid_metadata_prepper.py
+++ b/metaMS/lipid_metadata_prepper.py
@@ -223,7 +223,8 @@ def get_lipid_library(
     lipid_tree = pd.read_sql_query(f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", conn)
 
     # convert molecular data to dictionary of LipidMetadata objects, with mol_id as key
-    lipid_tree = lipid_tree.set_index('id')
+    lipid_tree['id_index'] = lipid_tree['id']
+    lipid_tree = lipid_tree.set_index('id_index')
     lipid_tree = lipid_tree.to_dict(orient='index')
     lipid_metadata = {
             k: _dict_to_dataclass(v, LipidMetadata)
@@ -231,8 +232,8 @@ def get_lipid_library(
         }
 
     # convert ms2 data to flashentropy library
-    mz_subset_full = mz_subset_full.to_dict(orient='records')
-    fe_lib = _to_flashentropy(mz_subset_full, normalize=normalize)
+    mz_subset_full_dict = mz_subset_full.to_dict(orient='records')
+    fe_lib = _to_flashentropy(mz_subset_full_dict, normalize=normalize, fe_kwargs=fe_kwargs)
 
     # close the connection
     conn.close()

From f6c37ba24c2c53ea9ca4e99f13e306fdf1d976d8 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Mon, 16 Dec 2024 10:15:40 -0800
Subject: [PATCH 11/16] WIP Reconfigure multiprocessing for lipid workflow

---
 metaMS/lcms_lipidomics_workflow.py | 44 ++++++++++++++++--------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index a73e943..7330b82 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -7,6 +7,7 @@
 import click
 import warnings
 import pandas as pd
+import gc
 
 from corems.mass_spectra.input.mzml import MZMLSpectraParser
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
@@ -247,7 +248,6 @@ def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False):
             warnings.simplefilter("ignore")
             exporter.report_to_csv()
 
-
 def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     time_start = datetime.datetime.now()
     myLCMSobj = instantiate_lcms_obj(file_in)           
@@ -498,7 +498,8 @@ def run_lcms_lipidomics_workflow(
     scan_translator = lipid_workflow_params.scan_translator_path
 
     click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " file(s), using " +  str(cores) + " core(s)")
-    
+    gc.collect()
+
     # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results
     if cores == 1 or len(files_list) == 1:
         mz_dicts = []
@@ -511,22 +512,23 @@ def run_lcms_lipidomics_workflow(
             )
             mz_dicts.append(mz_dict)
     elif cores > 1:
-        pool = Pool(cores)
-        args = [
-            (
-                str(file_in),
-                str(file_out),
-                params_toml,
-                scan_translator,
-            )
-            for file_in, file_out in list(zip(files_list, out_paths_list))
-        ]
-        mz_dicts = pool.starmap(run_lipid_sp_ms1, args)
-        pool.close()
-        pool.join()
-    
+        with Pool(cores) as pool:
+            args = [
+                (
+                    str(file_in),
+                    str(file_out),
+                    params_toml,
+                    scan_translator,
+                )
+                for file_in, file_out in zip(files_list, out_paths_list)
+            ]
+            mz_dicts = pool.starmap(run_lipid_sp_ms1, args)
+    gc.collect() 
+        
     # Prepare metadata for searching
     metadata = prep_metadata(mz_dicts, out_dir)
+    del mz_dicts
+    gc.collect()
     
     # Run ms2 spectral search and export final results
     click.echo("Starting ms2 spectral search and exporting final results")
@@ -536,11 +538,11 @@ def run_lcms_lipidomics_workflow(
                 file_out, metadata, scan_translator=scan_translator
             )
     elif cores > 1:
-        pool = Pool(cores)
-        args = [(file_out, metadata, scan_translator) for file_out in out_paths_list]
-        mz_dicts = pool.starmap(run_lipid_ms2, args)
-        pool.close()
-        pool.join()
+        with Pool(cores) as pool:
+            args = [(file_out, metadata, scan_translator) for file_out in out_paths_list]
+            pool.starmap(run_lipid_ms2, args)
+
+    gc.collect()
 
 if __name__ == "__main__":
     run_lcms_lipidomics_workflow(

From de72860da24d8fa1785aa26e5d2be5b3a2c0457f Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Mon, 16 Dec 2024 10:38:21 -0800
Subject: [PATCH 12/16] WIP Fix query for metadata prepper

---
 metaMS/lipid_metadata_prepper.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py
index 771c12f..af5d57f 100644
--- a/metaMS/lipid_metadata_prepper.py
+++ b/metaMS/lipid_metadata_prepper.py
@@ -199,6 +199,12 @@ def get_lipid_library(
         fe_kwargs={},
 ):
 
+    # prepare the mz_list for searching against the database
+    mz_list = pd.DataFrame(mz_list, columns=['mz_obs'])
+    mz_list = mz_list.sort_values(by='mz_obs')
+    mz_list = mz_list.reset_index(drop=True)
+    mz_obs_arr = mz_list['mz_obs'].values
+
     # connect to the database
     conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite')
 
@@ -206,10 +212,14 @@ def get_lipid_library(
     mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn)
     mz_all = mz_all.sort_values(by='precursor_mz')
 
-    # filter by polarity
+    # filter by polarity and if there are any matches within mz_tol_ppm
     mz_subset = mz_all[mz_all['polarity'] == polarity].copy()
-    mz_subset['closest_mz'] = mz_all['precursor_mz'].apply(lambda x: mz_list[find_closest(np.array(mz_list), x)])
-    mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz']) / mz_subset['closest_mz'] * 1e6
+    mz_subset = mz_subset.sort_values(by='precursor_mz')
+    mz_subset = mz_subset.reset_index(drop=True)
+    mz_subset['closest_mz_obs'] = mz_obs_arr[
+        find_closest(mz_obs_arr, mz_subset.precursor_mz.values)
+    ]
+    mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz_obs']) / mz_subset['precursor_mz'] * 1e6
     mz_subset = mz_subset[np.abs(mz_subset['ppm_error']) <= mz_tol_ppm]
 
     # get the full lipidMassSpectrumObject table for the filtered ms2 ids

From bf880558f3940705866e842a33bf9ffc37220540 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Mon, 16 Dec 2024 14:08:34 -0800
Subject: [PATCH 13/16] WIP lipid workflow working with formula searching

---
 .../emsl_lipidomics_corems_params.toml        | 30 ++++++-------
 configuration/lipidomics_metams.toml          |  2 +-
 metaMS/lcms_lipidomics_workflow.py            | 45 +++++++++++++++++--
 metaMS/lipid_metadata_prepper.py              |  1 +
 4 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
index 4d43664..647be6a 100644
--- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
+++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
@@ -17,19 +17,19 @@ peak_picking_method = "persistent homology"
 implemented_peak_picking_methods = [ "persistent homology",]
 mass_feature_cluster_mz_tolerance_rel = 5e-6
 mass_feature_cluster_rt_tolerance = 0.3
-ms1_scans_to_average = 5
+ms1_scans_to_average = 1
 ms1_deconvolution_corr_min = 0.95
 ms2_dda_rt_tolerance = 0.15
 ms2_dda_mz_tolerance = 0.05
 ms2_min_fe_score = 0.3
 search_as_lipids = true
 include_fragment_types = true
-export_profile_spectra = true
-export_eics = true
+export_profile_spectra = false
+export_eics = false
 export_unprocessed_ms1 = false
 verbose_processing = false
-ph_inten_min_rel = 0.05 #TODO KRH: Change this back to reasonable setting after dev (0.005 or 0.0005?)
-ph_persis_min_rel = 0.05
+ph_inten_min_rel = 0.0005
+ph_persis_min_rel = 0.0005
 ph_smooth_it = 0
 
 [mass_spectrum.ms1.molecular_search]
@@ -40,7 +40,7 @@ use_runtime_kendrick_filter = false
 use_min_peaks_filter = true
 min_peaks_per_class = 15
 url_database = ""
-db_jobs = 3
+db_jobs = 1
 db_chunk_size = 300
 ion_charge = -1
 min_hc_filter = 0.3
@@ -64,8 +64,8 @@ isProtonated = true
 isAdduct = false
 ion_types_excluded = []
 ionization_type = "ESI"
-min_ppm_error = -10.0
-max_ppm_error = 10.0
+min_ppm_error = -5.0
+max_ppm_error = 5.0
 min_abun_error = -100.0
 max_abun_error = 100.0
 mz_error_range = 1.5
@@ -73,13 +73,13 @@ error_method = "None"
 mz_error_average = 0.0
 
 [mass_spectrum.ms1.mass_spectrum]
-noise_threshold_method = "relative_abundance"
+noise_threshold_method = "log"
 noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",]
 noise_threshold_min_std = 6
 noise_threshold_min_s2n = 4.0
-noise_threshold_min_relative_abundance = 0.1
+noise_threshold_min_relative_abundance = 0.01
 noise_threshold_absolute_abundance = 1000000.0
-noise_threshold_log_nsigma = 6
+noise_threshold_log_nsigma = 8
 noise_threshold_log_nsigma_corr_factor = 0.463
 noise_threshold_log_nsigma_bins = 500
 noise_min_mz = 0
@@ -163,7 +163,7 @@ noise_threshold_method = "relative_abundance"
 noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",]
 noise_threshold_min_std = 6
 noise_threshold_min_s2n = 4.0
-noise_threshold_min_relative_abundance = 0.1
+noise_threshold_min_relative_abundance = 0.01
 noise_threshold_absolute_abundance = 1000000.0
 noise_threshold_log_nsigma = 6
 noise_threshold_log_nsigma_corr_factor = 0.463
@@ -283,9 +283,9 @@ legacy_resolving_power = false
 legacy_centroid_polyfit = false
 
 [mass_spectrum.ms1.molecular_search.usedAtoms]
-C = [ 10, 100,]
-H = [ 18, 200,]
-N = [ 0, 3,]
+C = [ 11, 70,]
+H = [ 19, 150,]
+N = [ 0, 2,]
 P = [ 0, 1,]
 O = [ 1, 23,]
 S = [ 0, 1,]
diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml
index 28be8c6..d14d324 100644
--- a/configuration/lipidomics_metams.toml
+++ b/configuration/lipidomics_metams.toml
@@ -3,4 +3,4 @@ output_directory = "output"
 corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml"
 metabref_token_path = "metabref.token"
 scan_translator_path = "configuration/lipid_configs/emsl_lipidomics_scan_translator.toml"
-cores = 1
\ No newline at end of file
+cores = 2
\ No newline at end of file
diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index 7330b82..c089d32 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -13,6 +13,7 @@
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
 from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra
 from corems.mass_spectra.output.export import LipidomicsExport
+from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas
 from corems.encapsulation.input.parameter_from_json import (
     load_and_set_toml_parameters_lcms,
 )
@@ -220,6 +221,45 @@ def add_mass_features(myLCMSobj, scan_translator):
             spectrum_mode="centroid", ms_params_key=param_key, scan_filter=scan_filter
         )
 
+def molecular_formula_search(myLCMSobj):
+    """Perform molecular search on ms1 spectra
+
+    Parameters
+    ----------
+    myLCMSobj : corems LCMS object
+        LCMS object to process
+
+    Returns
+    -------
+    None, processes the LCMS object
+    """
+    i = 1
+    # get df of mass features
+    mf_df = myLCMSobj.mass_features_to_df()
+
+    # search molecular formulas for each mass features that are the deconvoluted parent and have a ms2
+    mf_searched = []
+    for mf_id in mf_df.index:
+        if myLCMSobj.mass_features[mf_id].mass_spectrum_deconvoluted_parent:
+            if len(myLCMSobj.mass_features[mf_id].ms2_scan_numbers) > 0:
+                if mf_id not in mf_searched:
+                    if i > 5:  # TODO KRH: remove this when ready
+                        break
+
+                    scan = myLCMSobj.mass_features[mf_id].apex_scan
+                    # Search single spectrum for all peaks that correspond to the same scan
+                    mf_df_scan = mf_df[mf_df.apex_scan == scan]
+                    peaks_to_search = [
+                        myLCMSobj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist()
+                    ]
+                    SearchMolecularFormulas(
+                        myLCMSobj._ms[scan],
+                        first_hit=False,
+                        find_isotopologues=True,
+                    ).run_worker_ms_peaks(peaks_to_search)
+                    mf_searched.extend(mf_df_scan.index.tolist())
+                    i += 1
+
 def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False):
     """Export results to hdf5 and csv as a lipid report
 
@@ -249,13 +289,12 @@ def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False):
             exporter.report_to_csv()
 
 def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
-    time_start = datetime.datetime.now()
     myLCMSobj = instantiate_lcms_obj(file_in)           
     set_params_on_lcms_obj(myLCMSobj, params_toml)
     check_scan_translator(myLCMSobj, scan_translator)
     add_mass_features(myLCMSobj, scan_translator)
     myLCMSobj.remove_unprocessed_data()
-    #TODO KRH: add molecular search here
+    molecular_formula_search(myLCMSobj)
     export_results(myLCMSobj, out_path=out_path, final=False)
     precursor_mz_list = list(
         set(
@@ -480,7 +519,7 @@ def run_lcms_lipidomics_workflow(
             cores=cores,
         )
     
-    # Make output dir and get list of files to process
+    # Make output dir
     out_dir = Path(lipid_workflow_params.output_directory)
     out_dir.mkdir(parents=True, exist_ok=True)
 
diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py
index af5d57f..ce7cfab 100644
--- a/metaMS/lipid_metadata_prepper.py
+++ b/metaMS/lipid_metadata_prepper.py
@@ -207,6 +207,7 @@ def get_lipid_library(
 
     # connect to the database
     conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite')
+    #TODO KRH: change path to a parameter
 
     # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz
     mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn)

From 412511b991a4d535be4573cb50e952560e1be33e Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Wed, 18 Dec 2024 11:41:30 -0800
Subject: [PATCH 14/16] Modify lipid workflow to comply with corems 3.2

---
 metaMS/cli.py                      | 49 +++++++++---------
 metaMS/lcms_lipidomics_workflow.py | 79 +++++++++++++-----------------
 metaMS/lipid_metadata_prepper.py   |  9 +++-
 wdl/metaMS_lcmslipidomics.wdl      |  5 +-
 wdl/metams_input_lipidomics.json   |  2 +-
 5 files changed, 68 insertions(+), 76 deletions(-)

diff --git a/metaMS/cli.py b/metaMS/cli.py
index 32b2b6c..bcafaf8 100644
--- a/metaMS/cli.py
+++ b/metaMS/cli.py
@@ -121,23 +121,6 @@ def dump_lipidomics_toml_template(toml_file_name):
         toml.dump(LipidomicsWorkflowParameters().__dict__, workflow_param)
 
 
-@cli.command(name="dump-lipidomics-corems-toml-template")
-@click.argument("toml_file_name", required=True, type=str)
-def dump_lipidomics_corems_toml_template(toml_file_name):
-    """
-    Writes a toml file with the CoreMS parameters to be used in the lipidomics workflow
-
-    Parameters
-    ----------
-    toml_file_name : str
-        The name of the toml file to write the parameters to
-    """
-    path_obj = Path(toml_file_name).with_suffix(".toml")
-    print("dumping lipidomics corems toml template")
-    pass
-    # TODO KRH: add call for dumping lipidomics corems toml template from corems once we can import it
-
-
 @cli.command(name="run-lipidomics-workflow")
 @click.option(
     "-p",
@@ -168,7 +151,7 @@ def dump_lipidomics_corems_toml_template(toml_file_name):
     help="The path corems parameters toml file",
 )
 @click.option(
-    "-t", "--token_path", required=False, type=str, help="The path to the metabref token"
+    "-d", "--db_location", required=False, type=str, help="The path to the local database"
 )
 @click.option(
     "-s", "--scan_translator_path", required=False, type=str, help="The path to the scan translator file"
@@ -181,7 +164,7 @@ def run_lipidomics_workflow(
     file_paths, 
     output_directory, 
     corems_params, 
-    token_path, 
+    db_location, 
     scan_translator_path, 
     cores
     ):
@@ -189,15 +172,27 @@ def run_lipidomics_workflow(
 
     Parameters
     ----------
-    #TODO KRH: Add this in
+    paramaters_file : str
+        The path to the toml file with the lipidomics workflow parameters
+    file_paths : str
+        The paths to the input files, separated by commas as one string
+    output_directory : str
+        The directory where the output files will be stored
+    corems_params : str
+        The path corems parameters toml file
+    db_location : str
+        The path to the sqlite database for lipid spectra searching
+    scan_translator_path : str
+        The path to the scan translator file
+    cores : int
+        The number of cores to use for processing
     """
     if paramaters_file is not None:
         if cores is not None or file_paths is not None:
-        #TODO KRH: Add all other parameters above
             click.echo("Using parameters file, ignoring other parameters")
-        #run_lcms_lipidomics_workflow(
-        #    lipidomics_workflow_paramaters_file=paramaters_file
-        #)
+        run_lcms_lipidomics_workflow(
+            lipidomics_workflow_paramaters_file=paramaters_file
+        )
     else:
         if cores is None:
             cores = 1
@@ -213,14 +208,14 @@ def run_lipidomics_workflow(
                 "Must provide an output directory if not using a parameters file"
             )
             return
-        if token_path is None:
-            click.echo("No metabref token provided")
+        if db_location is None:
+            click.echo("No database path provided")
             return
         run_lcms_lipidomics_workflow(
             file_paths=file_paths,
             output_directory=output_directory,
             corems_toml_path=corems_params,
-            metabref_token_path=token_path,
+            db_location=db_location,
             scan_translator_path=scan_translator_path,
             cores=cores,
         )
diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py
index c089d32..10c7dc9 100644
--- a/metaMS/lcms_lipidomics_workflow.py
+++ b/metaMS/lcms_lipidomics_workflow.py
@@ -1,9 +1,7 @@
 from dataclasses import dataclass
 import toml
 from pathlib import Path
-import datetime
 from multiprocessing import Pool
-from typing import List
 import click
 import warnings
 import pandas as pd
@@ -13,13 +11,20 @@
 from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader
 from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra
 from corems.mass_spectra.output.export import LipidomicsExport
-from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas
+from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulasLC
 from corems.encapsulation.input.parameter_from_json import (
     load_and_set_toml_parameters_lcms,
 )
 
 from metaMS.lipid_metadata_prepper import get_lipid_library, _to_flashentropy
 
+# Suppress specific warning from the corems.mass_spectrum.input.massList module
+warnings.filterwarnings(
+    "ignore",
+    message="No isotopologue matched the formula_dict: {formula_dict}",
+    module="corems.mass_spectrum.input.massList"
+)
+
 @dataclass
 class LipidomicsWorkflowParameters:
     """
@@ -33,9 +38,8 @@ class LipidomicsWorkflowParameters:
         The directory where the output files will be stored
     corems_toml_path : str
         The path to the corems configuration file
-    metabref_token_path : str
-        The path to the metabref token file
-        See https://metabref.emsl.pnnl.gov/api for more information for how to get a token
+    db_location : str
+        The path to the local sqlite database used for searching lipid ms2 spectra
     scan_translator_path : str
         The path to the scan translator file, optional
     cores : int
@@ -45,10 +49,9 @@ class LipidomicsWorkflowParameters:
     file_paths: tuple = ('data/...', 'data/...')
     output_directory: str = "output"
     corems_toml_path: str = None
-    metabref_token_path: str = None
+    db_location: str = None
     scan_translator_path: str = None
     cores: int = 1
-    #TODO KRH: Add a parameter for the location of the sqlite database
 
 def check_lipidomics_workflow_params(lipid_workflow_params):
     # Check that corems_toml_path exists
@@ -72,8 +75,11 @@ def check_lipidomics_workflow_params(lipid_workflow_params):
     for file_path in lipid_workflow_params.file_paths:
         if ".raw" not in file_path and ".mzML" not in file_path:
             raise ValueError(f"File path {file_path} is not a .raw or .mzML file, exiting workflow")
-
-    #TODO KRH: Add a check that we can access the metabref API with the token
+    
+    # Check that db_location exists
+    if lipid_workflow_params.db_location is not None:
+        if not Path(lipid_workflow_params.db_location).exists():
+            raise FileNotFoundError("Database location not found, exiting workflow")
 
 def instantiate_lcms_obj(file_in):
     """Instantiate a corems LCMS object from a binary file.  Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory)
@@ -233,32 +239,9 @@ def molecular_formula_search(myLCMSobj):
     -------
     None, processes the LCMS object
     """
-    i = 1
-    # get df of mass features
-    mf_df = myLCMSobj.mass_features_to_df()
-
-    # search molecular formulas for each mass features that are the deconvoluted parent and have a ms2
-    mf_searched = []
-    for mf_id in mf_df.index:
-        if myLCMSobj.mass_features[mf_id].mass_spectrum_deconvoluted_parent:
-            if len(myLCMSobj.mass_features[mf_id].ms2_scan_numbers) > 0:
-                if mf_id not in mf_searched:
-                    if i > 5:  # TODO KRH: remove this when ready
-                        break
-
-                    scan = myLCMSobj.mass_features[mf_id].apex_scan
-                    # Search single spectrum for all peaks that correspond to the same scan
-                    mf_df_scan = mf_df[mf_df.apex_scan == scan]
-                    peaks_to_search = [
-                        myLCMSobj.mass_features[x].ms1_peak for x in mf_df_scan.index.tolist()
-                    ]
-                    SearchMolecularFormulas(
-                        myLCMSobj._ms[scan],
-                        first_hit=False,
-                        find_isotopologues=True,
-                    ).run_worker_ms_peaks(peaks_to_search)
-                    mf_searched.extend(mf_df_scan.index.tolist())
-                    i += 1
+    # Perform a molecular search on all of the mass features
+    mol_form_search = SearchMolecularFormulasLC(myLCMSobj)
+    mol_form_search.run_mass_feature_search()
 
 def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False):
     """Export results to hdf5 and csv as a lipid report
@@ -281,7 +264,6 @@ def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False):
     exporter = LipidomicsExport(out_path, myLCMSobj)
     exporter.to_hdf(overwrite=True)
     if final:
-        # Do not show warnings, these are expected
         exporter.report_to_csv(molecular_metadata=molecular_metadata)
     else:
         with warnings.catch_warnings():
@@ -294,6 +276,7 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     check_scan_translator(myLCMSobj, scan_translator)
     add_mass_features(myLCMSobj, scan_translator)
     myLCMSobj.remove_unprocessed_data()
+    #Finally, perform molecular formula search on all ms1 spectra associated with mass features
     molecular_formula_search(myLCMSobj)
     export_results(myLCMSobj, out_path=out_path, final=False)
     precursor_mz_list = list(
@@ -308,7 +291,7 @@ def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator):
     mz_dict = {myLCMSobj.polarity: precursor_mz_list}
     return mz_dict
 
-def prep_metadata(mz_dicts, out_dir):
+def prep_metadata(mz_dicts, out_dir, db_location):
     """Prepare metadata for ms2 spectral search
 
     Parameters
@@ -317,6 +300,8 @@ def prep_metadata(mz_dicts, out_dir):
         List of dicts with keys "positive" and "negative" and values of lists of precursor mzs
     out_dir : Path
         Path to output directory
+    db_location : str
+        Path to lipid database
 
     Returns
     -------
@@ -339,6 +324,7 @@ def prep_metadata(mz_dicts, out_dir):
 
     if metadata["mzs"]["negative"] is not None:
         metabref_negative, lipidmetadata_negative = get_lipid_library(
+            db_location=db_location,
             mz_list=metadata["mzs"]["negative"],
             polarity="negative",
             mz_tol_ppm=5,
@@ -363,6 +349,7 @@ def prep_metadata(mz_dicts, out_dir):
     print("Preparing positive lipid library")
     if metadata["mzs"]["positive"] is not None:
         metabref_positive, lipidmetadata_positive = get_lipid_library(
+            db_location=db_location,
             mz_list=metadata["mzs"]["positive"],
             polarity="positive",
             mz_tol_ppm=5,
@@ -489,8 +476,11 @@ def run_lipid_ms2(out_path, metadata, scan_translator=None):
     """
     # Read in the intermediate results
     out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5"
-    parser = ReadCoreMSHDFMassSpectra(out_path_hdf5)
-    myLCMSobj = parser.get_lcms_obj()
+    # Catch known UserWarning from corems and ignore it
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        parser = ReadCoreMSHDFMassSpectra(out_path_hdf5)
+        myLCMSobj = parser.get_lcms_obj(load_raw=False)
 
     # Process ms2 spectra, perform spectral search, and export final results
     process_ms2(myLCMSobj, metadata, scan_translator=scan_translator)
@@ -501,7 +491,7 @@ def run_lcms_lipidomics_workflow(
     file_paths=None,
     output_directory=None,
     corems_toml_path=None,
-    metabref_token_path=None,
+    db_location=None,
     scan_translator_path=None,
     cores=None,
 ):
@@ -513,7 +503,7 @@ def run_lcms_lipidomics_workflow(
         lipid_workflow_params = LipidomicsWorkflowParameters(
             file_paths= file_paths.split(","),
             output_directory=output_directory,
-            metabref_token_path=metabref_token_path,
+            db_location=db_location,
             scan_translator_path=scan_translator_path,
             corems_toml_path=corems_toml_path,
             cores=cores,
@@ -565,7 +555,8 @@ def run_lcms_lipidomics_workflow(
     gc.collect() 
         
     # Prepare metadata for searching
-    metadata = prep_metadata(mz_dicts, out_dir)
+    click.echo("Preparing metadata for ms2 spectral search")
+    metadata = prep_metadata(mz_dicts, out_dir, lipid_workflow_params.db_location)
     del mz_dicts
     gc.collect()
     
@@ -573,7 +564,7 @@ def run_lcms_lipidomics_workflow(
     click.echo("Starting ms2 spectral search and exporting final results")
     if cores == 1 or len(files_list) == 1:
         for file_out in out_paths_list:
-            mz_dicts = run_lipid_ms2(
+            run_lipid_ms2(
                 file_out, metadata, scan_translator=scan_translator
             )
     elif cores > 1:
diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py
index ce7cfab..b33d07a 100644
--- a/metaMS/lipid_metadata_prepper.py
+++ b/metaMS/lipid_metadata_prepper.py
@@ -123,6 +123,11 @@ def _to_flashentropy(metabref_lib, normalize=True, fe_kwargs={}):
             spectrum["mz"], normalize=normalize
         )
 
+        # Cast "fragment_types" to a list (if present and not already a list)
+        if "fragment_types" in spectrum.keys():
+            if not isinstance(spectrum["fragment_types"], list):
+                spectrum["fragment_types"] = spectrum["fragment_types"].split(",")
+
         # Add spectrum to library
         fe_lib.append(spectrum)
 
@@ -191,6 +196,7 @@ def _dict_to_dataclass(metabref_lib, data_class):
     return data_class(**input_dict)
 
 def get_lipid_library(
+        db_location,
         mz_list,
         polarity,
         mz_tol_ppm,
@@ -206,8 +212,7 @@ def get_lipid_library(
     mz_obs_arr = mz_list['mz_obs'].values
 
     # connect to the database
-    conn = sqlite3.connect('/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite')
-    #TODO KRH: change path to a parameter
+    conn = sqlite3.connect(db_location)
 
     # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz
     mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn)
diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl
index 802fe2d..a5d4bba 100644
--- a/wdl/metaMS_lcmslipidomics.wdl
+++ b/wdl/metaMS_lcmslipidomics.wdl
@@ -14,7 +14,7 @@ task runMetaMSLCMSLipidomics {
         Array[File] file_paths
         String output_directory
         File corems_toml_path
-        File metabref_token_path
+        File db_location
         File scan_translator_path
         Int cores
     }
@@ -24,7 +24,7 @@ task runMetaMSLCMSLipidomics {
             -i ${sep=',' file_paths} \
             -o ${output_directory} \
             -c ${corems_toml_path} \
-            -t ${metabref_token_path} \
+            -d ${db_location} \
             -s ${scan_translator_path} \
             -j ${cores}
     }
@@ -36,5 +36,6 @@ task runMetaMSLCMSLipidomics {
 
     runtime {
         docker: "local-metams:latest"
+        #TODO KRH: Update to pushed version when available
     }
 }
\ No newline at end of file
diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json
index 4cc08d4..e6dabb6 100644
--- a/wdl/metams_input_lipidomics.json
+++ b/wdl/metams_input_lipidomics.json
@@ -4,7 +4,7 @@
     ],
     "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml",
-    "lcmsLipidomics.runMetaMSLCMSLipidomics.metabref_token_path": "./configuration/gcms_corems.toml",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1
 }
\ No newline at end of file

From c0d272046d0a47944ca59ffbcf667f5a81a243ff Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Wed, 18 Dec 2024 14:14:06 -0800
Subject: [PATCH 15/16] WIP Add functioning wdl and configs for lipid workflow
 Pending PiPy release of corems 3.2

---
 Dockerfile                                      |  7 +++++++
 .../emsl_lipidomics_corems_params.toml          | 17 +++++++++--------
 configuration/lipidomics_metams.toml            | 15 +++++++++------
 requirements.txt                                |  5 ++---
 wdl/metams_input_lipidomics.json                |  7 ++++---
 5 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9308ee2..174f5d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,6 +22,13 @@ COPY metaMS/ /metams/metaMS/
 COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/
 COPY db/ /metams/db/
 
+#TODO KRH: Remove this section once the CoreMS package is available on PyPI and installable via the requirements.txt file
+# Copy the CoreMS tar.gz file into the Docker image
+COPY CoreMS-3.2.0.tar.gz /metams/
+
+# Install the CoreMS package from the tar.gz file
+RUN pip install /metams/CoreMS-3.2.0.tar.gz
+
 # Install the MetaMS package in editable mode
 RUN pip3 install --editable .
 
diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
index 647be6a..9620f12 100644
--- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
+++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml
@@ -28,8 +28,8 @@ export_profile_spectra = false
 export_eics = false
 export_unprocessed_ms1 = false
 verbose_processing = false
-ph_inten_min_rel = 0.0005
-ph_persis_min_rel = 0.0005
+ph_inten_min_rel = 0.01
+ph_persis_min_rel = 0.01
 ph_smooth_it = 0
 
 [mass_spectrum.ms1.molecular_search]
@@ -57,7 +57,7 @@ adduct_atoms_neg = [ "Cl", "Br",]
 adduct_atoms_pos = [ "Na", "K",]
 score_methods = [ "S_P_lowest_error", "N_S_P_lowest_error", "lowest_error", "prob_score", "air_filter_error", "water_filter_error", "earth_filter_error",]
 score_method = "prob_score"
-output_min_score = 0.1
+output_min_score = 0.3
 output_score_method = "All Candidates"
 isRadical = false
 isProtonated = true
@@ -66,11 +66,12 @@ ion_types_excluded = []
 ionization_type = "ESI"
 min_ppm_error = -5.0
 max_ppm_error = 5.0
-min_abun_error = -100.0
-max_abun_error = 100.0
+min_abun_error = -50.0
+max_abun_error = 50.0
 mz_error_range = 1.5
 error_method = "None"
 mz_error_average = 0.0
+verbose_processing = false
 
 [mass_spectrum.ms1.mass_spectrum]
 noise_threshold_method = "log"
@@ -97,7 +98,7 @@ calibration_ref_match_method_implemented = [ "legacy", "merged",]
 calibration_ref_match_tolerance = 0.003
 calibration_ref_match_std_raw_error_limit = 1.5
 do_calibration = true
-verbose_processing = true
+verbose_processing = false
 
 [mass_spectrum.ms1.ms_peak]
 kendrick_rounding_method = "floor"
@@ -183,7 +184,7 @@ calibration_ref_match_method_implemented = [ "legacy", "merged",]
 calibration_ref_match_tolerance = 0.003
 calibration_ref_match_std_raw_error_limit = 1.5
 do_calibration = true
-verbose_processing = true
+verbose_processing = false
 
 [mass_spectrum.ms2.ms_peak]
 kendrick_rounding_method = "floor"
@@ -269,7 +270,7 @@ calibration_ref_match_method_implemented = [ "legacy", "merged",]
 calibration_ref_match_tolerance = 0.003
 calibration_ref_match_std_raw_error_limit = 1.5
 do_calibration = true
-verbose_processing = true
+verbose_processing = false
 
 [mass_spectrum.ms2_cid.ms_peak]
 kendrick_rounding_method = "floor"
diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml
index d14d324..c7957f6 100644
--- a/configuration/lipidomics_metams.toml
+++ b/configuration/lipidomics_metams.toml
@@ -1,6 +1,9 @@
-file_paths = ["data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw"]
-output_directory = "output"
-corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml"
-metabref_token_path = "metabref.token"
-scan_translator_path = "configuration/lipid_configs/emsl_lipidomics_scan_translator.toml"
-cores = 2
\ No newline at end of file
+file_paths = [
+    "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_POS_23Jan18_Brandi-WCSH5801.raw",
+    "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_NEG_25Jan18_Brandi-WCSH5801.raw"
+]
+output_directory = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/_blanchard_11_8ws97026/processed_20241218"
+corems_toml_path = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_corems_params.toml"
+db_location = "/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite"
+scan_translator_path = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_scan_translator.toml"
+cores = 4
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index edeb4e1..66e74e9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-corems==3.0.2
+corems==3.2.0
 Click>=7.1.1
-requests
-nmdc-schema>=7.0.0
\ No newline at end of file
+requests
\ No newline at end of file
diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json
index e6dabb6..c9d9796 100644
--- a/wdl/metams_input_lipidomics.json
+++ b/wdl/metams_input_lipidomics.json
@@ -1,10 +1,11 @@
 {
     "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [
-        "./data/raw_data/Blanch_Nat_Lip_C_12_AB_M_17_NEG_25Jan18_Brandi-WCSH5801.raw"
+        "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_POS_23Jan18_Brandi-WCSH5801.raw",
+    "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_NEG_25Jan18_Brandi-WCSH5801.raw"
     ],
     "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output",
-    "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "./configuration/lipid_configs/emsl_lipidomics_corems_params.toml",
-    "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_corems_params.toml",
+    "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml",
     "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1
 }
\ No newline at end of file

From ca45251aa2d7ea2110c069bb70d01175cb99f3d0 Mon Sep 17 00:00:00 2001
From: Katherine Heal <katherine.heal@pnnl.gov>
Date: Thu, 19 Dec 2024 10:36:56 -0800
Subject: [PATCH 16/16] Fix biosample API check

---
 .gitignore                                    |  5 +++-
 .../api_info_retriever.py                     | 28 ++++++++++---------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index c742356..78f7f33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,4 +38,7 @@ share/python-wheels/
 *.egg
 
 # Raw thermo data
-*.raw
\ No newline at end of file
+*.raw
+
+# Metabref token
+*.token
\ No newline at end of file
diff --git a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py
index 3b4f862..1b8f357 100644
--- a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py
+++ b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py
@@ -147,18 +147,20 @@ def check_if_ids_exist(self, ids: list) -> bool:
             If there's an error in making the API request.
         """
         ids_test = list(set(ids))
-        ids_test = [id.replace('"', "'") for id in ids_test]
-        ids_test_str = ", ".join(f'"{id}"' for id in ids_test)
-        filter_param = f'{{"id": {{"$in": [{ids_test_str}]}}}}'
-        og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection=id"
-
-        try:
-            resp = requests.get(og_url)
-            resp.raise_for_status()  # Raises an HTTPError for bad responses
-            data = resp.json()
-            if len(data["resources"]) != len(ids_test):
-                return False
-        except requests.RequestException as e:
-            raise requests.RequestException(f"Error making API request: {e}")
+        for id in ids_test:
+            filter_param = f'{{"id": "{id}"}}'
+            field = "id"
+
+            og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}"
+
+            try:
+                resp = requests.get(og_url)
+                resp.raise_for_status()  # Raises an HTTPError for bad responses
+                data = resp.json()
+                if len(data["resources"]) == 0:
+                    print(f"ID {id} not found")
+                    return False
+            except requests.RequestException as e:
+                raise requests.RequestException(f"Error making API request: {e}")
 
         return True