For runs w/multiple projects, manage adapter-trimmed files (#126)

* For runs w/multiple projects, manage adapter-trimmed files * comment removed
biocore · Feb 16, 2024 · 2b15b07 · 2b15b07
1 parent dd8265e
commit 2b15b07
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 1 deletion.
diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py
@@ -92,6 +92,10 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input):
             files = [x for x in files if x.endswith('.fastq.gz') and
                      'zero_files' not in x]
 
+            # remove fastq files in the only-adapter-filtered
+            # folder from consideration if they are present.
+            files = [x for x in files if 'only-adapter-filtered' not in x]
+
             # break files up into R1, R2, I1, I2
             # assume _R1_ does not occur in the path as well.
             r1_only = [x for x in files if '_R1_' in x]

diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
@@ -202,6 +202,26 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
         for fp in files_to_move:
             move(fp, dst)
 
+    @staticmethod
+    def _move_trimmed_files(project_name, output_path):
+        '''
+        Given output_path, move all fastqs to a new subdir named project_name.
+        :param project_name: The name of the new folder to be created.
+        :param output_path: The path to scan for fastq files.
+        :return: None
+        '''
+
+        if exists(output_path):
+            pattern = f"{output_path}/*.fastq.gz"
+
+            # this directory shouldn't already exist.
+            makedirs(join(output_path, project_name), exist_ok=False)
+
+            for trimmed_file in list(glob.glob(pattern)):
+                move(trimmed_file, join(output_path, project_name))
+        else:
+            raise ValueError(f"'{output_path}' does not exist")
+
     def run(self, callback=None):
         # now a single job-script will be created to process all projects at
         # the same time, and intelligently handle adapter-trimming as needed
@@ -244,6 +264,15 @@ def run(self, callback=None):
             pattern = f"{source_dir}/*.fastq.gz"
             completed_files = list(glob.glob(pattern))
 
+            # if the 'only-adapter-filtered' directory exists, move the files
+            # into a unique location so that files from multiple projects
+            # don't overwrite each other.
+            trimmed_only_path = join(self.output_path,
+                                     'only-adapter-filtered')
+
+            if exists(trimmed_only_path):
+                NuQCJob._move_trimmed_files(project_name, trimmed_only_path)
+
             if needs_human_filtering is True:
                 filtered_directory = join(source_dir, 'filtered_sequences')
             else:

diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py
@@ -1,11 +1,12 @@
 import shutil
 import unittest
-from os.path import join, abspath, exists
+from os.path import join, abspath, exists, dirname
 from functools import partial
 from sequence_processing_pipeline.NuQCJob import NuQCJob
 from sequence_processing_pipeline.PipelineError import PipelineError
 from os import makedirs, remove
 from metapool import load_sample_sheet
+import glob
 
 
 class TestNuQCJob(unittest.TestCase):
@@ -546,6 +547,10 @@ def tearDown(self):
         if exists(self.tmp_file_path):
             remove(self.tmp_file_path)
 
+        # for test_move_trimmed_files()
+        if exists(self.path('NuQCJob')):
+            shutil.rmtree(self.path('NuQCJob'))
+
     def test_nuqcjob_creation(self):
         # use good-sample-sheet as the basis for a sample Metatranscriptomic
         with self.assertRaises(PipelineError) as e:
@@ -1204,6 +1209,41 @@ def test_regular_expressions(self):
 
         self._helper(job.json_regex, good_names, bad_names)
 
+    def test_move_trimmed(self):
+        # Note: this test does not make use of the output_dir that other
+        # tests use.
+
+        for dummy_fp in SAMPLE_DIR:
+            dummy_fp = self.path(dummy_fp)
+            dummy_path = dirname(dummy_fp)
+            makedirs(dummy_path, exist_ok=True)
+            with open(dummy_fp, 'w') as f:
+                f.write("This is a dummy file.\n")
+
+        trimmed_only_path = self.path('NuQCJob', 'only-adapter-filtered')
+
+        NuQCJob._move_trimmed_files('NPH_15288', trimmed_only_path)
+
+        new_path = join(trimmed_only_path, 'NPH_15288')
+        pattern = f"{new_path}/*.fastq.gz"
+
+        exp = [
+            ('only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001.'
+             'fastq.gz'),
+            ('only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001.'
+             'fastq.gz'),
+            ('only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001.'
+             'fastq.gz'),
+            ('only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001.'
+             'fastq.gz'),
+            ('only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001.'
+             'fastq.gz')]
+
+        for trimmed_file in list(glob.glob(pattern)):
+            trimmed_file = trimmed_file.split('NuQCJob/')[-1]
+            if trimmed_file not in exp:
+                self.assertIn(trimmed_file, exp)
+
     def _helper(self, regex, good_names, bad_names):
         for good_name in good_names:
             substr = regex.search(good_name)
@@ -1214,5 +1254,26 @@ def _helper(self, regex, good_names, bad_names):
             self.assertIsNone(substr, msg=f'Regex failed on {bad_name}')
 
 
+SAMPLE_DIR = [
+    'NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz',
+    'NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz',
+    'NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz',
+    'NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz',
+    'NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz',
+    'NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html',
+    'NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html',
+    'NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html',
+    'NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html',
+    'NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html',
+    'NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json',
+    'NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json',
+    'NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json',
+    'NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json',
+    'NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json',
+    'NuQCJob/process_all_fastq_files.sh',
+    'NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed',
+    'NuQCJob/logs/slurm-1897981_1.out',
+    'NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1']
+
 if __name__ == '__main__':
     unittest.main()