Skip to content

Commit

Permalink
For runs w/multiple projects, manage adapter-trimmed files (#126)
Browse files Browse the repository at this point in the history
* For runs w/multiple projects, manage adapter-trimmed files

* comment removed
  • Loading branch information
charles-cowart authored Feb 16, 2024
1 parent dd8265e commit 2b15b07
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 1 deletion.
4 changes: 4 additions & 0 deletions sequence_processing_pipeline/FastQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input):
files = [x for x in files if x.endswith('.fastq.gz') and
'zero_files' not in x]

# remove fastq files in the only-adapter-filtered
# folder from consideration if they are present.
files = [x for x in files if 'only-adapter-filtered' not in x]

# break files up into R1, R2, I1, I2
# assume _R1_ does not occur in the path as well.
r1_only = [x for x in files if '_R1_' in x]
Expand Down
29 changes: 29 additions & 0 deletions sequence_processing_pipeline/NuQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,26 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
for fp in files_to_move:
move(fp, dst)

@staticmethod
def _move_trimmed_files(project_name, output_path):
'''
Given output_path, move all fastqs to a new subdir named project_name.
:param project_name: The name of the new folder to be created.
:param output_path: The path to scan for fastq files.
:return: None
'''

if exists(output_path):
pattern = f"{output_path}/*.fastq.gz"

# this directory shouldn't already exist.
makedirs(join(output_path, project_name), exist_ok=False)

for trimmed_file in list(glob.glob(pattern)):
move(trimmed_file, join(output_path, project_name))
else:
raise ValueError(f"'{output_path}' does not exist")

def run(self, callback=None):
# now a single job-script will be created to process all projects at
# the same time, and intelligently handle adapter-trimming as needed
Expand Down Expand Up @@ -244,6 +264,15 @@ def run(self, callback=None):
pattern = f"{source_dir}/*.fastq.gz"
completed_files = list(glob.glob(pattern))

# if the 'only-adapter-filtered' directory exists, move the files
# into a unique location so that files from multiple projects
# don't overwrite each other.
trimmed_only_path = join(self.output_path,
'only-adapter-filtered')

if exists(trimmed_only_path):
NuQCJob._move_trimmed_files(project_name, trimmed_only_path)

if needs_human_filtering is True:
filtered_directory = join(source_dir, 'filtered_sequences')
else:
Expand Down
63 changes: 62 additions & 1 deletion sequence_processing_pipeline/tests/test_NuQCJob.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import shutil
import unittest
from os.path import join, abspath, exists
from os.path import join, abspath, exists, dirname
from functools import partial
from sequence_processing_pipeline.NuQCJob import NuQCJob
from sequence_processing_pipeline.PipelineError import PipelineError
from os import makedirs, remove
from metapool import load_sample_sheet
import glob


class TestNuQCJob(unittest.TestCase):
Expand Down Expand Up @@ -546,6 +547,10 @@ def tearDown(self):
if exists(self.tmp_file_path):
remove(self.tmp_file_path)

# for test_move_trimmed_files()
if exists(self.path('NuQCJob')):
shutil.rmtree(self.path('NuQCJob'))

def test_nuqcjob_creation(self):
# use good-sample-sheet as the basis for a sample Metatranscriptomic
with self.assertRaises(PipelineError) as e:
Expand Down Expand Up @@ -1204,6 +1209,41 @@ def test_regular_expressions(self):

self._helper(job.json_regex, good_names, bad_names)

def test_move_trimmed(self):
# Note: this test does not make use of the output_dir that other
# tests use.

for dummy_fp in SAMPLE_DIR:
dummy_fp = self.path(dummy_fp)
dummy_path = dirname(dummy_fp)
makedirs(dummy_path, exist_ok=True)
with open(dummy_fp, 'w') as f:
f.write("This is a dummy file.\n")

trimmed_only_path = self.path('NuQCJob', 'only-adapter-filtered')

NuQCJob._move_trimmed_files('NPH_15288', trimmed_only_path)

new_path = join(trimmed_only_path, 'NPH_15288')
pattern = f"{new_path}/*.fastq.gz"

exp = [
('only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001.'
'fastq.gz')]

for trimmed_file in list(glob.glob(pattern)):
trimmed_file = trimmed_file.split('NuQCJob/')[-1]
if trimmed_file not in exp:
self.assertIn(trimmed_file, exp)

def _helper(self, regex, good_names, bad_names):
for good_name in good_names:
substr = regex.search(good_name)
Expand All @@ -1214,5 +1254,26 @@ def _helper(self, regex, good_names, bad_names):
self.assertIsNone(substr, msg=f'Regex failed on {bad_name}')


SAMPLE_DIR = [
'NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json',
'NuQCJob/process_all_fastq_files.sh',
'NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed',
'NuQCJob/logs/slurm-1897981_1.out',
'NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1']

if __name__ == '__main__':
unittest.main()

0 comments on commit 2b15b07

Please sign in to comment.