Skip to content

Commit

Permalink
Added support for kraken2 (#75)
Browse files Browse the repository at this point in the history
* Added support for kraken2

Added support for kraken2. Multiple libraries are supported for minimap2 as well.

* Remove unneeded print() statements

* Added unittest

Added unittest for performing multiple minimap2 passes and kraken2.

* flake8

* Removed hard-coded path from unittest
  • Loading branch information
charles-cowart authored Apr 20, 2022
1 parent a83fc65 commit 32d7a94
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 83 deletions.
57 changes: 37 additions & 20 deletions sequence_processing_pipeline/QCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,19 @@

class QCJob(Job):
def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
mmi_db_path, queue_name, node_count, nprocs, wall_time_limit,
jmem, fastp_path, minimap2_path, samtools_path,
modules_to_load, qiita_job_id, pool_size, max_array_length):
minimap_database_paths, kraken2_database_path, queue_name,
node_count, nprocs, wall_time_limit, jmem, fastp_path,
minimap2_path, samtools_path, modules_to_load, qiita_job_id,
pool_size, max_array_length):
"""
Submit a Torque job where the contents of fastq_root_dir are processed
using fastp, minimap, and samtools. Human-genome sequences will be
filtered out, if needed.
:param fastq_root_dir: Path to a dir of Fastq files, org. by project.
:param output_path: Path where all pipeline-generated files live.
:param sample_sheet_path: Path to a sample sheet file.
:param mmi_db_path: Path to human genome database in running env.
:param minimap_database_paths: Path to human genome databases in env.
:param kraken2_database_path: Path to human genome dat abase in env.
:param queue_name: Torque queue name to use in running env.
:param node_count: Number of nodes to use in running env.
:param nprocs: Number of processes to use in runing env.
Expand All @@ -49,7 +51,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
self.needs_trimming = metadata['needs_adapter_trimming']
self.nprocs = 16 if nprocs > 16 else nprocs
self.chemistry = metadata['chemistry']
self.mmi_db_path = mmi_db_path
self.minimap_database_paths = minimap_database_paths
self.kraken2_database_path = kraken2_database_path
self.queue_name = queue_name
self.node_count = node_count
self.wall_time_limit = wall_time_limit
Expand Down Expand Up @@ -382,15 +385,6 @@ def _gen_fastp_cmd(self, current_dir, filename1, filename2, project_dir,
def _gen_chained_cmd(self, current_dir, filename1, filename2,
fastp_reports_dir, project_name, project_dir,
adapter_a, adapter_A):
"""
Generates a command-line string for running fastp piping directly
into minimap2 piping directly into samtools. The string is based
on the parameters supplied to the object.
:param fastp_reports_dir: Path to dir for storing json and html files
:param human_phix_db_path: Path to the human_phix_db_path.mmi
:return: A string suitable for executing in Popen() and the like.
"""

read1_input_path = join(current_dir, filename1)
read2_input_path = join(current_dir, filename2)

Expand All @@ -411,18 +405,41 @@ def _gen_chained_cmd(self, current_dir, filename1, filename2,
path2 = join(partial, filename2.replace('.fastq.gz',
'.trimmed.fastq.gz'))

kraken_report_path = join(partial,
filename1.replace('.fastq.gz',
'.kraken2_report.txt'))

kraken_output_path = join(partial,
filename1.replace('.fastq.gz',
'.kraken2.trimmed.#.fastq')
)

result = self.fastp_path

if adapter_a:
# assume that if adapter_a is not None, then adapter_A is not None
# as well. We performed this check already.
result += (f' --adapter_sequence {adapter_a}'
f' --adapter_sequence_r2 {adapter_A}')

result += (f' -l 100 -i {read1_input_path} -I {read2_input_path} '
f'-w {self.nprocs} -j {json_output_path} -h '
f'{html_output_path} --stdout | {self.minimap2_path} -ax'
f' sr -t {self.nprocs} {self.mmi_db_path} - -a | '
f'{self.samtools_path} fastq -@ {self.nprocs} -f 12 -F '
f'256 -1 {path1} -2 {path2}')
result += (f' -l 100 -i {read1_input_path} -I {read2_input_path} -w '
f'{self.nprocs} -j {json_output_path} -h {html_output_path}'
' --stdout ')

# for each database specified in configuration, run the data through
# minimap using the database and pipe to samtools to tame the output.
for database in self.minimap_database_paths:
result += (f'| {self.minimap2_path} -ax sr -t {self.nprocs} '
f'{database} - -a | {self.samtools_path} fastq -@ '
f'{self.nprocs} -f 12 -F 256 ')

# append the final parameters to write out the final output to disk
result += f'-1 {path1} -2 {path2}'

# add krakken2
result += (f'\nkraken2 --threads {self.nprocs} --db '
f'{self.kraken2_database_path} --report '
f'{kraken_report_path} --unclassified-out '
f'{kraken_output_path} --paired {path1} {path2}')

return result
3 changes: 2 additions & 1 deletion sequence_processing_pipeline/configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
"nprocs": 16,
"queue": "qiita",
"wallclock_time_in_hours": 1,
"mmi_db": "/databases/minimap2/human-phix-db.mmi",
"minimap_databases": ["/databases/minimap2/human-phix-db.mmi"],
"kraken2_database": "/databases/minimap2/hp_kraken-db.mmi",
"modules_to_load": ["fastp_0.20.1", "samtools_1.12", " minimap2_2.18"],
"fastp_executable_path": "fastp",
"minimap2_executable_path": "minimap2",
Expand Down
1 change: 0 additions & 1 deletion sequence_processing_pipeline/tests/test_FastQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,6 @@ def setUp(self):
# write files out to disk
for file_name in file_names:
with open(file_name, 'w') as f2:
print(file_name)
f2.write("This is a file.")

def tearDown(self):
Expand Down
1 change: 0 additions & 1 deletion sequence_processing_pipeline/tests/test_Job.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def test_group_commands(self):
cmds = list(range(1, 8))
cmds = [str(x) for x in cmds]
results = job._group_commands(cmds)
print(results)
self.assertEqual(results[0], '1;3;5;7')
self.assertEqual(results[1], '2;4;6')
self.assertEqual(len(results), 2)
Expand Down
Loading

0 comments on commit 32d7a94

Please sign in to comment.