Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new phoenix qc entrypoint #83

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions bin/check_samplesheet_qc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/usr/bin/env python

# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv

import os
import sys
import errno
import argparse
import gzip
import re

def parse_args(args=None):
Description = "Reformat cdcgov/phoenix samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:

sample,fastq_1,fastq_2,fastp_pass_json,fastp_failed_json,spades,mlst,quast,amrfinderplus
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,passed_fastp.json,failed.fastp.json,spades.fasta,mlst.tsv,quast.tsv,amrfinderplus.tsv

sample A sample name for the input
fastq_1 R1 of reads run through Fastp
fastq_2 R2 of reads run through Fastp
fastp_pass_json JSON output from initial Fastp run
fastp_failed_json JSON output from rerun of Fastp on failed reads
spades Assembly created by SPAdes
mlst TSV output from mlst tool
quast TSV report generated from quast
amrfinderplus TSV report generated from amrfinderplus
"""

sample_mapping_dict = {}
with open(file_in, "r") as fin:

## Check header
MIN_COLS = 2
HEADER = ['sample', 'fastq_1', 'fastq_2', 'fastp_pass_json', 'fastp_failed_json', 'spades', 'mlst', 'quast', 'amrfinderplus']

header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
line,
)
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus = lspl[: len(HEADER)]
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): # If file is not gzipped then gzip it.
fastq_gz = fastq + ".gz"
with open(fastq, "rb") as f_in:
with gzip.open(fastq_gz, 'wb') as f_out:
f_out.writelines(f_in)
print("FastQ file does not have extension '.fastq.gz' or '.fq.gz'! Zipping file.",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus]
else:
print_error("Invalid combination of columns provided!", "Line", line)

## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "fastp_pass_json", "fastp_failed_json", "spades", "mlst", "quast", "amrfinderplus"]) + "\n")
for sample in sorted(sample_mapping_dict.keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))

# for idx, val in enumerate(sample_mapping_dict[sample]):
# fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
for idx, val in enumerate(sample_mapping_dict[sample]):
if not val[1].endswith(".gz"): # check that forward read is a gzip file
val[1] = re.sub(".fastq$", ".fastq.gz", val[1])
val[1] = re.sub(".fq$", ".fq.gz", val[1])
if not val[2].endswith(".gz"): # check that reverse read is a gzip file
val[2] = re.sub(".fastq$", ".fastq.gz", val[2])
val[2] = re.sub(".fq$", ".fq.gz", val[2])
fout.write(",".join(["{}".format(sample)] + val) + "\n")
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
9 changes: 9 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ WorkflowMain.initialise(workflow, params, log)
*/

include { PHOENIX_EXTERNAL } from './workflows/phoenix'
include { PHOENIX_QC_EXTERNAL } from './workflows/phoenix_qc'
include { PHOENIX_EXQC } from './workflows/cdc_phoenix'
include { SRA_PHOENIX } from './workflows/sra_phoenix'
include { SCAFFOLD_EXTERNAL } from './workflows/scaffolds'
Expand All @@ -51,6 +52,14 @@ workflow PHOENIX {
gamma_ar = PHOENIX_EXTERNAL.out.gamma_ar
}

//
// WORKFLOW: Run QC steps from main cdcgov/phoenix analysis pipeline
//
workflow PHOENIX_QC {
main:
PHOENIX_QC_EXTERNAL ()
}

//
// WORKFLOW: Run internal version of cdcgov/phoenix analysis pipeline that includes BUSCO, SRST2 and KRAKEN_ASMBLED
//
Expand Down
28 changes: 28 additions & 0 deletions modules/local/samplesheet_qc_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
process SAMPLESHEET_QC_CHECK {
tag "$samplesheet"
label 'process_low'
container 'quay.io/jvhagey/phoenix:base_v1.0.0'

/*container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/python:3.8.3' :
'quay.io/biocontainers/python:3.8.3' }"*/

input:
path samplesheet

output:
path '*.valid.csv' , emit: csv
path "versions.yml", emit: versions

script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/
"""
check_samplesheet_qc.py \\
$samplesheet \\
samplesheet.valid.csv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}
110 changes: 110 additions & 0 deletions subworkflows/local/input_qc_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
//
// Check input samplesheet and get read channels
//

include { SAMPLESHEET_QC_CHECK } from '../../modules/local/samplesheet_qc_check'

workflow INPUT_QC_CHECK {
take:
samplesheet // file: /path/to/samplesheet.csv

main:
SAMPLESHEET_QC_CHECK ( samplesheet )
.csv
.splitCsv ( header:true, sep:',' )
.map { create_qc_channels(it) }
.set {ch_samples }

// Create channels to match upstream processes
// FASTP_TRIMD.out.reads -> tuple val(meta), path('*.trim.fastq.gz'), optional:true, emit: reads
ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus ->
[ [id:meta.id, single_end:true], reads]
}.set { ch_reads }

// GATHERING_READ_QC_STATS: tuple val(meta), path(fastp_trimd_json), path(fastp_singles_json)
ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus ->
[ [id:meta.id, single_end:true], fastp_pass_json, fastp_failed_json]
}.set { ch_fastp_json }

// SPADES_WF.out.spades_ch -> SPADES.out.scaffolds.map{meta, scaffolds -> [ [id:meta.id, single_end:true], scaffolds]}
ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus ->
[ [id:meta.id, single_end:true], spades]
}.set { ch_spades }

// MLST.out.tsv -> tuple val(meta), path("*.tsv"), emit: tsv
ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus ->
[ [id:meta.id, single_end:true], mlst]
}.set { ch_mlst }

// QUAST.out.report_tsv -> tuple val(meta), path("*.tsv"), emit: tsv
ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus ->
[ [id:meta.id, single_end:true], quast]
}.set { ch_quast }

// AMRFINDERPLUS_RUN.out.report -> tuple val(meta), path("${meta.id}_all_genes.tsv"), emit: report
ch_samples.map{meta, reads, fastp_pass_json, fastp_failed_json, spades, mlst, quast, amrfinderplus ->
[ [id:meta.id, single_end:true], amrfinderplus]
}.set { ch_amrfinderplus }

emit:
reads = ch_reads
fastp_json = ch_fastp_json
spades = ch_spades
mlst = ch_mlst
quast = ch_quast
amrfinderplus = ch_amrfinderplus
valid_samplesheet = SAMPLESHEET_QC_CHECK.out.csv
versions = SAMPLESHEET_QC_CHECK.out.versions // channel: [ versions.yml ]
}

// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
def create_qc_channels(LinkedHashMap row) {
def meta = [:]
meta.id = row.sample
meta.single_end = row.single_end.toBoolean()
missing_input = false

def array = []
if (!file(row.fastq_1).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}"
}

if (!meta.single_end) {
if (!file(row.fastq_2).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
}
}

// Check remaining files
if (!file(row.fastp_pass_json).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Fastp passed reads JSON file does not exist!\n${row.fastp_pass_json}"
}

if (!file(row.fastp_failed_json).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Fastp failed reads JSON file does not exist!\n${row.fastp_failed_json}"
}

if (!file(row.spades).exists()) {
exit 1, "ERROR: Please check input samplesheet -> SPAdes assembly file does not exist!\n${row.spades}"
}

if (!file(row.mlst).exists()) {
exit 1, "ERROR: Please check input samplesheet -> MLST TSV report file does not exist!\n${row.mlst}"
}

if (!file(row.quast).exists()) {
exit 1, "ERROR: Please check input samplesheet -> QUAST TSV report file does not exist!\n${row.quast}"
}

if (!file(row.amrfinderplus).exists()) {
exit 1, "ERROR: Please check input samplesheet -> AMRFinder+ report file does not exist!\n${row.amrfinderplus}"
}

if (meta.single_end) {
array = [ meta, [ file(row.fastq_1) ], file(row.fastp_pass_json), file(row.fastp_failed_json), file(row.spades), file(row.mlst), file(row.quast), file(row.amrfinderplus) ]
} else {
array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ], file(row.fastp_pass_json), file(row.fastp_failed_json), file(row.spades), file(row.mlst), file(row.quast), file(row.amrfinderplus) ]
}

return array
}
Loading