diff --git a/.gitignore b/.gitignore index 9c9e155..45b43e1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ setup.log /tests/*.dSYM /install +.vstags +.nextflow +nextflow/.nextflow* diff --git a/nextflow/main.nf b/nextflow/main.nf new file mode 100644 index 0000000..a3d912e --- /dev/null +++ b/nextflow/main.nf @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2014-2022 Genome Research Ltd + * + * Author: CASM/Cancer IT + * + * This file is part of CaVEMan. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * 1. The usage of a range of years within a copyright statement contained within + * this distribution should be interpreted as being equivalent to a list of years + * including the first and last year specified and all consecutive years between + * them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- + * 2009, 2011-2012’ should be interpreted as being identical to a statement that + * reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright + * statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being + * identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, + * 2009, 2010, 2011, 2012’. + */ + +nextflow.enable.dsl=2 + +def helpMessage() { + log.info """ + Usage: + nextflow run https://github.com/cancerit/CaVEMan -entry --help + Available entry points: + - caveman + - caveman_np + """.stripIndent() +} + +def helpCavemanMessage() { + log.info """ + Usage: + nextflow run https://github.com/cancerit/CaVEMan -entry caveman --help + Required parameters: + --outdir Folder to output result to. + --genomefa Path to reference index genome file *.fa.fai[.gz] + --mutBam Tumour BAM/CRAM file (co-located index and bas files) + --controlBam Normal BAM/CRAM file (co-located index and bas files) + --ignoreContigs Location of tsv ignore regions file + --species Species name to be output in VCF + --assembly Species assembly to be output in VCF + --analysisName + + + + --simrep Full path to tabix indexed simple/satellite repeats. + --filter VCF filter rules file (see FlagVcf.pl for details) + --genes Full path to tabix indexed coding gene footprints. + --unmatched Full path to tabix indexed gff3 of unmatched normal panel + - see pindel_np_from_vcf.pl + Optional + --seqtype Sequencing protocol, expect all input to match [WGS] + --assembly Name of assembly in use + - when not available in BAM header SQ line. + --species Species + - when not available in BAM header SQ line. + --exclude Exclude this list of ref sequences from processing, wildcard '%' + - comma separated, e.g. NC_007605,hs37d5,GL% + --badloci Tabix indexed BED file of locations to not accept as anchors + - e.g. hi-seq depth from UCSC + --skipgerm Don't output events with more evidence in normal BAM. + --softfil VCF filter rules to be indicated in INFO field as soft flags + --limit When defined with '-cpus' internally thread concurrent processes. + - requires '-p', specifically for pindel/pin2vcf steps + --debug Don't cleanup workarea on completion. + --apid Analysis process ID (numeric) - for cgpAnalysisProc header info + - not necessary for external use + + priorMutProb = 0.000006 + priorSnpProb = 0.000100 + normalContamination = 0.100000 + refBias = 0.950000 + mutProbCutoff = 0.800000 + snpProbCutoff = 0.950000 + minTumCvg = 1 + minNormCvg = 1 + normProtocol = "WGS" + tumProtocol = "WGS" + normCNFill = 2 + tumCNFill = 2 + normSeqPlatform = "NO_PLAT" + tumSeqPlatform = "NO_PLAT" + """.stripIndent() +} + +def getUsableContigIndices(genomeFai, ignoreContigs) { + fai_lines = file(genomeFai).readLines() + def contig_fai = [] + for (fai_line in fai_lines){ + contig_fai.push(fai_line.split("\\s+")[0]) + } + contig_fai = contig_fai.reverse() + def ignore_contigs_list = file(ignoreContigs).readLines() + def unique_fai = (contig_fai + ignore_contigs_list ) - contig_fai.intersect( ignore_contigs_list ) + def indices = [] + for(contig in unique_fai){ + indices.add((contig_fai.findIndexOf { "$it" == "$contig" })+1) + } + return indices +} + +include { setup; split; split_concat; generate_mstep_indices; mstep; merge; estep; merge_results; add_ids; } from './modules/caveman-core.nf' + +//mstep; merge; estep; combine; +// Print help if no workflow specified +workflow { + if ( params.help ) { + helpMessage() + exit 0 + } +} + +workflow caveman { + //wf for running caveman + //Help message + if ( params.help ) { + helpCavemanMessage() + exit 0 + } + log.info """\ + CaVEMan:caveman - NF Pipeline + ------------------------------ + //setup + analysisName: ${params.analysisName} + genomefa: ${params.genomefa} + mutBam: ${params.mutBam} + controlBam: ${params.controlBam} + ignoreContigs: ${params.ignoreContigs} + ignoreRegions: ${params.ignoreRegions} + species: ${params.species} + assembly: ${params.assembly} + """.stripIndent() + + main: + genome = tuple file(params.genomefa), file("${params.genomefa}.fai") + mt = tuple file(params.mutBam), file("${params.mutBam}.bai") + wt = tuple file(params.controlBam), file("${params.controlBam}.bai") + ign_file = file(params.ignoreRegions) + results = file(params.resultsDir) + splitF = file(params.splitList) + ignoreContigFile = file(params.ignoreContigs) + + subCaVEMan( + //setup + genome, + mt, + wt, + ign_file, + ignoreContigFile, + params.species, + params.assembly, + //Optional setup - or empty file and blanket CN + params.normcn, + params.tumcn, + params.configF, + params.algBeanF, + results, + splitF, + params.includeSW, + params.includeSE, + params.includeDups, + + params.maxSplitReadNum, + + params.mstepSplitSize, + params.minBaseQual, + + //Optional estep + params.priorMutProb, + params.priorSnpProb, + params.normalContamination, + params.refBias, + params.mutProbCutoff, + params.snpProbCutoff, + params.minTumCvg, + params.minNormCvg, + params.normProtocol, + params.tumProtocol, + params.normCNFill, + params.tumCNFill, + params.normSeqPlatform, + params.tumSeqPlatform, + + //Analysis name for output files + params.analysisName + ) +} + +workflow subCaVEMan { + //sub workflow, params and help in entrypoint wf + take: + //setup + genome + mt + wt + ign_file + ignoreContigFile + species + assembly + //Optional setup - or empty file and blanket CN + normcn + tumcn + configF + algBeanF + results + splitF + includeSW + includeSE + includeDups + //Optional spliut + maxSplitReadNum + //Optional mstep + mstepSplitSize + minBaseQual + //Optional estep + priorMutProb + priorSnpProb + normalContamination + refBias + mutProbCutoff + snpProbCutoff + minTumCvg + minNormCvg + normProtocol + tumProtocol + normCNFill + tumCNFill + normSeqPlatform + tumSeqPlatform + //Output files naming + analysisName + + main: + setup( + genome, + mt, + wt, + ign_file, + normcn, + tumcn, + // configF, + // algBeanF, + // results, + // splitF, + includeSW, + includeSE, + includeDups + ) + //Only want the indices of usable contigs - subtract contents of the ignoreContigsFile from the genome.fa.fa file + Channel.fromList(getUsableContigIndices("${params.genomefa}.fai", ignoreContigFile)).set({contig_index}) + split( + setup.out.configFile, + genome, + mt, + wt, + ign_file, + contig_index, + maxSplitReadNum + ) + //Concatenate split files + split_concat( + split.out.splitFiles.collect() + ) + generate_mstep_indices( + split_concat.out.splitList + ) + mstep( + setup.out.configFile, + setup.out.algBeanFile, + split_concat.out.splitList, + genome, + mt, + wt, + ign_file, + mstepSplitSize, + minBaseQual, + split.out.rposFiles.collect(), + generate_mstep_indices.out.mstep_index.splitText() + ) + merge( + setup.out.configFile, + setup.out.algBeanFile, + split_concat.out.splitList, + mstep.out.mstepResults.collect(), + "$launchDir/results" + ) + estep( + setup.out.configFile, + merge.out.covArrFile, + merge.out.probsArrFile, + setup.out.algBeanFile, + split_concat.out.splitList, + "$launchDir/results", + genome, + mt, + wt, + ign_file, + split.out.rposFiles.collect(), + minBaseQual, + species, + assembly, + priorMutProb, + priorSnpProb, + normalContamination, + refBias, + mutProbCutoff, + snpProbCutoff, + minTumCvg, + minNormCvg, + normProtocol, + tumProtocol, + normCNFill, + tumCNFill, + normSeqPlatform, + tumSeqPlatform, + generate_mstep_indices.out.mstep_index.splitText() + ) + merge_results( + "$launchDir/results", + split_concat.out.splitList, + estep.out.estep_idx.collect(), + analysisName + ) + add_ids( + merge_results.out.mergedMutsVCF, + merge_results.out.mergedSnpVCF, + analysisName + ) +} diff --git a/nextflow/modules/caveman-core.nf b/nextflow/modules/caveman-core.nf new file mode 100644 index 0000000..df05b5d --- /dev/null +++ b/nextflow/modules/caveman-core.nf @@ -0,0 +1,288 @@ +def get_chr_count (genome, ignorecontigs) { + echo "$genome\t$ignorecontigs" +} + +process setup { + + input: + //setup + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + file ('ign.file') + + //Optional setup - or empty file and blanket CN + val normcn + val tumcn + val includeSW + val includeSE + val includeDups + + publishDir "$launchDir", mode: 'copy', pattern: 'caveman.cfg.ini', overwrite: true + publishDir "$launchDir", mode: 'copy', pattern: 'alg_bean', overwrite: true + + output: + //setup outputs + path "caveman.cfg.ini", emit: configFile + path "alg_bean", emit: algBeanFile + + script: + def applySW = includeSW != "NO_SW" ? "-w $includeSW" : '' + def applySE = includeSE != "NO_SE" ? "-z $includeSE" : '' + def applyDup = includeDups != "NO_DUPS" ? "-u $includeDups" : '' + def applyNCN = normcn != "NO_NCN" ? "-j $normcn" : '' + def applyTCN = tumcn != "NO_TCN" ? "-e $tumcn" : '' + """ + caveman setup \ + ${applySW} \ + ${applySE} \ + ${applyDup} \ + ${applyNCN} \ + ${applyTCN} \ + -l splitList \ + -a alg_bean \ + -c caveman.cfg.ini \ + -t mt.bam \ + -n wt.bam \ + -r genome.fa.fai \ + -g ign.file \ + -f results \ + """ +} + +process split { + + input: + path 'caveman.cfg.ini' + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + each index + val maxSplitReadNum + + publishDir "$launchDir/splits/", mode: 'symlink', pattern: 'splitList.*', overwrite: true + + output: + path "splitList.*", emit: splitFiles + path "caveman.cfg.ini", emit: configFile + path "readpos.*", optional: true, emit: rposFiles + + script: + """ + caveman split \ + -f caveman.cfg.ini \ + -i $index \ + -e $maxSplitReadNum \ + """ +} + +process split_concat { + input: + path splitFileList + + publishDir "$launchDir/", mode: 'copy', pattern: 'splitList.tmp', overwrite: true + + output: + path "splitList", emit: splitList + + script: + """ + for splitFile in ${splitFileList} + do + cat \$splitFile >> splitList + done + """ +} + +process generate_mstep_indices { + input: + path 'fileForLineCount' + + output: + stdout emit: mstep_index + + script: + """ + sed -n '=' fileForLineCount + """ +} + +process mstep { + input: + path 'caveman.cfg.ini' + path 'alg_bean' + path 'splitList' + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + val mstepSplitSize + val minBaseQual + path rposFiles + each index + + publishDir "$launchDir/", mode: 'symlink', pattern: 'results/*/*.covs', overwrite: true + + output: + path "results/*/*.covs", emit: mstepResults + + script: + """ + caveman mstep \ + -f caveman.cfg.ini \ + -a $mstepSplitSize \ + -m $minBaseQual \ + -i $index + """ + +} + +process merge { + input: + path 'caveman.cfg.ini' + path 'alg_bean' + path 'splitList' + path 'mstepResults' + path 'results' + + output: + path 'covs_arr', emit: covArrFile + path 'probs_arr', emit: probsArrFile + + publishDir "$launchDir/", mode: 'copy', pattern: 'covs_arr', overwrite: true + publishDir "$launchDir/", mode: 'copy', pattern: 'probs_arr', overwrite: true + + script: + """ + caveman merge \ + -c covs_arr \ + -p probs_arr \ + -f caveman.cfg.ini + """ +} + +process estep { + + input: + path 'caveman.cfg.ini' + path 'covs_arr' + path 'probs_arr' + path 'alg_bean' + path 'splitList' + path 'results' + tuple path('genome.fa'), path('genome.fa.fai') + tuple path('mt.bam'), path('mt.bam.bai') + tuple path('wt.bam'), path('wt.bam.bai') + path ('ign.file') + path rposFiles + val minBQ + val spec + val ass + val priorMutProb + val priorSnpProb + val normalContamination + val refBias + val mutProbCutoff + val snpProbCutoff + val minTumCvg + val minNormCvg + val normProtocol + val tumProtocol + val normCNFill + val tumCNFill + val normSeqPlatform + val tumSeqPlatform + each index + + output: + path "results/*/*.vcf.gz", emit: estepVCFs + path "results/*/*.bed", emit: estepNoAnalysis + val index, emit: estep_idx + + publishDir "$launchDir/", mode: 'symlink', pattern: 'results/*/*.vcf.gz', overwrite: true + publishDir "$launchDir/", mode: 'symlink', pattern: 'results/*/*.bed', overwrite: true + + script: + def applyNormPlat = normSeqPlatform != "NO_PLAT" ? "-P $normSeqPlatform" : '' + def applyTumPlat = tumSeqPlatform != "NO_PLAT" ? "-T $tumSeqPlatform" : '' + """ + caveman estep \ + -f caveman.cfg.ini \ + -g covs_arr \ + -o probs_arr \ + -m $minBQ \ + -v $ass \ + -w $spec \ + -c $priorMutProb \ + -d $priorSnpProb \ + -k $normalContamination \ + -b $refBias \ + -p $mutProbCutoff \ + -q $snpProbCutoff \ + -x $minTumCvg \ + -y $minNormCvg \ + -l $normProtocol \ + -r $tumProtocol \ + -n $normCNFill \ + -t $tumCNFill \ + ${applyNormPlat} \ + ${applyTumPlat} \ + -i $index + """ + +} + +process merge_results { + input: + path 'results' + path 'splitList' + val indexes + val analysisName + + output: + path "${analysisName}.muts.raw.vcf", emit: mergedMutsVCF + path "${analysisName}.snps.raw.vcf", emit: mergedSnpVCF + path "${analysisName}.no_analysis.bed.gz", emit: mergedNoAnalysis + + publishDir "$launchDir/", mode: 'copy', pattern: '*.no_analysis.bed.gz*', overwrite: true + + script: + """ + mergeCavemanResults -s splitList -o ${analysisName}.muts.raw.vcf -f results/%/%.muts.vcf.gz && + mergeCavemanResults -s splitList -o ${analysisName}.snps.raw.vcf -f results/%/%.snps.vcf.gz && \ + mergeCavemanResults -s splitList -o ${analysisName}.no_analysis.bed -f results/%/%.no_analysis.bed && \ + bgzip ${analysisName}.no_analysis.bed && tabix -p bed ${analysisName}.no_analysis.bed.gz\ + """ + +} + +process add_ids { + input: + path 'muts.raw.vcf' + path 'snp.raw.vcf' + val analysisName + + output: + path "${analysisName}.muts.vcf.gz", emit: mutsVCFIIDs + path "${analysisName}.muts.vcf.gz.tbi", emit: mutsVCFIIDsTbx + path "${analysisName}.snps.vcf.gz", emit: snpsVCFIIDs + path "${analysisName}.snps.vcf.gz.tbi", emit: snpsVCFIIDsTbx + + publishDir "$launchDir/", mode: 'copy', pattern: '*.snps.vcf.gz*', overwrite: true + publishDir "$launchDir/", mode: 'copy', pattern: '*.muts.vcf.gz*', overwrite: true + + script: + """ + cgpAppendIdsToVcf.pl -i muts.raw.vcf -o ${analysisName}.muts.vcf && \ + cgpAppendIdsToVcf.pl -i snp.raw.vcf -o ${analysisName}.snps.vcf && \ + bgzip ${analysisName}.muts.vcf && bgzip ${analysisName}.snps.vcf && + tabix -p vcf ${analysisName}.muts.vcf.gz && tabix -p vcf ${analysisName}.snps.vcf.gz + """ +} + +/* +process flag { + cgpFlagCaVEMan.pl -i %s -o %s -s %s -m %s -n %s -b %s -g %s -umv %s -ref %s -t %s -sa %s +} +*/ diff --git a/nextflow/nextflow.config b/nextflow/nextflow.config new file mode 100644 index 0000000..d2e49d0 --- /dev/null +++ b/nextflow/nextflow.config @@ -0,0 +1,134 @@ +params { + + //Generic - help etc + help = false + + //Required + genomefa = null + mutBam = null + controlBam = null + ignoreContigs = null + ignoreRegions = null + species = null + assembly = null + analysisName = null + + //Optional setup - or empty file and blanket CN + normcn = "NO_NCN" + tumcn = "NO_TCN" + configF = "caveman.cfg.ini" + algBeanF = "alg_bean" + resultsDir = "./results" + splitList = "splitList" + includeSW = "NO_SW" + includeSE = "NO_SE" + includeDups = "NO_DUPS" + + //Optional split + maxSplitReadNum = 350000 + + //Optional mstep + mstepSplitSize = 50000 + minBaseQual = 11 + + //Optional estep + priorMutProb = 0.000006 + priorSnpProb = 0.000100 + normalContamination = 0.100000 + refBias = 0.950000 + mutProbCutoff = 0.800000 + snpProbCutoff = 0.950000 + minTumCvg = 1 + minNormCvg = 1 + normProtocol = "WGS" + tumProtocol = "WGS" + normCNFill = 2 + tumCNFill = 5 + normSeqPlatform = "NO_PLAT" + tumSeqPlatform = "NO_PLAT" + +} + +process { + + withName: 'setup' { + cpus = 1 + memory = '250 MB' + queue = 'small' + } + + withName: 'split' { + cpus = 1 + memory = '1500 MB' + queue = 'normal' + } + + withName: 'split_concat' { + cpus = 1 + memory = '2 GB' + queue = 'small' + } + + withName: 'generate_mstep_indices' { + cpus = 1 + memory = '500 MB' + queue = 'small' + } + + withName: 'mstep' { + cpus = 1 + memory = '8 GB' + queue = 'normal' + } + + withName: 'merge' { + cpus = 1 + memory = '5 GB' + queue = 'small' + } + + withName: 'estep' { + cpus = 1 + memory = '8 GB' + queue = 'normal' + } + + withName: 'merge_results' { + cpus = 1 + memory = '5 GB' + queue = 'small' + } + + withName: 'add_ids' { + cpus = 1 + memory = '2 GB' + queue = 'small' + } +/* + withName: 'flag' { + + }*/ +} + +// this will only work if there is a docker/singularity image available +profiles { + local { + process.executor = 'local' + } + lsf { + process.executor = 'lsf' + executor.perJobMemLimit = true + } + docker { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + singularity.runOptions = '--cleanenv' + docker.enabled = false + process.container = 'quay.io/wtsicgp/cgpcavemanwrapper:1.18.3' + } +} diff --git a/src/config_file_access.c b/src/config_file_access.c index c56b270..2c46e59 100644 --- a/src/config_file_access.c +++ b/src/config_file_access.c @@ -159,9 +159,9 @@ int config_file_access_write_config_file(FILE *file, char *tum_bam_file, char *n check_mem(curr_wd); char *chk = getcwd(curr_wd,size); check(chk!=NULL,"Error retrieving cwd for config file."); - int res = fprintf(file,"%s=%s\n",CWD,curr_wd); - check(res>=0,"Error writing cwd to config file."); - res = fprintf(file,"%s=%s\n",MUT_TUM,tum_bam_file); + //int res = fprintf(file,"%s=%s\n",CWD,curr_wd); + //check(res>=0,"Error writing cwd to config file."); + int res = fprintf(file,"%s=%s\n",MUT_TUM,tum_bam_file); check(res>=0,"Error writing mutant bam file loc to config file."); res = fprintf(file,"%s=%s\n",NORM_TUM,norm_bam_file); check(res>=0,"Error writing normal bam file loc to config file."); diff --git a/src/split.c b/src/split.c index 1257fc7..536b270 100644 --- a/src/split.c +++ b/src/split.c @@ -69,7 +69,7 @@ static char tum_cn_loc[512]; static int idx = 0; void split_print_usage (int exit_code){ - printf ("Usage: caveman split -i jobindex [-f path] [-c int] [-m int] [-e int] \n\n"); + printf ("Usage: caveman split -i jobindex [-f path] [-e int] \n\n"); printf("-i --index [int] Job index (e.g. from $LSB_JOBINDEX)\n\n"); printf("Optional\n"); printf("-f --config-file [file] Path to the config file produced by setup [default:'%s'].\n",config_file); @@ -82,8 +82,6 @@ int split_setup_options(int argc, char *argv[]){ const struct option long_opts[] = { {"config-file", required_argument, 0, 'f'}, - {"increment", required_argument, 0, 'c'}, - {"max-read-count",required_argument , 0, 'm'}, {"read-count", required_argument, 0, 'e'}, {"index", required_argument, 0, 'i'}, {"help", no_argument, 0, 'h'},