From a761b1c7d70d6b17feb1b8039658babcbd222965 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Mon, 16 Sep 2019 13:59:29 -0400 Subject: [PATCH] Begin documentation and code cleanup --- README.md | 34 ++++++ pChunks.R | 161 ++++++++++++++-------------- pChunks.yml | 304 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 421 insertions(+), 78 deletions(-) create mode 100644 README.md create mode 100644 pChunks.yml diff --git a/README.md b/README.md new file mode 100644 index 0000000..6f68498 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ + + * [pChunks: Plasmid identification and annotation from Illumina reads](#pChunks-Plasmid-identification-and-annotation-from-assemblies) + * [Installation](#installation) + * [Conda environments](#conda-environments) + * [Dependencies](#dependencies) + * [Software](#software) + * [R packages](#r-packages) + * [Quickstart / demo](#quickstart--demo) + * [Usage](#usage) + * [Using your own data](#using-your-own-data) + + + + + +# pChunks: Plasmid identification and annotation from Illumina reads + + + +# Installation + +## Conda environments + +## Dependencies + +### Software + +### R packages + +# Quickstart / demo + +# Usage + +# Using your own data \ No newline at end of file diff --git a/pChunks.R b/pChunks.R index 4a43b55..fa9e487 100755 --- a/pChunks.R +++ b/pChunks.R @@ -1,7 +1,7 @@ #!/bin/env Rscript library(parallel) -library(hash) +suppressMessages(library(hash)) library(stringr) library(grid) library(gridExtra) @@ -14,13 +14,13 @@ blat = function(sample) { outputFile = paste0('plasmids/hits', sample, '.psl') command = paste('blat -tileSize=18 -minIdentity=98', assemblyFile, - '/data/home/gtg075i/data/plasmidsAndArtificial/plasmidsAndArtificial.fna', + plasmidDatabase, outputFile) if (file.size(outputFile) == 0) { print(command) system(command) } -} +} printif = function(string = NULL, condition){ if (condition) { @@ -47,7 +47,15 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, } if (!file.exists(plasmidDatabase)) { argumentsGood = FALSE - message(paste('Plasmid database', plasmidDatabase, 'not found')) + message(paste('Plasmid database FASTA', plasmidDatabase, 'not found')) + } + if (!file.exists(paste0(plasmidDatabase,'.nhr'))) { + argumentsGood = FALSE + message(paste('Plasmid BLAST database', paste0(plasmidDatabase,'.nhr') ,'not found')) + message(paste('Building Plasmid BLAST database', paste0(plasmidDatabase,'.nhr'))) + command = paste('makeblastdb -in', plasmidDatabase, + '-dbtype nucl') + system(command, intern = TRUE) } if (is.na(outputDirectory)) { argumentsGood = FALSE @@ -69,12 +77,12 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, message('There is a problem with the arguments') return() } - + printif(paste('Finding plasmids in', plasmidPSLFile), verbosity > 0) ## Keep track of the total score in case we doing a grid search totalPlasmidScore = 0 - + ## Check for the existence of the output directory, remove if it exists if (file.exists(outputDirectory)) { printif(paste('Removing existing output directory', outputDirectory), verbosity > 1) @@ -83,7 +91,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, printif(paste('Making output directory', outputDirectory), verbosity > 1) dir.create(outputDirectory) outputPrefix = paste0(outputDirectory, "/plasmids") - + ## Read in and filter the BLAT plasmid hits plasmidHits = read.table(plasmidPSLFile, row.names = NULL, header = FALSE, sep = '\t', stringsAsFactors = FALSE, skip = 5) colnames(plasmidHits) = c('match', 'mismatch', 'rep_m', 'Ns', 'tgap_c', 'tgap_b', @@ -97,7 +105,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## Toss out any hits missing information plasmidHits = plasmidHits[complete.cases(plasmidHits),] - + ## Toss out very long plasmid sequences -- probably actually genome chunks labeled incorrectly veryLongHits = sum(plasmidHits[,'tlength'] >= maxTargetLength) printif(paste('Removing', veryLongHits, 'hits greater than', maxTargetLength), verbosity > 0) @@ -120,16 +128,16 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, by = list(plasmidHits[,'query'], plasmidHits[,'target']), FUN = max) printif(head(sequenceLengths), verbosity > 1) printif(paste('Sequence-plasmid pair lengths:', paste(dim(sequenceLengths), collapse = 'x')), verbosity > 1) - + matchingFractions = cbind(sequenceMatches[,c(1,2)], sequenceMatches[,3] / sequenceLengths[,3]) colnames(matchingFractions) = c('query', 'target', 'fraction') printif(head(matchingFractions), verbosity > 1) printif(paste('Sequence-plasmid pair fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) - + matchingFractions = matchingFractions[matchingFractions[,'fraction'] >= minQueryCoverage,] printif(head(matchingFractions), verbosity > 1) printif(paste('Passing sequence-plasmid pair fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) - + aboveMinCoverage = apply(matchingFractions, 1, function(i){paste0(i['query'], '|', i['target'])}) plasmidHits = plasmidHits[apply(plasmidHits, 1, function(i){paste0(i['query'], '|', i['target'])}) %in% aboveMinCoverage, ] printif(paste("Sequence-plasmid hits after removing low-coverage hits:", nrow(plasmidHits)), verbosity > 0) @@ -144,21 +152,21 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, by = list(plasmidHits[,'target']), FUN = max) printif(head(targetLengths), verbosity > 1) printif(paste('Plasmid lengths:', paste(dim(targetLengths), collapse = 'x')), verbosity > 1) - + matchingFractions = cbind(targetMatches[,1], targetMatches[,2] / targetLengths[,2]) colnames(matchingFractions) = c('target', 'fraction') printif(head(matchingFractions), verbosity > 1) printif(paste('Plasmid fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) - + matchingFractions = matchingFractions[matchingFractions[,'fraction'] >= minTargetCoverage,] printif(head(matchingFractions), verbosity > 1) printif(paste('Passing plasmid fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) - + aboveMinCoverage = matchingFractions[, 'target'] plasmidHits = plasmidHits[plasmidHits[, 'target'] %in% aboveMinCoverage, ] printif(paste("Sequence-plasmid hits after removing low-coverage hits:", nrow(plasmidHits)), verbosity > 0) - + ## If we're out of sequece-plasmid hits, then stop here if (nrow(plasmidHits) == 0) { message(paste('Not hits found')) @@ -173,7 +181,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, if (!(i %% 1000)) { printif(paste('Processing hit', i, '/', nrow(plasmidHits)), verbosity > 0) } - + query = plasmidHits[i,'query'] target = plasmidHits[i, 'target'] @@ -186,17 +194,17 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, queryCoverage[[query]][[target]] = rep(0, times = plasmidHits[i, 'qlength']) queryMismatches[[query]][[target]] = 0 } - + blockSizes = as.numeric(unlist(strsplit(x = plasmidHits[i,'block_sizes'], ','))) qBlockStarts = as.numeric(unlist(strsplit(x = plasmidHits[i,'qstarts'], ','))) - + for (j in 1:length(blockSizes)) { queryCoverage[[query]][[target]][qBlockStarts[j]:(qBlockStarts[j]+blockSizes[j])] = 1 } - queryMismatches[[query]][[target]] = queryMismatches[[query]][[target]] + plasmidHits[i,'mismatch'] + queryMismatches[[query]][[target]] = queryMismatches[[query]][[target]] + plasmidHits[i,'mismatch'] } - + ## Pull the full plasmid names from the blast database because BLAT doens't report them, just the ID's targetIDs = plasmidHits[,'target'] targetIDs = gsub("\\|$", "", targetIDs) @@ -210,13 +218,13 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, '| grep ">"') targetNames = system(command, intern = TRUE) printif(paste('Found', length(targetNames), 'target names for', length(targetIDs), 'targets.'), verbosity > 0) - + targetNames = gsub('^>.*\\| ', '', targetNames) targetNames = gsub('^>.', '', targetNames) plasmidHits = cbind(plasmidHits, targetIDs, targetNames) printif(paste('Named hits:', nrow(plasmidHits)), verbosity > 1) - - #Pull just the plasmids out of the larget set of hits, i.e, make sure it has the word 'plasmid' in the description. + + #Pull just the plasmids out of the larget set of hits, i.e, make sure it has the word 'plasmid' or 'vector' in the description. plasmidHits = plasmidHits[grep('plasmid|vector', plasmidHits[,'targetNames'], ignore.case = TRUE), ,drop = FALSE] plasmidHits = plasmidHits[!grepl('tig0000', plasmidHits[,'targetNames'], ignore.case = TRUE), , drop = FALSE] printif(paste("Sequece-plasmid hits after screening by name:", paste(dim(plasmidHits), collapse = 'x')), verbosity > 1) @@ -230,8 +238,8 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, message('Not hits found') return() } - - ## Clean up the plasmid names -- they look like crap by default. + + ## Clean up the plasmid names -- they're super ugly out of the box plasmidNames = plasmidHits[,'targetNames'] plasmidNames = gsub(', comp.*', '', plasmidNames) plasmidNames = gsub(', contig.*', '', plasmidNames) @@ -257,7 +265,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## Order hits by the plasmid ID and the query length plasmidHits = plasmidHits[order(plasmidHits[,'target'], -plasmidHits[,'qlength']), ] - + ## Iterate, finding plasmids until we run out of usable sequence-plasmid its plasmidNumber = 0 plasmidResults = c() @@ -265,14 +273,14 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## Keep track of how many plasmids we have gone over plasmidNumber = plasmidNumber + 1 - + printif(paste('Sequence-plasmid hits left:', nrow(plasmidHits)), verbosity > 1) contigToPlasmid = hash() plasmidToContig = hash() plasmidCoverage = hash() plasmidCoverageWithRepeats = hash() contigCoverage = hash() - + ##Find contig/plasmid plasmid/contig pairs if (is.null(plasmidHits)) { break @@ -284,7 +292,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## Find the coverage of each plasmid in the possible set by the contigs for (i in 1:nrow(plasmidHits)) { - + query = plasmidHits[i,'query'] target = plasmidHits[i,'target'] matches = plasmidHits[i,'match'] @@ -295,7 +303,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, queryStarts = as.numeric(unlist(strsplit(plasmidHits[i, 'qstarts'], ','))) targetStarts = as.numeric(unlist(strsplit(plasmidHits[i, 'tstarts'], ','))) - + ## Skip matches which have less than 50% of the bases from the contig on the plasmid -- probably not a good match if ((sum(queryCoverage[[query]][[target]]) - queryMismatches[[query]][[target]]) / queryLength <= minQueryCoverage) { next @@ -314,7 +322,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, } else { contigToPlasmid[[query]][[target]] = contigToPlasmid[[query]][[target]] + score } - + ## Keep track of target(plasmid) coverage by the contigs if (!has.key(target, plasmidCoverage)) { plasmidCoverage[[target]] = rep(0, targetLength) @@ -322,10 +330,10 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, plasmidToContig[[target]] = hash() plasmidMismatches[target] = 0 } - + ## Keep track of how much of each plasmid is covered by ALL of the contigs in the current set of hits - if (plasmidHits[i,'strand'] == '-') { #Flip stupid reverse strand + if (plasmidHits[i,'strand'] == '-') { #Flip reverse strand targetStarts = rev(plasmidHits[i,'tlength'] - targetStarts) - rev(blockSizes) } previousCoverage = sum(plasmidCoverage[[target]]) @@ -335,7 +343,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## Keep track of all contig alignments to this plasmid, even with repeats plasmidCoverageWithRepeats[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])] = 1 - + ## Skip if this region of the query sequence has already been assigned to this plasmid if (sum(contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0) <= 50) { printif(paste('Sequence', query, 'already used for', target, @@ -344,8 +352,6 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, } if (!penalized) { ## Penalty for every gap, only penalize once per match - ## TODO: Penalize for gap length, not just once per gap - #plasmidMismatches[target] = plasmidMismatches[target] + (length(blockSizes) - 1) * 100 plasmidMismatches[target] = plasmidMismatches[target] + mismatches * 5 penalized = TRUE } @@ -361,26 +367,24 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, } else { plasmidToContig[[target]][[query]] = plasmidToContig[[target]][[query]] + score } - + } - + ## Get the best set of plasmids out, i.e., the set with the most bases matching between the contig and plasmid plasmidScores = c() for (thisPlasmid in keys(plasmidCoverage)){ thisPlasmidScore = sum(plasmidCoverage[[thisPlasmid]]) - #thisPlasmidScore = thisPlasmidScore * log((thisPlasmidScore / length(plasmidCoverage[[thisPlasmid]])) * 100)**2 - #thisPlasmidScore = thisPlasmidScore * (thisPlasmidScore / length(plasmidCoverage[[thisPlasmid]])) plasmidScores = c(plasmidScores, thisPlasmidScore) names(plasmidScores)[length(plasmidScores)] = thisPlasmid } plasmidScores = sort(plasmidScores - plasmidMismatches[names(plasmidScores)], dec = TRUE) - + if (length(plasmidScores) > 0) { printif('Highest scoring plasmids', verbosity > 1) printif(head(cbind(plasmidScores), 20), verbosity > 1) } - + ## Stop searching for plasmids if nothing matches well or we're out of hits if (length(plasmidScores) == 0) { printif('Out of plasmids', verbosity > 0) @@ -389,7 +393,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, printif('Out of min-scoring plasmids', verbosity > 0) break } - + ## For each matching plasmid, ordered by total bases matching the assembly, find the set of corresponding contigs plasmidToUse = 1 if (!is.null(searchDepth) && plasmidNumber <= length(searchDepth)) { @@ -398,7 +402,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, plasmid = names(plasmidScores)[plasmidToUse] totalPlasmidScore = totalPlasmidScore + plasmidScores[plasmid] printif(paste("Pulling sequences for", plasmid), verbosity > 0) - + ## Find contigs what haven't already been given to another plasmid so we can assign them next round plasmidContigs = keys(plasmidToContig[[plasmid]]) unusedContigs = plasmidContigs[!(plasmidContigs %in% usedContigs)] @@ -423,13 +427,13 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## How many bases from the plasmid are uncovered? plasmidMissing = rep(sum(plasmidCoverageWithRepeats[[as.character(plasmidID)]] == 0), nrow(plasmidRows)) - + ## Create a circos plot for this plasmid if (makeCircos) { circosDirectory = paste0(outputDirectory, '/circos') dir.create(circosDirectory, showWarnings = FALSE) - + printif(paste('Drawing circos diagram for', plasmid), verbosity > 1) plasmidConfFile = paste0(circosDirectory, '/', plasmidID, '.conf') command = paste('cat plasmids/circos/plasmid.conf', @@ -437,21 +441,21 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, '>', plasmidConfFile) printif(command, verbosity > 1) system(command) - + plasmidIdeogramFile = paste0(circosDirectory, '/', plasmidID, 'Ideogram.conf') command = paste('cat plasmids/circos/plasmidIdeogram.conf', '| sed', paste('"s/PLASMID/', plasmidID, '/g"', sep = ''), '>', plasmidIdeogramFile) printif(command, verbosity > 1) - system(command) - + system(command) + plasmidKaryotypeFile = paste0(circosDirectory, '/', plasmidID, 'Karyotype.conf') command = paste('cat plasmids/circos/plasmidKaryotype.conf', '| sed', paste('"s/PLASMID/', plasmidID, '/g"', sep = ''), '>', plasmidKaryotypeFile) printif(command, verbosity > 1) system(command) - + plasmidKaryotypeDataFile = paste(circosDirectory, '/', plasmidID, 'Karyotype.txt', sep = '') plasmidKaryotypeData = rbind(c('chr', '-', 'plasmid', 1, 0, plasmidLength[1], 'plasmid')) write.table(file = plasmidKaryotypeDataFile, x = plasmidKaryotypeData, @@ -465,7 +469,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, if (thisPlasmidCoverage[1] == 1) { blockStarts = c(1, blockStarts) } - + blockStops = seq(2, length(thisPlasmidCoverage))[ thisPlasmidCoverage[2:length(thisPlasmidCoverage)] == 0 & (thisPlasmidCoverage[2:length(thisPlasmidCoverage)] != thisPlasmidCoverage[1:(length(thisPlasmidCoverage) - 1)])] @@ -474,7 +478,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, } printif(cbind(blockStarts, blockStops), verbosity >= 2) - + plasmidContigDataFile = paste(circosDirectory, '/', plasmidID, 'Contigs.txt', sep = '') plasmidContigData = cbind(blockStarts, blockStops, 1:length(blockStarts)) plasmidContigData = cbind(rep('plasmid', nrow(plasmidContigData)), plasmidContigData) @@ -482,16 +486,9 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, write.table(file = plasmidContigDataFile, x = plasmidContigData, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = '\t') plasmidGeneDataFile = paste(circosDirectory, '/', plasmidID, 'Genes.txt', sep = '') - ## print(circosGenes) - ## if (!is.null(nrow(circosGenes))) { - ## plasmidData = cbind(rep('plasmid', nrow(circosGenes)), circosGenes) - ## } else { - ## plasmidData = '' - ## } - ##write.table(file = plasmidGeneDataFile, x = plasmidData, row.names = FALSE, col.names = FALSE, quote = FALSE, sep = '\t') write.table(file = plasmidGeneDataFile, x = "", row.names = FALSE, col.names = FALSE, quote = FALSE, sep = '\t') - - command = paste('~/src/circos-0.69-3/bin/circos', + + command = paste('circos', '-conf', plasmidConfFile, '-debug_group textplace') print(command) @@ -520,20 +517,20 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, thisPlasmidQuerySizes = c(thisPlasmidQuerySizes, length(queryCoverage[[contig]][[plasmid]])) thisPlasmidMatches = c(thisPlasmidMatches, sum(queryCoverage[[contig]][[plasmid]])) } - + ## Add this plasmid's hits onto the growing list of sequence-plasmid hits thisPlasmidResults = cbind(unusedContigs, plasmidName, as.character(plasmidID), thisPlasmidQuerySizes, thisPlasmidMatches, plasmidLength, plasmidMissing) colnames(thisPlasmidResults) = c('query.name', 'plasmid.name', 'plasmid.accession', 'query.size', 'aligned.bases', 'plasmid.size', 'plasmid.missing') thisPlasmidResults = thisPlasmidResults[order(-thisPlasmidMatches), ] plasmidResults = rbind(plasmidResults, thisPlasmidResults) - + ## Remove the contigs added to this plasmid from the list of plasmid/contig BLAT hits plasmidHits = plasmidHits[!(plasmidHits[,'query'] %in% usedContigs),] plasmidHits = plasmidHits[!(plasmidHits[,'target'] == plasmid),] } rownames(plasmidResults) = plasmidResults[,1] - + ## Check for the presence of AMR genes in this file if (!noAMR) { amrBEDFile = paste0(outputDirectory, '/amrMapping.bed') @@ -567,14 +564,14 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, rownames(amrResults) = amrResults[,1] amrResults = amrResults[ , 2, drop = FALSE] print(amrResults) - + plasmidResults = cbind(plasmidResults, rep('', nrow(plasmidResults))) colnames(plasmidResults)[ncol(plasmidResults)] = 'amr' plasmidResults[rownames(plasmidResults) %in% rownames(amrResults), 'amr'] = amrResults[rownames(plasmidResults)[rownames(plasmidResults) %in% rownames(amrResults)], 1] - + } - + ## Check for the presence of incompatibility groups in this file if (!noInc) { incBEDFile = paste0(outputDirectory, '/incMapping.bed') @@ -599,7 +596,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, '| awk \'{OFS="\t";locus=$7"\t"$8"\t"$9; if($5 > s[locus]){s[locus]=$5;b[locus] = $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}}END{for(i in b){print i,b[i]}}\'', '>', incFinalBEDFile) printif(command, verbosity > 1) - system(command) + system(command) ## Read the inc group results in and add them to the plasmid contigs incResults = read.table(file = incFinalBEDFile, header = FALSE, row.names = NULL, stringsAsFactors = FALSE, quote = '') @@ -608,17 +605,17 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, rownames(incResults) = incResults[,1] incResults = incResults[ , 2, drop = FALSE] print(incResults) - + plasmidResults = cbind(plasmidResults, rep('', nrow(plasmidResults))) colnames(plasmidResults)[ncol(plasmidResults)] = 'inc' plasmidResults[rownames(plasmidResults) %in% rownames(incResults), 'inc'] = incResults[rownames(plasmidResults)[rownames(plasmidResults) %in% rownames(incResults)], 1] } - + ## Write the plasmid results to file plasmidChunksFile = paste0(outputDirectory, '/plasmids.tsv') write.table(file = plasmidChunksFile, x = plasmidResults, quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) - + ## Dump a sequence file of potential plasmid contigs plasmidSequenceFile = paste0(outputPrefix, '.fna') system(paste0('echo "" >', plasmidSequenceFile)) @@ -631,7 +628,7 @@ findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, ## Return the total score of this round, in case we are doing a search return(totalPlasmidScore) - + } pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, @@ -645,9 +642,9 @@ pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, searchDepth = c(1), threads = 1, verbosity = 2) { - + print(plasmidDatabase) - + ## Verify the arguments argumentsGood = TRUE if (!file.exists(plasmidPSLFile)) { @@ -658,6 +655,14 @@ pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, argumentsGood = FALSE message(paste('Plasmid database', plasmidDatabase, 'not found')) } + if (!file.exists(paste0(plasmidDatabase,'.nhr'))) { + argumentsGood = FALSE + message(paste('Plasmid BLAST database', paste0(plasmidDatabase,'.nhr') ,'not found')) + message(paste('Building Plasmid BLAST database', paste0(plasmidDatabase,'.nhr'))) + command = paste('makeblastdb -in', plasmidDatabase, + '-dbtype nucl') + system(command, intern = TRUE) + } if (is.na(outputDirectory)) { argumentsGood = FALSE message('Output directory not given') @@ -686,12 +691,12 @@ pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, } printif(paste('Making output directory', outputDirectory), verbosity > 1) dir.create(outputDirectory) - + ## Default to c(1) for the plasmid search depth searchDepths = lapply(searchDepth, function(i){seq(i, 1)}) searchDepths = as.matrix(expand.grid(searchDepths)) print(searchDepths) - + plasmidScores = mclapply(1:nrow(searchDepths), function(i) { findPlasmids(plasmidPSLFile = plasmidPSLFile, plasmidDatabase = plasmidDatabase, amrPSLFile = amrPSLFile, amrDatabase, noAMR = noAMR, @@ -723,10 +728,10 @@ pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, paste0(outputDirectory, '/', files)) printif(commands, verbosity >= 2) lapply(commands, system) - + } - + optionList = list( make_option('--plasmid-psl', type = 'character', default = NULL, help = 'Plasmid BLAT output', metavar = ''), diff --git a/pChunks.yml b/pChunks.yml new file mode 100644 index 0000000..1f3a0ac --- /dev/null +++ b/pChunks.yml @@ -0,0 +1,304 @@ +name: pChunks +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _r-mutex=1.0.1=anacondar_1 + - attrs=19.1.0=py_0 + - backcall=0.1.0=py_0 + - binutils_impl_linux-64=2.31.1=h6176602_1 + - binutils_linux-64=2.31.1=h6176602_8 + - bleach=3.1.0=py_0 + - bwidget=1.9.11=0 + - bzip2=1.0.8=h516909a_0 + - ca-certificates=2019.9.11=hecc5488_0 + - cairo=1.16.0=hfb77d84_1002 + - certifi=2019.9.11=py37_0 + - circos=0.69.8=0 + - curl=7.65.3=hf8cf82a_0 + - decorator=4.4.0=py_0 + - defusedxml=0.5.0=py_1 + - entrypoints=0.3=py37_1000 + - expat=2.2.5=he1b5a44_1003 + - fontconfig=2.13.1=h86ecdb6_1001 + - freetype=2.10.0=he983fc9_1 + - fribidi=1.0.5=h516909a_1002 + - gcc_impl_linux-64=7.3.0=habb00fd_1 + - gcc_linux-64=7.3.0=h553295d_8 + - gettext=0.19.8.1=hc5be6a0_1002 + - gfortran_impl_linux-64=7.3.0=hdf63c60_1 + - gfortran_linux-64=7.3.0=h553295d_8 + - giflib=5.1.9=h516909a_0 + - glib=2.58.3=h6f030ca_1002 + - graphite2=1.3.13=hf484d3e_1000 + - gsl=2.5=h294904e_1 + - gxx_impl_linux-64=7.3.0=hdf63c60_1 + - gxx_linux-64=7.3.0=h553295d_8 + - harfbuzz=2.4.0=h9f30f68_3 + - icu=64.2=he1b5a44_1 + - ipykernel=5.1.2=py37h5ca1d4c_0 + - ipython=7.8.0=py37h5ca1d4c_0 + - ipython_genutils=0.2.0=py_1 + - jedi=0.15.1=py37_0 + - jinja2=2.10.1=py_0 + - jpeg=9c=h14c3975_1001 + - jsonschema=3.0.2=py37_0 + - jupyter_client=5.3.1=py_0 + - jupyter_core=4.4.0=py_0 + - krb5=1.16.3=h05b26f9_1001 + - libblas=3.8.0=12_openblas + - libcblas=3.8.0=12_openblas + - libcurl=7.65.3=hda55be3_0 + - libedit=3.1.20170329=hf8c457e_1001 + - libffi=3.2.1=he1b5a44_1006 + - libgcc-ng=9.1.0=hdf63c60_0 + - libgd=2.2.5=h82918c6_1006 + - libgfortran-ng=7.3.0=hdf63c60_0 + - libiconv=1.15=h516909a_1005 + - liblapack=3.8.0=12_openblas + - libopenblas=0.3.7=h6e990d7_1 + - libpng=1.6.37=hed695b0_0 + - libsodium=1.0.17=h516909a_0 + - libssh2=1.8.2=h22169c7_2 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - libtiff=4.0.10=h57b8799_1003 + - libuuid=2.32.1=h14c3975_1000 + - libwebp=1.0.2=h576950b_1 + - libxcb=1.13=h14c3975_1002 + - libxml2=2.9.9=hee79883_5 + - lz4-c=1.8.3=hf484d3e_1001 + - make=4.2.1=h14c3975_2004 + - markupsafe=1.1.1=py37h14c3975_0 + - mistune=0.8.4=py37h14c3975_1000 + - nbconvert=5.6.0=py37_1 + - nbformat=4.4.0=py_1 + - ncurses=6.1=hf484d3e_1002 + - notebook=6.0.1=py37_0 + - openssl=1.1.1c=h516909a_0 + - pandoc=2.7.3=0 + - pandocfilters=1.4.2=py_1 + - pango=1.42.4=ha030887_1 + - parso=0.5.1=py_0 + - pcre=8.41=hf484d3e_1003 + - perl=5.26.2=h516909a_1006 + - perl-autoloader=5.74=pl526_2 + - perl-carp=1.38=pl526_3 + - perl-clone=0.42=pl526h516909a_0 + - perl-config-general=2.63=pl526_0 + - perl-digest-perl-md5=1.9=pl526_1 + - perl-dynaloader=1.25=pl526_1 + - perl-exporter=5.72=pl526_1 + - perl-exporter-tiny=1.002001=pl526_0 + - perl-extutils-makemaker=7.36=pl526_1 + - perl-font-ttf=1.06=pl526_0 + - perl-gd=2.71=pl526he860b03_0 + - perl-io-string=1.08=pl526_3 + - perl-list-moreutils=0.428=pl526_1 + - perl-list-moreutils-xs=0.428=pl526_0 + - perl-math-bezier=0.01=pl526_1 + - perl-math-round=0.07=pl526_1 + - perl-math-vecstat=0.08=pl526_1 + - perl-module-implementation=0.09=pl526_2 + - perl-module-runtime=0.016=pl526_1 + - perl-number-format=1.75=pl526_3 + - perl-params-validate=1.29=pl526h14c3975_1 + - perl-pathtools=3.75=pl526h14c3975_1 + - perl-readonly=2.05=pl526_0 + - perl-regexp-common=2017060201=pl526_0 + - perl-scalar-list-utils=1.52=pl526h516909a_0 + - perl-set-intspan=1.19=pl526_1 + - perl-statistics-basic=1.6611=pl526_2 + - perl-svg=2.84=pl526_0 + - perl-text-format=0.59=pl526_2 + - perl-time-hires=1.9760=pl526h14c3975_1 + - perl-try-tiny=0.30=pl526_1 + - perl-xml-parser=2.44_01=pl526ha1d75be_1002 + - perl-xsloader=0.24=pl526_0 + - pexpect=4.7.0=py37_0 + - pickleshare=0.7.5=py37_1000 + - pip=19.2.3=py37_0 + - pixman=0.38.0=h516909a_1003 + - prometheus_client=0.7.1=py_0 + - prompt_toolkit=2.0.9=py_0 + - pthread-stubs=0.4=h14c3975_1001 + - ptyprocess=0.6.0=py_1001 + - pygments=2.4.2=py_0 + - pyrsistent=0.15.4=py37h516909a_0 + - python=3.7.3=h33d41f4_1 + - python-dateutil=2.8.0=py_0 + - pyzmq=18.0.2=py37h1768529_2 + - r-askpass=1.1=r36hcdcec82_1 + - r-assertthat=0.2.1=r36h6115d3f_1 + - r-backports=1.1.4=r36hcdcec82_1 + - r-base=3.6.1=hba50c9b_4 + - r-base64enc=0.1_3=r36hcdcec82_1003 + - r-bh=1.69.0_1=r36h6115d3f_1 + - r-boot=1.3_23=r36h6115d3f_2 + - r-broom=0.5.2=r36h6115d3f_1 + - r-callr=3.3.1=r36h6115d3f_0 + - r-caret=6.0_84=r36hcdcec82_1 + - r-cellranger=1.1.0=r36h6115d3f_1002 + - r-class=7.3_15=r36hcdcec82_1001 + - r-cli=1.1.0=r36h6115d3f_2 + - r-clipr=0.7.0=r36h6115d3f_0 + - r-cluster=2.1.0=r36h9bbef5b_2 + - r-codetools=0.2_16=r36h6115d3f_1001 + - r-colorspace=1.4_1=r36hcdcec82_1 + - r-crayon=1.3.4=r36h6115d3f_1002 + - r-curl=4.0=r36hcdcec82_0 + - r-data.table=1.12.2=r36hcdcec82_1 + - r-dbi=1.0.0=r36h6115d3f_1002 + - r-dbplyr=1.4.2=r36h6115d3f_1 + - r-digest=0.6.20=r36h0357c0b_1 + - r-dplyr=0.8.3=r36h0357c0b_2 + - r-ellipsis=0.2.0.1=r36hcdcec82_1 + - r-essentials=3.6=r36_2001 + - r-evaluate=0.14=r36h6115d3f_1 + - r-fansi=0.4.0=r36hcdcec82_1001 + - r-forcats=0.4.0=r36h6115d3f_1 + - r-foreach=1.4.7=r36h6115d3f_0 + - r-foreign=0.8_72=r36hcdcec82_0 + - r-formatr=1.7=r36h6115d3f_1 + - r-fs=1.3.1=r36h0357c0b_1 + - r-generics=0.0.2=r36h6115d3f_1002 + - r-getopt=1.20.3=r36_1 + - r-ggplot2=3.2.1=r36h6115d3f_0 + - r-gistr=0.4.2=r36h6115d3f_1002 + - r-glmnet=2.0_18=r36h9bbef5b_2 + - r-glue=1.3.1=r36hcdcec82_1 + - r-gower=0.2.1=r36hcdcec82_1 + - r-gridextra=2.3=r36h6115d3f_1002 + - r-gtable=0.3.0=r36h6115d3f_2 + - r-hash=3.0.1=r36h6115d3f_1 + - r-haven=2.1.1=r36h0357c0b_1 + - r-hexbin=1.27.3=r36h9bbef5b_2 + - r-highr=0.8=r36h6115d3f_1 + - r-hms=0.5.1=r36h6115d3f_0 + - r-htmltools=0.3.6=r36he1b5a44_1003 + - r-htmlwidgets=1.3=r36h6115d3f_1001 + - r-httpuv=1.5.1=r36h0357c0b_1 + - r-httr=1.4.1=r36h6115d3f_1 + - r-ipred=0.9_9=r36hcdcec82_1 + - r-irdisplay=0.7=r36_1001 + - r-irkernel=1.0.2=r36h6115d3f_2 + - r-iterators=1.0.12=r36h6115d3f_0 + - r-jsonlite=1.6=r36hcdcec82_1001 + - r-kernsmooth=2.23_15=r36h9bbef5b_1004 + - r-knitr=1.24=r36h6115d3f_0 + - r-labeling=0.3=r36h6115d3f_1002 + - r-later=0.8.0=r36h0357c0b_1 + - r-lattice=0.20_38=r36hcdcec82_1002 + - r-lava=1.6.6=r36h6115d3f_0 + - r-lazyeval=0.2.2=r36hcdcec82_1 + - r-lubridate=1.7.4=r36h0357c0b_1002 + - r-magrittr=1.5=r36h6115d3f_1002 + - r-maps=3.3.0=r36hcdcec82_1003 + - r-markdown=1.1=r36hcdcec82_0 + - r-mass=7.3_51.4=r36hcdcec82_1 + - r-matrix=1.2_17=r36hcdcec82_1 + - r-mgcv=1.8_28=r36hcdcec82_1 + - r-mime=0.7=r36hcdcec82_1 + - r-modelmetrics=1.2.2=r36h0357c0b_1 + - r-modelr=0.1.5=r36h6115d3f_0 + - r-munsell=0.5.0=r36h6115d3f_1002 + - r-nlme=3.1_141=r36h9bbef5b_1 + - r-nnet=7.3_12=r36hcdcec82_1003 + - r-numderiv=2016.8_1.1=r36h6115d3f_1 + - r-openssl=1.4.1=r36h9c8475f_0 + - r-optparse=1.6.2=r36h6115d3f_1 + - r-pbdzmq=0.3_3=r36h559a7a4_1002 + - r-pillar=1.4.2=r36h6115d3f_2 + - r-pkgconfig=2.0.2=r36h6115d3f_1003 + - r-plogr=0.2.0=r36h6115d3f_1002 + - r-plyr=1.8.4=r36h0357c0b_1003 + - r-prettyunits=1.0.2=r36h6115d3f_1002 + - r-processx=3.4.1=r36hcdcec82_0 + - r-prodlim=2018.04.18=r36h0357c0b_1003 + - r-progress=1.2.2=r36h6115d3f_1 + - r-promises=1.0.1=r36h0357c0b_1001 + - r-pryr=0.1.4=r36h0357c0b_1003 + - r-ps=1.3.0=r36hcdcec82_1001 + - r-purrr=0.3.2=r36hcdcec82_1 + - r-quantmod=0.4_15=r36h6115d3f_1 + - r-r6=2.4.0=r36h6115d3f_2 + - r-randomforest=4.6_14=r36h9bbef5b_1002 + - r-rbokeh=0.5.0=r36h6115d3f_1002 + - r-rcolorbrewer=1.1_2=r36h6115d3f_1002 + - r-rcpp=1.0.2=r36h0357c0b_0 + - r-readr=1.3.1=r36h0357c0b_1001 + - r-readxl=1.3.1=r36h0357c0b_1 + - r-recipes=0.1.6=r36h6115d3f_1 + - r-recommended=3.6=r36_1003 + - r-rematch=1.0.1=r36h6115d3f_1002 + - r-repr=1.0.1=r36h6115d3f_1 + - r-reprex=0.3.0=r36h6115d3f_1 + - r-reshape2=1.4.3=r36h0357c0b_1004 + - r-rlang=0.4.0=r36hcdcec82_1 + - r-rmarkdown=1.15=r36h6115d3f_0 + - r-rpart=4.1_15=r36hcdcec82_1 + - r-rstudioapi=0.10=r36h6115d3f_2 + - r-rvest=0.3.4=r36h6115d3f_1 + - r-scales=1.0.0=r36h0357c0b_1002 + - r-selectr=0.4_1=r36h6115d3f_1001 + - r-shiny=1.3.2=r36h6115d3f_1 + - r-sourcetools=0.1.7=r36he1b5a44_1001 + - r-spatial=7.3_11=r36hcdcec82_1003 + - r-squarem=2017.10_1=r36h6115d3f_1002 + - r-stringi=1.4.3=r36h0e574ca_3 + - r-stringr=1.4.0=r36h6115d3f_1 + - r-survival=2.44_1.1=r36hcdcec82_1 + - r-sys=3.3=r36hcdcec82_0 + - r-tibble=2.1.3=r36hcdcec82_1 + - r-tidyr=0.8.3=r36h0357c0b_1 + - r-tidyselect=0.2.5=r36h0357c0b_1001 + - r-tidyverse=1.2.1=r36h6115d3f_1002 + - r-timedate=3043.102=r36h6115d3f_1001 + - r-tinytex=0.15=r36h6115d3f_0 + - r-ttr=0.23_4=r36h9bbef5b_1002 + - r-utf8=1.1.4=r36hcdcec82_1001 + - r-uuid=0.1_2=r36hcdcec82_1002 + - r-vctrs=0.2.0=r36hcdcec82_1 + - r-viridislite=0.3.0=r36h6115d3f_1002 + - r-whisker=0.4=r36h6115d3f_0 + - r-withr=2.1.2=r36h6115d3f_1001 + - r-xfun=0.9=r36h6115d3f_0 + - r-xml2=1.2.2=r36h0357c0b_0 + - r-xtable=1.8_4=r36h6115d3f_2 + - r-xts=0.11_2=r36hcdcec82_1 + - r-yaml=2.2.0=r36hcdcec82_1002 + - r-zeallot=0.1.0=r36h6115d3f_1001 + - r-zoo=1.8_6=r36hcdcec82_1 + - readline=8.0=hf8c457e_0 + - sed=4.7=h1bed415_1000 + - send2trash=1.5.0=py_0 + - setuptools=41.2.0=py37_0 + - six=1.12.0=py37_1000 + - sqlite=3.29.0=hcee41ef_1 + - terminado=0.8.2=py37_0 + - testpath=0.4.2=py_1001 + - tk=8.6.9=hed695b0_1002 + - tktable=2.10=h555a92e_1 + - tornado=6.0.3=py37h516909a_0 + - traitlets=4.3.2=py37_1000 + - wcwidth=0.1.7=py_1 + - webencodings=0.5.1=py_1 + - wheel=0.33.6=py37_0 + - xorg-kbproto=1.0.7=h14c3975_1002 + - xorg-libice=1.0.10=h516909a_0 + - xorg-libsm=1.2.3=h84519dc_1000 + - xorg-libx11=1.6.8=h516909a_0 + - xorg-libxau=1.0.9=h14c3975_0 + - xorg-libxdmcp=1.1.3=h516909a_0 + - xorg-libxext=1.3.4=h516909a_0 + - xorg-libxrender=0.9.10=h516909a_1002 + - xorg-renderproto=0.11.1=h14c3975_1002 + - xorg-xextproto=7.3.0=h14c3975_1002 + - xorg-xproto=7.0.31=h14c3975_1007 + - xz=5.2.4=h14c3975_1001 + - zeromq=4.3.2=he1b5a44_2 + - zlib=1.2.11=h516909a_1005 + - zstd=1.4.0=h3b9ef0a_0 +prefix: /data/home/achande3/anaconda3/envs/pChunks +