From 95ee2ba5d25b167c7416deb5c714eed751e791ed Mon Sep 17 00:00:00 2001 From: Tony Bolger Date: Mon, 12 Oct 2020 15:42:49 +0200 Subject: [PATCH] Version 0.32 --- MANIFEST.MF | 1 + adapters/NexteraPE-PE.fa | 12 ++ adapters/TruSeq3-PE-2.fa | 12 ++ build.xml | 4 +- .../usadellab/trimmomatic/Trimmomatic.java | 14 +- .../usadellab/trimmomatic/TrimmomaticPE.java | 179 ++++++++++++++++-- .../usadellab/trimmomatic/TrimmomaticSE.java | 28 ++- .../trimmomatic/fastq/FastqParser.java | 87 ++++++++- .../trimmomatic/fastq/FastqRecord.java | 7 +- 9 files changed, 317 insertions(+), 27 deletions(-) create mode 100644 MANIFEST.MF create mode 100644 adapters/NexteraPE-PE.fa create mode 100644 adapters/TruSeq3-PE-2.fa diff --git a/MANIFEST.MF b/MANIFEST.MF new file mode 100644 index 0000000..5ff8755 --- /dev/null +++ b/MANIFEST.MF @@ -0,0 +1 @@ +Main-Class: org.usadellab.trimmomatic.Trimmomatic diff --git a/adapters/NexteraPE-PE.fa b/adapters/NexteraPE-PE.fa new file mode 100644 index 0000000..a986757 --- /dev/null +++ b/adapters/NexteraPE-PE.fa @@ -0,0 +1,12 @@ +>PrefixNX/1 +AGATGTGTATAAGAGACAG +>PrefixNX/2 +AGATGTGTATAAGAGACAG +>Trans1 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>Trans1_rc +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>Trans2 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>Trans2_rc +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC \ No newline at end of file diff --git a/adapters/TruSeq3-PE-2.fa b/adapters/TruSeq3-PE-2.fa new file mode 100644 index 0000000..b205511 --- /dev/null +++ b/adapters/TruSeq3-PE-2.fa @@ -0,0 +1,12 @@ +>PrefixPE/1 +TACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PrefixPE/2 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>PE1 +TACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PE1_rc +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA +>PE2 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>PE2_rc +AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ No newline at end of file diff --git a/build.xml b/build.xml index d93463f..87b0c8e 100644 --- a/build.xml +++ b/build.xml @@ -1,5 +1,5 @@ - + @@ -72,7 +72,7 @@ - + diff --git a/src/org/usadellab/trimmomatic/Trimmomatic.java b/src/org/usadellab/trimmomatic/Trimmomatic.java index 9e9e992..6d84923 100644 --- a/src/org/usadellab/trimmomatic/Trimmomatic.java +++ b/src/org/usadellab/trimmomatic/Trimmomatic.java @@ -6,6 +6,18 @@ public class Trimmomatic { + private static final int MAX_AUTO_THREADS=16; + + public static int calcAutoThreadCount() + { + int cpus=Runtime.getRuntime().availableProcessors(); + + if(cpus>MAX_AUTO_THREADS) + return MAX_AUTO_THREADS; + + return cpus; + } + /** * @param args */ @@ -33,7 +45,7 @@ else if(mode.equals("SE")) if(showUsage) { System.err.println("Usage: "); - System.err.println(" PE [-threads ] [-phred33|-phred64] [-trimlog ] ..."); + System.err.println(" PE [-threads ] [-phred33|-phred64] [-trimlog ] [-basein | ] [-baseout | ] ..."); System.err.println(" or: "); System.err.println(" SE [-threads ] [-phred33|-phred64] [-trimlog ] ..."); System.exit(1); diff --git a/src/org/usadellab/trimmomatic/TrimmomaticPE.java b/src/org/usadellab/trimmomatic/TrimmomaticPE.java index 0af8f83..c9a78a0 100644 --- a/src/org/usadellab/trimmomatic/TrimmomaticPE.java +++ b/src/org/usadellab/trimmomatic/TrimmomaticPE.java @@ -21,7 +21,7 @@ import org.usadellab.trimmomatic.trim.Trimmer; import org.usadellab.trimmomatic.trim.TrimmerFactory; -public class TrimmomaticPE +public class TrimmomaticPE extends Trimmomatic { /** @@ -253,6 +253,24 @@ public void process(File input1, File input2, File output1P, File output1U, File FastqParser parser2 = new FastqParser(phredOffset); parser2.parse(input2); + if(phredOffset==0) + { + int phred1=parser1.determinePhredOffset(); + int phred2=parser2.determinePhredOffset(); + + if(phred1==phred2 && phred1!=0) + { + System.err.println("Quality encoding detected as phred"+phred1); + parser1.setPhredOffset(phred1); + parser2.setPhredOffset(phred1); + } + else + { + System.err.println("Error: Unable to detect quality encoding"); + System.exit(1); + } + } + FastqSerializer serializer1P = new FastqSerializer(); serializer1P.open(output1P); @@ -286,13 +304,92 @@ public void process(File input1, File input2, File output1P, File output1U, File if (trimLogStream != null) trimLogStream.close(); } + + private static int getFileExtensionIndex(String str) + { + String extensions[]={".fq",".fastq",".txt",".gz",".bz2",".zip"}; + + String tmp=str; + boolean done=false; + + while(!done) + { + done=true; + for(String ext: extensions) + { + if(tmp.endsWith(ext)) + { + tmp=tmp.substring(0,tmp.length()-ext.length()); + done=false; + } + } + } + + return tmp.length(); + } + + private static String replaceLast(String str, String out, String in) + { + int idx1=str.lastIndexOf(out); + if(idx1==-1) + return null; + + int idx2=idx1+out.length(); + + return str.substring(0,idx1)+in+str.substring(idx2); + } + + + + private static File[] calculateTemplatedInput(String baseStr) + { + String translation[][]={{"_R1_","_R2_"},{"_f","_r"},{".f",".r"},{"_1","_2"},{".1",".2"}}; + + File fileBase=new File(baseStr); + File baseDir=fileBase.getParentFile(); + + String baseName=fileBase.getName(); + int extSplit=getFileExtensionIndex(baseName); + + String core=baseName.substring(0,extSplit); + String exts=baseName.substring(extSplit); + + for(String pair[]: translation) + { + String tmp=replaceLast(core, pair[0], pair[1]); + if(tmp!=null) + return new File[] {fileBase, new File(baseDir, tmp+exts)}; + } + + return null; + } + + + private static File[] calculateTemplatedOutput(String baseStr) + { + File fileBase=new File(baseStr); + File baseDir=fileBase.getParentFile(); + + String baseName=fileBase.getName(); + int extSplit=getFileExtensionIndex(baseName); + + String core=baseName.substring(0,extSplit); + String exts=baseName.substring(extSplit); + + return new File[] {new File(baseDir,core+"_1P"+exts),new File(baseDir,core+"_1U"+exts),new File(baseDir,core+"_2P"+exts),new File(baseDir,core+"_2U"+exts)}; + } + + public static boolean run(String[] args) throws IOException { int argIndex = 0; - int phredOffset = 64; - int threads = 1; + int phredOffset = 0; + int threads = 0; + String templateInput=null; + String templateOutput=null; + boolean badOption = false; File trimLog = null; @@ -313,6 +410,20 @@ else if (arg.equals("-trimlog")) else badOption = true; } + else if (arg.equals("-basein")) + { + if (argIndex < args.length) + templateInput = args[argIndex++]; + else + badOption = true; + } + else if (arg.equals("-baseout")) + { + if (argIndex < args.length) + templateOutput = args[argIndex++]; + else + badOption = true; + } else { System.err.println("Unknown option " + arg); @@ -320,22 +431,62 @@ else if (arg.equals("-trimlog")) } } - if (args.length - argIndex < 7 || badOption) + int additionalArgs=1+(templateInput==null?2:0)+(templateOutput==null?4:0); + + if (args.length - argIndex < additionalArgs || badOption) return false; System.err.print("TrimmomaticPE: Started with arguments:"); for (String arg : args) System.err.print(" " + arg); System.err.println(); + + if(threads==0) + { + threads=calcAutoThreadCount(); + if(threads>1) + System.err.println("Multiple cores found: Using "+threads+" threads"); + } - File input1 = new File(args[argIndex++]); - File input2 = new File(args[argIndex++]); - - File output1P = new File(args[argIndex++]); - File output1U = new File(args[argIndex++]); - - File output2P = new File(args[argIndex++]); - File output2U = new File(args[argIndex++]); + File inputs[],outputs[]; + + if(templateInput!=null) + { + inputs=calculateTemplatedInput(templateInput); + if(inputs==null) + { + System.err.println("Unable to determine input files from: "+templateInput); + System.exit(1); + } + + System.out.println("Using templated Input files: "+inputs[0]+" "+inputs[1]); + } + else + { + inputs=new File[2]; + inputs[0]=new File(args[argIndex++]); + inputs[1]=new File(args[argIndex++]); + } + + if(templateOutput!=null) + { + outputs=calculateTemplatedOutput(templateOutput); + if(outputs==null) + { + System.err.println("Unable to determine output files from: "+templateInput); + System.exit(1); + } + + System.out.println("Using templated Output files: "+outputs[0]+" "+outputs[1]+" "+outputs[2]+" "+outputs[3]); + } + else + { + outputs=new File[4]; + outputs[0]=new File(args[argIndex++]); + outputs[1]=new File(args[argIndex++]); + outputs[2]=new File(args[argIndex++]); + outputs[3]=new File(args[argIndex++]); + } TrimmerFactory fac = new TrimmerFactory(); Trimmer trimmers[] = new Trimmer[args.length - argIndex]; @@ -344,7 +495,7 @@ else if (arg.equals("-trimlog")) trimmers[i] = fac.makeTrimmer(args[i + argIndex]); TrimmomaticPE tm = new TrimmomaticPE(); - tm.process(input1, input2, output1P, output1U, output2P, output2U, trimmers, phredOffset, trimLog, threads); + tm.process(inputs[0], inputs[1], outputs[0], outputs[1], outputs[2], outputs[3], trimmers, phredOffset, trimLog, threads); System.err.println("TrimmomaticPE: Completed successfully"); return true; @@ -355,7 +506,7 @@ public static void main(String[] args) throws IOException if (!run(args)) { System.err - .println("Usage: TrimmomaticPE [-threads ] [-phred33|-phred64] [-trimlog ] ..."); + .println("Usage: TrimmomaticPE [-threads ] [-phred33|-phred64] [-trimlog ] [-basein | ] [-baseout | ] ..."); System.exit(1); } } diff --git a/src/org/usadellab/trimmomatic/TrimmomaticSE.java b/src/org/usadellab/trimmomatic/TrimmomaticSE.java index 20bbab7..4307274 100644 --- a/src/org/usadellab/trimmomatic/TrimmomaticSE.java +++ b/src/org/usadellab/trimmomatic/TrimmomaticSE.java @@ -21,7 +21,7 @@ import org.usadellab.trimmomatic.trim.Trimmer; import org.usadellab.trimmomatic.trim.TrimmerFactory; -public class TrimmomaticSE +public class TrimmomaticSE extends Trimmomatic { /** @@ -191,6 +191,21 @@ public void process(File input, File output, Trimmer trimmers[], int phredOffset FastqParser parser = new FastqParser(phredOffset); parser.parse(input); + if(phredOffset==0) + { + int phred=parser.determinePhredOffset(); + if(phred!=0) + { + System.err.println("Quality encoding detected as phred"+phred); + parser.setPhredOffset(phred); + } + else + { + System.err.println("Error: Unable to detect quality encoding"); + System.exit(1); + } + } + FastqSerializer serializer = new FastqSerializer(); serializer.open(output); @@ -212,8 +227,8 @@ public void process(File input, File output, Trimmer trimmers[], int phredOffset public static boolean run(String[] args) throws IOException { int argIndex = 0; - int phredOffset = 64; - int threads = 1; + int phredOffset = 0; + int threads = 0; boolean badOption = false; @@ -250,6 +265,13 @@ else if (arg.equals("-trimlog")) System.err.print(" " + arg); System.err.println(); + if(threads==0) + { + threads=calcAutoThreadCount(); + System.err.println("Automatically using "+threads+" threads"); + } + + File input = new File(args[argIndex++]); File output = new File(args[argIndex++]); diff --git a/src/org/usadellab/trimmomatic/fastq/FastqParser.java b/src/org/usadellab/trimmomatic/fastq/FastqParser.java index 6f51a04..1235f15 100644 --- a/src/org/usadellab/trimmomatic/fastq/FastqParser.java +++ b/src/org/usadellab/trimmomatic/fastq/FastqParser.java @@ -6,6 +6,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.util.ArrayDeque; import java.util.concurrent.atomic.AtomicBoolean; import java.util.zip.ZipInputStream; @@ -15,7 +16,12 @@ public class FastqParser { + private static final int PREREAD_COUNT=10000; + private int phredOffset; + private ArrayDeque deque; + int qualHistogram[]; + private PositionTrackingInputStream posTrackInputStream; private BufferedReader reader; private FastqRecord current; @@ -25,10 +31,21 @@ public class FastqParser { public FastqParser(int phredOffset) { this.phredOffset = phredOffset; + deque=new ArrayDeque(PREREAD_COUNT); + this.atEOF=new AtomicBoolean(); } - public void parseOne() throws IOException { + public void setPhredOffset(int phredOffset) + { + this.phredOffset=phredOffset; + + if(current!=null) + current.setPhredOffset(phredOffset); + } + + public void parseOne() throws IOException + { current = null; String name; @@ -73,6 +90,36 @@ public int getProgress() { return (int)(((float) bytesRead / fileLength) * 100); } + + private void accumulateHistogram(FastqRecord rec) + { + int quals[]=rec.getQualityAsInteger(false); + + for(int i: quals) + qualHistogram[i]++; + } + + public int determinePhredOffset() + { + int phred33Total=0; + int phred64Total=0; + + for(int i=33;i<=58;i++) + phred33Total+=qualHistogram[i]; + + for(int i=80;i<=104;i++) + phred64Total+=qualHistogram[i]; + + if(phred33Total==0 && phred64Total>0) + return 64; + + if(phred64Total==0 && phred33Total>0) + return 33; + + return 0; + } + + public void parse(File file) throws IOException { String name = file.getName(); fileLength = file.length(); @@ -90,6 +137,22 @@ public void parse(File file) throws IOException { } reader=new BufferedReader(new InputStreamReader(contentInputStream), 32768); + + if(phredOffset==0) + { + deque.clear(); + qualHistogram=new int[256]; + + for(int i=0;i