From e66991f6fdb6d175f9135f5f19a6f705d0f4d4c6 Mon Sep 17 00:00:00 2001 From: Kincekara Date: Tue, 17 Dec 2024 16:41:30 +0000 Subject: [PATCH] adds bbtools 39.13 w/ bugfix --- README.md | 2 +- bbtools/39.13/Dockerfile | 70 ++++++++++++++++++++++++ bbtools/39.13/README.md | 114 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 bbtools/39.13/Dockerfile create mode 100644 bbtools/39.13/README.md diff --git a/README.md b/README.md index c00458f4f..fe5e9731f 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ To learn more about the docker pull rate limits and the open source software pro | [Auspice](https://hub.docker.com/r/staphb/auspice)
[![docker pulls](https://badgen.net/docker/pulls/staphb/auspice)](https://hub.docker.com/r/staphb/auspice) | | https://github.com/nextstrain/auspice | | [bakta](https://hub.docker.com/r/staphb/bakta)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bakta)](https://hub.docker.com/r/staphb/bakta) | | https://github.com/oschwengers/bakta | | [bandage](https://hub.docker.com/r/staphb/bandage)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bandage)](https://hub.docker.com/r/staphb/bandage) | | https://rrwick.github.io/Bandage/ | -| [BBTools](https://hub.docker.com/r/staphb/bbtools/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bbtools)](https://hub.docker.com/r/staphb/bbtools) | | https://jgi.doe.gov/data-and-tools/bbtools/ | +| [BBTools](https://hub.docker.com/r/staphb/bbtools/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bbtools)](https://hub.docker.com/r/staphb/bbtools) | | https://jgi.doe.gov/data-and-tools/bbtools/ | | [bcftools](https://hub.docker.com/r/staphb/bcftools/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bcftools)](https://hub.docker.com/r/staphb/bcftools) | | https://github.com/samtools/bcftools | | [bedtools](https://hub.docker.com/r/staphb/bedtools/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bedtools)](https://hub.docker.com/r/staphb/bedtools) | | https://bedtools.readthedocs.io/en/latest/
https://github.com/arq5x/bedtools2 | | [berrywood-report-env](https://hub.docker.com/r/staphb/berrywood-report-env/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/berrywood-report-env)](https://hub.docker.com/r/staphb/berrywood-report-env) | | none | diff --git a/bbtools/39.13/Dockerfile b/bbtools/39.13/Dockerfile new file mode 100644 index 000000000..6a57f87ea --- /dev/null +++ b/bbtools/39.13/Dockerfile @@ -0,0 +1,70 @@ +FROM staphb/samtools:1.21 as samtools +FROM staphb/htslib:1.21 as htslib + +# As a reminder +# https://github.com/StaPH-B/docker-builds/pull/925#issuecomment-2010553275 +# bbmap/docs/TableOfContents.txt lists additional dependencies + +FROM ubuntu:jammy as app + +ARG SAMBAMBAVER=1.0.1 +ARG BBTOOLSVER=39.13 + +LABEL base.image="ubuntu:jammy" +LABEL dockerfile.version="1" +LABEL software="BBTools" +LABEL software.version=${BBTOOLSVER} +LABEL description="A set of tools labeled as \"Bestus Bioinformaticus\"" +LABEL website="https://sourceforge.net/projects/bbmap" +LABEL documentation="https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" +LABEL license="https://jgi.doe.gov/disclaimer/" +LABEL maintainer="Abigail Shockey" +LABEL maintainer.email="abigail.shockey@slh.wisc.edu" +LABEL maintainer2="Padraic Fanning" +LABEL maintainer2.email="faninnpm AT miamioh DOT edu" + +RUN apt-get update && \ + apt-get install --no-install-recommends -y \ + openjdk-8-jre-headless \ + pigz \ + pbzip2 \ + lbzip2 \ + bzip2 \ + libcurl4-gnutls-dev \ + libdeflate-dev \ + wget \ + ca-certificates \ + procps && \ + rm -rf /var/lib/apt/lists/* && \ + apt-get autoclean + +# copy samtools to image +COPY --from=samtools /usr/local/bin/* /usr/local/bin/ +COPY --from=htslib /usr/local/bin/* /usr/local/bin/ + +# download and install sambamba +RUN wget -q https://github.com/biod/sambamba/releases/download/v${SAMBAMBAVER}/sambamba-${SAMBAMBAVER}-linux-amd64-static.gz && \ + gzip -d sambamba-${SAMBAMBAVER}-linux-amd64-static.gz && \ + mv sambamba-${SAMBAMBAVER}-linux-amd64-static /usr/local/bin/sambamba && \ + chmod +x /usr/local/bin/sambamba + +# download and install bbtools +RUN wget -q https://sourceforge.net/projects/bbmap/files/BBMap_${BBTOOLSVER}.tar.gz && \ + tar -xzf BBMap_${BBTOOLSVER}.tar.gz && \ + rm BBMap_${BBTOOLSVER}.tar.gz && \ + mkdir /data + +ENV PATH=/bbmap/:$PATH \ + LC_ALL=C + +CMD tail -n 90 /bbmap/docs/TableOfContents.txt + +WORKDIR /data + +# testing +FROM app as test + +# get test data and test one thing that uses samtools/sambamba +RUN wget -q https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.primertrim.sorted.bam && \ + streamsam.sh in='SRR13957123.primertrim.sorted.bam' out='test_SRR13957123.primertrim.sorted.fastq.gz' && \ + test -f test_SRR13957123.primertrim.sorted.fastq.gz diff --git a/bbtools/39.13/README.md b/bbtools/39.13/README.md new file mode 100644 index 000000000..e65829476 --- /dev/null +++ b/bbtools/39.13/README.md @@ -0,0 +1,114 @@ +# BBTools container + +Main tool: [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) + +Code repository: https://sourceforge.net/projects/bbmap/ + +Additional tools: +- samtools: 1.21 +- htslib: 1.21 +- sambamba: 1.0.1 + +Basic information on how to use this tool: +- executable: *.sh +- help: Program descriptions and options are shown when running the shell scripts with no parameters. +- version: --version +- description: +>BBTools is a suite of fast, multithreaded bioinformatics tools designed for analysis of DNA and RNA sequence data. BBTools can handle common sequencing file formats such as fastq, fasta, sam, scarf, fasta+qual, compressed or raw, with autodetection of quality encoding and interleaving. + +Additional information: +| Script | Purpose | Comment | +|-------------------------|----------------------------------------------------------------------------------|------------------------------------------------------------------------| +| bbcms.sh | Performs error correction using a Count-Min Sketch | Intended for metagenome assembly assembly | +| bbcountunique.sh | Counts unique kmers in reads | | +| bbduk.sh | Trims, filters or masks reads using kmers | | +| bbmap.sh | Splice-aware aligner for short reads | | +| bbmapskimmer.sh | BBMap version designed for high levels of multimapping | | +| bbmask.sh | Masks references based on various things, such as sequence complexity | | +| bbmerge.sh | Merges overlapping paired reads | | +| bbmerge-auto.sh | Same as bbmerge, but tries to allocate all memory on the node | Use this version for kmer operations like extend | +| bbnorm.sh | Normalizes reads based on coverage | Mainly for use prior to single-cell assembly | +| bbsplit.sh | BBMap version that maps to multiple references simultaneously | Intended for decontamination; similar to Seal | +| bbversion.sh | Prints the version of BBTools | | +| bbwrap.sh | Wraps BBMap to process many files using same reference | Saves time by loading the index only once | +| calctruequality.sh | Allows recalibration of quality scores from mapped reads | This generates the correction matrix; BBDuk does the recalibration | +| callgenes.sh | Fast prokaryotic gene caller | Integrated into BBSketch | +| callvariants.sh | Fast variant caller | | +| callvariants2.sh | Same as callvariants.sh with the "multisample" flag | | +| clumpify.sh | Shrinks compressed fastq files, and can remove duplicate reads | Also supports error correction | +| comparesketch.sh | Compares sketches locally, without using a sketch server | | +| crossblock.sh | Alias for decontaminate.sh | | +| cutgff.sh | Cuts out features defined by gff file | E.g, generates one fasta entry per gene from a gff and an assembly | +| cutprimers.sh | Cuts out subregions of ribosomes | Mainly for 16S analysis | +| decontaminate.sh | Pool-level decontamination for single-cell MDA-amplified genomes | | +| dedupe.sh | Removes duplicate and fully-contained sequences | Can also be used to cluster 16S sequences | +| dedupe2.sh | Version of dedupe that supports more hash keys for greater sensitivity | | +| dedupebymapping.sh | Deduplicates reads based on mapping coordinates | | +| demuxbyname.sh | Demultiplexes based on sequences headers | | +| filterbyname.sh | Filters based on sequence headers | | +| filterbytaxa.sh | Filters sequences based on taxonomic classification | Used with NCBI datasets | +| filterbytile.sh | Removes reads that are in low quality areas on flowcell | | +| filterqc.sh | Part of JGI's fastq filtering pipeline | | +| filtersam.sh | Filters sam files to remove reads with multiple unsupported mismatches | Designed for NovaSeq | +| gitable.sh | Used to process NCBI taxonomy data | | +| khist.sh | Alias for bbnorm.sh with flags for making a kmer frequency histogram | | +| kmercountexact.sh | Counts kmers and produces a histogram | Uses more memory than BBNorm but allows exact counts | +| kmercountmulti.sh | Cardinality estimation over multiple kmer lengths | Uses LogLog; does not produce a histogram | +| mapPacBio.sh | BBMap version designed for PacBio or Nanopore reads | Reads longer than 5kbp get broken into 5kbp shreds | +| mergesketch.sh | Allows multiple sketches to be combined | | +| msa.sh | Alignment tool | Used with cutprimers.sh to cut subsections out of 16s | +| mutate.sh | Generates synthetic genomes by randomly mutating the input | | +| muxbyname.sh | Multiplex multiple files, renaming sequences based on input file name | Opposite of demuxbyname.sh | +| partition.sh | Splits a sequence file into multiple files | | +| pileup.sh | Calculates coverage from sam files | | +| plotflowcell.sh | Produces statistics about flowcell positions | | +| processhi-c.sh | Custom trimming for hi-C reads | In development | +| randomreads.sh | Generates synthetic data from real genome reference | Highly customizable | +| readqc.sh | Short read quality report | Alternative to fastqc | +| reformat.sh | Converts sequence files to another format | Has many additional options, includes subsampling | +| rename.sh | Renames sequences in various ways, such as adding a prefix | | +| repair.sh | Fixes broken pairing in fastq files | | +| representative.sh | Makes a smaller subset of a reference dataset by eliminating redundancy | Designed for use with BBSketch output | +| rqcfilter2.sh | Filtering pipeline used at JGI | portal.nersc.gov/dna/microbial/assembly/bushnell/RQCFilterData.tar | +| seal.sh | Counts kmer matches between query and reference sequences | | +| sendsketch.sh | Fast taxonomic classifier using webservers at JGI | | +| shred.sh | Breaks sequences into shorter, fixed-length pieces | | +| shuffle.sh | Randomly reorders input file | Crashes if input doesn't fit in memory | +| shuffle2.sh | Randomly reorders input file | Supports larger files, but output might be less random | +| sketch.sh | Makes reference sketches on a per-TaxID basis | | +| sketchblacklist.sh | Makes sketch blacklists of common kmers | | +| sortbyname.sh | Sorts sequences by name, length, quality, taxa, and other things | | +| summarizequast.sh | Generates box plots for multiple quast reports | | +| tadpipe.sh | Preprocessing and assembly pipeline using tadpole | | +| tadpole.sh | Fast short read assembler | | +| tadwrapper.sh | Runs Tadpole with multiple kmer lengths to select the best assembly | | +| taxserver.sh | Starts taxonomy and sketch servers | | +| testformat.sh | Determines if file is fasta, fastq, interleaved, etc. by reading first few lines | | +| testformat2.sh | Generates extensive statistics by reading the full file | | +| translate6frames.sh | Translates nucleotide sequence into amino acid sequence in all frames | | +| vcf2gff.sh | Converts vcf format to gff format | | + +Full documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/ + +## Example Usage + +(adapted from `/opt/bbmap/pipelines/covid/processCorona.sh`) + +Interleave a pair of FASTQ files for downstream processing: + +```text +reformat.sh \ + in1=${SAMPLE}_R1.fastq.gz \ + in2=${SAMPLE}_R2.fastq.gz \ + out=${SAMPLE}.fastq.gz +``` +Split into SARS-CoV-2 and non-SARS-CoV-2 reads: + +```text +bbduk.sh ow -Xmx1g \ + in=${SAMPLE}.fq.gz \ + ref=REFERENCE.fasta \ + outm=${SAMPLE}_viral.fq.gz \ + outu=${SAMPLE}_nonviral.fq.gz \ + k=25 +```