diff --git a/README.md b/README.md index 6d1210e11..6c3ade6f5 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ To learn more about the docker pull rate limits and the open source software pro | [NCBI table2asn](https://hub.docker.com/r/staphb/ncbi-table2asn)
[![docker pulls](https://badgen.net/docker/pulls/staphb/ncbi-table2asn)](https://hub.docker.com/r/staphb/ncbi-table2asn) | | [https://www.ncbi.nlm.nih.gov/genbank/table2asn/](https://www.ncbi.nlm.nih.gov/genbank/table2asn/)
[https://ftp.ncbi.nlm.nih.gov/asn1-converters/versions/2022-06-14/by_program/table2asn/](https://ftp.ncbi.nlm.nih.gov/asn1-converters/versions/2022-06-14/by_program/table2asn/) | | [OrthoFinder](https://hub.docker.com/r/staphb/OrthoFinder)
[![docker pulls](https://badgen.net/docker/pulls/staphb/orthofinder)](https://hub.docker.com/r/staphb/orthofinder) | | https://github.com/davidemms/OrthoFinder | | [Panaroo](https://hub.docker.com/r/staphb/panaroo)
[![docker pulls](https://badgen.net/docker/pulls/staphb/panaroo)](https://hub.docker.com/r/staphb/panaroo) | | https://github.com/gtonkinhill/panaroo | -| [Pangolin](https://hub.docker.com/r/staphb/pangolin)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pangolin)](https://hub.docker.com/r/staphb/pangolin) |
Click to see Pangolin v3 and older versions! **Pangolin version & pangoLEARN data release date**
**Pangolin version & pangolin-data version**
| https://github.com/cov-lineages/pangolin
https://github.com/cov-lineages/pangoLEARN
https://github.com/cov-lineages/pango-designation
https://github.com/cov-lineages/scorpio
https://github.com/cov-lineages/constellations
https://github.com/cov-lineages/lineages (archived)
https://github.com/hCoV-2019/pangolin (archived) | +| [Pangolin](https://hub.docker.com/r/staphb/pangolin)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pangolin)](https://hub.docker.com/r/staphb/pangolin) |
Click to see Pangolin v3 and older versions! **Pangolin version & pangoLEARN data release date**
**Pangolin version & pangolin-data version**
| https://github.com/cov-lineages/pangolin
https://github.com/cov-lineages/pangoLEARN
https://github.com/cov-lineages/pango-designation
https://github.com/cov-lineages/scorpio
https://github.com/cov-lineages/constellations
https://github.com/cov-lineages/lineages (archived)
https://github.com/hCoV-2019/pangolin (archived) | | [parallel-perl](https://hub.docker.com/r/staphb/parallel-perl)
[![docker pulls](https://badgen.net/docker/pulls/staphb/parallel-perl)](https://hub.docker.com/r/staphb/parallel-perl) | | https://www.gnu.org/software/parallel | | [pasty](https://hub.docker.com/r/staphb/pasty)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pasty)](https://hub.docker.com/r/staphb/pasty) | | https://github.com/rpetit3/pasty | | [pbptyper](https://hub.docker.com/r/staphb/pbptyper)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pbptyper)](https://hub.docker.com/r/staphb/pbptyper) | | https://github.com/rpetit3/pbptyper | diff --git a/pangolin/4.3-pdata-1.20/Dockerfile b/pangolin/4.3-pdata-1.20/Dockerfile new file mode 100644 index 000000000..b1c19159d --- /dev/null +++ b/pangolin/4.3-pdata-1.20/Dockerfile @@ -0,0 +1,127 @@ +FROM mambaorg/micromamba:1.4.3 as app + +# build and run as root users since micromamba image has 'mambauser' set as the $USER +USER root +# set workdir to default for building; set to /data at the end +WORKDIR / + +# ARG variables only persist during build time +# had to include the v for some of these due to GitHub tags. +# using pangolin-data github tag, NOT what is in the GH release title "v1.2.133" +ARG PANGOLIN_VER="v4.3" +ARG PANGOLIN_DATA_VER="v1.20" +ARG SCORPIO_VER="v0.3.17" +ARG CONSTELLATIONS_VER="v0.1.10" +ARG USHER_VER="0.6.2" + +# metadata labels +LABEL base.image="mambaorg/micromamba:1.4.3" +LABEL dockerfile.version="1" +LABEL software="pangolin" +LABEL software.version=${PANGOLIN_VER} +LABEL description="Conda environment for Pangolin. Pangolin: Software package for assigning SARS-CoV-2 genome sequences to global lineages." +LABEL website="https://github.com/cov-lineages/pangolin" +LABEL license="GNU General Public License v3.0" +LABEL license.url="https://github.com/cov-lineages/pangolin/blob/master/LICENSE.txt" +LABEL maintainer="Curtis Kapsak" +LABEL maintainer.email="kapsakcj@gmail.com" + +# install dependencies; cleanup apt garbage +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + git \ + procps \ + bsdmainutils && \ + apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# get the pangolin repo +RUN wget "https://github.com/cov-lineages/pangolin/archive/${PANGOLIN_VER}.tar.gz" && \ + tar -xf ${PANGOLIN_VER}.tar.gz && \ + rm -v ${PANGOLIN_VER}.tar.gz && \ + mv -v pangolin-* pangolin + +# set the environment; PATH is unnecessary here, but leaving anyways. It's reset later in dockerfile +ENV PATH="$PATH" \ + LC_ALL=C.UTF-8 + +# modify environment.yml to pin specific versions during install +# create the conda environment using modified environment.yml +RUN sed -i "s|usher.*|usher=${USHER_VER}|" /pangolin/environment.yml && \ + sed -i "s|scorpio.git|scorpio.git@${SCORPIO_VER}|" /pangolin/environment.yml && \ + sed -i "s|pangolin-data.git|pangolin-data.git@${PANGOLIN_DATA_VER}|" /pangolin/environment.yml && \ + sed -i "s|constellations.git|constellations.git@${CONSTELLATIONS_VER}|" /pangolin/environment.yml && \ + micromamba create -n pangolin -y -f /pangolin/environment.yml + +# so that mamba/conda env is active when running below commands +ENV ENV_NAME="pangolin" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +WORKDIR /pangolin + +# run pip install step; download optional pre-computed assignment hashes for UShER (useful for running on large batches of samples) +# best to skip using the assigment-cache if running on one sample for speed +# print versions +RUN pip install . && \ + pangolin --add-assignment-cache && \ + micromamba clean -a -y && \ + mkdir /data && \ + pangolin --all-versions && \ + usher --version + +WORKDIR /data + +# hardcode pangolin executable into the PATH variable +ENV PATH="${PATH}:/opt/conda/envs/pangolin/bin/" + +# default command is to pull up help options for virulencefinder; can be overridden of course +CMD ["pangolin", "-h"] + +# new base for testing +FROM app as test + +# so that mamba/conda env is active when running below commands +ENV ENV_NAME="pangolin" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +# test on test sequences supplied with Pangolin code +RUN pangolin /pangolin/pangolin/test/test_seqs.fasta --analysis-mode usher -o /data/test_seqs-output-pusher && \ + column -t -s, /data/test_seqs-output-pusher/lineage_report.csv + +# test functionality of assignment-cache option +RUN pangolin --use-assignment-cache /pangolin/pangolin/test/test_seqs.fasta + +# download B.1.1.7 genome from Utah +ADD https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.consensus.fa /test-data/SRR13957123.consensus.fa + +# test on a B.1.1.7 genome +RUN pangolin /test-data/SRR13957123.consensus.fa --analysis-mode usher -o /test-data/SRR13957123-pusher && \ + column -t -s, /test-data/SRR13957123-pusher/lineage_report.csv + + # install unzip for unzipping zip archive from NCBI +RUN apt-get update && apt-get install -y --no-install-recommends unzip + +# install ncbi datasets tool (pre-compiled binary); place in $PATH +RUN wget https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets && \ + chmod +x datasets && \ + mv -v datasets /usr/local/bin + +# download assembly for a BA.1 from Florida (https://www.ncbi.nlm.nih.gov/biosample?term=SAMN29506515 and https://www.ncbi.nlm.nih.gov/nuccore/ON924087) +# run pangolin in usher analysis mode +RUN datasets download virus genome accession ON924087.1 --filename ON924087.1.zip && \ + unzip ON924087.1.zip && rm ON924087.1.zip && \ + mv -v ncbi_dataset/data/genomic.fna ON924087.1.genomic.fna && \ + rm -vr ncbi_dataset/ README.md && \ + pangolin ON924087.1.genomic.fna --analysis-mode usher -o ON924087.1-usher && \ + column -t -s, ON924087.1-usher/lineage_report.csv + +# test specific for new lineage, XBB.1.16, introduced in pangolin-data v1.19 +# using this assembly: https://www.ncbi.nlm.nih.gov/nuccore/2440446687 +# biosample here: https://www.ncbi.nlm.nih.gov/biosample?term=SAMN33060589 +# one of the sample included in initial pango-designation here: https://github.com/cov-lineages/pango-designation/issues/1723 +RUN datasets download virus genome accession OQ381818.1 --filename OQ381818.1.zip && \ + unzip OQ381818.1.zip && rm OQ381818.1.zip && \ + mv -v ncbi_dataset/data/genomic.fna OQ381818.1.genomic.fna && \ + rm -vr ncbi_dataset/ README.md && \ + pangolin OQ381818.1.genomic.fna --analysis-mode usher -o OQ381818.1-usher && \ + column -t -s, OQ381818.1-usher/lineage_report.csv diff --git a/pangolin/4.3-pdata-1.20/README.md b/pangolin/4.3-pdata-1.20/README.md new file mode 100644 index 000000000..4eb3d779e --- /dev/null +++ b/pangolin/4.3-pdata-1.20/README.md @@ -0,0 +1,53 @@ +# pangolin docker image + +Main tool : [pangolin](https://github.com/cov-lineages/pangolin) + +Full documentation: [https://cov-lineages.org/resources/pangolin.html](https://cov-lineages.org/resources/pangolin.html) + +Phylogenetic Assignment of Named Global Outbreak LINeages + +Additional tools: + +- [pangolin-data](https://github.com/cov-lineages/pangolin-data) 1.20 +- [pangolin-assignment](https://github.com/cov-lineages/pangolin-assignment) 1.20 +- [minimap2](https://github.com/lh3/minimap2) 2.26-r1175 +- [usher](https://github.com/yatisht/usher) 0.6.2 +- [faToVcf](https://github.com/yatisht/usher) 426 +- [scorpio](https://github.com/cov-lineages/scorpio) 0.3.17 +- [constellations](https://github.com/cov-lineages/constellations) 0.1.10 +- [gofasta](https://github.com/virus-evolution/gofasta) 1.2.0 +- [mafft](https://mafft.cbrc.jp/alignment/software/) 7.520 +- python 3.8.15 + +## pangoLEARN deprecation + +As of pangolin version 4.3, pangoLEARN mode has been deprecated. [More info can be found here on the v4.3 release page.](https://github.com/cov-lineages/pangolin/releases/tag/v4.3) + +> If `--analysis-mode fast` or `--analysis-mode pangolearn` is given, pangolin v4.3 will print out a warning and use UShER mode instead, unless `--datadir` is also given specifying a directory with pangoLEARN model files. The next release of pangolin-data (v1.20) will no longer include the model files which have not been updated since v1.18. + +This docker image contains `pangolin-data` v1.20. The pangoLEARN model has not been updated since pangolin-data version 1.18. Only the the underlying UShER tree/protobuf file will be maintained for the forseeable future. + +**Please use the UShER mode of pangolin if you want to stay up-to-date with the most recent lineages.** [See pangolin-data release notes here for more details](https://github.com/cov-lineages/pangolin-data/releases/tag/v1.20) + +## Example Usage + +```bash +# run Pangolin in the default mode (usher). Can optionally supply --analysis-mode usher +$ pangolin /pangolin/pangolin/test/test_seqs.fasta -o /data/test_seqs-output-pusher + +# view the output CSV +$ column -t -s, /data/test_seqs-output-pusher/lineage_report.csv +taxon lineage conflict ambiguity_score scorpio_call scorpio_support scorpio_conflict scorpio_notes version pangolin_version scorpio_version constellation_version is_designated qc_status qc_notes note +India seq B.1.617.1 0.0 B.1.617.1-like 1.0 0.0 scorpio call: Alt alleles 11; Ref alleles 0; Amb alleles 0; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.617.1(1/1) +b117 B.1.1.7 0.0 Alpha (B.1.1.7-like) 0.91 0.04 scorpio call: Alt alleles 21; Ref alleles 1; Amb alleles 1; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.1.7(2/2) +outgroup_A A 0.0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: A(1/1) +issue_57_torsten_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_has_6000_Ns_in_18000_bases Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_has_no_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_is_too_short Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.9 +This_seq_has_lots_of_Ns Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.98 +This_seq_is_literally_just_N Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +Japan_seq B 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. +USA_seq B.1.314 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. +Unassigned_omicron_seq BA.1 0.0 Probable Omicron (BA.1-like) 0.71 0.08 scorpio call: Alt alleles 42; Ref alleles 5; Amb alleles 9; Oth alleles 3 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.03 Usher placements: BA.1(1/1) +```