From af476527e2b5a6389cbead91454b7572f9298aad Mon Sep 17 00:00:00 2001 From: Curtis Kapsak Date: Tue, 30 Jul 2024 16:30:43 -0400 Subject: [PATCH] adds pangolin with pdata v1.29 (#1027) * dockerfile builds successfully w pdata 1.29 * adds readme for pdata 1.29 * update main readme w new pdata version & link * added test for new lineage KP.3.3.2 that was added in pango-designation 1.29 * fixed comments in test stage --- README.md | 2 +- pangolin/4.3.1-pdata-1.29/Dockerfile | 208 +++++++++++++++++++++++++++ pangolin/4.3.1-pdata-1.29/README.md | 53 +++++++ 3 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 pangolin/4.3.1-pdata-1.29/Dockerfile create mode 100644 pangolin/4.3.1-pdata-1.29/README.md diff --git a/README.md b/README.md index 5a3d42885..d8378b7d2 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,7 @@ To learn more about the docker pull rate limits and the open source software pro | [ONTime](https://hub.docker.com/r/staphb/ontime)
[![docker pulls](https://badgen.net/docker/pulls/staphb/ontime)](https://hub.docker.com/r/staphb/ontime) | | https://github.com/mbhall88/ontime | | [OrthoFinder](https://hub.docker.com/r/staphb/orthofinder)
[![docker pulls](https://badgen.net/docker/pulls/staphb/orthofinder)](https://hub.docker.com/r/staphb/orthofinder) | | https://github.com/davidemms/OrthoFinder | | [Panaroo](https://hub.docker.com/r/staphb/panaroo)
[![docker pulls](https://badgen.net/docker/pulls/staphb/panaroo)](https://hub.docker.com/r/staphb/panaroo) | | (https://hub.docker.com/r/staphb/panaroo) | -| [Pangolin](https://hub.docker.com/r/staphb/pangolin)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pangolin)](https://hub.docker.com/r/staphb/pangolin) |
Click to see Pangolin v4.2 and older versions! **Pangolin version & pangoLEARN data release date** **Pangolin version & pangolin-data version**
**Pangolin version & pangolin-data version** | https://github.com/cov-lineages/pangolin
https://github.com/cov-lineages/pangoLEARN
https://github.com/cov-lineages/pango-designation
https://github.com/cov-lineages/scorpio
https://github.com/cov-lineages/constellations
https://github.com/cov-lineages/lineages (archived)
https://github.com/hCoV-2019/pangolin (archived) | +| [Pangolin](https://hub.docker.com/r/staphb/pangolin)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pangolin)](https://hub.docker.com/r/staphb/pangolin) |
Click to see Pangolin v4.2 and older versions! **Pangolin version & pangoLEARN data release date** **Pangolin version & pangolin-data version**
**Pangolin version & pangolin-data version** | https://github.com/cov-lineages/pangolin
https://github.com/cov-lineages/pangoLEARN
https://github.com/cov-lineages/pango-designation
https://github.com/cov-lineages/scorpio
https://github.com/cov-lineages/constellations
https://github.com/cov-lineages/lineages (archived)
https://github.com/hCoV-2019/pangolin (archived) | | [parallel-perl](https://hub.docker.com/r/staphb/parallel-perl)
[![docker pulls](https://badgen.net/docker/pulls/staphb/parallel-perl)](https://hub.docker.com/r/staphb/parallel-perl) | | https://www.gnu.org/software/parallel | | [parsnp](https://hub.docker.com/r/staphb/parsnp)
[![docker pulls](https://badgen.net/docker/pulls/staphb/parsnp)](https://hub.docker.com/r/staphb/parsnp) | | https://github.com/marbl/parsnp | | [pasty](https://hub.docker.com/r/staphb/pasty)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pasty)](https://hub.docker.com/r/staphb/pasty) | | https://github.com/rpetit3/pasty | diff --git a/pangolin/4.3.1-pdata-1.29/Dockerfile b/pangolin/4.3.1-pdata-1.29/Dockerfile new file mode 100644 index 000000000..b79f2a6c5 --- /dev/null +++ b/pangolin/4.3.1-pdata-1.29/Dockerfile @@ -0,0 +1,208 @@ +FROM mambaorg/micromamba:1.5.8 AS app + +# build and run as root users since micromamba image has 'mambauser' set as the $USER +USER root +# set workdir to default for building; set to /data at the end +WORKDIR / + +# ARG variables only persist during build time +# had to include the v for some of these due to GitHub tags. +# using pangolin-data github tag, NOT what is in the GH release title "v1.2.133" +ARG PANGOLIN_VER="v4.3.1" +ARG PANGOLIN_DATA_VER="v1.29" +ARG SCORPIO_VER="v0.3.19" +ARG CONSTELLATIONS_VER="v0.1.12" +ARG USHER_VER="0.6.3" + +# metadata labels +LABEL base.image="mambaorg/micromamba:1.5.8" +LABEL dockerfile.version="1" +LABEL software="pangolin" +LABEL software.version=${PANGOLIN_VER} +LABEL description="Conda environment for Pangolin. Pangolin: Software package for assigning SARS-CoV-2 genome sequences to global lineages." +LABEL website="https://github.com/cov-lineages/pangolin" +LABEL license="GNU General Public License v3.0" +LABEL license.url="https://github.com/cov-lineages/pangolin/blob/master/LICENSE.txt" +LABEL maintainer="Curtis Kapsak" +LABEL maintainer.email="kapsakcj@gmail.com" + +# install dependencies; cleanup apt garbage +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + git \ + procps \ + bsdmainutils && \ + apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# get the pangolin repo +RUN wget "https://github.com/cov-lineages/pangolin/archive/${PANGOLIN_VER}.tar.gz" && \ + tar -xf ${PANGOLIN_VER}.tar.gz && \ + rm -v ${PANGOLIN_VER}.tar.gz && \ + mv -v pangolin-* pangolin + +# set the environment; PATH is unnecessary here, but leaving anyways. It's reset later in dockerfile +ENV PATH="$PATH" \ + LC_ALL=C.UTF-8 + +# modify environment.yml to pin specific versions during install +# pin specific versions of usher, scorpio, pangolin-data, constellations, and pulp +# create the conda environment using modified environment.yml +RUN sed -i "s|usher.*|usher=${USHER_VER}|" /pangolin/environment.yml && \ + sed -i "s|scorpio.git|scorpio.git@${SCORPIO_VER}|" /pangolin/environment.yml && \ + sed -i "s|pangolin-data.git|pangolin-data.git@${PANGOLIN_DATA_VER}|" /pangolin/environment.yml && \ + sed -i "s|constellations.git|constellations.git@${CONSTELLATIONS_VER}|" /pangolin/environment.yml && \ + sed -i "12 a\ - pulp=2.7.0" /pangolin/environment.yml && \ + micromamba create -n pangolin -y -f /pangolin/environment.yml && \ + micromamba clean -a -y -f + +# so that mamba/conda env is active when running below commands +ENV ENV_NAME="pangolin" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +WORKDIR /pangolin + +# run pip install step; download optional pre-computed assignment hashes for UShER (useful for running on large batches of samples) +# best to skip using the assigment-cache if running on one sample for speed +# print versions +RUN pip install . && \ + pangolin --add-assignment-cache && \ + mkdir /data && \ + pangolin --all-versions && \ + usher --version + +# final working directory in "app" layer is /data for passing data in/out of container +WORKDIR /data + +# hardcode pangolin executable into the PATH variable +ENV PATH="${PATH}:/opt/conda/envs/pangolin/bin/" XDG_CACHE_HOME=/tmp + +# default command is to pull up help options for pangolin; can be overridden of course +CMD ["pangolin", "-h"] + +# new base for testing +FROM app AS test + +# so that mamba/conda env is active when running below commands +ENV ENV_NAME="pangolin" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +# test on test sequences supplied with Pangolin code +RUN pangolin /pangolin/pangolin/test/test_seqs.fasta -o /data/test_seqs-output-pusher && \ + column -t -s, /data/test_seqs-output-pusher/lineage_report.csv + +# test functionality of assignment-cache option +RUN pangolin --use-assignment-cache /pangolin/pangolin/test/test_seqs.fasta + +# download B.1.1.7 genome from Utah +ADD https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.consensus.fa /test-data/SRR13957123.consensus.fa + +# test on a B.1.1.7 genome +RUN pangolin /test-data/SRR13957123.consensus.fa -o /test-data/SRR13957123-pusher && \ + column -t -s, /test-data/SRR13957123-pusher/lineage_report.csv + + # install unzip for unzipping zip archive from NCBI +RUN apt-get update && apt-get install -y --no-install-recommends unzip + +# install ncbi datasets tool (pre-compiled binary); place in $PATH +RUN wget https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets && \ + chmod +x datasets && \ + mv -v datasets /usr/local/bin + +# download assembly for a BA.1 from Florida (https://www.ncbi.nlm.nih.gov/biosample?term=SAMN29506515 and https://www.ncbi.nlm.nih.gov/nuccore/ON924087) +# run pangolin in usher analysis mode +RUN datasets download virus genome accession ON924087.1 --filename ON924087.1.zip && \ + unzip ON924087.1.zip && rm ON924087.1.zip && \ + mv -v ncbi_dataset/data/genomic.fna ON924087.1.genomic.fna && \ + rm -vr ncbi_dataset/ README.md && \ + pangolin ON924087.1.genomic.fna -o ON924087.1-usher && \ + column -t -s, ON924087.1-usher/lineage_report.csv + +# test specific for new lineage, XBB.1.16, introduced in pangolin-data v1.19 +# using this assembly: https://www.ncbi.nlm.nih.gov/nuccore/2440446687 +# biosample here: https://www.ncbi.nlm.nih.gov/biosample?term=SAMN33060589 +# one of the sample included in initial pango-designation here: https://github.com/cov-lineages/pango-designation/issues/1723 +RUN datasets download virus genome accession OQ381818.1 --filename OQ381818.1.zip && \ + unzip OQ381818.1.zip && rm OQ381818.1.zip && \ + mv -v ncbi_dataset/data/genomic.fna OQ381818.1.genomic.fna && \ + rm -vr ncbi_dataset/ README.md && \ + pangolin OQ381818.1.genomic.fna -o OQ381818.1-usher && \ + column -t -s, OQ381818.1-usher/lineage_report.csv + +# testing another XBB.1.16, trying to test scorpio functionality. Want pangolin to NOT assign lineage based on pango hash match. +# this test runs as expected, uses scorpio to check for constellation of mutations, then assign using PUSHER placement +RUN datasets download virus genome accession OR177999.1 --filename OR177999.1.zip && \ +unzip OR177999.1.zip && rm OR177999.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR177999.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR177999.1.genomic.fna -o OR177999.1-usher && \ +column -t -s, OR177999.1-usher/lineage_report.csv + + ## test for BA.2.86 + # virus identified in MI: https://www.ncbi.nlm.nih.gov/nuccore/OR461132.1 +RUN datasets download virus genome accession OR461132.1 --filename OR461132.1.zip && \ +unzip OR461132.1.zip && rm OR461132.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR461132.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR461132.1.genomic.fna -o OR461132.1-usher && \ +column -t -s, OR461132.1-usher/lineage_report.csv + + ## test for JN.2 (BA.2.86 sublineage) JN.2 is an alias of B.1.1.529.2.86.1.2 + # NY CDC Quest sample: https://www.ncbi.nlm.nih.gov/nuccore/OR598183 +RUN datasets download virus genome accession OR598183.1 --filename OR598183.1.zip && \ +unzip OR598183.1.zip && rm OR598183.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR598183.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR598183.1.genomic.fna -o OR598183.1-usher && \ +column -t -s, OR598183.1-usher/lineage_report.csv + +## test for JQ.1 (BA.2.86.3 sublineage); JQ.1 is an alias of B.1.1.529.2.86.3.1 +# THANK YOU ERIN AND UPHL!! https://www.ncbi.nlm.nih.gov/nuccore/OR716684 +# this test is important due to the fact that this lineage was included in the UShER tree, despite being designated after the pangolin-designation 1.23 release +# it previously caused and error/bug in pangolin, but now is fixed +RUN datasets download virus genome accession OR716684.1 --filename OR716684.1.zip && \ +unzip OR716684.1.zip && rm OR716684.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR716684.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR716684.1.genomic.fna -o OR716684.1-usher && \ +column -t -s, OR716684.1-usher/lineage_report.csv + +## test for JN.1.22 (BA.2.86.x sublineage; full unaliased lineage is B.1.1.529.2.86.1.1.22) +# see here for commit where it was designated https://github.com/cov-lineages/pango-designation/commit/a90c8e31c154621ed86c985debfea09e17541cda +# Here's the genome on NCBI, which was used to designate JN.1.22 lineage +RUN datasets download virus genome accession PP189069.1 --filename PP189069.1.zip && \ +unzip PP189069.1.zip && rm PP189069.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna PP189069.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin PP189069.1.genomic.fna -o PP189069.1-usher && \ +column -t -s, PP189069.1-usher/lineage_report.csv + +## test for JN.1.48 (BA.2.86.x sublineage; full unaliased lineage is B.1.1.529.2.86.1.1.48) +# this lineages which was designated in pango-designation v1.27: https://github.com/cov-lineages/pango-designation/releases/tag/v1.27 +# see here for commit where it was designated https://github.com/cov-lineages/pango-designation/commit/67f48bf24283999f1940f3aee8159f404124ff3f +# Here's the genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/PP218754 +RUN datasets download virus genome accession PP218754.1 --filename PP218754.1.zip && \ +unzip PP218754.1.zip && rm PP218754.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna PP218754.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin PP218754.1.genomic.fna -o PP218754.1-usher && \ +column -t -s, PP218754.1-usher/lineage_report.csv + +# new lineage LK.1 that was introduced in pango-designation v1.28: https://github.com/cov-lineages/pango-designation/commit/922795c90de355e67200cf4d379e8e5ff22472e4 +# thank you Luis, Lorraine, Marcos & team from PR Sci Trust for sharing your data! +# genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/2728145425 +RUN datasets download virus genome accession PP770375.1 --filename PP770375.1.zip && \ +unzip PP770375.1.zip && rm PP770375.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna PP770375.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin PP770375.1.genomic.fna -o PP770375.1-usher && \ +column -t -s, PP770375.1-usher/lineage_report.csv + +# new lineage KP.3.3.2 that was introduced in pango-designation v1.29: https://github.com/cov-lineages/pango-designation/commit/7125e606818312b78f0756d7fcab6dba92dd0a9e +# genome on NCBI: https://www.ncbi.nlm.nih.gov/nuccore/PQ073669 +RUN datasets download virus genome accession PQ073669.1 --filename PQ073669.1.zip && \ +unzip PQ073669.1.zip && rm PQ073669.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna PQ073669.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin PQ073669.1.genomic.fna -o PQ073669.1-usher && \ +column -t -s, PQ073669.1-usher/lineage_report.csv \ No newline at end of file diff --git a/pangolin/4.3.1-pdata-1.29/README.md b/pangolin/4.3.1-pdata-1.29/README.md new file mode 100644 index 000000000..8542245a0 --- /dev/null +++ b/pangolin/4.3.1-pdata-1.29/README.md @@ -0,0 +1,53 @@ +# pangolin docker image + +Main tool : [pangolin](https://github.com/cov-lineages/pangolin) + +Full documentation: [https://cov-lineages.org/resources/pangolin.html](https://cov-lineages.org/resources/pangolin.html) + +Phylogenetic Assignment of Named Global Outbreak LINeages + +Additional tools: + +- [pangolin-data](https://github.com/cov-lineages/pangolin-data) 1.29 +- [pangolin-assignment](https://github.com/cov-lineages/pangolin-assignment) 1.29 +- [minimap2](https://github.com/lh3/minimap2) 2.28-r1209 +- [usher](https://github.com/yatisht/usher) 0.6.3 +- [faToVcf](https://github.com/yatisht/usher) 448 +- [scorpio](https://github.com/cov-lineages/scorpio) 0.3.19 +- [constellations](https://github.com/cov-lineages/constellations) 0.1.12 +- [gofasta](https://github.com/virus-evolution/gofasta) 1.2.1 +- [mafft](https://mafft.cbrc.jp/alignment/software/) 7.526 +- python 3.8.19 + +## pangoLEARN deprecation + +As of pangolin version 4.3, pangoLEARN mode has been deprecated. [More info can be found here on the v4.3 release page.](https://github.com/cov-lineages/pangolin/releases/tag/v4.3) + +> If `--analysis-mode fast` or `--analysis-mode pangolearn` is given, pangolin v4.3 will print out a warning and use UShER mode instead, unless `--datadir` is also given specifying a directory with pangoLEARN model files. The next release of pangolin-data (v1.20) will no longer include the model files which have not been updated since v1.18. + +The pangoLEARN model has not been updated since pangolin-data version 1.18. Only the the underlying UShER tree/protobuf file will be maintained for the forseeable future. + +**Please use the UShER mode of pangolin if you want to stay up-to-date with the most recent lineages.** [See pangolin-data release notes here for more details](https://github.com/cov-lineages/pangolin-data/releases) + +## Example Usage + +```bash +# run Pangolin in the default mode (usher). Can optionally supply --analysis-mode usher +$ pangolin /pangolin/pangolin/test/test_seqs.fasta -o /data/test_seqs-output-pusher + +# view the output CSV +$ column -t -s, /data/test_seqs-output-pusher/lineage_report.csv +taxon lineage conflict ambiguity_score scorpio_call scorpio_support scorpio_conflict scorpio_notes version pangolin_version scorpio_version constellation_version is_designated qc_status qc_notes note +India seq B.1.617.1 0.0 B.1.617.1-like 1.0 0.0 scorpio call: Alt alleles 11; Ref alleles 0; Amb alleles 0; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.617.1(1/1) +b117 B.1.1.7 0.0 Alpha (B.1.1.7-like) 0.91 0.04 scorpio call: Alt alleles 21; Ref alleles 1; Amb alleles 1; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.1.7(2/2) +outgroup_A A 0.0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: A(1/1) +issue_57_torsten_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_has_6000_Ns_in_18000_bases Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_has_no_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_is_too_short Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.9 +This_seq_has_lots_of_Ns Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.98 +This_seq_is_literally_just_N Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +Japan_seq B 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. +USA_seq B.1.314 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. +Unassigned_omicron_seq BA.1 0.0 Probable Omicron (BA.1-like) 0.71 0.08 scorpio call: Alt alleles 42; Ref alleles 5; Amb alleles 9; Oth alleles 3 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.03 Usher placements: BA.1(1/1) +```