forked from StaPH-B/docker-builds
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request StaPH-B#670 from StaPH-B/cjk-pangolin-update
add pangolin 4.3 & pdata 1.20
- Loading branch information
Showing
3 changed files
with
181 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
FROM mambaorg/micromamba:1.4.3 as app | ||
|
||
# build and run as root users since micromamba image has 'mambauser' set as the $USER | ||
USER root | ||
# set workdir to default for building; set to /data at the end | ||
WORKDIR / | ||
|
||
# ARG variables only persist during build time | ||
# had to include the v for some of these due to GitHub tags. | ||
# using pangolin-data github tag, NOT what is in the GH release title "v1.2.133" | ||
ARG PANGOLIN_VER="v4.3" | ||
ARG PANGOLIN_DATA_VER="v1.20" | ||
ARG SCORPIO_VER="v0.3.17" | ||
ARG CONSTELLATIONS_VER="v0.1.10" | ||
ARG USHER_VER="0.6.2" | ||
|
||
# metadata labels | ||
LABEL base.image="mambaorg/micromamba:1.4.3" | ||
LABEL dockerfile.version="1" | ||
LABEL software="pangolin" | ||
LABEL software.version=${PANGOLIN_VER} | ||
LABEL description="Conda environment for Pangolin. Pangolin: Software package for assigning SARS-CoV-2 genome sequences to global lineages." | ||
LABEL website="https://github.com/cov-lineages/pangolin" | ||
LABEL license="GNU General Public License v3.0" | ||
LABEL license.url="https://github.com/cov-lineages/pangolin/blob/master/LICENSE.txt" | ||
LABEL maintainer="Curtis Kapsak" | ||
LABEL maintainer.email="[email protected]" | ||
|
||
# install dependencies; cleanup apt garbage | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
wget \ | ||
ca-certificates \ | ||
git \ | ||
procps \ | ||
bsdmainutils && \ | ||
apt-get autoclean && rm -rf /var/lib/apt/lists/* | ||
|
||
# get the pangolin repo | ||
RUN wget "https://github.com/cov-lineages/pangolin/archive/${PANGOLIN_VER}.tar.gz" && \ | ||
tar -xf ${PANGOLIN_VER}.tar.gz && \ | ||
rm -v ${PANGOLIN_VER}.tar.gz && \ | ||
mv -v pangolin-* pangolin | ||
|
||
# set the environment; PATH is unnecessary here, but leaving anyways. It's reset later in dockerfile | ||
ENV PATH="$PATH" \ | ||
LC_ALL=C.UTF-8 | ||
|
||
# modify environment.yml to pin specific versions during install | ||
# create the conda environment using modified environment.yml | ||
RUN sed -i "s|usher.*|usher=${USHER_VER}|" /pangolin/environment.yml && \ | ||
sed -i "s|scorpio.git|scorpio.git@${SCORPIO_VER}|" /pangolin/environment.yml && \ | ||
sed -i "s|pangolin-data.git|pangolin-data.git@${PANGOLIN_DATA_VER}|" /pangolin/environment.yml && \ | ||
sed -i "s|constellations.git|constellations.git@${CONSTELLATIONS_VER}|" /pangolin/environment.yml && \ | ||
micromamba create -n pangolin -y -f /pangolin/environment.yml | ||
|
||
# so that mamba/conda env is active when running below commands | ||
ENV ENV_NAME="pangolin" | ||
ARG MAMBA_DOCKERFILE_ACTIVATE=1 | ||
|
||
WORKDIR /pangolin | ||
|
||
# run pip install step; download optional pre-computed assignment hashes for UShER (useful for running on large batches of samples) | ||
# best to skip using the assigment-cache if running on one sample for speed | ||
# print versions | ||
RUN pip install . && \ | ||
pangolin --add-assignment-cache && \ | ||
micromamba clean -a -y && \ | ||
mkdir /data && \ | ||
pangolin --all-versions && \ | ||
usher --version | ||
|
||
WORKDIR /data | ||
|
||
# hardcode pangolin executable into the PATH variable | ||
ENV PATH="${PATH}:/opt/conda/envs/pangolin/bin/" | ||
|
||
# default command is to pull up help options for virulencefinder; can be overridden of course | ||
CMD ["pangolin", "-h"] | ||
|
||
# new base for testing | ||
FROM app as test | ||
|
||
# so that mamba/conda env is active when running below commands | ||
ENV ENV_NAME="pangolin" | ||
ARG MAMBA_DOCKERFILE_ACTIVATE=1 | ||
|
||
# test on test sequences supplied with Pangolin code | ||
RUN pangolin /pangolin/pangolin/test/test_seqs.fasta --analysis-mode usher -o /data/test_seqs-output-pusher && \ | ||
column -t -s, /data/test_seqs-output-pusher/lineage_report.csv | ||
|
||
# test functionality of assignment-cache option | ||
RUN pangolin --use-assignment-cache /pangolin/pangolin/test/test_seqs.fasta | ||
|
||
# download B.1.1.7 genome from Utah | ||
ADD https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.consensus.fa /test-data/SRR13957123.consensus.fa | ||
|
||
# test on a B.1.1.7 genome | ||
RUN pangolin /test-data/SRR13957123.consensus.fa --analysis-mode usher -o /test-data/SRR13957123-pusher && \ | ||
column -t -s, /test-data/SRR13957123-pusher/lineage_report.csv | ||
|
||
# install unzip for unzipping zip archive from NCBI | ||
RUN apt-get update && apt-get install -y --no-install-recommends unzip | ||
|
||
# install ncbi datasets tool (pre-compiled binary); place in $PATH | ||
RUN wget https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets && \ | ||
chmod +x datasets && \ | ||
mv -v datasets /usr/local/bin | ||
|
||
# download assembly for a BA.1 from Florida (https://www.ncbi.nlm.nih.gov/biosample?term=SAMN29506515 and https://www.ncbi.nlm.nih.gov/nuccore/ON924087) | ||
# run pangolin in usher analysis mode | ||
RUN datasets download virus genome accession ON924087.1 --filename ON924087.1.zip && \ | ||
unzip ON924087.1.zip && rm ON924087.1.zip && \ | ||
mv -v ncbi_dataset/data/genomic.fna ON924087.1.genomic.fna && \ | ||
rm -vr ncbi_dataset/ README.md && \ | ||
pangolin ON924087.1.genomic.fna --analysis-mode usher -o ON924087.1-usher && \ | ||
column -t -s, ON924087.1-usher/lineage_report.csv | ||
|
||
# test specific for new lineage, XBB.1.16, introduced in pangolin-data v1.19 | ||
# using this assembly: https://www.ncbi.nlm.nih.gov/nuccore/2440446687 | ||
# biosample here: https://www.ncbi.nlm.nih.gov/biosample?term=SAMN33060589 | ||
# one of the sample included in initial pango-designation here: https://github.com/cov-lineages/pango-designation/issues/1723 | ||
RUN datasets download virus genome accession OQ381818.1 --filename OQ381818.1.zip && \ | ||
unzip OQ381818.1.zip && rm OQ381818.1.zip && \ | ||
mv -v ncbi_dataset/data/genomic.fna OQ381818.1.genomic.fna && \ | ||
rm -vr ncbi_dataset/ README.md && \ | ||
pangolin OQ381818.1.genomic.fna --analysis-mode usher -o OQ381818.1-usher && \ | ||
column -t -s, OQ381818.1-usher/lineage_report.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# pangolin docker image | ||
|
||
Main tool : [pangolin](https://github.com/cov-lineages/pangolin) | ||
|
||
Full documentation: [https://cov-lineages.org/resources/pangolin.html](https://cov-lineages.org/resources/pangolin.html) | ||
|
||
Phylogenetic Assignment of Named Global Outbreak LINeages | ||
|
||
Additional tools: | ||
|
||
- [pangolin-data](https://github.com/cov-lineages/pangolin-data) 1.20 | ||
- [pangolin-assignment](https://github.com/cov-lineages/pangolin-assignment) 1.20 | ||
- [minimap2](https://github.com/lh3/minimap2) 2.26-r1175 | ||
- [usher](https://github.com/yatisht/usher) 0.6.2 | ||
- [faToVcf](https://github.com/yatisht/usher) 426 | ||
- [scorpio](https://github.com/cov-lineages/scorpio) 0.3.17 | ||
- [constellations](https://github.com/cov-lineages/constellations) 0.1.10 | ||
- [gofasta](https://github.com/virus-evolution/gofasta) 1.2.0 | ||
- [mafft](https://mafft.cbrc.jp/alignment/software/) 7.520 | ||
- python 3.8.15 | ||
|
||
## pangoLEARN deprecation | ||
|
||
As of pangolin version 4.3, pangoLEARN mode has been deprecated. [More info can be found here on the v4.3 release page.](https://github.com/cov-lineages/pangolin/releases/tag/v4.3) | ||
|
||
> If `--analysis-mode fast` or `--analysis-mode pangolearn` is given, pangolin v4.3 will print out a warning and use UShER mode instead, unless `--datadir` is also given specifying a directory with pangoLEARN model files. The next release of pangolin-data (v1.20) will no longer include the model files which have not been updated since v1.18. | ||
This docker image contains `pangolin-data` v1.20. The pangoLEARN model has not been updated since pangolin-data version 1.18. Only the the underlying UShER tree/protobuf file will be maintained for the forseeable future. | ||
|
||
**Please use the UShER mode of pangolin if you want to stay up-to-date with the most recent lineages.** [See pangolin-data release notes here for more details](https://github.com/cov-lineages/pangolin-data/releases/tag/v1.20) | ||
|
||
## Example Usage | ||
|
||
```bash | ||
# run Pangolin in the default mode (usher). Can optionally supply --analysis-mode usher | ||
$ pangolin /pangolin/pangolin/test/test_seqs.fasta -o /data/test_seqs-output-pusher | ||
|
||
# view the output CSV | ||
$ column -t -s, /data/test_seqs-output-pusher/lineage_report.csv | ||
taxon lineage conflict ambiguity_score scorpio_call scorpio_support scorpio_conflict scorpio_notes version pangolin_version scorpio_version constellation_version is_designated qc_status qc_notes note | ||
India seq B.1.617.1 0.0 B.1.617.1-like 1.0 0.0 scorpio call: Alt alleles 11; Ref alleles 0; Amb alleles 0; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.617.1(1/1) | ||
b117 B.1.1.7 0.0 Alpha (B.1.1.7-like) 0.91 0.04 scorpio call: Alt alleles 21; Ref alleles 1; Amb alleles 1; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.1.7(2/2) | ||
outgroup_A A 0.0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: A(1/1) | ||
issue_57_torsten_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map | ||
This_seq_has_6000_Ns_in_18000_bases Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map | ||
This_seq_has_no_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map | ||
This_seq_is_too_short Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.9 | ||
This_seq_has_lots_of_Ns Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.98 | ||
This_seq_is_literally_just_N Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map | ||
Japan_seq B 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. | ||
USA_seq B.1.314 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. | ||
Unassigned_omicron_seq BA.1 0.0 Probable Omicron (BA.1-like) 0.71 0.08 scorpio call: Alt alleles 42; Ref alleles 5; Amb alleles 9; Oth alleles 3 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.03 Usher placements: BA.1(1/1) | ||
``` |