From accee56a7a24b103f3a9f81c8fd7b5ee831e216e Mon Sep 17 00:00:00 2001 From: Shane Giles <62901608+bsgiles73@users.noreply.github.com> Date: Tue, 21 May 2024 16:07:50 -0600 Subject: [PATCH] feat(IPVC-2464): Add a seqrepo-pull compose service (#40) --- README.md | 20 +++--------- docker-compose.yml | 28 ++++++++-------- etc/global.conf | 2 +- .../docker-compose-splign-manual.yml | 16 ++++++++++ .../splign-manual}/uta-splign-manual | 32 +++++++++++-------- sbin/seqrepo-load | 14 ++++---- sbin/seqrepo-pull | 22 +++++++++++++ 7 files changed, 82 insertions(+), 52 deletions(-) create mode 100644 misc/splign-manual/docker-compose-splign-manual.yml rename {sbin => misc/splign-manual}/uta-splign-manual (56%) create mode 100755 sbin/seqrepo-pull diff --git a/README.md b/README.md index 8e229cc..8f9c455 100644 --- a/README.md +++ b/README.md @@ -306,11 +306,10 @@ mkdir -p $(pwd)/output/logs Set variables: ``` -export UTA_ETL_OLD_SEQREPO_VERSION=2024-02-20 export UTA_ETL_OLD_UTA_IMAGE_TAG=uta_20210129b -export UTA_ETL_OLD_UTA_VERSION=uta_20210129b +export UTA_ETL_OLD_UTA_VERSION=UTA_ETL_OLD_UTA_IMAGE_TAG +export UTA_ETL_NEW_UTA_VERSION=uta_20240512 export UTA_ETL_NCBI_DIR=./ncbi-data -export UTA_ETL_SEQREPO_DIR=./seqrepo-data export UTA_ETL_WORK_DIR=./output/artifacts export UTA_ETL_LOG_DIR=./output/logs ``` @@ -322,16 +321,7 @@ docker build --target uta -t uta-update . ### 1. Download SeqRepo data ``` -docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION - -# download seqrepo. can skip if container already exists. -docker run --name seqrepo biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION - -# copy seqrepo data into a local directory -docker run -v $UTA_ETL_SEQREPO_DIR:/output-dir --volumes-from seqrepo ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir' - -# allow seqrepo to be modified -docker run -it -v $UTA_ETL_SEQREPO_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir' +docker compose run seqrepo-pull ``` Note: pulling data takes ~30 minutes and requires ~13 GB. @@ -348,14 +338,14 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts. docker compose run ncbi-download docker compose run uta-extract docker compose run seqrepo-load -UTA_ETL_NEW_UTA_VERSION=uta_20240512 docker compose run uta-load +docker compose run uta-load ``` #### 2B. Mitochondrial transcripts ``` docker compose -f docker-compose.yml -f misc/mito-transcripts/docker-compose-mito-extract.yml run mito-extract docker compose run seqrepo-load -UTA_ETL_NEW_UTA_VERSION=uta_20240512 docker compose run uta-load +docker compose run uta-load ``` #### 2C. Manual splign transcripts diff --git a/docker-compose.yml b/docker-compose.yml index 2141e7d..84b043a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,13 @@ version: '3' services: + seqrepo-pull: + user: root + image: uta-update + command: sbin/seqrepo-pull + volumes: + - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo + network_mode: host ncbi-download: image: uta-update command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir @@ -22,9 +29,9 @@ services: network_mode: host seqrepo-load: image: uta-update - command: sbin/seqrepo-load /usr/local/share/seqrepo ${UTA_ETL_OLD_SEQREPO_VERSION} /seqrepo-load/work /seqrepo-load/logs + command: sbin/seqrepo-load /seqrepo-load/work /seqrepo-load/logs volumes: - - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs working_dir: /opt/repos/uta @@ -46,19 +53,10 @@ services: uta: condition: service_healthy volumes: - - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo + - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo - ${UTA_ETL_WORK_DIR}:/uta-load/work - ${UTA_ETL_LOG_DIR}:/uta-load/logs network_mode: host - splign-manual: - image: uta-update - command: sbin/uta-splign-manual ${UTA_ETL_OLD_SEQREPO_VERSION} ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs - depends_on: - uta: - condition: service_healthy - volumes: - - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo - - ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input - - ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work - - ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs - network_mode: host + +volumes: + seqrepo-volume: diff --git a/etc/global.conf b/etc/global.conf index 1aece93..0d3cfcc 100644 --- a/etc/global.conf +++ b/etc/global.conf @@ -16,7 +16,7 @@ aligner = utaaa fasta_directories = aux/sequences2 aux/sequences -seqrepo = /usr/local/share/seqrepo/2024-02-20 +seqrepo = /biocommons/dl.biocommons.org/seqrepo/master #data/manual #data/bic/sequences.fasta.bgz diff --git a/misc/splign-manual/docker-compose-splign-manual.yml b/misc/splign-manual/docker-compose-splign-manual.yml new file mode 100644 index 0000000..7ef0e87 --- /dev/null +++ b/misc/splign-manual/docker-compose-splign-manual.yml @@ -0,0 +1,16 @@ +# docker compose file for the splign-manual uta update procedure + +version: '3' + +services: + splign-manual: + image: uta-update + command: sbin/uta-splign-manual ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs + depends_on: + uta: + condition: service_healthy + volumes: + - ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input + - ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work + - ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs + network_mode: host diff --git a/sbin/uta-splign-manual b/misc/splign-manual/uta-splign-manual similarity index 56% rename from sbin/uta-splign-manual rename to misc/splign-manual/uta-splign-manual index 2bc3e9b..cb1e9ee 100755 --- a/sbin/uta-splign-manual +++ b/misc/splign-manual/uta-splign-manual @@ -4,15 +4,14 @@ set -euxo pipefail -seqrepo_version=$1 -source_uta_v=$2 -input_dir=$3 -working_dir=$4 -log_dir=$5 +source_uta_v=$1 +input_dir=$2 +working_dir=$3 +log_dir=$4 -if [ -z "$seqrepo_version" ] || [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +if [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] then - echo 'Usage: sbin/uta-splign-manual ' + echo 'Usage: misc/uta-splign-manual ' exit 1 fi @@ -24,22 +23,29 @@ mkdir -p "$log_dir" mkdir -p "$working_dir" # Generate txinfo.gz and exonset.gz files -python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml --output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log" +python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml \ + --output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log" # Generate fasta files -seqrepo export $(gzip -cdq $working_dir/txinfo.gz | cut -f2 | tail +2) --instance-name "$seqrepo_version" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log" +seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \ + export $(gzip -cdq $working_dir/txinfo.gz | cut -f2 | tail +2) \ + --instance-name "master" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log" # Generate seqinfo.gz file -sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | tee "$log_dir/fasta-to-seqinfo.log" +sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | \ + tee "$log_dir/fasta-to-seqinfo.log" # Load seqinfo -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo $working_dir/seqinfo.gz 2>&1 | tee "$log_dir/load-seqinfo.log" +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo $working_dir/seqinfo.gz 2>&1 | \ + tee "$log_dir/load-seqinfo.log" # Load txinfo -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo $working_dir/txinfo.gz 2>&1 | tee "$log_dir/load-txinfo.log" +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo $working_dir/txinfo.gz 2>&1 | \ + tee "$log_dir/load-txinfo.log" # Load exonset -uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset $working_dir/exonset.gz 2>&1 | tee "$log_dir/load-exonset.log" +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset $working_dir/exonset.gz 2>&1 | \ + tee "$log_dir/load-exonset.log" # Align exons uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$log_dir/align-exons.log" diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load index 02197ec..95b662f 100755 --- a/sbin/seqrepo-load +++ b/sbin/seqrepo-load @@ -2,14 +2,12 @@ set -euxo pipefail -seqrepo_root=$1 -seqrepo_version=$2 -sequence_dir=$3 -log_dir=$4 +sequence_dir=$1 +log_dir=$2 -if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$log_dir" ] +if [ -z "$sequence_dir" ] || [ -z "$log_dir" ] then - echo 'Usage: sbin/seqrepo-load ' + echo 'Usage: sbin/seqrepo-load ' exit 1 fi @@ -17,7 +15,7 @@ fi mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*") # Load SeqRepo with new sequences -seqrepo --root-directory "$seqrepo_root" \ - load -n NCBI --instance-name "$seqrepo_version" \ +seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \ + load -n NCBI --instance-name "master" \ "${FASTA_FILES[@]}" 2>&1 | \ tee "$log_dir/seqrepo-load.log" diff --git a/sbin/seqrepo-pull b/sbin/seqrepo-pull new file mode 100755 index 0000000..894dffa --- /dev/null +++ b/sbin/seqrepo-pull @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +SEQREPO_DIR="/biocommons/dl.biocommons.org/seqrepo" + +# pull the latest seqrepo version from biocommons +latest_version=$(seqrepo list-remote-instances | tail -n 1 | xargs) +cd "$SEQREPO_DIR" +rsync -rtHP --no-motd dl.biocommons.org::seqrepo/"$latest_version" . + +# setup seqrepo build directory +mkdir -p master/sequences +cd "$latest_version" +cp -av aliases.sqlite3 "$SEQREPO_DIR"/master/ +chmod u+w "$SEQREPO_DIR"/master/aliases.sqlite3 +cd sequences +cp -av db.sqlite3 "$SEQREPO_DIR"/master/sequences/ +chmod u+w "$SEQREPO_DIR"/master/sequences/db.sqlite3 +for d in 2???; do + cp -alv $d "$SEQREPO_DIR"/master/sequences/ +done