Skip to content

Commit

Permalink
Merge branch 'main' into IPVC-2445-update-schema-version
Browse files Browse the repository at this point in the history
  • Loading branch information
bsgiles73 committed May 21, 2024
2 parents 9af4f7f + accee56 commit e8f0b0f
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 52 deletions.
20 changes: 5 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,10 @@ mkdir -p $(pwd)/output/logs

Set variables:
```
export UTA_ETL_OLD_SEQREPO_VERSION=2024-02-20
export UTA_ETL_OLD_UTA_IMAGE_TAG=uta_20210129b
export UTA_ETL_OLD_UTA_VERSION=uta_20210129b
export UTA_ETL_OLD_UTA_VERSION=UTA_ETL_OLD_UTA_IMAGE_TAG
export UTA_ETL_NEW_UTA_VERSION=uta_20240512
export UTA_ETL_NCBI_DIR=./ncbi-data
export UTA_ETL_SEQREPO_DIR=./seqrepo-data
export UTA_ETL_WORK_DIR=./output/artifacts
export UTA_ETL_LOG_DIR=./output/logs
```
Expand All @@ -322,16 +321,7 @@ docker build --target uta -t uta-update .

### 1. Download SeqRepo data
```
docker pull biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
# download seqrepo. can skip if container already exists.
docker run --name seqrepo biocommons/seqrepo:$UTA_ETL_OLD_SEQREPO_VERSION
# copy seqrepo data into a local directory
docker run -v $UTA_ETL_SEQREPO_DIR:/output-dir --volumes-from seqrepo ubuntu bash -c 'cp -R /usr/local/share/seqrepo/* /output-dir'
# allow seqrepo to be modified
docker run -it -v $UTA_ETL_SEQREPO_DIR:/output-dir ubuntu bash -c 'chmod -R +w /output-dir'
docker compose run seqrepo-pull
```

Note: pulling data takes ~30 minutes and requires ~13 GB.
Expand All @@ -348,14 +338,14 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
docker compose run ncbi-download
docker compose run uta-extract
docker compose run seqrepo-load
UTA_ETL_NEW_UTA_VERSION=uta_20240512 docker compose run uta-load
docker compose run uta-load
```

#### 2B. Mitochondrial transcripts
```
docker compose -f docker-compose.yml -f misc/mito-transcripts/docker-compose-mito-extract.yml run mito-extract
docker compose run seqrepo-load
UTA_ETL_NEW_UTA_VERSION=uta_20240512 docker compose run uta-load
docker compose run uta-load
```

#### 2C. Manual splign transcripts
Expand Down
28 changes: 13 additions & 15 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
version: '3'

services:
seqrepo-pull:
user: root
image: uta-update
command: sbin/seqrepo-pull
volumes:
- seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo
network_mode: host
ncbi-download:
image: uta-update
command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir
Expand All @@ -22,9 +29,9 @@ services:
network_mode: host
seqrepo-load:
image: uta-update
command: sbin/seqrepo-load /usr/local/share/seqrepo ${UTA_ETL_OLD_SEQREPO_VERSION} /seqrepo-load/work /seqrepo-load/logs
command: sbin/seqrepo-load /seqrepo-load/work /seqrepo-load/logs
volumes:
- ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
- seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo
- ${UTA_ETL_WORK_DIR}:/seqrepo-load/work
- ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs
working_dir: /opt/repos/uta
Expand All @@ -46,19 +53,10 @@ services:
uta:
condition: service_healthy
volumes:
- ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
- seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo
- ${UTA_ETL_WORK_DIR}:/uta-load/work
- ${UTA_ETL_LOG_DIR}:/uta-load/logs
network_mode: host
splign-manual:
image: uta-update
command: sbin/uta-splign-manual ${UTA_ETL_OLD_SEQREPO_VERSION} ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs
depends_on:
uta:
condition: service_healthy
volumes:
- ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
- ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input
- ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work
- ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs
network_mode: host

volumes:
seqrepo-volume:
2 changes: 1 addition & 1 deletion etc/global.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ aligner = utaaa
fasta_directories =
aux/sequences2
aux/sequences
seqrepo = /usr/local/share/seqrepo/2024-02-20
seqrepo = /biocommons/dl.biocommons.org/seqrepo/master

#data/manual
#data/bic/sequences.fasta.bgz
Expand Down
16 changes: 16 additions & 0 deletions misc/splign-manual/docker-compose-splign-manual.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# docker compose file for the splign-manual uta update procedure

version: '3'

services:
splign-manual:
image: uta-update
command: sbin/uta-splign-manual ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs
depends_on:
uta:
condition: service_healthy
volumes:
- ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input
- ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work
- ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs
network_mode: host
32 changes: 19 additions & 13 deletions sbin/uta-splign-manual → misc/splign-manual/uta-splign-manual
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@

set -euxo pipefail

seqrepo_version=$1
source_uta_v=$2
input_dir=$3
working_dir=$4
log_dir=$5
source_uta_v=$1
input_dir=$2
working_dir=$3
log_dir=$4

if [ -z "$seqrepo_version" ] || [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
if [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
then
echo 'Usage: sbin/uta-splign-manual <source_uta_version> <input_dir> <working_dir> <log_dir>'
echo 'Usage: misc/uta-splign-manual <source_uta_version> <input_dir> <working_dir> <log_dir>'
exit 1
fi

Expand All @@ -24,22 +23,29 @@ mkdir -p "$log_dir"
mkdir -p "$working_dir"

# Generate txinfo.gz and exonset.gz files
python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml --output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log"
python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml \
--output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log"

# Generate fasta files
seqrepo export $(gzip -cdq $working_dir/txinfo.gz | cut -f2 | tail +2) --instance-name "$seqrepo_version" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log"
seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \
export $(gzip -cdq $working_dir/txinfo.gz | cut -f2 | tail +2) \
--instance-name "master" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log"

# Generate seqinfo.gz file
sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | tee "$log_dir/fasta-to-seqinfo.log"
sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | \
tee "$log_dir/fasta-to-seqinfo.log"

# Load seqinfo
uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo $working_dir/seqinfo.gz 2>&1 | tee "$log_dir/load-seqinfo.log"
uta --conf=etc/global.conf --conf=etc/[email protected] load-seqinfo $working_dir/seqinfo.gz 2>&1 | \
tee "$log_dir/load-seqinfo.log"

# Load txinfo
uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo $working_dir/txinfo.gz 2>&1 | tee "$log_dir/load-txinfo.log"
uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo $working_dir/txinfo.gz 2>&1 | \
tee "$log_dir/load-txinfo.log"

# Load exonset
uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset $working_dir/exonset.gz 2>&1 | tee "$log_dir/load-exonset.log"
uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset $working_dir/exonset.gz 2>&1 | \
tee "$log_dir/load-exonset.log"

# Align exons
uta --conf=etc/global.conf --conf=etc/[email protected] align-exons 2>&1 | tee "$log_dir/align-exons.log"
Expand Down
14 changes: 6 additions & 8 deletions sbin/seqrepo-load
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,20 @@

set -euxo pipefail

seqrepo_root=$1
seqrepo_version=$2
sequence_dir=$3
log_dir=$4
sequence_dir=$1
log_dir=$2

if [ -z "$seqrepo_root" ] || [ -z "$seqrepo_version" ] || [ -z "$sequence_dir" ] || [ -z "$log_dir" ]
if [ -z "$sequence_dir" ] || [ -z "$log_dir" ]
then
echo 'Usage: sbin/seqrepo-load <seqrepo_root> <seqrepo_version> <sequence_dir> <log_dir>'
echo 'Usage: sbin/seqrepo-load <sequence_dir> <log_dir>'
exit 1
fi

# find all fasta files in the working directory
mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*")

# Load SeqRepo with new sequences
seqrepo --root-directory "$seqrepo_root" \
load -n NCBI --instance-name "$seqrepo_version" \
seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \
load -n NCBI --instance-name "master" \
"${FASTA_FILES[@]}" 2>&1 | \
tee "$log_dir/seqrepo-load.log"
22 changes: 22 additions & 0 deletions sbin/seqrepo-pull
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

set -euxo pipefail

SEQREPO_DIR="/biocommons/dl.biocommons.org/seqrepo"

# pull the latest seqrepo version from biocommons
latest_version=$(seqrepo list-remote-instances | tail -n 1 | xargs)
cd "$SEQREPO_DIR"
rsync -rtHP --no-motd dl.biocommons.org::seqrepo/"$latest_version" .

# setup seqrepo build directory
mkdir -p master/sequences
cd "$latest_version"
cp -av aliases.sqlite3 "$SEQREPO_DIR"/master/
chmod u+w "$SEQREPO_DIR"/master/aliases.sqlite3
cd sequences
cp -av db.sqlite3 "$SEQREPO_DIR"/master/sequences/
chmod u+w "$SEQREPO_DIR"/master/sequences/db.sqlite3
for d in 2???; do
cp -alv $d "$SEQREPO_DIR"/master/sequences/
done

0 comments on commit e8f0b0f

Please sign in to comment.