Skip to content

Commit

Permalink
remove Mar databases from kaiju-makedb
Browse files Browse the repository at this point in the history
  • Loading branch information
pmenzel committed Nov 25, 2023
1 parent b2cf640 commit 0e0216a
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 270 deletions.
192 changes: 0 additions & 192 deletions util/kaiju-convertMAR.py

This file was deleted.

79 changes: 1 addition & 78 deletions util/kaiju-makedb
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ usage() {
echo
echo " nr_euk: nr and additionally including fungi and microbial eukaryotes"
echo
# echo " mar_ref, mar_db: individual marine reference databases or assembled genomes from the Marine Metagenomics Portal"
# echo " mar: combination of both MAR databases"
# echo
echo " fungi: All fungi genomes from NCBI RefSeq (any assembly status)."
echo
echo " viruses: Viral genomes from NCBI RefSeq"
Expand Down Expand Up @@ -133,18 +130,11 @@ command -v kaiju-mkbwt >/dev/null 2>/dev/null || { echo Error: kaiju-mkbwt not f
command -v kaiju-convertNR >/dev/null 2>/dev/null || { echo Error: kaiju-convertNR not found in $PATH; exit 1; }

[ -z "$DB" ] && { echo Error: Use option -s to select a source database; usage; exit 1; }
[ "$DB" = "fungi" -o "$DB" = "mar" -o "$DB" = "mar_ref" -o "$DB" = "mar_db" -o "$DB" = "nr" -o "$DB" = "nr_euk" -o "$DB" = "refseq" -o "$DB" = "refseq_ref" -o "$DB" = "refseq_nr" -o "$DB" = "progenomes" -o "$DB" = "viruses" -o "$DB" = "plasmids" -o "$DB" = "rvdb" ] || { echo Error: $DB is not a valid source database; usage; exit 1; }
[ "$DB" = "fungi" -o "$DB" = "nr" -o "$DB" = "nr_euk" -o "$DB" = "refseq" -o "$DB" = "refseq_ref" -o "$DB" = "refseq_nr" -o "$DB" = "progenomes" -o "$DB" = "viruses" -o "$DB" = "plasmids" -o "$DB" = "rvdb" ] || { echo Error: $DB is not a valid source database; usage; exit 1; }

if [ "$DB" = "mar" -o "$DB" = "mar_ref" -o "$DB" = "mar_db" ]
then
command -v python >/dev/null 2>/dev/null || { echo Error: python not found; exit 1; }
jq --help >/dev/null 2>/dev/null || { echo jq is not installed; exit 1; }
python -c 'from collections import Counter' >/dev/null 2>/dev/null || { echo Error: Python version too low for using Counter; exit 1; }
fi

[ -r $SCRIPTDIR/kaiju-taxonlistEuk.tsv ] || { echo Error: File kaiju-taxonlistEuk.tsv not found in $SCRIPTDIR; exit 1; }
[ -r $SCRIPTDIR/kaiju-excluded-accessions.txt ] || { echo Error: File kaiju-excluded-accessions.txt not found in $SCRIPTDIR; exit 1; }
[ -r $SCRIPTDIR/kaiju-convertMAR.py ] || { echo Error: File kaiju-convertMAR.py not found in $SCRIPTDIR; exit 1; }

#test AnyUncompress usable in perl, used by kaiju-gbk2faa.pl
`perl -e 'use IO::Uncompress::AnyUncompress qw(anyuncompress $AnyUncompressError);'`
Expand All @@ -163,73 +153,6 @@ fi
echo "${GREEN}Extracting taxdump.tar.gz${NC}"
tar xf taxdump.tar.gz nodes.dmp names.dmp merged.dmp

#----------------------------------------------------------------------------------------------------------------------------------
if [ "$DB" = "mar" -o "$DB" = "mar_ref" -o "$DB" = "mar_db" ]
then
mkdir -p $DB/source
if [ $index_only -eq 0 ]
then
if [ $DL -eq 1 ]
then
if [ "$DB" = "mar" -o "$DB" = "mar_ref" ]
then
echo "${GREEN}Downloading MarRef metadata from MMP (databasesapi.sfb.uit.no)${NC}"
MARREF_VERSION=$(curl -Ls -o /dev/null -w %{url_effective} https://databasesapi.sfb.uit.no/rest/v1/MarRef/records | grep -Po 'ver=\K\d+\.\d+')
echo "${GREEN}Current MarRef version is: ${MARREF_VERSION}${NC}"
curl "https://databasesapi.sfb.uit.no/rpc/v1/MarRef/graphs?x%5Basmbl%3Asequences%5D=each&y_yName%5Btax%3Aorganism%5D=setR" -o $DB/MarRef.json -L
[ -r $DB/MarRef.json ] || { echo -e "${RED}Missing file MarRef.json${NC}"; exit 1; }
MARREF_COUNT=$(jq .graph[].x $DB/MarRef.json | wc -l)
echo "${GREEN}Downloading MarRef reference genomes from the Marine Metagenomics Portal using $parallelDL threads${NC}"
jq .graph[].x $DB/MarRef.json | tr -d '"' | xargs -I{} -P $parallelDL wget -P $DB/source -q -np --recursive https://public.sfb.uit.no/MarRef/genomes/{}/protein.faa || true
# Some genomes might be part of both DBs, causing
mv -n $DB/source/public.sfb.uit.no/MarRef/genomes/* $DB/source
rm -rf $DB/source/public.sfb.uit.no
echo "${GREEN}Converting MarRef data to Kaiju format${NC}"
python $SCRIPTDIR/kaiju-convertMAR.py --ref $DB/MarRef.json --genomes $DB/source >> $DB/kaiju_db_tmp.faa
fi
if [ "$DB" = "mar" -o "$DB" = "mar_db" ]
then
echo "${GREEN}Downloading MarDB metadata from MMP (databasesapi.sfb.uit.no)${NC}"
MARDB_VERSION=$(curl -Ls -o /dev/null -w %{url_effective} https://databasesapi.sfb.uit.no/rest/v1/MarDB/records | grep -Po 'ver=\K\d+\.\d+')
echo "${GREEN}Current MarDB version is: ${MARDB_VERSION}${NC}"
curl "https://databasesapi.sfb.uit.no/rpc/v1/MarDB/graphs?x%5Basmbl%3Asequences%5D=each&y_yName%5Btax%3Aorganism%5D=setR" -o $DB/MarDB.json -L
[ -r $DB/MarDB.json ] || { echo -e "${RED}Missing file MarDB.json${NC}"; exit 1; }
MARDB_COUNT=$(jq .graph[].x $DB/MarDB.json | wc -l)
echo "${GREEN}Downloading MarDB complete genomes from the Marine Metagenomics Portal using $parallelDL threads${NC}"
jq .graph[].x $DB/MarDB.json | tr -d '"' | xargs -I{} -P $parallelDL wget -P $DB/source -q -np --recursive https://public.sfb.uit.no/MarDB/genomes/{}/protein.faa || true
mv -n $DB/source/public.sfb.uit.no/MarDB/genomes/* $DB/source
rm -rf $DB/source/public.sfb.uit.no
echo "${GREEN}Converting MarRef data to Kaiju format${NC}"
python $SCRIPTDIR/kaiju-convertMAR.py --ref $DB/MarDB.json --genomes $DB/source >> $DB/kaiju_db_tmp.faa
fi
fi
fi
echo "${GREEN}Performing Perl oneliner-wizardry${NC}"
cat $DB/kaiju_db_tmp.faa | perl -lsne 'BEGIN{open(F,$m);while(<F>){@F=split(/[\|\s]+/);$h{$F[0]}=$F[1]}}if(/(>.+)_(\d+)/){print $1,"_",defined($h{$2})?$h{$2}:$2;}else{print}' -- -m=merged.dmp > $DB/kaiju_db_$DB.faa
rm $DB/kaiju_db_tmp.faa
echo "${GREEN}Creating Borrows-Wheeler transform${NC}"
kaiju-mkbwt -n $threadsBWT -e $exponentSA -a ACDEFGHIKLMNPQRSTVWY -o $DB/kaiju_db_$DB $DB/kaiju_db_$DB.faa
echo "${GREEN}Creating FM-Index${NC}"
kaiju-mkfmi $DB/kaiju_db_$DB
if [ "$DB" = "mar" ]
then
echo "${GREEN}Added MarRef v${MARREF_VERSION}\n--Metadata contains ${MARREF_COUNT} entries${NC}"
echo "${GREEN}Added MarDB v${MARDB_VERSION}\n--Metadata contains ${MARDB_COUNT} entries${NC}"
MARREF_MARDB_COUNT=`expr ${MARREF_COUNT} + ${MARDB_COUNT}`
echo "${GREEN}Combined\n--Metadata contains: ${MARREF_MARDB_COUNT} entries"
fi
if [ "$DB" = "mar_ref" ]
then
echo "${GREEN}Added MarRef v${MARREF_VERSION}\n--Metadata contains ${MARREF_COUNT} entries${NC}"
fi
if [ "$DB" = "mar_db" ]
then
echo "${GREEN}Added MarDB v${MARDB_VERSION}\n--Metadata contains ${MARDB_COUNT} entries${NC}"
fi
echo "${GREEN}\nCreated database ${DB}/ has sequences from `ls -1 $DB/source|wc -l` genomes.\n(This number should add up to total metadata entries. If not, some genomes have missing sequence data either from NCBI or from local MMP backend processing for various reasons and/or criteria)${NC}"
echo "${GREEN}\nYou should keep this information${NC}"
echo "${GREEN}Read more about the Mar databases here: https://mmp2.sfb.uit.no/databases/${NC}"
fi
#----------------------------------------------------------------------------------------------------------------------------------
if [ "$DB" = "nr_euk" ]
then
Expand Down

0 comments on commit 0e0216a

Please sign in to comment.