remove Mar databases from kaiju-makedb

bioinformatics-centre · Nov 25, 2023 · 0e0216a · 0e0216a
1 parent b2cf640
commit 0e0216a
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 270 deletions.
diff --git a/util/kaiju-convertMAR.py b/util/kaiju-convertMAR.py
diff --git a/util/kaiju-makedb b/util/kaiju-makedb
@@ -42,9 +42,6 @@ usage() {
 	echo
 	echo  " nr_euk: nr and additionally including fungi and microbial eukaryotes"
 	echo
-#	echo  " mar_ref, mar_db: individual marine reference databases or assembled genomes from the Marine Metagenomics Portal"
-#	echo  " mar: combination of both MAR databases"
-#	echo
 	echo  " fungi: All fungi genomes from NCBI RefSeq (any assembly status)."
 	echo
 	echo  " viruses: Viral genomes from NCBI RefSeq"
@@ -133,18 +130,11 @@ command -v kaiju-mkbwt >/dev/null 2>/dev/null || { echo Error: kaiju-mkbwt not f
 command -v kaiju-convertNR >/dev/null 2>/dev/null || { echo Error: kaiju-convertNR not found in $PATH; exit 1; }
 
 [ -z "$DB" ] && { echo Error: Use option -s to select a source database; usage; exit 1; }
-[ "$DB" = "fungi" -o "$DB" = "mar" -o "$DB" = "mar_ref" -o "$DB" = "mar_db" -o "$DB" = "nr" -o "$DB" = "nr_euk" -o "$DB" = "refseq" -o "$DB" = "refseq_ref" -o "$DB" = "refseq_nr" -o "$DB" = "progenomes" -o "$DB" = "viruses" -o "$DB" = "plasmids" -o "$DB" = "rvdb" ] || { echo Error: $DB is not a valid source database; usage; exit 1; }
+[ "$DB" = "fungi" -o "$DB" = "nr" -o "$DB" = "nr_euk" -o "$DB" = "refseq" -o "$DB" = "refseq_ref" -o "$DB" = "refseq_nr" -o "$DB" = "progenomes" -o "$DB" = "viruses" -o "$DB" = "plasmids" -o "$DB" = "rvdb" ] || { echo Error: $DB is not a valid source database; usage; exit 1; }
 
-if [ "$DB" = "mar" -o "$DB" = "mar_ref" -o "$DB" = "mar_db" ]
-then
-	command -v python >/dev/null 2>/dev/null || { echo Error: python not found; exit 1; }
-	jq --help >/dev/null 2>/dev/null || { echo jq is not installed; exit 1; }
-	python -c 'from collections import Counter' >/dev/null 2>/dev/null || { echo Error: Python version too low for using Counter; exit 1; }
-fi
 
 [ -r $SCRIPTDIR/kaiju-taxonlistEuk.tsv ] || { echo Error: File kaiju-taxonlistEuk.tsv not found in $SCRIPTDIR; exit 1; }
 [ -r $SCRIPTDIR/kaiju-excluded-accessions.txt ] || { echo Error: File kaiju-excluded-accessions.txt not found in $SCRIPTDIR; exit 1; }
-[ -r $SCRIPTDIR/kaiju-convertMAR.py ] || { echo Error: File kaiju-convertMAR.py not found in $SCRIPTDIR; exit 1; }
 
 #test AnyUncompress usable in perl, used by kaiju-gbk2faa.pl
 `perl -e 'use IO::Uncompress::AnyUncompress qw(anyuncompress $AnyUncompressError);'`
@@ -163,73 +153,6 @@ fi
 echo "${GREEN}Extracting taxdump.tar.gz${NC}"
 tar xf taxdump.tar.gz nodes.dmp names.dmp merged.dmp
 
-#----------------------------------------------------------------------------------------------------------------------------------
-if [ "$DB" = "mar" -o "$DB" = "mar_ref" -o "$DB" = "mar_db" ]
-then
-	mkdir -p $DB/source
-	if [ $index_only -eq 0 ]
-	then
-		if [ $DL -eq 1 ]
-		then
-			if [ "$DB" = "mar" -o "$DB" = "mar_ref" ]
-			then
-				echo "${GREEN}Downloading MarRef metadata from MMP (databasesapi.sfb.uit.no)${NC}"
-				MARREF_VERSION=$(curl -Ls -o /dev/null -w %{url_effective} https://databasesapi.sfb.uit.no/rest/v1/MarRef/records | grep -Po 'ver=\K\d+\.\d+')
-				echo "${GREEN}Current MarRef version is: ${MARREF_VERSION}${NC}"
-				curl "https://databasesapi.sfb.uit.no/rpc/v1/MarRef/graphs?x%5Basmbl%3Asequences%5D=each&y_yName%5Btax%3Aorganism%5D=setR" -o $DB/MarRef.json -L
-				[ -r $DB/MarRef.json ] || { echo -e "${RED}Missing file MarRef.json${NC}"; exit 1; }
-				MARREF_COUNT=$(jq .graph[].x $DB/MarRef.json | wc -l)
-				echo "${GREEN}Downloading MarRef reference genomes from the Marine Metagenomics Portal using $parallelDL threads${NC}"
-				jq .graph[].x $DB/MarRef.json | tr -d '"' | xargs -I{} -P $parallelDL wget -P $DB/source -q -np --recursive https://public.sfb.uit.no/MarRef/genomes/{}/protein.faa || true
-				# Some genomes might be part of both DBs, causing 
-				mv -n $DB/source/public.sfb.uit.no/MarRef/genomes/* $DB/source
-				rm -rf $DB/source/public.sfb.uit.no
-				echo "${GREEN}Converting MarRef data to Kaiju format${NC}"
-				python $SCRIPTDIR/kaiju-convertMAR.py --ref $DB/MarRef.json --genomes $DB/source >> $DB/kaiju_db_tmp.faa
-			fi
-			if [ "$DB" = "mar" -o "$DB" = "mar_db" ]
-			then
-				echo "${GREEN}Downloading MarDB metadata from MMP (databasesapi.sfb.uit.no)${NC}"
-				MARDB_VERSION=$(curl -Ls -o /dev/null -w %{url_effective} https://databasesapi.sfb.uit.no/rest/v1/MarDB/records | grep -Po 'ver=\K\d+\.\d+')
-				echo "${GREEN}Current MarDB version is: ${MARDB_VERSION}${NC}"
-				curl "https://databasesapi.sfb.uit.no/rpc/v1/MarDB/graphs?x%5Basmbl%3Asequences%5D=each&y_yName%5Btax%3Aorganism%5D=setR" -o $DB/MarDB.json -L
-				[ -r $DB/MarDB.json ] || { echo -e "${RED}Missing file MarDB.json${NC}"; exit 1; }
-				MARDB_COUNT=$(jq .graph[].x $DB/MarDB.json | wc -l)
-				echo "${GREEN}Downloading MarDB complete genomes from the Marine Metagenomics Portal using $parallelDL threads${NC}"
-				jq .graph[].x $DB/MarDB.json | tr -d '"' | xargs -I{} -P $parallelDL wget -P $DB/source -q -np --recursive https://public.sfb.uit.no/MarDB/genomes/{}/protein.faa || true
-				mv -n $DB/source/public.sfb.uit.no/MarDB/genomes/* $DB/source
-				rm -rf $DB/source/public.sfb.uit.no
-				echo "${GREEN}Converting MarRef data to Kaiju format${NC}"
-				python $SCRIPTDIR/kaiju-convertMAR.py --ref $DB/MarDB.json --genomes $DB/source >> $DB/kaiju_db_tmp.faa
-			fi
-		fi
-	fi
-	echo "${GREEN}Performing Perl oneliner-wizardry${NC}"
-	cat $DB/kaiju_db_tmp.faa | perl -lsne 'BEGIN{open(F,$m);while(<F>){@F=split(/[\|\s]+/);$h{$F[0]}=$F[1]}}if(/(>.+)_(\d+)/){print $1,"_",defined($h{$2})?$h{$2}:$2;}else{print}' -- -m=merged.dmp > $DB/kaiju_db_$DB.faa
-	rm $DB/kaiju_db_tmp.faa
-	echo "${GREEN}Creating Borrows-Wheeler transform${NC}"
-	kaiju-mkbwt -n $threadsBWT -e $exponentSA -a ACDEFGHIKLMNPQRSTVWY -o $DB/kaiju_db_$DB $DB/kaiju_db_$DB.faa
-	echo "${GREEN}Creating FM-Index${NC}"
-	kaiju-mkfmi $DB/kaiju_db_$DB
-	if [ "$DB" = "mar" ]
-	then
-		echo "${GREEN}Added MarRef v${MARREF_VERSION}\n--Metadata contains ${MARREF_COUNT} entries${NC}"
-		echo "${GREEN}Added MarDB v${MARDB_VERSION}\n--Metadata contains ${MARDB_COUNT} entries${NC}"
-		MARREF_MARDB_COUNT=`expr ${MARREF_COUNT} + ${MARDB_COUNT}`
-		echo "${GREEN}Combined\n--Metadata contains: ${MARREF_MARDB_COUNT} entries"
-	fi
-	if [ "$DB" = "mar_ref" ]
-	then
-		echo "${GREEN}Added MarRef v${MARREF_VERSION}\n--Metadata contains ${MARREF_COUNT} entries${NC}"
-	fi
-	if [ "$DB" = "mar_db" ]
-	then
-		echo "${GREEN}Added MarDB v${MARDB_VERSION}\n--Metadata contains ${MARDB_COUNT} entries${NC}"
-	fi
-	echo "${GREEN}\nCreated database ${DB}/ has sequences from `ls -1 $DB/source|wc -l` genomes.\n(This number should add up to total metadata entries. If not, some genomes have missing sequence data either from NCBI or from local MMP backend processing for various reasons and/or criteria)${NC}"
-	echo "${GREEN}\nYou should keep this information${NC}"
-	echo "${GREEN}Read more about the Mar databases here: https://mmp2.sfb.uit.no/databases/${NC}"
-fi
 #----------------------------------------------------------------------------------------------------------------------------------
 if [ "$DB" = "nr_euk" ]
 then