Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dada2 #2483

Merged
merged 2 commits into from
Nov 8, 2019
Merged

dada2 #2483

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ script:
planemo test --galaxy_branch "$GALAXY_RELEASE" --galaxy_source "$GALAXY_REPO" $(cat changed_tools_chunk.list)
elif [ -s changed_repositories_chunk.list ]; then
while read -r DIR; do
planemo test --galaxy_branch "$GALAXY_RELEASE" --galaxy_source "$GALAXY_REPO" "$DIR"
if [[ "$DIR" =~ ^data_managers.* ]]; then
TESTPATH=$(planemo ci_find_tools "$DIR")
else
TESTPATH="$DIR"
fi
planemo test --galaxy_branch "$GALAXY_RELEASE" --galaxy_source "$GALAXY_REPO" "$TESTPATH"
done < changed_repositories_chunk.list
fi
10 changes: 10 additions & 0 deletions data_managers/data_manager_dada2/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: data_manager_dada2
owner: iuc
description: Data manager to download DADA2 reference databases
homepage_url: https://benjjneb.github.io/dada2/index.html
long_description: |
"DADA2: Fast and accurate sample inference from amplicon data with single-nucleotide resolution"
remote_repository_url: "https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_dada2"
type: unrestricted
categories:
- Data Managers
181 changes: 181 additions & 0 deletions data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
<?xml version="1.0"?>
<tool id="dada2_fetcher" name="dada2 data manager" tool_type="manage_data" version="0.0.7">
<description>Download reference databases</description>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/data_manager.py'
--out '$out_file'
#set dataset = str($db_cond.db_select) + '_' + str($db_cond.version_select)
--dataset '$dataset'
bernt-matthias marked this conversation as resolved.
Show resolved Hide resolved
]]>
</command>
<inputs>
<conditional name="db_cond">
<param name="db_select" type="select" label="Taxonomic database">
<option value="silva">Silva</option>
<option value="rdp">RDP</option>
<option value="greengenes">GreenGenes</option>
<option value="unite">UNITE Fungi: General Fasta</option>
<!-- UNITE Eukaryotes not yet supported https://github.com/benjjneb/dada2/issues/702 -->
<option value="RefSeq_RDP">NCBI RefSeq 16S rRNA database supplemented by RDP</option>
<option value="gtdb">GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea)</option>
<option value="hitdb">HitDB (Human InTestinal 16S)</option>
<option value="silva_euk_18S">Silva Eukaryotic 18S</option>
<option value="PR2">Protist Ribosomal Reference database (PR2)</option>
</param>
<when value="silva">
<param name="version_select" type="select" label="Database version">
<option value="132">132</option>
<option value="128">128</option>
</param>
</when>
<when value="rdp">
<param name="version_select" type="select" label="Database version">
<option value="16">16</option>
<option value="14">14</option>
</param>
</when>
<when value="greengenes">
<param name="version_select" type="select" label="Database version">
<option value="13.84">13.84</option>
</param>
</when>
<when value="unite">
<param name="version_select" type="select" label="Database version">
<option value="8.0_fungi">release 8.0 for Fungi</option>
<option value="8.0_fungi_singletons">release 8.0 for Fungi including global and 97% singletons</option>
</param>
</when>
<when value="RefSeq_RDP">
<param name="version_select" type="select" label="Database version">
<option value="2018_05">05/2018</option>
</param>
</when>
<when value="gtdb">
<param name="version_select" type="select" label="Database version">
<option value="2018_11">11/2018</option>
</param>
</when>
<when value="hitdb">
<param name="version_select" type="select" label="Database version">
<option value="1">1</option>
</param>
</when>
<when value="silva_euk_18S">
<param name="version_select" type="select" label="Database version">
<option value="132">132</option>
</param>
</when>
<when value="PR2">
<param name="version_select" type="select" label="Database version">
<option value="4.11.1">4.11.1</option>
</param>
</when>
</conditional>
</inputs>
<outputs>
<data name="out_file" format="data_manager_json" />
</outputs>
<tests>
<test>
<param name="db_cond|db_select" value="silva"/>
<param name="db_cond|version_select" value="132"/>
<output name="out_file" file="silva132_json"/>
</test>
<test>
<param name="db_cond|db_select" value="rdp"/>
<param name="db_cond|version_select" value="16"/>
<output name="out_file" file="rdp16_json"/>
</test>
<test>
<param name="db_cond|db_select" value="greengenes"/>
<param name="db_cond|version_select" value="13.84"/>
<output name="out_file" file="greengenes13.84_json"/>
</test>
<test>
<param name="db_cond|db_select" value="unite"/>
<param name="db_cond|version_select" value="8.0_fungi"/>
<output name="out_file" file="unite8fungi_json"/>
</test>
<test>
<param name="db_cond|db_select" value="unite"/>
<param name="db_cond|version_select" value="8.0_fungi_singletons"/>
<output name="out_file" file="unite8fungisingletons_json"/>
</test>
<test>
<param name="db_cond|db_select" value="RefSeq_RDP"/>
<param name="db_cond|version_select" value="2018_05"/>
<output name="out_file" file="RefSeq_RDP2018_json"/>
</test>
<test>
<param name="db_cond|db_select" value="gtdb"/>
<param name="db_cond|version_select" value="2018_11"/>
<output name="out_file" file="gtdb2018_json"/>
</test>
<test>
<param name="db_cond|db_select" value="hitdb"/>
<param name="db_cond|version_select" value="1"/>
<output name="out_file" file="hitdb1_json"/>
</test>
<test>
<param name="db_cond|db_select" value="silva_euk_18S"/>
<param name="db_cond|version_select" value="132"/>
<output name="out_file" file="silvaeuk132_json"/>
</test>
<test>
<param name="db_cond|db_select" value="PR2"/>
<param name="db_cond|version_select" value="4.11.1"/>
<output name="out_file" file="PR24.11.1_json"/>
</test>
</tests>
<help><![CDATA[
Public Reference databases maintained by the DADA2 project
..........................................................

The following refrence databases which are describes as maintained by the DADA2 project (https://benjjneb.github.io/dada2/training.html) are available

- Silva (https://www.arb-silva.de/)
- RDP (http://rdp.cme.msu.edu/)
- GreenGenes (http://greengenes.secondgenome.com/)
- UNITE general FASTA (https://unite.ut.ee/repository.php)

While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment.

For the Silva databases check the license information: http://www.arb-silva.de/silva-license-information.

Except for UNITE all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The UNITE databases are taken from the links provided on the UNITE website

More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html.

Further public Reference databases listed by the DADA2 project
..............................................................

Several contributed reference databases are listed of the DADA2 project website (https://benjjneb.github.io/dada2/training.html):

- RefSeq + RDP (NCBI RefSeq 16S rRNA database supplemented by RDP)
- GTDB: Genome Taxonomy Database (More info: http://gtdb.ecogenomic.org/)
- HitDB version 1 (Human InTestinal 16S rRNA) (https://github.com/microbiome/HITdb)
- RDP fungi LSU
- Silva Eukaryotic 18S
- PR2 (https://github.com/pr2database/pr2database)

Except for PR2, all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The PR2 database is taken from their github page.

More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html.
]]></help>
<citations>
<!-- silva -->
<citation type="doi">10.1093/nar/gks1219</citation>
<!-- rdp -->>
<citation type="doi">10.1093/nar/gkt1244</citation>
<!-- greengenes -->
<citation type="doi">10.1128/AEM.03006-05</citation>
<!-- unite -->
<citation type="doi">10.15156/BIO/786343</citation>
<!-- TODO gtdb ??? -->
<!-- hitdb -->
<citation type="doi">10.1186/s12864-015-2265-y</citation>
<!-- PR2 -->
<citation type="doi">10.1093/nar/gks1160</citation>
</citations>
</tool>

133 changes: 133 additions & 0 deletions data_managers/data_manager_dada2/data_manager/data_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import argparse
import json
import os
try:
# For Python 3.0 and later
from urllib.request import Request, urlopen
except ImportError:
# Fall back to Python 2 imports
from urllib2 import Request, urlopen

DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species"

FILE2NAME = {
"silva_132": "Silva version 132",
"silva_128": "Silva version 128",
"rdp_16": "RDP trainset 16",
"rdp_14": "RDP trainset 14",
"greengenes_13.84": "GreenGenes version 13.84",
"unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi",
"unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons",
"RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)",
"gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea) (11/2018)",
"hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)",
"silva_euk_18S_132": "Silva version 132 Eukaryotic 18S",
"PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1"
}

FILE2TAXURL = {
"silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1",
"silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1",
"rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1",
"rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1",
"unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip",
"unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip",
"greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1",
"RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1",
"gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1",
"hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1",
"silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1",
"PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz"
}

FILE2SPECIESURL = {
"silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1",
"silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1",
"rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1",
"rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1"
}

FILE2TAXLEVELS = {
"PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"
}


def url_download(url, fname, workdir):
"""
download url to workdir/fname
"""
file_path = os.path.join(workdir, fname)
if not os.path.exists(workdir):
os.makedirs(workdir)
src = None
dst = None
try:
req = Request(url)
src = urlopen(req)
with open(file_path, 'wb') as dst:
while True:
chunk = src.read(2**10)
if chunk:
dst.write(chunk)
else:
break
finally:
if src:
src.close()

# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta)
if fname.startswith("unite"):
import glob
import gzip
import shutil
import zipfile
# unzip download
zip_ref = zipfile.ZipFile(file_path, 'r')
zip_ref.extractall(workdir)
zip_ref.close()
# gzip top level fasta file
fastas = glob.glob("%s/*fasta" % workdir)
if len(fastas) != 1:
msg = "UNITE download %s contained %d fasta file(s): %s" % (url, len(fastas), " ".join(fastas))
raise Exception(msg)
with open(fastas[0], 'rb') as f_in:
with gzip.open(file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)


def remote_dataset(dataset, outjson):

with open(outjson) as jf:
params = json.loads(jf.read())

workdir = params['output_data'][0]['extra_files_path']
os.mkdir(workdir)
url_download( FILE2TAXURL[dataset], dataset + ".taxonomy", workdir)

data_manager_json = {"data_tables": {}}
data_manager_entry = {}
data_manager_entry['value'] = dataset
data_manager_entry['name'] = FILE2NAME[dataset]
data_manager_entry['path'] = dataset + ".taxonomy"
data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS)
data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry

if FILE2SPECIESURL.get(dataset, False ):
url_download( FILE2SPECIESURL[dataset], dataset + ".species", workdir)
data_manager_entry = {}
data_manager_entry['value'] = dataset
data_manager_entry['name'] = FILE2NAME[dataset]
data_manager_entry['path'] = dataset + ".species"
data_manager_json["data_tables"]["dada2_species"] = data_manager_entry

with file(outjson, 'w') as jf:
jf.write(json.dumps(data_manager_json))


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Create data manager json.')
parser.add_argument('--out', action='store', help='JSON filename')
parser.add_argument('--dataset', action='store', help='Download data set name')
args = parser.parse_args()

remote_dataset(args.dataset, args.out)
34 changes: 34 additions & 0 deletions data_managers/data_manager_dada2/data_manager_conf.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0"?>
<data_managers>
<data_manager tool_file="data_manager/dada2_fetcher.xml" id="dada2_fetcher">
<data_table name="dada2_taxonomy">
<output>
<column name="value" />
<column name="name" />
<column name="path" output_ref="out_file">
<move type="file" relativize_symlinks="True">
<source>${path}</source>
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
<column name="taxlevels" />
</output>
</data_table>
<data_table name="dada2_species">
<output>
<column name="value" />
<column name="name" />
<column name="path" output_ref="out_file">
<move type="file" relativize_symlinks="True">
<source>${path}</source>
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
</data_manager>
</data_managers>
1 change: 1 addition & 0 deletions data_managers/data_manager_dada2/test-data/PR24.11.1_json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "PR2_4.11.1.taxonomy", "name": "Protist Ribosomal Reference database (PR2) 4.11.1", "value": "PR2_4.11.1", "taxlevels": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "RefSeq_RDP_2018_05.taxonomy", "name": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "value": "RefSeq_RDP_2018_05", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}}
9 changes: 9 additions & 0 deletions data_managers/data_manager_dada2/test-data/dada2_species.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This is a sample file distributed with Galaxy that is used to define a
# list of dada2 reference data sets for species assignment, using three
# tab separated columns:
#
# <unique_build_id> <display_name> <fasta_file_path>
#
# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html
#
# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html
Loading