From f327c1241b5d4b20990ac1fd5fc2080d4306c941 Mon Sep 17 00:00:00 2001 From: hadley Date: Mon, 17 Dec 2018 14:49:52 -0500 Subject: [PATCH] add examples --- examples/HCV1a.json | 348 ++++++ examples/HIVE_metagenomics.json | 270 +++++ examples/LICENSE | 25 + examples/README.md | 13 + examples/UVP.json | 1146 +++++++++++++++++++ examples/glycosylation-sites-UniCarbKB.json | 260 +++++ 6 files changed, 2062 insertions(+) create mode 100644 examples/HCV1a.json create mode 100644 examples/HIVE_metagenomics.json create mode 100644 examples/LICENSE create mode 100644 examples/README.md create mode 100644 examples/UVP.json create mode 100644 examples/glycosylation-sites-UniCarbKB.json diff --git a/examples/HCV1a.json b/examples/HCV1a.json new file mode 100644 index 0000000..7042769 --- /dev/null +++ b/examples/HCV1a.json @@ -0,0 +1,348 @@ +{ + "bco_id": "https://w3id.org/biocompute/1.3.0/examples/HCV1a.json", + "checksum": "06DACE70679F35BA87A3DD6FFFED4ED24A4F5B8C2571264C37E5F1B3ADE04A31", + "bco_spec_version" : "https://w3id.org/biocompute/1.3.0/", + "provenance_domain": { + "name": "HCV1a ledipasvir resistance SNP detection", + "version": "2.9", + "review": [ + { + "status": "approved", + "reviewer_comment": "Approved by GW staff. Waiting for approval from FDA Reviewer", + "date": "2017-11-12T12:30:48-0400", + "reviewer": { + "name": "Charles Hadley King", + "affiliation": "George Washington University", + "email": "hadley_king@gwu.edu", + "contribution": ["curatedBy"], + "orcid": "https://orcid.org/0000-0003-1409-4549" + } + }, + { + "status": "approved", + "reviewer_comment": "The revised BCO looks fine", + "date": "2017-12-12T12:30:48-0400", + "reviewer": { + "name": "Eric Donaldson", + "affiliation": "FDA", + "email": "Eric.Donaldson@fda.hhs.gov", + "contribution": ["curatedBy"] + } + } + ], + "obsolete_after" : "2118-09-26T14:43:43-0400", + "embargo" : { + "start_time": "2000-09-26T14:43:43-0400", + "end_time": "2000-09-26T14:43:45-0400" + }, + "created": "2017-01-24T09:40:17-0500", + "modified": "2018-09-21T14:06:14-0400", + "contributors": [ + { + "name": "Charles Hadley King", + "affiliation": "George Washington University", + "email": "hadley_king@gwu.edu", + "contribution": ["createdBy", "curatedBy"], + "orcid": "https://orcid.org/0000-0003-1409-4549" + }, + { + "name": "Eric Donaldson", + "affiliation": "FDA", + "email": "Eric.Donaldson@fda.hhs.gov", + "contribution": ["authoredBy"] + } + ], + "license": "https://spdx.org/licenses/CC-BY-4.0.html" + }, + "usability_domain": [ + "Identify baseline single nucleotide polymorphisms (SNPs)[SO:0000694], (insertions)[SO:0000667], and (deletions)[SO:0000045] that correlate with reduced (ledipasvir)[pubchem.compound:67505836] antiviral drug efficacy in (Hepatitis C virus subtype 1)[taxonomy:31646]", + "Identify treatment emergent amino acid (substitutions)[SO:1000002] that correlate with antiviral drug treatment failure", + "Determine whether the treatment emergent amino acid (substitutions)[SO:1000002] identified correlate with treatment failure involving other drugs against the same virus", + "GitHub CWL example: https://github.com/mr-c/hive-cwl-examples/blob/master/workflow/hive-viral-mutation-detection.cwl#L20" + ], + "extension_domain":{ + "fhir_extension": [ + { + "fhir_endpoint": "http://fhirtest.uhn.ca/baseDstu3", + "fhir_version": "3", + "fhir_resources": [ + { + "fhir_resource": "Sequence", + "fhir_id": "21376" + }, + { + "fhir_resource": "DiagnosticReport", + "fhir_id": "6288583" + }, + { + "fhir_resource": "ProcedureRequest", + "fhir_id": "25544" + }, + { + "fhir_resource": "Observation", + "fhir_id": "92440" + }, + { + "fhir_resource": "FamilyMemberHistory", + "fhir_id": "4588936" + } + ] + } + ], + "scm_extension": { + "scm_repository": "https://github.com/example/repo1", + "scm_type": "git", + "scm_commit": "c9ffea0b60fa3bcf8e138af7c99ca141a6b8fb21", + "scm_path": "workflow/hive-viral-mutation-detection.cwl", + "scm_preview": "https://github.com/example/repo1/blob/c9ffea0b60fa3bcf8e138af7c99ca141a6b8fb21/workflow/hive-viral-mutation-detection.cwl" + } + }, + "description_domain": { + "keywords": [ + "HCV1a", + "Ledipasvir", + "antiviral resistance", + "SNP", + "amino acid substitutions" + ], + "xref": [ + { + "namespace": "pubchem.compound", + "name": "PubChem-compound", + "ids": ["67505836"], + "access_time": "2018-13-02T10:15-05:00" + }, + { + "namespace": "pubmed", + "name": "PubMed", + "ids": ["26508693"], + "access_time": "2018-13-02T10:15-05:00" + }, + { + "namespace": "so", + "name": "Sequence Ontology", + "ids": ["SO:000002", "SO:0000694", "SO:0000667", "SO:0000045"], + "access_time": "2018-13-02T10:15-05:00" + }, + { + "namespace": "taxonomy", + "name": "Taxonomy", + "ids": ["31646"], + "access_time": "2018-13-02T10:15-05:00" + } + ], + "platform": ["HIVE"], + "pipeline_steps": [ + { + "step_number": 1, + "name": "HIVE-hexagon", + "description": "Alignment of reads to a set of references", + "version": "1.3", + "prerequisite": [ + { + "name": "Hepatitis C virus genotype 1", + "uri": { + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/22129792", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "name": "Hepatitis C virus type 1b complete genome", + "uri": { + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/5420376", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "name": "Hepatitis C virus (isolate JFH-1) genomic RNA", + "uri": { + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/13122261", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "name": "Hepatitis C virus clone J8CF, complete genome", + "uri": { + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/386646758", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "name": "Hepatitis C virus S52 polyprotein gene", + "uri": { + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/295311559", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ], + "input_list": [ + { + "uri": "http://example.com/dna.cgi?cmd=objFile&ids=514683", + "access_time": "2017-01-24T09:40:17-0500" + }, + { + "uri": "http://example.com/dna.cgi?cmd=objFile&ids=514682", + "access_time": "2017-01-24T09:40:17-0500" + } + ], + "output_list": [ + { + "uri": "http://example.com/data/514769/allCount-aligned.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + ] + }, + { + "step_number": 2, + "name": "HIVE-heptagon", + "description": "variant calling", + "version": "1.3", + "input_list": [ + { + "uri": "http://example.com/data/514769/dnaAccessionBased.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + ], + "output_list": [ + { + "uri": "http://example.com/data/514801/SNPProfile.csv", + "access_time": "2017-01-24T09:40:17-0500" + }, + { + "uri": "http://example.com/data/14769/allCount-aligned.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + ] + } + ] + }, + "execution_domain": { + "script":[ + { + "uri": { + "uri": "https://example.com/workflows/antiviral_resistance_detection_hive.py" + } + } + ], + "script_driver": "shell", + "software_prerequisites": [ + { + "name": "HIVE-hexagon", + "version": "babajanian.1", + "uri": { + "uri": "http://example.com/dna.cgi?cmd=dna-hexagon&cmdMode=-", + "access_time": "2017-01-24T09:40:17-0500", + "sha1_chksum": "d60f506cddac09e9e816531e7905ca1ca6641e3c" + } + }, + { + "name": "HIVE-heptagon", + "version": "albinoni.2", + "uri": { + "uri": "http://example.com/dna.cgi?cmd=dna-heptagon&cmdMode=-", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ], + "external_data_endpoints": [ + { + "name": "HIVE", + "url": "http://example.com/dna.cgi?cmd=login" + }, + { + "name": "access to e-utils", + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + } + ], + "environment_variables": { + "HOSTTYPE": "x86_64-linux", + "EDITOR": "vim" + } + }, + "parametric_domain": [ + {"param": "seed", "value": "14", "step": "1"}, + {"param":"minimum_match_len", "value": "66", "step": "1"}, + {"param": "divergence_threshold_percent", "value": "0.30", "step": "1"}, + {"param": "minimum_coverage", "value": "15", "step": "2"}, + {"param": "freq_cutoff", "value": "0.10", "step": "2"} + ], + "io_domain": { + "input_subdomain": [ + { + "uri": { + "filename": "Hepatitis C virus genotype 1", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/22129792", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus type 1b complete genome", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/5420376", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus (isolate JFH-1) genomic RNA", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/13122261", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/386646758", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus S52 polyprotein gene", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/295311559", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "HCV1a_drug_resistant_sample0001-01", + "uri": "http://example.com/nuc-read/514682", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + + "uri": { + "filename": "HCV1a_drug_resistant_sample0001-02", + "uri": "http://example.com/nuc-read/514683", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ], + "output_subdomain": [ + { + "mediatype": "text/csv", + "uri": { + "uri": "http://example.com/data/514769/dnaAccessionBased.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://example.com/data/514801/SNPProfile*.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ] + }, + "error_domain": { + "empirical_error": { + "false_negative_alignment_hits": "<0.0010", + "false_discovery": "<0.05" + }, + "algorithmic_error": { + "false_positive_mutation_calls_discovery": "<0.00005", + "false_discovery": "0.005" + } + } +} diff --git a/examples/HIVE_metagenomics.json b/examples/HIVE_metagenomics.json new file mode 100644 index 0000000..fcecc6d --- /dev/null +++ b/examples/HIVE_metagenomics.json @@ -0,0 +1,270 @@ +{ + "bco_id": "https://w3id.org/biocompute/1.3.0/examples/HIVE_metagenomics.json", + "checksum": "ECD541AE0F61AAAAA1FAC14B2B08ABE18F610E1AA4677D54E89B292550F5058A", + "bco_spec_version" : "https://w3id.org/biocompute/1.3.0/", + "provenance_domain": { + "name": "Healthy human fecal metagenomic diversity", + "version": "1.0.0", + "review": [ + { + "status": "approved", + "reviewer_comment": "Approved by GW staff.", + "reviewer": { + "name": "Charles Hadley King", + "affiliation": "George Washington University", + "email": "hadley_king@gwu.edu", + "contribution": ["curatedBy"], + "orcid": "https://orcid.org/0000-0003-1409-4549" + } + } + ], + "obsolete_after" : "2118-09-26T14:43:43-0400", + "embargo" : { + "start_time": "2000-09-26T14:43:43-0400", + "end_time": "2000-09-26T14:43:45-0400" + }, + "created": "2018-11-29T11:29:08-0500", + "modified": "2018-11-30T11:29:08-0500", + "contributors": [ + { + "name": "Charles Hadley King", + "affiliation": "George Washington University", + "email": "hadley_king@gwu.edu", + "contribution": ["createdBy", "curatedBy", "authoredBy"], + "orcid": "https://orcid.org/0000-0003-1409-4549" + }, + { + "name": "Raja Mazumder", + "affiliation": "George Washington University", + "email": "mazumder@gwu.edu", + "contribution": ["createdBy", "curatedBy", "authoredBy"], + "orcid": "https://orcid.org/0000-0001-88238-9945" + } + ], + "license": "https://spdx.org/licenses/CC-BY-4.0.html" + }, + "usability_domain": [ + "Identify the most common organism present in a human [taxID:9606] fecal [UBERON:0001988] sample, ", + "Identify the general community composition of organisms in a human [taxID:9606] fecal [UBERON:0001988] sample, ", + "CensuScope is used to do a census of the composition of the read files. Based on a user-defined threshold, organisms identified are used for alignment in the Hexagon alignment." + ], + "extension_domain":{ + "scm_extension": { + "scm_repository": "https://github.com/biocompute-objects/HIVE_metagenomics", + "scm_type": "git", + "scm_commit": "e4620f642fb20557f6c679397696614305ed07b1", + "scm_path": "biocompute-objects/HIVE_metagenomics", + "scm_preview": "https://github.com/example/repo1/blob/c9ffea0b60fa3bcf8e138af7c99ca141a6b8fb21/workflow/hive-viral-mutation-detection.cwl" + } + }, + "description_domain": { + "keywords": [ + "metagenome", + "metagenomic analysis", + "fecal" + ], + "xref": [ + { + "namespace": "uberon", + "name": "Uber Anatomy Ontology", + "ids": ["0001988"], + "access_time": "2016-11-30T06:46-0500" + }, + { + "namespace": "taxonomy", + "name": "Taxonomy", + "ids": ["9606"], + "access_time": "2016-11-30T06:46-0500" + } + ], + "platform": ["hive"], + "pipeline_steps": [ + { + "step_number": 1, + "name": "CensuScope", + "description": "Detect taxonomic composition of a metagenomic data set.", + "version": "1.3", + "prerequisite": [ + { + "name": "Filtered_NT_feb18_2016", + "uri": { + "uri": "https://hive.biochemistry.gwu.edu/genome/513957", + "access_time": "2016-11-30T06:46-0500" + } + } + ], + "input_list": [ + { + "uri": "https://hive.biochemistry.gwu.edu/nuc-read/545722", + "access_time": "2016-11-30T06:46-0500" + }, + { + "uri": "https://hive.biochemistry.gwu.edu/nuc-read/545721", + "access_time": "2016-11-30T06:46-0500" + } + ], + "output_list": [ + { + "uri": "https://hive.biochemistry.gwu.edu/546223/dnaAccessionBasedResult.csv", + "access_time": "2016-11-30T06:46-0500" + } + ] + }, + { + "step_number": 2, + "name": "HIVE-hexagon", + "description": "Alignment of reads to a set of references", + "version": "1.3", + "input_list": [ + { + "uri": "http://example.com/data/546223/dnaAccessionBased.csv", + "access_time": "2016-11-30T06:46-0500" + }, + { + "uri": "https://hive.biochemistry.gwu.edu/nuc-read/545722", + "access_time": "2016-11-30T06:46-0500" + }, + { + "uri": "https://hive.biochemistry.gwu.edu/nuc-read/545721", + "access_time": "2016-11-30T06:46-0500" + } + ], + "output_list": [ + { + "uri": "https://hive.biochemistry.gwu.edu/546232/alCount-Unalignedo524569-alCount--1.csv", + "access_time": "2016-11-30T06:46-0500" + } + ] + } + ] + }, + "execution_domain": { + "script": [ + { + "uri": { + "uri": "https://github.com/biocompute-objects/HIVE_metagenomics/blob/master/driverHIVEmetagenomic.py" + } + } + ], + "script_driver": "shell", + "software_prerequisites": [ + { + "name": "CensuScope", + "version": "albinoni.2", + "uri": { + "uri": "http://example.com/dna.cgi?cmd=dna-screening&cmdMode=-", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "name": "HIVE-hexagon", + "version": "babajanian.1", + "uri": { + "uri": "http://example.com/dna.cgi?cmd=dna-hexagon&cmdMode=-", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ], + "external_data_endpoints": [ + { + "name": "HIVE", + "url": "https://hive.biochemistry.gwu.edu/dna.cgi?cmd=login" + }, + { + "name": "access to e-utils", + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + } + ], + "environment_variables": { + "key": "HOSTTYPE", + "value" : "x86_64-linux" + } + }, + "parametric_domain": [ + {"param": "seed", "value": "14", "step": "2"}, + {"param":"minimum_match_len", "value": "66", "step": "2"}, + {"param": "divergence_threshold_percent", "value": "0.30", "step": "2"}, + {"param": "minimum_coverage", "value": "15", "step": "2"}, + {"param": "freq_cutoff", "value": "0.10", "step": "2"} + ], + "io_domain": { + "input_subdomain": [ + { + "uri": { + "filename": "Hepatitis C virus genotype 1", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/22129792", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus type 1b complete genome", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/5420376", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus (isolate JFH-1) genomic RNA", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/13122261", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus clone J8CF, complete genome", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/386646758", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "Hepatitis C virus S52 polyprotein gene", + "uri": "http://www.ncbi.nlm.nih.gov/nuccore/295311559", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "uri": { + "filename": "HCV1a_drug_resistant_sample0001-01", + "uri": "http://example.com/nuc-read/514682", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + + "uri": { + "filename": "HCV1a_drug_resistant_sample0001-02", + "uri": "http://example.com/nuc-read/514683", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ], + "output_subdomain": [ + { + "mediatype": "text/csv", + "uri": { + "uri": "http://example.com/data/514769/dnaAccessionBased.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://example.com/data/514801/SNPProfile*.csv", + "access_time": "2017-01-24T09:40:17-0500" + } + } + ] + }, + "error_domain": { + "empirical_error": { + "false_negative_alignment_hits": "<0.0010", + "false_discovery": "<0.05" + }, + "algorithmic_error": { + "false_positive_mutation_calls_discovery": "<0.00005", + "false_discovery": "0.005" + } + } +} diff --git a/examples/LICENSE b/examples/LICENSE new file mode 100644 index 0000000..d4c4219 --- /dev/null +++ b/examples/LICENSE @@ -0,0 +1,25 @@ +BSD 2-Clause License + +Copyright (c) 2018, BioCompute +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..118415d --- /dev/null +++ b/examples/README.md @@ -0,0 +1,13 @@ +# BCO Examples +A repository for BCO example flat files. + +## Table of Contents: + +* [HCV1a](HCV1a.json) - This BCO was developed with the [Reproducibility and Interpretation use case](https://github.com/biocompute-objects/BCO_Specification/blob/master/introduction.md#reproducibility-and-interpretation-use-case) in mind. This is the archetypal BCO example and is in the [BCO Specification](https://github.com/biocompute-objects/BCO_Specification) repository. + +* [glycosylation-sites-UniCarbKB](glycosylation-sites-UniCarbKB.json) - This BCO was developed with the [Data integration use case](https://github.com/biocompute-objects/BCO_Specification/blob/master/introduction.md#data-integration-use-case) in mind. The full repository is available [here](https://github.com/biocompute-objects/Dataset-BCO) + + +* [UVP](UVP.json) - This BCO was developed with the [Accountability use case](https://github.com/biocompute-objects/BCO_Specification/blob/master/introduction.md#accountability-use-case) in mind. The full repository is available [here](https://github.com/biocompute-objects/UVP-BCO) + +* [HIVE_metagenomics](HIVE_metagenomics.json) - This BCO was developed with the [Reusability Use Case](https://github.com/biocompute-objects/BCO_Specification/blob/master/introduction.md#reusability-use-case) in mind. The full repository is available [here](https://github.com/biocompute-objects/HIVE_metagenomics) \ No newline at end of file diff --git a/examples/UVP.json b/examples/UVP.json new file mode 100644 index 0000000..5161eed --- /dev/null +++ b/examples/UVP.json @@ -0,0 +1,1146 @@ +{ + "bco_id": "https://w3id.org/biocompute/1.3.0/examples/UVP_BCO.json", + "checksum": "8098B0E9BF2D8D98A0F3C200774A8BD8E228064F56BFB81DEEE432BB2252B014", + "bco_spec_version": "https://w3id.org/biocompute/1.3.0/", + "provenance_domain": { + "name": "Lineage assignment for an isolate of M. tuberculosis based on its single nucleotide polymorphism (SNP) profile based on UVC v1.0.", + "version": "v1.0", + "review": [ + { + "status": "approved", + "reviewer_comment": "Approved by GW staff.", + "date": "2017-11-12T12:30:48-0400", + "reviewer": { + "name": "Anjan Purkayastha", + "affiliation": "George Washington University", + "email": "anjan.purkayastha@gmail.com", + "contribution": [ + "curatedBy" + ] + } + }, + { + "status": "approved", + "reviewer_comment": "Approved by Critical Path Institute staff.", + "date": "2017-11-12T12:30:48-0400", + "reviewer": { + "name": "Marco Schito", + "affiliation": "Critical Path Institute", + "email": "mschito@c-path.org", + "contribution": [ + "curatedBy" + ] + } + }, + { + "status": "approved", + "date": "2017-11-12T12:30:48-0400", + "reviewer_comment": "Approved by Critical Path Institute staff.", + "reviewer": { + "name": "Kenneth Ramey", + "affiliation": "Critical Path Institute", + "email": "kramey@c-path.org", + "contribution": [ + "curatedBy" + ] + } + } + ], + "obsolete_after": "2118-09-26T14:43:43-0400", + "embargo": { + "start_time": "2000-09-26T14:43:43-0400", + "end_time": "2018-10-08T18:02:33-0400" + }, + "created": "2017-11-12T12:30:48-0400", + "modified": "2018-10-08T18:35:33-0400", + "contributors": [ + { + "name": "Matthew Ezewudo", + "affiliation": "Critical Path Institute", + "email": "mezewudo@c-path.org", + "contribution": [ + "authoredBy" + ] + }, + { + "name": "Jamie Posie", + "affiliation": "CDC Atlanta, GA", + "contribution": [ + "authoredBy" + ] + }, + { + "name": "Anjan Purkayastha", + "affiliation": "George Washington University", + "email": "anjan.purkayastha@gmail.com", + "contribution": [ + "authoredBy", + "curatedBy" + ] + }, + { + "name": "Marco Schito", + "affiliation": "Critical Path Institute", + "email": "mschito@c-path.org", + "contribution": [ + "authoredBy" + ] + }, + { + "name": "Charles Hadley King", + "affiliation": "George Washington University", + "email": "hadley_king@gwu.edu", + "contribution": [ + "authoredBy", + "curatedBy" + ], + "orcid": "https://orcid.org/0000-0003-1409-4549" + }, + { + "name": "ReseqTB Consortium", + "affiliation": "Critical Path Institute", + "email": "info@c-path.org", + "contribution": [ + "createdAt" + ] + } + ], + "license": "https://spdx.org/licenses/CC-BY-4.0.html" + }, + "usability_domain": [ + "Lineage assignment for an isolate of M. tuberculosis[taxonomy:1773] based on its single nucleotide polymorphism [so:0000694] (SNP) profile." + ], + "extension_domain": { + "scm_extension": { + "scm_repository": "https://github.com/CPTR-ReSeqTB/UVP", + "scm_type": "git", + "scm_commit": "9e8f588b3cd3f5eebde29f7d2879e1a1e1c1aed3", + "scm_path": "UVP/scripts/UVP.py" + } + }, + "description_domain": { + "keywords": [ + "Mycobacterium tuberculosis", + "Phylogenetics", + "Bacterial lineage analysis", + "Single Nucleotide Polymorphism", + "SNP" + ], + "xref": [ + { + "namespace": "pubmed", + "name": "PubMed", + "ids": [ + "00000" + ], + "access_time": "2018-13-02T10:15-05:00" + }, + { + "namespace": "so", + "name": "Sequence Ontology", + "ids": [ + "0000694" + ], + "access_time": "2018-13-02T10:15-05:00" + }, + { + "namespace": "taxonomy", + "name": "Taxonomy", + "ids": [ + "1773" + ], + "access_time": "2018-13-02T10:15-05:00" + } + ], + "platform": [ + "Linux" + ], + "pipeline_steps": [ + { + "step_number": 1, + "name": "FastQValidator", + "description": "To verify if input file is in fastq format", + "version": "1.0.5", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_1.fastq.gz" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_2.fastq.gz" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/validation/Validation_report.txt" + } + ] + }, + { + "step_number": 2, + "name": "FastQC", + "description": "assess Quality of raw sequence reads", + "version": "0.11.5", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_1.fastq.gz" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_2.fastq.gz" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/fastqc/ERR552106_1_fastqc.html" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/fastqc/ERR552106_1_fastqc.zip" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/fastqc/ERR552106_2_fastqc.html" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/fastqc/ERR552106_2_fastqc.zip" + } + ] + }, + { + "step_number": 3, + "name": "Kraken", + "description": "Assesses species specificity of sequence reads", + "version": "0.10.5", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_1.fastq.gz" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_2.fastq.gz" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/kraken/final_report.txt" + } + ] + }, + { + "step_number": 4, + "name": "BWA", + "description": "Aligns sequence reads to reference genome", + "version": "0.7.12", + "prerequisite": [ + { + "name": "M. tuberculosis H37Rv genome reference file", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.fa" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_1.fastq.gz" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_2.fastq.gz" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.bam" + } + ] + }, + { + "step_number": 5, + "name": "Qualimap", + "description": "Assess mapping quality of aligned reads", + "version": "2.1.1", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/agogo.css" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/ajax-loader.gif" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/basic.css" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/bgfooter.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/bgtop.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/comment-bright.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/comment-close.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/comment.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/doctools.js" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/down-pressed.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/down.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/file.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/jquery.js" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/minus.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/plus.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/pygments.css" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/qualimap_logo_small.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/report.css" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/searchtools.js" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/underscore.js" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/up-pressed.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/up.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/css/websupport.js" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_coverage_0to50_histogram.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_coverage_across_reference.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_coverage_histogram.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_coverage_quotes.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_gc_content_per_window.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_homopolymer_indels.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_insert_size_across_reference.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_insert_size_histogram.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_mapping_quality_across_reference.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_mapping_quality_histogram.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_reads_clipping_profile.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_reads_content_per_read_position.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/images_qualimapReport/genome_uniq_read_starts_histogram.png" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/coverage_across_reference.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/coverage_histogram.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/duplication_rate_histogram.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/genome_fraction_coverage.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/homopolymer_indels.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/insert_size_across_reference.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/insert_size_histogram.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/mapped_reads_clipping_profile.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/mapped_reads_gc-content_distribution.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/mapped_reads_nucleotide_content.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/mapping_quality_across_reference.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/raw_data_qualimapReport/mapping_quality_histogram.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/genome_results.txt" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/qualimap/qualimapReport.html" + } + ] + }, + { + "step_number": 6, + "name": "MarkDuplicates", + "description": "Removes duplicate reads from alignment", + "version": "1.134", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.no_dups.bam" + } + ] + }, + { + "step_number": 7, + "name": "IndelRealigner", + "description": "Perfoms re-alignment around insertions and deletions", + "version": "3.4.0", + "prerequisite": [ + { + "name": "M. tuberculosis H37Rv genome reference file", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.fa" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.no_dups.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.realigned.bam" + } + ] + }, + { + "step_number": 8, + "name": "BaseRecalibrator", + "description": "Recalibrates base quality scores", + "version": "3.4.0", + "prerequisite": [ + { + "name": "M. tuberculosis H37Rv genome reference file", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.fa" + } + }, + { + "name": "Variation sites file", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/snps.vcf" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.realigned.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.recalibrated.bam" + } + ] + }, + { + "step_number": 9, + "name": "BuildBamIndex", + "description": "Indexes sorted BAM files for variant calling", + "version": "1.134", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.recalibrated.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.recalibrated.bai" + } + ] + }, + { + "step_number": 10, + "name": "UnifiedGenotyper", + "description": "Calls variant positions in alignment", + "version": "3.4.0", + "prerequisite": [ + { + "name": "M. tuberculosis H37Rv genome reference file", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.fa" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.recalibrated.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK.vcf" + }, + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK.mpileup" + } + ] + }, + { + "step_number": 11, + "name": "VCFtools", + "description": "Filters raw VCF to exclude poor quality variants", + "version": "0.1.12b", + "prerequisite": [ + { + "name": "Excluded list file", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/excluded_loci.txt" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK.vcf" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK_filtered.vcf" + } + ] + }, + { + "step_number": 12, + "name": "SnpEff", + "description": "Annotates variants in VCF file", + "version": "4.1", + "prerequisite": [ + { + "name": "M. tuberculosis H37Rv GenBank File", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.gbk" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK_filtered.vcf" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK_annotated.vcf" + } + ] + }, + { + "step_number": 13, + "name": "parse_annotation.py", + "description": "Parses annotated VCF to create annotation text file", + "version": "", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK_annotated.vcf" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_Final_annotation.txt" + } + ] + }, + { + "step_number": 14, + "name": "lineage_parser.py", + "description": "Assigns Mycobacterium tuberculosis Complex lineage to isolate", + "version": "", + "prerequisite": [ + { + "name": "Lineage Markers File", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/lineage_markers.txt" + } + } + ], + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_Final_annotation.txt" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106.lineage_report.txt" + } + ] + }, + { + "step_number": 15, + "name": "BEDtools", + "description": "Creates loci based coverage statistics of genome coverage", + "version": "2.17.0", + "input_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/bam_files/ERR552106.recalibrated.bam" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_genome_region_coverage.txt" + } + ] + }, + { + "step_number": 16, + "name": "resis_parser.py", + "description": "Creates a coverage depth and width table of all loci in isolate genome", + "version": "", + "input_list": [ + { + "uri": "[path_to_genome_loci_text_file]" + }, + { + "uri": "[path_to_per_position_depth_text_file]" + } + ], + "output_list": [ + { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_Coverage.txt" + } + ] + } + ] + }, + "execution_domain": { + "script": [ + { + "uri": { + "uri": "https://github.com/CPTR-ReSeqTB/UVP/commit/9e8f588b3cd3f5eebde29f7d2879e1a1e1c1aed3" + } + } + ], + "script_driver": "Python", + "software_prerequisites": [ + { + "name": "BEDtools", + "version": "2.17.0", + "uri": { + "uri": "https://github.com/arq5x/bedtools/releases/tag/v2.17.0", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "5e4507c54355a4a38c6d3e7497a2836a123c6655" + } + }, + { + "name": "Bcftools", + "version": "1.2", + "uri": { + "uri": "https://github.com/samtools/bcftools/releases/download/1.2/bcftools-1.2.tar.bz2", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "352908143497da0640b928248165e83212dc4298" + } + }, + { + "name": "BWA", + "version": "0.7.12", + "uri": { + "uri": "https://sourceforge.net/projects/bio-bwa/files/bwa-0.7.12.tar.bz2/download", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "6389ca75328bae6d946bfdd58ff4beb0feebaedd" + } + }, + { + "name": "FastQC", + "version": "0.11.5", + "uri": { + "uri": "https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/fastq_screen_v0.13.0.tar.gz", + "access_time": "2018-10-08T18:35:33-0400" + } + }, + { + "name": "GATK", + "version": "3.4.0", + "uri": { + "uri": "https://github.com/broadgsa/gatk-protected/releases/tag/3.4", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "f19618653a0d23baaf147efe7f14aeb4eeb0cbb8" + } + }, + { + "name": "Kraken", + "version": "0.10.5", + "uri": { + "uri": "https://ccb.jhu.edu/software/kraken/dl/kraken-0.10.5-beta.tgz", + "access_time": "2018-10-08T18:35:33-0400" + } + }, + { + "name": "Picard", + "version": "1.134", + "uri": { + "uri": "https://github.com/broadinstitute/picard/releases/tag/1.134", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "a7a08c474e4d99346eec7a9956a8fe71943b5d80" + } + }, + { + "name": "Pigz", + "version": "2.3.3", + "uri": { + "uri": "http://springdale.math.ias.edu/data/puias/unsupported/7/SRPMS/pigz-2.3.3-1.sdl7.src.rpm", + "access_time": "2018-10-08T18:35:33-0400" + } + }, + { + "name": "Qualimap", + "version": "2.11", + "uri": { + "uri": "https://bitbucket.org/kokonech/qualimap/downloads/qualimap_v2.1.1.zip", + "access_time": "2018-10-08T18:35:33-0400" + } + }, + { + "name": "Samtools", + "version": "1.2", + "uri": { + "uri": "https://github.com/samtools/samtools/archive/1.2.zip", + "access_time": "2018-10-08T18:35:33-0400" + } + }, + { + "name": "SnpEff", + "version": "4.1", + "uri": { + "uri": "https://sourceforge.net/projects/snpeff/files/snpEff_v4_1l_core.zip/download", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "c96e21564b05d6a7912e4dd35f9ef6fe2e094fbb" + } + }, + { + "name": "Vcftools", + "version": "0.1.12b", + "uri": { + "uri": "https://sourceforge.net/projects/vcftools/files/vcftools_0.1.12.tar.gz/download", + "access_time": "2018-10-08T18:35:33-0400", + "sha1_chksum": "29a1ab67786e39be57cbb1ef4e0f6682110b7516" + } + } + ], + "external_data_endpoints": [ + { + "name": "BCOReSeqTB", + "url": "https://github.com/CPTR-ReSeqTB/UVP/" + } + ], + "environment_variables": { + "CORE": "8" + } + }, + "io_domain": { + "input_subdomain": [ + { + "uri": { + "filename": "Mycobacterium tuberculosis H37Rv, complete genome", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.fa" + } + }, + { + "uri": { + "filename": "Mycobacterium tuberculosis H37Rv, complete genome", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/NC_000962.gbk" + } + }, + { + "uri": { + "filename": "excluded_loci", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/excluded_loci.txt" + } + }, + { + "uri": { + "filename": "lineage_markers", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/lineage_markers.txt" + } + }, + { + "uri": { + "filename": "variation sites", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_reference_files/snps.vcf" + } + }, + { + "uri": { + "filename": "ERR552106_2.fastq.gz", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_2.fastq.gz" + } + }, + { + "uri": { + "filename": "ERR552106_1.fastq.gz", + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_input_fastq_files/ERR552106_1.fastq.gz" + } + } + ], + "output_subdomain": [ + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106.lineage_report.txt" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106.log" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_Coverage.txt" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106ERR552106_Final_annotation.txt" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK.vcf" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_GATK_filtered.vcf" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_Lineage.txt" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_deleted_loci.txt" + } + }, + { + "mediatype": "text/csv", + "uri": { + "uri": "http://bco.reseqtb.org/UVP-BCO/UVPv2.4.1_sample_results/ERR552106/ERR552106_genome_region_coverage.txt" + } + } + ] + }, + "error_domain": { + "empirical_error": { + "description": [ + "This test object represents tests done with single lineage sequences to establish the sensitivity of UVP to detect lineage and antibiotic resistant variants", + "Test objective was to evaluate the ability of UVP to identify strain lineage and antibiotic resistant variants from samples of high, medium, low sequence qualities and depths of coverage of 10, 15, 20, 25 and 30-fold. Simulated reads developed from 12 lineage-specific M. tuberculosis (Mtb) genome sequences were used to test UVP." + ], + "parameters": { + "sample_type": "single Mtb lineages (n = 12) with antibiotic resistant variants introduced in silico", + "total_sample_size": "180", + "platform": "Illumina HiSeq 2000", + "paired_end": true, + "length": "100", + "simulated": true, + "program": "ART", + "simulator_parameters": [ + { + "ss": "hs20" + }, + { + "l": "100" + }, + { + "m": "500" + }, + { + "qU": "45" + }, + { + "s": "100" + } + ], + "sequence_quality_level_parameters": { + "description": "these correspond to the ART parameters: qs, qs2, ir, ir2, dr, dr2.", + "sequence_quality_high": { + "substitution_error_rate_R1": "0.0004", + "substitution_error_rate_R2": "0.0007", + "insertion_error_rate_R1": "0.00009", + "insertion_error_rate_R2": "0.00015", + "deletion_error_rate_R1": "0.00011", + "deletion_error_rate_R2": "0.00023", + "units": "errors per sequenced base" + }, + "sequence_quality_medium": { + "substitution_error_rate_R1": "0.004", + "substitution_error_rate_R2": "0.007", + "insertion_error_rate_R1": "0.0009", + "insertion_error_rate_R2": "0.0015", + "deletion_error_rate_R1": "0.0011", + "deletion_error_rate_R2": "0.0023", + "units": "errors per sequenced base" + }, + "sequence_quality_low": { + "substitution_error_rate_R1": "0.04", + "substitution_error_rate_R2": "0.07", + "insertion_error_rate_R1": "0.009", + "insertion_error_rate_R2": "0.015", + "deletion_error_rate_R1": "0.011", + "deletion_error_rate_R2": "0.023", + "units": "errors per sequenced base" + } + } + }, + "summary results": { + "sequence_quality_high": { + "sample size": "60", + "result": { + "lineage_assignment_rate": "93.33", + "mean_AR_identification_rate": "86.72", + "Units": "Percentage" + } + }, + "sequence_quality_medium": { + "sample size": "60", + "result": { + "lineage_assignment_rate": "90.00", + "mean_AR_identification_rate": "81.00", + "Units": "Percentage" + } + }, + "sequence_quality_low": { + "sample size": "60", + "result": { + "lineage_assignment_rate": "0.00", + "mean_AR_identification_rate": "0.00", + "Units": "Percentage" + } + }, + "coverage_10": { + "sample size": "36", + "result": { + "lineage_assignment_rate": "41.67", + "mean_AR_identification_rate": "22.42", + "Units": "Percentage" + } + }, + "coverage_15": { + "sample size": "36", + "result": { + "lineage_assignment_rate": "63.89", + "mean_AR_identification_rate": "57.14", + "Units": "Percentage" + } + }, + "coverage_20": { + "sample size": "36", + "result": { + "lineage_assignment_rate": "66.67", + "mean_AR_identification_rate": "66.46", + "Units": "Percentage" + } + }, + "coverage_25": { + "sample size": "36", + "result": { + "lineage_assignment_rate": "66.67", + "mean_AR_identification_rate": "66.66", + "Units": "Percentage" + } + }, + "coverage_30": { + "sample size": "36", + "result": { + "lineage_assignment_rate": "66.67", + "mean_AR_identification_rate": "66.66", + "Units": "Percentage" + } + } + }, + "detailed results": [ + { + "sequence_quality_high": { + "coverage_10": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "66.67", + "mean_AR_identification_rate": "40.75", + "Units": "Percentage" + } + }, + "coverage_15": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "92.85", + "Units": "Percentage" + } + }, + "coverage_20": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "100.00", + "Units": "Percentage" + } + }, + "coverage_25": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "100.00", + "Units": "Percentage" + } + }, + "coverage_30": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "100.00", + "Units": "Percentage" + } + } + } + }, + { + "sequence_quality_medium": { + "coverage_10": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "58.34", + "mean_AR_identification_rate": "26.50", + "Units": "Percentage" + } + }, + "coverage_15": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "91.66", + "mean_AR_identification_rate": "78.57", + "Units": "Percentage" + } + }, + "coverage_20": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "99.40", + "Units": "Percentage" + } + }, + "coverage_25": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "100.00", + "Units": "Percentage" + } + }, + "coverage_30": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "100.00", + "mean_AR_identification_rate": "100.00", + "Units": "Percentage" + } + } + } + }, + { + "sequence_quality_low": { + "coverage_10": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "0.00", + "mean_AR_identification_rate": "0.00", + "Units": "Percentage" + } + }, + "coverage_15": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "0.00", + "mean_AR_identification_rate": "0.00", + "Units": "Percentage" + } + }, + "coverage_20": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "0.00", + "mean_AR_identification_rate": "0.00", + "Units": "Percentage" + } + }, + "coverage_25": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "0.00", + "mean_AR_identification_rate": "0.00", + "Units": "Percentage" + } + }, + "coverage_30": { + "sample size": "12", + "result": { + "lineage_assignment_rate": "0.00", + "mean_AR_identification_rate": "0.00", + "Units": "Percentage" + } + } + } + } + ] + }, + "algorithmic_error": { + "placeholder": "for algorithmic error domain" + } + } +} \ No newline at end of file diff --git a/examples/glycosylation-sites-UniCarbKB.json b/examples/glycosylation-sites-UniCarbKB.json new file mode 100644 index 0000000..1e6ae5a --- /dev/null +++ b/examples/glycosylation-sites-UniCarbKB.json @@ -0,0 +1,260 @@ +{ + "bco_id": "https://w3id.org/biocompute/1.3.0/examples/glycosylation-sites-UniCarbKB", + "checksum": "D231C92C660CD1DD818D412E10F86F07338BA730FBE6898EF8F7DF1B1ECBFD3C", + "bco_spec_version" : "https://w3id.org/biocompute/1.3.0/", + "provenance_domain":{ + "name": "glycosylation-sites-UniCarbKB", + "version": "1.0", + "review":[ + { + "status": "approved", + "reviewer_comment": "The dataset has passed the manual and automated QC steps and the readme has also been reviewed", + "reviewer":{ + "name": "Rahi Navelkar", + "affiliation": "The George Washington University", + "email": "rsn13@gwu.edu", + "contribution":["curatedBy"] + } + } + ], + "created": "2018-02-21T14:46:55-5:00", + "modified": "2018-10-10T11:34:02-5:00", + "contributors":[ + { + "name": "Matthew Campbell", + "affiliation": "Institute for Glycomics, Griffith University, Gold Coast, Queensland, Australia", + "email": "m.campbell2@griffith.edu.au", + "contribution":["contributedBy"] + }, + { + "name": "Rahi Navelkar", + "affiliation": "The George Washington University", + "email": "rsn13@gwu.edu", + "contribution":["curatedBy"] + }, + { + "name": "Robel Kahsay", + "affiliation": "The George Washington University", + "email": "hadley_king@gwu.edu", + "contribution":["createdBy"] + } + ], + "license": "https://creativecommons.org/licenses/by/4.0/" + }, + "usability_domain":[ + "List of human [taxid:9606] proteins with information on glycosylation sites from UniCarbKB database [https://academic.oup.com/nar/article/42/D1/D215/1052197, https://doi.org/10.1093/nar/gkt1128]" + ], + "extension_domain":{ + "license":{ + "data_license": "https://creativecommons.org/licenses/by/4.0/", + "scripts_license": "https://www.gnu.org/licenses/gpl-3.0.en.html" + }, + "scm_extension":{ + "scm_repository": "https://github.com/GW-HIVE/glygen-backend-integration/", + "scm_type": "git", + "scm_commit": "d34b85553e775dd5452005d786fe6e47d6048ee0", + "scm_path": "/data/projects/glygen/generated/datasets/reviewed/human_proteoform_glycosylation_sites_unicarbkb_glytoucan.readme.txt" + } + }, + "description_domain":{ + "keywords":[ + "protein", + "canonical", + "glycosylation", + "glycan" + ], + "xref":[ + { + "namespace": "taxonomy", + "name": "Taxonomy", + "ids": ["9606"], + "access_time": "2018-21-02T14:46:55-5:00" + } + ], + "platform": ["centos7"], + "pipeline_steps":[ + { + "step_number":1, + "name": "ac2canonical.py", + "description": "Python script for mapping the UniProtKB accessions in the input file to the UniProtKB canonical accessions ", + "version": "", + "input_list":[ + { + "uri": "/human_protein_position_pmid_id_aminoacid_glytoucan_2018_09_04_07_51_27.txt" + } + ], + "output_list":[ + { + "uri": "human_protein_position_pmid_id_aminoacid_glytoucan_2018_09_04_07_51_27.txt" + } + ] + }, + { + "step_number":2, + "name": "make-proteoform_glycosylation_sites_unicarbkb_glytoucan-csv-step2b.py", + "description": "Python scripts for retrieving glycosylation type or linkage type through UniCarbKB structure webpage ", + + "input_list":[ + {"uri": "human_protein_position_pmid_id_aminoacid_glytoucan_2018_09_04_07_51_27.txt"} + ], + "output_list":[ + {"uri": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.csv"} + ] + }, + { + "step_number":2, + "name": "make-proteoform_glycosylation_sites_unicarbkb_glytoucan-csv-step2b.py", + "description": "Python scripts for retrieving glycosylation type or linkage type through UniCarbKB structure webpage ", + "input_list":[ + {"uri": "human_protein_position_pmid_id_aminoacid_glytoucan_2018_09_04_07_51_27.txt"} + ], + "output_list":[ + {"uri": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.csv"} + ] + }, + { + "step_number":3, + "name": "make-proteoform_glycosylation_sites_unicarbkb_glytoucan-csv-step3.py", + "description": "Python script for quality check of the processed file. Records which fall under one or more following criteria's are flagged and eliminated and can be accessed using the log file. The elimination steps include - a. If the protein accession is not included in UniProtKB protein list - UniProtKB Nov-2017 Release b. If the amino acid position does not match to the amino acid on the associated position on fasta sequence - UniProtKB Nov-2017 Release c. If the id (UnicarbKB structure id) is not present in input file d. If the glycosylation type (linkage type) is not retrieved through step 3 e. If a serine or threonine is reported for an N-linked glycan structure f. If an asparagine is reported for an O-linked glycan structure", + "input_list":[ + {"uri": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.csv"}, + {"uri": "human_protein_all.fasta"} + ], + "output_list":[ + {"uri": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.csv"}, + {"uri": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.log"} + ] + } + ] + }, + "execution_domain":{ + "script":[ + { + "uri": { + "uri": "https://github.com/glygener/glygen-backend-integration/blob/master/integration/ac2canonical.py" + } + }, + { + "uri": { + "uri": "https://github.com/glygener/glygen-backend-integration/blob/master/integration/make-proteoform_glycosylation_sites_unicarbkb_glytoucan-csv-step2a.py" + } + }, + { + "uri": { + "uri": "https://github.com/glygener/glygen-backend-integration/blob/master/integration/make-proteoform_glycosylation_sites_unicarbkb_glytoucan-csv-step2b.py" + } + }, + { + "uri": { + "uri": "https://github.com/glygener/glygen-backend-integration/blob/master/integration/make-proteoform_glycosylation_sites_unicarbkb_glytoucan-csv-step3.py" + } + } + ], + "script_driver": "manual", + "software_prerequisites":[ + { + "name": "Python", + "version": "2.7.13", + "uri": { + "uri": "https://www.python.org/downloads/release/python-2713/", + "access_time": "2017-01-24T09:40:17-0500", + "sha1_chksum": "17add4bf0ad0ec2f08e0cae6d205c700" + } + } + ], + "external_data_endpoints": [ + { + "name": "UniCarbKB", + "url": "http://www.unicarbkb.org/" + }, + { + "name": "access glygen-backend-integration", + "url": "https://github.com/glygener/glygen-backend-integration" + } + ], + "environment_variables":{ + + } + }, + "io_domain":{ + "input_subdomain":[ + { + "uri":{ + "filename": "human_protein_position_pmid_id_aminoacid_glytoucan_2018_09_04_07_51_27.txt", + "uri": "http://data.glygen.org/datasets/source/human_protein_position_pmid_id_aminoacid_glytoucan_2018_09_04_07_51_27.txt", + "access_time": "2018-10-10T11:34:02-5:00" + } + }, + { + "uri":{ + "filename": "human_protein_all.fasta", + "uri": "http://data.glygen.org/GLYDS00053", + "access_time": "2018-10-10T11:34:02-5:00" + } + } + ], + "output_subdomain":[ + { + "mediatype": "csv/text", + "uri":{ + "filename": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.log", + "uri": "http://data.glygen.org/datasets/logs/human_proteoform_glycosylation_sites_unicarbkb_glytoucan.log", + "access_time": "2018-10-10T11:37:02-5:00" + } + }, + { + "mediatype": "csv/text", + "uri":{ + "filename": "human_proteoform_glycosylation_sites_unicarbkb_glytoucan.csv", + "uri": "http://data.glygen.org/GLYDS00040", + "access_time": "2018-10-10T11:37:02-5:00" + } + } + ] + }, + "error_domain":{ + "empirical_error":{ + "statistics":[ + { + "comment": "Unique value statistics for the dataset" + }, + { + "key": "uniprotkb_canonical_ac", + "value":92, + "description": "Accession assigned to the protein isoform chosen to be the canonical sequence in UniProtKB database" + }, + { + "key": "glycosylation_site", + "value":223, + "description": "Site on the protein sequence where glycosylation is observed" + }, + { + "key": "evidence", + "value":163, + "description": "NCBI PubMed Id (PMID) as evidence for the entry" + }, + { + "key": "unicarbkb_id", + "value":984, + "description": "UnicarbKB data structure identifier" + }, + { + "key": "glytoucan_ac", + "value":824, + "description": "Unique accession assigned to the registered glycan structure in GlyTouCan database" + }, + { + "key": "amino_acid", + "value":3, + "description": "Three letter code abbreviation of the amino acid" + }, + { + "key": "glycosylation_type", + "value":3, + "description": "Type of glycosylation [linkage type]" + } + ] + }, + "algorithmic_error":{} + } +}