From d58134fc484bfc80455e8f06540358ab8c8db630 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 14 Jun 2016 11:27:34 +0100 Subject: [PATCH 01/40] Initial version of JSON convertor --- tools/sphinx/protobuf-json-docs.py | 147 +++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100755 tools/sphinx/protobuf-json-docs.py diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py new file mode 100755 index 00000000..d9363064 --- /dev/null +++ b/tools/sphinx/protobuf-json-docs.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +import sys +import collections + +from google.protobuf.compiler import plugin_pb2 as plugin +import itertools +import json +from google.protobuf.descriptor_pb2 import DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto + +def convert_protodef_to_editable(proto): + class Editable(object): + def __init__(self, prot): + self.kind = type(prot) + self.name = prot.name + self.comment = "" + if isinstance(prot, EnumDescriptorProto): + self.value = [convert_protodef_to_editable(x) for x in prot.value] + elif isinstance(prot, DescriptorProto): + self.field = [convert_protodef_to_editable(x) for x in prot.field] + elif isinstance(prot, EnumValueDescriptorProto): + self.number = prot.number + elif isinstance(prot, FieldDescriptorProto): + self.type = prot.type + else: + raise Exception, type(prot) + + return Editable(proto) + +def traverse(proto_file): + + def _collapse_comments(comments): + return comments["leading_comments"] + comments["trailing_comments"] + + def _traverse(package, items, tree): + for item_index, item in enumerate(items): + item = convert_protodef_to_editable(item) + if item_index in tree: + comments = tree[item_index] + if "leading_comments" in comments or "trailing_comments" in comments: + item.comments = _collapse_comments(comments) + del comments["leading_comments"] + del comments["trailing_comments"] + if item.kind is EnumDescriptorProto: + if 2 in comments: # value in EnumDescriptorProto + for k in comments[2]: + value_comment = comments[2][k] + if value_comment != {}: + item.value[k].comment = _collapse_comments(value_comment) + elif item.kind is DescriptorProto: + if 2 in comments: # field in DescriptorProto + for k in comments[2]: + field_comment = comments[2][k] + if field_comment != {}: + item.field[k].comment = _collapse_comments(field_comment) + else: + raise Exception, item.kind + + yield item, package + + if isinstance(item, DescriptorProto): + for enum in item.enum_type: + yield enum, package + + for nested in item.nested_type: + nested_package = package + item.name + + for nested_item in _traverse(nested, nested_package): + yield nested_item, nested_package + + tree = collections.defaultdict(collections.defaultdict) + for loc in proto_file.source_code_info.location: + if loc.leading_comments or loc.trailing_comments: + place = tree + for p in loc.path: + if not place.has_key(p): + place[p] = collections.defaultdict(collections.defaultdict) + place = place[p] + place["leading_comments"] = loc.leading_comments + place["trailing_comments"] = loc.trailing_comments + + return itertools.chain( + _traverse(proto_file.package, proto_file.enum_type, tree[5]), # 5 is enum_type in FileDescriptorProto + _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is enum_type in FileDescriptorProto + ) + +def generate_code(request, response): + for proto_file in request.proto_file: + output = [] + + # Parse request + for item, package in traverse(proto_file): + data = { + 'package': proto_file.package or '<root>', + 'filename': proto_file.name, + 'name': item.name, + 'doc': item.comment + } + + if item.kind == DescriptorProto: + data.update({ + 'type': 'Message', + 'properties': [{ + 'name': f.name, + 'type': int(f.type), + 'doc': f.comment + } + for f in item.field] + }) + + elif item.kind == EnumDescriptorProto: + data.update({ + 'type': 'Enum', + 'values': [{ + 'name': v.name, + 'value': v.number, + 'doc': v.comment} + for v in item.value] + }) + + output.append(data) + + # Fill response + f = response.file.add() + f.name = proto_file.name + '.json' + f.content = json.dumps(output, indent=2) + + +if __name__ == '__main__': + # Read request message from stdin + data = sys.stdin.read() + + # Parse request + request = plugin.CodeGeneratorRequest() + request.ParseFromString(data) + + # Create response + response = plugin.CodeGeneratorResponse() + + # Generate code + generate_code(request, response) + + # Serialise response message + output = response.SerializeToString() + + # Write to stdout + sys.stdout.write(output) From 4940ac8ee517f98ea95c25944a78c4fbca456ab1 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 14 Jun 2016 13:37:55 +0100 Subject: [PATCH 02/40] Mostly Avro-compliant JSON output from the Protobuf parser --- tools/sphinx/protobuf-json-docs.py | 89 +++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index d9363064..cc882ab9 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -30,7 +30,7 @@ def __init__(self, prot): def traverse(proto_file): def _collapse_comments(comments): - return comments["leading_comments"] + comments["trailing_comments"] + return (comments["leading_comments"] + comments["trailing_comments"]).strip() def _traverse(package, items, tree): for item_index, item in enumerate(items): @@ -38,7 +38,8 @@ def _traverse(package, items, tree): if item_index in tree: comments = tree[item_index] if "leading_comments" in comments or "trailing_comments" in comments: - item.comments = _collapse_comments(comments) + item.comment = _collapse_comments(comments) + #raise Exception, item.__dict__ del comments["leading_comments"] del comments["trailing_comments"] if item.kind is EnumDescriptorProto: @@ -68,9 +69,12 @@ def _traverse(package, items, tree): for nested_item in _traverse(nested, nested_package): yield nested_item, nested_package + import pprint + open("dump", "w").write(pprint.pformat(proto_file.source_code_info)) + tree = collections.defaultdict(collections.defaultdict) for loc in proto_file.source_code_info.location: - if loc.leading_comments or loc.trailing_comments: + if loc.leading_comments or loc.trailing_comments or loc.leading_detached_comments: place = tree for p in loc.path: if not place.has_key(p): @@ -78,47 +82,80 @@ def _traverse(package, items, tree): place = place[p] place["leading_comments"] = loc.leading_comments place["trailing_comments"] = loc.trailing_comments + place["leading_detached_comments"] = loc.leading_detached_comments + + if set(tree.keys()).difference(set([4,5,12])) != set(): + raise Exception, sorted(tree.keys()) - return itertools.chain( - _traverse(proto_file.package, proto_file.enum_type, tree[5]), # 5 is enum_type in FileDescriptorProto - _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is enum_type in FileDescriptorProto - ) + return {"types": + itertools.chain( + _traverse(proto_file.package, proto_file.enum_type, tree[5]), # 5 is enum_type in FileDescriptorProto + _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is enum_type in FileDescriptorProto + ), + "file": tree[12] + } def generate_code(request, response): for proto_file in request.proto_file: - output = [] + types = [] - # Parse request - for item, package in traverse(proto_file): + results = traverse(proto_file) + for item, package in results["types"]: data = { - 'package': proto_file.package or '<root>', - 'filename': proto_file.name, 'name': item.name, 'doc': item.comment } if item.kind == DescriptorProto: data.update({ - 'type': 'Message', - 'properties': [{ + 'type': 'message', + 'fields': [] + }) + for f in item.field: + if f.type in [1]: + kind = "double" + elif f.type in [3]: + kind = "long" + elif f.type in [5]: + kind = "integer" + elif f.type in [8]: + kind = "boolean" + elif f.type in [9]: + kind = "string" + elif f.type in [11]: + kind = "message" + elif f.type in [12]: + kind = "bytes" + elif f.type in [14]: + kind = "enum" + else: + raise Exception, f.type + data["fields"].append({ 'name': f.name, - 'type': int(f.type), + 'type': kind, 'doc': f.comment - } - for f in item.field] - }) + }) elif item.kind == EnumDescriptorProto: + comments = ["\n* `%s`: %s"%(v.name, v.comment) for v in item.value] data.update({ - 'type': 'Enum', - 'values': [{ - 'name': v.name, - 'value': v.number, - 'doc': v.comment} - for v in item.value] + 'type': 'enum', + 'symbols': [v.name for v in item.value] }) - - output.append(data) + data["doc"] += " ".join(comments) + + types.append(data) + + if results["file"].has_key("leading_detached_comments"): + comments = "".join(results["file"]["leading_detached_comments"]) + else: + comments = "" + output = { + "types": types, + "protocol": proto_file.name.split("/")[-1].split(".")[0], + 'doc': comments, + "namespace": proto_file.package, + } # Fill response f = response.file.add() From 8e1171eecc6d632802fe3c6680fbad3fc6a14d25 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 14 Jun 2016 13:49:44 +0100 Subject: [PATCH 03/40] Append all the leading_detached_comments to the file-related info --- tools/sphinx/protobuf-json-docs.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index cc882ab9..396aa3df 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -74,7 +74,7 @@ def _traverse(package, items, tree): tree = collections.defaultdict(collections.defaultdict) for loc in proto_file.source_code_info.location: - if loc.leading_comments or loc.trailing_comments or loc.leading_detached_comments: + if loc.leading_comments or loc.trailing_comments: place = tree for p in loc.path: if not place.has_key(p): @@ -82,9 +82,8 @@ def _traverse(package, items, tree): place = place[p] place["leading_comments"] = loc.leading_comments place["trailing_comments"] = loc.trailing_comments - place["leading_detached_comments"] = loc.leading_detached_comments - if set(tree.keys()).difference(set([4,5,12])) != set(): + if set(tree.keys()).difference(set([4,5])) != set(): raise Exception, sorted(tree.keys()) return {"types": @@ -92,7 +91,7 @@ def _traverse(package, items, tree): _traverse(proto_file.package, proto_file.enum_type, tree[5]), # 5 is enum_type in FileDescriptorProto _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is enum_type in FileDescriptorProto ), - "file": tree[12] + "file": ["".join(x.leading_detached_comments) for x in proto_file.source_code_info.location if len(x.leading_detached_comments) > 0] } def generate_code(request, response): @@ -146,10 +145,7 @@ def generate_code(request, response): types.append(data) - if results["file"].has_key("leading_detached_comments"): - comments = "".join(results["file"]["leading_detached_comments"]) - else: - comments = "" + comments = "".join(results["file"]).strip() output = { "types": types, "protocol": proto_file.name.split("/")[-1].split(".")[0], From 23ed0f8ca563915743aab33bad40055ae0615f1b Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 14 Jun 2016 14:02:14 +0100 Subject: [PATCH 04/40] Add service support --- tools/sphinx/protobuf-json-docs.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 396aa3df..0b435420 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -6,7 +6,7 @@ from google.protobuf.compiler import plugin_pb2 as plugin import itertools import json -from google.protobuf.descriptor_pb2 import DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto +from google.protobuf.descriptor_pb2 import DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto, ServiceDescriptorProto, MethodDescriptorProto def convert_protodef_to_editable(proto): class Editable(object): @@ -22,6 +22,11 @@ def __init__(self, prot): self.number = prot.number elif isinstance(prot, FieldDescriptorProto): self.type = prot.type + elif isinstance(prot, ServiceDescriptorProto): + self.method = [convert_protodef_to_editable(x) for x in prot.method] + elif isinstance(prot, MethodDescriptorProto): + self.input_type = prot.input_type + self.output_type = prot.output_type else: raise Exception, type(prot) @@ -54,6 +59,12 @@ def _traverse(package, items, tree): field_comment = comments[2][k] if field_comment != {}: item.field[k].comment = _collapse_comments(field_comment) + elif item.kind is ServiceDescriptorProto: + if 2 in comments: # method in ServiceDescriptorProto + for k in comments[2]: + method_comment = comments[2][k] + if method_comment != {}: + item.method[k].comment = _collapse_comments(method_comment) else: raise Exception, item.kind @@ -83,13 +94,14 @@ def _traverse(package, items, tree): place["leading_comments"] = loc.leading_comments place["trailing_comments"] = loc.trailing_comments - if set(tree.keys()).difference(set([4,5])) != set(): + if set(tree.keys()).difference(set([4,5,6])) != set(): raise Exception, sorted(tree.keys()) return {"types": itertools.chain( + _traverse(proto_file.package, proto_file.service, tree[6]), # 5 is enum_type in FileDescriptorProto _traverse(proto_file.package, proto_file.enum_type, tree[5]), # 5 is enum_type in FileDescriptorProto - _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is enum_type in FileDescriptorProto + _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is message_type in FileDescriptorProto ), "file": ["".join(x.leading_detached_comments) for x in proto_file.source_code_info.location if len(x.leading_detached_comments) > 0] } @@ -110,9 +122,11 @@ def generate_code(request, response): 'type': 'message', 'fields': [] }) - for f in item.field: + for f in item.field: # types from FieldDescriptorProto if f.type in [1]: kind = "double" + elif f.type in [2]: + kind = "float" elif f.type in [3]: kind = "long" elif f.type in [5]: @@ -142,6 +156,13 @@ def generate_code(request, response): 'symbols': [v.name for v in item.value] }) data["doc"] += " ".join(comments) + elif item.kind == ServiceDescriptorProto: + data.update({ + 'type': 'service', + 'methods': [{"name": m.name, "input": m.input_type[1:], "output": m.output_type[1:]} for m in item.method] + }) + else: + raise Exception, item.kind types.append(data) From 509807fdbf6725aa23bae875ea0507689d1fb321 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 14 Jun 2016 15:05:45 +0100 Subject: [PATCH 05/40] Append "messages" separately rather than merging with types --- tools/sphinx/protobuf-json-docs.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 0b435420..2974d1d9 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -109,6 +109,7 @@ def _traverse(package, items, tree): def generate_code(request, response): for proto_file in request.proto_file: types = [] + messages = {} results = traverse(proto_file) for item, package in results["types"]: @@ -148,7 +149,7 @@ def generate_code(request, response): 'type': kind, 'doc': f.comment }) - + types.append(data) elif item.kind == EnumDescriptorProto: comments = ["\n* `%s`: %s"%(v.name, v.comment) for v in item.value] data.update({ @@ -156,19 +157,26 @@ def generate_code(request, response): 'symbols': [v.name for v in item.value] }) data["doc"] += " ".join(comments) + types.append(data) elif item.kind == ServiceDescriptorProto: - data.update({ - 'type': 'service', - 'methods': [{"name": m.name, "input": m.input_type[1:], "output": m.output_type[1:]} for m in item.method] - }) + for m in item.method: + messages[m.name] = { + "doc": m.comment, + "request": { + "name": "request", + "type": m.input_type[1:], + }, + "response": m.output_type[1:], + "errors" : [ "GAException" ] + } else: raise Exception, item.kind - types.append(data) comments = "".join(results["file"]).strip() output = { "types": types, + "messages": messages, "protocol": proto_file.name.split("/")[-1].split(".")[0], 'doc': comments, "namespace": proto_file.package, From 8c642cbced29e15bfa11884e36a6f421a8769987 Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Tue, 14 Jun 2016 16:44:28 +0100 Subject: [PATCH 06/40] Update the Makefile to use the protoc json plugin instead of avro-tools. Remove generated *.rst files from git --- .gitignore | 3 +- doc/source/schemas/Makefile | 38 +- .../schemas/alleleAnnotationmethods.rst | 688 ------------------ doc/source/schemas/alleleAnnotations.rst | 561 -------------- doc/source/schemas/common.rst | 107 --- doc/source/schemas/index.rst | 27 +- doc/source/schemas/metadata.rst | 243 ------- doc/source/schemas/metadatamethods.rst | 237 ------ doc/source/schemas/methods.rst | 7 - doc/source/schemas/readmethods.rst | 612 ---------------- doc/source/schemas/reads.rst | 465 ------------ doc/source/schemas/referencemethods.rst | 379 ---------- doc/source/schemas/references.rst | 199 ----- .../schemas/sequenceAnnotationmethods.rst | 457 ------------ doc/source/schemas/sequenceAnnotations.rst | 342 --------- doc/source/schemas/variantmethods.rst | 475 ------------ doc/source/schemas/variants.rst | 297 -------- tools/sphinx/avpr2rest.py | 23 +- tools/sphinx/avrodomain.py | 12 +- tools/sphinx/protobuf-json-docs.py | 4 +- 20 files changed, 48 insertions(+), 5128 deletions(-) delete mode 100644 doc/source/schemas/alleleAnnotationmethods.rst delete mode 100644 doc/source/schemas/alleleAnnotations.rst delete mode 100644 doc/source/schemas/common.rst delete mode 100644 doc/source/schemas/metadata.rst delete mode 100644 doc/source/schemas/metadatamethods.rst delete mode 100644 doc/source/schemas/methods.rst delete mode 100644 doc/source/schemas/readmethods.rst delete mode 100644 doc/source/schemas/reads.rst delete mode 100644 doc/source/schemas/referencemethods.rst delete mode 100644 doc/source/schemas/references.rst delete mode 100644 doc/source/schemas/sequenceAnnotationmethods.rst delete mode 100644 doc/source/schemas/sequenceAnnotations.rst delete mode 100644 doc/source/schemas/variantmethods.rst delete mode 100644 doc/source/schemas/variants.rst diff --git a/.gitignore b/.gitignore index fded6d94..979ea845 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ target *~ #* -doc/source/schemas/*.avpr +doc/source/schemas/*.proto.rst build #********** windows template********** @@ -73,3 +73,4 @@ target/ #********** IntelliJ files ****** *.iml + diff --git a/doc/source/schemas/Makefile b/doc/source/schemas/Makefile index e81bb4fe..1e06308a 100644 --- a/doc/source/schemas/Makefile +++ b/doc/source/schemas/Makefile @@ -1,10 +1,10 @@ -# avdl-to-rst Makefile +# proto-to-rst Makefile # -# GA4GH schema docs are generated from avdl comments. The process is +# GA4GH schema docs are generated from proto comments. The process is # coordinated by this Makefile in these steps: -# * All .avdl files are converted to .avpr using the avro-tools -# package, which is downloaded if needed. -# * All .avpr files are converted to .rst using a python script in +# * All .proto files are converted to .json using the +# protoc json plugin `tools/sphinx/my-plugin.py`. +# * All .json files are converted to .rst using a python script in # schemas/tools. @@ -13,30 +13,26 @@ .DELETE_ON_ERROR: CACHE_DIR:=${HOME}/.cache/ga4gh -AVPR_DIR:=/tmp/ga4gh-${UID}/avpr -AVDL_DIR:=../../../src/main/resources/avro +JSON_DIR:=/tmp/ga4gh-${UID}/json +PROTO_BASE_DIR:=../../../src/main/proto +PROTO_DIR:=${PROTO_BASE_DIR}/ga4gh AVPR2REST_PATH:=../../../tools/sphinx/avpr2rest.py -AVRO_JAR_PATH:=${CACHE_DIR}/avro-tools.jar -AVRO_JAR_URL:=http://www.us.apache.org/dist/avro/avro-1.7.7/java/avro-tools-1.7.7.jar +PROTOC_PLUGIN_PATH:=../../../tools/sphinx/protobuf-json-docs.py -AVDL_BASENAMES:=$(subst ${AVDL_DIR}/,,$(wildcard ${AVDL_DIR}/*.avdl)) -AVPR_BASENAMES:=${AVDL_BASENAMES:.avdl=.avpr} -RST_BASENAMES:=${AVDL_BASENAMES:.avdl=.rst} +PROTO_BASENAMES:=$(subst ${PROTO_DIR}/,,$(wildcard ${PROTO_DIR}/*.proto)) +AVPR_BASENAMES:=${PROTO_BASENAMES:.proto=.proto.json} +RST_BASENAMES:=${PROTO_BASENAMES:.proto=.rst} default: ${RST_BASENAMES} +%.proto.json: ${PROTO_DIR}/%.proto + mkdir -p ${JSON_DIR} + protoc --proto_path ${PROTO_BASE_DIR} --plugin=protoc-gen-custom=${PROTOC_PLUGIN_PATH} --custom_out=${JSON_DIR} $< -${AVRO_JAR_PATH}: - mkdir -p ${@D} - curl -o $@ ${AVRO_JAR_URL} - -%.avpr: ${AVDL_DIR}/%.avdl ${AVRO_JAR_PATH} - java -jar ${AVRO_JAR_PATH} idl $< $@ - -%.rst: %.avpr - python ${AVPR2REST_PATH} $< . +%.rst: %.proto.json + python ${AVPR2REST_PATH} ${JSON_DIR}/ga4gh/$< . .PHONY: clean cleaner cleanest clean: diff --git a/doc/source/schemas/alleleAnnotationmethods.rst b/doc/source/schemas/alleleAnnotationmethods.rst deleted file mode 100644 index 67192528..00000000 --- a/doc/source/schemas/alleleAnnotationmethods.rst +++ /dev/null @@ -1,688 +0,0 @@ -AlleleAnnotationMethods -*********************** - - .. function:: searchVariantAnnotationSets(request) - - :param request: SearchVariantAnnotationSetsRequest: This request maps to the body of `POST /variantannotationsets/search` as JSON - :return type: SearchVariantAnnotationSetsResponse - :throws: GAException - -Returns a list of available variant annotation sets -`POST /variantannotationsets/search` must accept a JSON version of -`SearchVariantAnnotationSetsRequest` as the post body and will return a JSON -version of `SearchVariantAnnotationSetsResponse`. - - .. function:: searchVariantAnnotations(request) - - :param request: SearchVariantAnnotationsRequest: This request maps to the body of `POST /variantannotations/search` as JSON. - :return type: SearchVariantAnnotationsResponse - :throws: GAException - -Gets a list of `VariantAnnotations` matching the search criteria. - -`POST /variantannotations/search` must accept a JSON version of -`SearchVariantAnnotationsRequest` as the post body and will return a -JSON version of `SearchVariantAnnotationsResponse`. - - .. function:: getVariantAnnotationSet(id) - - :param id: string: The ID of the `VariantAnnotationSet`. - :return type: org.ga4gh.models.VariantAnnotationSet - :throws: GAException - -Gets an `VariantAnnotationSet` by ID. -`GET /variantannotationsets/{id}` will return a JSON version of -`VariantAnnotationSet`. - -.. avro:error:: GAException - - A general exception type. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: VariantSetMetadata - - :field key: - The top-level key. - :type key: string - :field value: - The value field for simple metadata. - :type value: string - :field id: - User-provided ID field, not enforced by this API. - Two or more pieces of structured metadata with identical - id and key fields are considered equivalent. - `FIXME: If it's not enforced, then why can't it be null?` - :type id: string - :field type: - The type of data. - :type type: string - :field number: - The number of values that can be included in a field described by this - metadata. - :type number: string - :field description: - A textual description of this metadata. - :type description: string - :field info: - Remaining structured metadata key-value pairs. - :type info: map> - - Optional metadata associated with a variant set. - -.. avro:record:: VariantSet - - :field id: - The variant set ID. - :type id: string - :field name: - The variant set name. - :type name: null|string - :field datasetId: - The ID of the dataset this variant set belongs to. - :type datasetId: string - :field referenceSetId: - The ID of the reference set that describes the sequences used by the variants in this set. - :type referenceSetId: string - :field metadata: - Optional metadata associated with this variant set. - This array can be used to store information about the variant set, such as information found - in VCF header fields, that isn't already available in first class fields such as "name". - :type metadata: array - - A VariantSet is a collection of variants and variant calls intended to be analyzed together. - -.. avro:record:: CallSet - - :field id: - The call set ID. - :type id: string - :field name: - The call set name. - :type name: null|string - :field sampleId: - The sample this call set's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the sampleId - field in the read groups used to generate this call set. - :type sampleId: null|string - :field variantSetIds: - The IDs of the variant sets this call set has calls in. - :type variantSetIds: array - :field created: - The date this call set was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this call set was last updated in - milliseconds from the epoch. - :type updated: null|long - :field info: - A map of additional call set information. - :type info: map> - - A CallSet is a collection of calls that were generated by the same analysis of the same sample. - -.. avro:record:: Call - - :field callSetName: - The name of the call set this variant call belongs to. - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetName: null|string - :field callSetId: - The ID of the call set this variant call belongs to. - - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetId: null|string - :field genotype: - The genotype of this variant call. - - A 0 value represents the reference allele of the associated `Variant`. Any - other value is a 1-based index into the alternate alleles of the associated - `Variant`. - - If a variant had a referenceBases field of "T", an alternateBases - value of ["A", "C"], and the genotype was [2, 1], that would mean the call - represented the heterozygous value "CA" for this variant. If the genotype - was instead [0, 1] the represented value would be "TA". Ordering of the - genotype values is important if the phaseset field is present. - :type genotype: array - :field phaseset: - If this field is not null, this variant call's genotype ordering implies - the phase of the bases and is consistent with any other variant calls on - the same contig which have the same phaseset string. - :type phaseset: null|string - :field genotypeLikelihood: - The genotype likelihoods for this variant call. Each array entry - represents how likely a specific genotype is for this call as - log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The - value ordering is defined by the GL tag in the VCF spec. - :type genotypeLikelihood: array - :field info: - A map of additional variant call information. - :type info: map> - - A `Call` represents the determination of genotype with respect to a - particular `Variant`. - - It may include associated information such as quality - and phasing. For example, a call might assign a probability of 0.32 to - the occurrence of a SNP named rs1234 in a call set with the name NA12345. - -.. avro:record:: Variant - - :field id: - The variant ID. - :type id: string - :field variantSetId: - The ID of the `VariantSet` this variant belongs to. This transitively defines - the `ReferenceSet` against which the `Variant` is to be interpreted. - :type variantSetId: string - :field names: - Names for the variant, for example a RefSNP ID. - :type names: array - :field created: - The date this variant was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this variant was last updated in - milliseconds from the epoch. - :type updated: null|long - :field referenceName: - The reference on which this variant occurs. - (e.g. `chr20` or `X`) - :type referenceName: string - :field start: - The start position at which this variant occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Variants spanning the join of circular genomes are represented as - two variants one on each side of the join (position 0). - :type start: long - :field end: - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - :type end: long - :field referenceBases: - The reference bases for this variant. They start at the given start position. - :type referenceBases: string - :field alternateBases: - The bases that appear instead of the reference bases. Multiple alternate - alleles are possible. - :type alternateBases: array - :field info: - A map of additional variant information. - :type info: map> - :field calls: - The variant calls for this particular variant. Each one represents the - determination of genotype with respect to this variant. `Call`s in this array - are implicitly associated with this `Variant`. - :type calls: array - - A `Variant` represents a change in DNA sequence relative to some reference. - For example, a variant could represent a SNP or an insertion. - Variants belong to a `VariantSet`. - This is equivalent to a row in VCF. - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - -.. avro:record:: AnalysisResult - - :field analysisId: - The ID of the analysis record for this result - :type analysisId: string - :field result: - The text-based result for this analysis - :type result: null|string - :field score: - The numeric score for this analysis - :type score: null|int - - An AnalysisResult record holds the output of a prediction package such - as SIFT on a specific allele. - -.. avro:record:: AlleleLocation - - :field start: - Relative start position of the allele in this coordinate system - :type start: int - :field end: - Relative end position of the allele in this coordinate system - :type end: null|int - :field referenceSequence: - Reference sequence in feature (this should be the codon at CDS level) - :type referenceSequence: null|string - :field alternateSequence: - Alternate sequence in feature (this should be the codon at CDS level) - :type alternateSequence: null|string - - An allele location record holds the location of an allele relative to a - non-genomic coordinate system such as a CDS or protein and holds the - reference and alternate sequence where appropriate - -.. avro:record:: VariantAnnotationSet - - :field id: - The ID of the variant annotation set record - :type id: string - :field variantSetId: - The ID of the variant set to which this annotation set belongs - :type variantSetId: string - :field name: - The variant annotation set name. - :type name: null|string - :field analysis: - Analysis details. It is essential to supply versions for all software and - reference data used. - :type analysis: Analysis - - A VariantAnnotationSet record groups VariantAnnotation records. It is derived - from a VariantSet and holds information describing the software and reference - data used in the annotation. - -.. avro:record:: HGVSAnnotation - - :field genomic: - :type genomic: null|string - :field transcript: - :type transcript: null|string - :field protein: - :type protein: null|string - - A HGVSAnnotation record holds Human Genome Variation Society descriptions - of the sequence change with respect to genomic, transcript and protein - sequences. See: http://www.hgvs.org/mutnomen/recs.html. - Descriptions should be provided at genomic level. Descriptions at transcript - level should be provided when the allele lies within a transcript. Descriptions - at protein level should be provided when the allele lies within the translated - sequence or stop codon. - -.. avro:record:: TranscriptEffect - - :field id: - The ID of the transcript effect record - :type id: string - :field featureId: - The id of the transcript feature the annotation is relative to - :type featureId: string - :field alternateBases: - Alternate allele - a variant may have more than one alternate allele, - each of which will have distinct annotation. - :type alternateBases: null|string - :field effects: - Effect of variant on this feature - :type effects: array - :field hgvsAnnotation: - Human Genome Variation Society variant descriptions - :type hgvsAnnotation: HGVSAnnotation - :field cDNALocation: - Change relative to cDNA - :type cDNALocation: null|AlleleLocation - :field CDSLocation: - :type CDSLocation: null|AlleleLocation - :field proteinLocation: - Change relative to protein - :type proteinLocation: null|AlleleLocation - :field analysisResults: - Output from prediction packages such as SIFT - :type analysisResults: array - - A transcript effect record is a set of information describing the - effect of an allele on a transcript - -.. avro:record:: VariantAnnotation - - :field id: - The ID of this VariantAnnotation. - :type id: string - :field variantId: - The variant ID. - :type variantId: string - :field variantAnnotationSetId: - The ID of the variant annotation set this record belongs to. - :type variantAnnotationSetId: string - :field createDateTime: - The :ref:`ISO 8601 ` time at which this record was created. - :type createDateTime: null|string - :field transcriptEffects: - The transcript effect annotation for the alleles of this variant. Each one - represents the effect of a single allele on a single transcript. - :type transcriptEffects: array - :field info: - Additional annotation data in key-value pairs. - :type info: map> - - A `VariantAnnotation` record represents the result of comparing a variant - to a set of reference data. - -.. avro:record:: SearchVariantAnnotationsRequest - - :field variantAnnotationSetId: - Required. The ID of the variant annotation set to search over. - :type variantAnnotationSetId: string - :field referenceName: - Only return variants with reference alleles on the reference with this - name. One of this field or `referenceId` or `features` is required. - (case-sensitive, exact match) - :type referenceName: null|string - :field referenceId: - Only return variants with reference alleles on the reference with this - ID. One of this field or `referenceName` or `features` is required. - :type referenceId: null|string - :field start: - Required if referenceName or referenceId supplied. - The beginning of the window (0-based, inclusive) for which variants with - overlapping reference alleles should be returned. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - :type start: long - :field end: - Required if referenceName or referenceId supplied. - The end of the window (0-based, exclusive) for which variants with - overlapping reference alleles should be returned. - :type end: long - :field effects: - This filter allows variant, transcript combinations to be extracted by effect - type(s). - Only return variant annotations including any of these effects and only return - transcript effects including any of these effects. Exact matching across all - fields of the Sequence Ontology OntologyTerm is required. - (A transcript effect may have multiple SO effects which will all be reported.) - If null, return all variant annotations. - :type effects: null|array - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /variantannotations/search` as JSON - -.. avro:record:: SearchVariantAnnotationsResponse - - :field variantAnnotations: - The list of matching variant annotations. - :type variantAnnotations: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /variantannotations/search` expressed as JSON. - -.. avro:record:: SearchVariantAnnotationSetsRequest - - :field variantSetId: - Required. The `VariantSet` to search. - :type variantSetId: string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /variantannotationsets/search` as JSON - -.. avro:record:: SearchVariantAnnotationSetsResponse - - :field variantAnnotationSets: - The list of matching variant annotation sets. - :type variantAnnotationSets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /variantannotationsets/search` expressed - as JSON. - diff --git a/doc/source/schemas/alleleAnnotations.rst b/doc/source/schemas/alleleAnnotations.rst deleted file mode 100644 index c821c98c..00000000 --- a/doc/source/schemas/alleleAnnotations.rst +++ /dev/null @@ -1,561 +0,0 @@ -AlleleAnnotations -***************** - -This protocol defines types used by the GA4GH Allele Annotation API. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - -.. avro:record:: VariantSetMetadata - - :field key: - The top-level key. - :type key: string - :field value: - The value field for simple metadata. - :type value: string - :field id: - User-provided ID field, not enforced by this API. - Two or more pieces of structured metadata with identical - id and key fields are considered equivalent. - `FIXME: If it's not enforced, then why can't it be null?` - :type id: string - :field type: - The type of data. - :type type: string - :field number: - The number of values that can be included in a field described by this - metadata. - :type number: string - :field description: - A textual description of this metadata. - :type description: string - :field info: - Remaining structured metadata key-value pairs. - :type info: map> - - Optional metadata associated with a variant set. - -.. avro:record:: VariantSet - - :field id: - The variant set ID. - :type id: string - :field name: - The variant set name. - :type name: null|string - :field datasetId: - The ID of the dataset this variant set belongs to. - :type datasetId: string - :field referenceSetId: - The ID of the reference set that describes the sequences used by the variants in this set. - :type referenceSetId: string - :field metadata: - Optional metadata associated with this variant set. - This array can be used to store information about the variant set, such as information found - in VCF header fields, that isn't already available in first class fields such as "name". - :type metadata: array - - A VariantSet is a collection of variants and variant calls intended to be analyzed together. - -.. avro:record:: CallSet - - :field id: - The call set ID. - :type id: string - :field name: - The call set name. - :type name: null|string - :field sampleId: - The sample this call set's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the sampleId - field in the read groups used to generate this call set. - :type sampleId: null|string - :field variantSetIds: - The IDs of the variant sets this call set has calls in. - :type variantSetIds: array - :field created: - The date this call set was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this call set was last updated in - milliseconds from the epoch. - :type updated: null|long - :field info: - A map of additional call set information. - :type info: map> - - A CallSet is a collection of calls that were generated by the same analysis of the same sample. - -.. avro:record:: Call - - :field callSetName: - The name of the call set this variant call belongs to. - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetName: null|string - :field callSetId: - The ID of the call set this variant call belongs to. - - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetId: null|string - :field genotype: - The genotype of this variant call. - - A 0 value represents the reference allele of the associated `Variant`. Any - other value is a 1-based index into the alternate alleles of the associated - `Variant`. - - If a variant had a referenceBases field of "T", an alternateBases - value of ["A", "C"], and the genotype was [2, 1], that would mean the call - represented the heterozygous value "CA" for this variant. If the genotype - was instead [0, 1] the represented value would be "TA". Ordering of the - genotype values is important if the phaseset field is present. - :type genotype: array - :field phaseset: - If this field is not null, this variant call's genotype ordering implies - the phase of the bases and is consistent with any other variant calls on - the same contig which have the same phaseset string. - :type phaseset: null|string - :field genotypeLikelihood: - The genotype likelihoods for this variant call. Each array entry - represents how likely a specific genotype is for this call as - log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The - value ordering is defined by the GL tag in the VCF spec. - :type genotypeLikelihood: array - :field info: - A map of additional variant call information. - :type info: map> - - A `Call` represents the determination of genotype with respect to a - particular `Variant`. - - It may include associated information such as quality - and phasing. For example, a call might assign a probability of 0.32 to - the occurrence of a SNP named rs1234 in a call set with the name NA12345. - -.. avro:record:: Variant - - :field id: - The variant ID. - :type id: string - :field variantSetId: - The ID of the `VariantSet` this variant belongs to. This transitively defines - the `ReferenceSet` against which the `Variant` is to be interpreted. - :type variantSetId: string - :field names: - Names for the variant, for example a RefSNP ID. - :type names: array - :field created: - The date this variant was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this variant was last updated in - milliseconds from the epoch. - :type updated: null|long - :field referenceName: - The reference on which this variant occurs. - (e.g. `chr20` or `X`) - :type referenceName: string - :field start: - The start position at which this variant occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Variants spanning the join of circular genomes are represented as - two variants one on each side of the join (position 0). - :type start: long - :field end: - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - :type end: long - :field referenceBases: - The reference bases for this variant. They start at the given start position. - :type referenceBases: string - :field alternateBases: - The bases that appear instead of the reference bases. Multiple alternate - alleles are possible. - :type alternateBases: array - :field info: - A map of additional variant information. - :type info: map> - :field calls: - The variant calls for this particular variant. Each one represents the - determination of genotype with respect to this variant. `Call`s in this array - are implicitly associated with this `Variant`. - :type calls: array - - A `Variant` represents a change in DNA sequence relative to some reference. - For example, a variant could represent a SNP or an insertion. - Variants belong to a `VariantSet`. - This is equivalent to a row in VCF. - -.. avro:record:: AnalysisResult - - :field analysisId: - The ID of the analysis record for this result - :type analysisId: string - :field result: - The text-based result for this analysis - :type result: null|string - :field score: - The numeric score for this analysis - :type score: null|int - - An AnalysisResult record holds the output of a prediction package such - as SIFT on a specific allele. - -.. avro:record:: AlleleLocation - - :field start: - Relative start position of the allele in this coordinate system - :type start: int - :field end: - Relative end position of the allele in this coordinate system - :type end: null|int - :field referenceSequence: - Reference sequence in feature (this should be the codon at CDS level) - :type referenceSequence: null|string - :field alternateSequence: - Alternate sequence in feature (this should be the codon at CDS level) - :type alternateSequence: null|string - - An allele location record holds the location of an allele relative to a - non-genomic coordinate system such as a CDS or protein and holds the - reference and alternate sequence where appropriate - -.. avro:record:: VariantAnnotationSet - - :field id: - The ID of the variant annotation set record - :type id: string - :field variantSetId: - The ID of the variant set to which this annotation set belongs - :type variantSetId: string - :field name: - The variant annotation set name. - :type name: null|string - :field analysis: - Analysis details. It is essential to supply versions for all software and - reference data used. - :type analysis: Analysis - - A VariantAnnotationSet record groups VariantAnnotation records. It is derived - from a VariantSet and holds information describing the software and reference - data used in the annotation. - -.. avro:record:: HGVSAnnotation - - :field genomic: - :type genomic: null|string - :field transcript: - :type transcript: null|string - :field protein: - :type protein: null|string - - A HGVSAnnotation record holds Human Genome Variation Society descriptions - of the sequence change with respect to genomic, transcript and protein - sequences. See: http://www.hgvs.org/mutnomen/recs.html. - Descriptions should be provided at genomic level. Descriptions at transcript - level should be provided when the allele lies within a transcript. Descriptions - at protein level should be provided when the allele lies within the translated - sequence or stop codon. - -.. avro:record:: TranscriptEffect - - :field id: - The ID of the transcript effect record - :type id: string - :field featureId: - The id of the transcript feature the annotation is relative to - :type featureId: string - :field alternateBases: - Alternate allele - a variant may have more than one alternate allele, - each of which will have distinct annotation. - :type alternateBases: null|string - :field effects: - Effect of variant on this feature - :type effects: array - :field hgvsAnnotation: - Human Genome Variation Society variant descriptions - :type hgvsAnnotation: HGVSAnnotation - :field cDNALocation: - Change relative to cDNA - :type cDNALocation: null|AlleleLocation - :field CDSLocation: - :type CDSLocation: null|AlleleLocation - :field proteinLocation: - Change relative to protein - :type proteinLocation: null|AlleleLocation - :field analysisResults: - Output from prediction packages such as SIFT - :type analysisResults: array - - A transcript effect record is a set of information describing the - effect of an allele on a transcript - -.. avro:record:: VariantAnnotation - - :field id: - The ID of this VariantAnnotation. - :type id: string - :field variantId: - The variant ID. - :type variantId: string - :field variantAnnotationSetId: - The ID of the variant annotation set this record belongs to. - :type variantAnnotationSetId: string - :field createDateTime: - The :ref:`ISO 8601 ` time at which this record was created. - :type createDateTime: null|string - :field transcriptEffects: - The transcript effect annotation for the alleles of this variant. Each one - represents the effect of a single allele on a single transcript. - :type transcriptEffects: array - :field info: - Additional annotation data in key-value pairs. - :type info: map> - - A `VariantAnnotation` record represents the result of comparing a variant - to a set of reference data. - diff --git a/doc/source/schemas/common.rst b/doc/source/schemas/common.rst deleted file mode 100644 index 99605783..00000000 --- a/doc/source/schemas/common.rst +++ /dev/null @@ -1,107 +0,0 @@ -Common -****** - -This file defines common types used in other parts of the schema. -There are no directly associated methods. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - diff --git a/doc/source/schemas/index.rst b/doc/source/schemas/index.rst index c343b0b7..f1640056 100644 --- a/doc/source/schemas/index.rst +++ b/doc/source/schemas/index.rst @@ -4,17 +4,16 @@ Schemas !!!!!!! .. toctree:: - common - metadata - metadatamethods - methods - readmethods - reads - referencemethods - references - variantmethods - variants - alleleAnnotationmethods - alleleAnnotations - sequenceAnnotations - sequenceAnnotationmethods + common.proto.rst + metadata.proto.rst + metadata_service.proto.rst + reads.proto.rst + read_service.proto.rst + references.proto.rst + reference_service.proto.rst + variants.proto.rst + variant_service.proto.rst + allele_annotations.proto.rst + allele_annotation_service.proto.rst + sequence_annotations.proto.rst + sequence_annotation_service.proto.rst diff --git a/doc/source/schemas/metadata.rst b/doc/source/schemas/metadata.rst deleted file mode 100644 index 7694731f..00000000 --- a/doc/source/schemas/metadata.rst +++ /dev/null @@ -1,243 +0,0 @@ -Metadata -******** - -This protocol defines metadata used in the other GA4GH protocols. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - diff --git a/doc/source/schemas/metadatamethods.rst b/doc/source/schemas/metadatamethods.rst deleted file mode 100644 index 703c69cb..00000000 --- a/doc/source/schemas/metadatamethods.rst +++ /dev/null @@ -1,237 +0,0 @@ -MetadataMethods -*************** - - .. function:: searchDatasets(request) - - :param request: SearchDatasetsRequest: This request maps to the body of `POST /datasets/search` as JSON. - :return type: SearchDatasetsResponse - :throws: GAException - -Gets a list of datasets accessible through the API. - -TODO: Reads and variants both want to have datasets. Are they the same object? - -`POST /datasets/search` must accept a JSON version of -`SearchDatasetsRequest` as the post body and will return a JSON version -of `SearchDatasetsResponse`. - - .. function:: getDataset(id) - - :param id: string: The ID of the `Dataset`. - :return type: org.ga4gh.models.Dataset - :throws: GAException - -Gets a `Dataset` by ID. -`GET /datasets/{id}` will return a JSON version of `Dataset`. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field recordCreateTime: - The time at which this record was created. - Format: ISO 8601, YYYY-MM-DDTHH:MM:SS.SSS (e.g. 2015-02-10T00:03:42.123Z) - :type recordCreateTime: string - :field recordUpdateTime: - The time at which this record was last updated. - Format: ISO 8601, YYYY-MM-DDTHH:MM:SS.SSS (e.g. 2015-02-10T00:03:42.123Z) - :type recordUpdateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: ISO 8601, YYYY-MM-DDTHH:MM:SS (e.g. 2015-02-10T00:03:42) - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:error:: GAException - - A general exception type. - -.. avro:record:: SearchDatasetsRequest - - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /datasets/search` as JSON. - -.. avro:record:: SearchDatasetsResponse - - :field datasets: - The list of datasets. - :type datasets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /datasets/search` expressed as JSON. - diff --git a/doc/source/schemas/methods.rst b/doc/source/schemas/methods.rst deleted file mode 100644 index e02a4cb3..00000000 --- a/doc/source/schemas/methods.rst +++ /dev/null @@ -1,7 +0,0 @@ -RPC -*** - -.. avro:error:: GAException - - A general exception type. - diff --git a/doc/source/schemas/readmethods.rst b/doc/source/schemas/readmethods.rst deleted file mode 100644 index 431504c8..00000000 --- a/doc/source/schemas/readmethods.rst +++ /dev/null @@ -1,612 +0,0 @@ -ReadMethods -*********** - - .. function:: searchReads(request) - - :param request: SearchReadsRequest: This request maps to the body of `POST /reads/search` as JSON. - :return type: SearchReadsResponse - :throws: GAException - -Gets a list of `ReadAlignment`s for one or more `ReadGroup`s. - -`searchReads` operates over a genomic coordinate space of reference sequence -and position defined by the `Reference`s to which the requested `ReadGroup`s are -aligned. - -If a target positional range is specified, search returns all reads whose -alignment to the reference genome *overlap* the range. A query which specifies -only read group IDs yields all reads in those read groups, including unmapped -reads. - -All reads returned (including reads on subsequent pages) are ordered by genomic -coordinate (by reference sequence, then position). Reads with equivalent genomic -coordinates are returned in an unspecified order. This order must be consistent -for a given repository, such that two queries for the same content (regardless -of page size) yield reads in the same order across their respective streams of -paginated responses. - -`POST /reads/search` must accept a JSON version of `SearchReadsRequest` as -the post body and will return a JSON version of `SearchReadsResponse`. - - .. function:: searchReadGroupSets(request) - - :param request: SearchReadGroupSetsRequest: This request maps to the body of `POST /readgroupsets/search` as JSON. - :return type: SearchReadGroupSetsResponse - :throws: GAException - -Gets a list of `ReadGroupSet` matching the search criteria. - -`POST /readgroupsets/search` must accept a JSON version of -`SearchReadGroupSetsRequest` as the post body and will return a JSON -version of `SearchReadGroupSetsResponse`. - - .. function:: getReadGroupSet(id) - - :param id: string: The ID of the `ReadGroupSet`. - :return type: org.ga4gh.models.ReadGroupSet - :throws: GAException - -Gets a `org.ga4gh.models.ReadGroupSet` by ID. -`GET /readgroupsets/{id}` will return a JSON version of `ReadGroupSet`. - - .. function:: getReadGroup(id) - - :param id: string: The ID of the `ReadGroup`. - :return type: org.ga4gh.models.ReadGroup - :throws: GAException - -Gets a `org.ga4gh.models.ReadGroup` by ID. -`GET /readgroups/{id}` will return a JSON version of `ReadGroup`. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:error:: GAException - - A general exception type. - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - -.. avro:record:: Program - - :field commandLine: - The command line used to run this program. - :type commandLine: null|string - :field id: - The user specified ID of the program. - :type id: null|string - :field name: - The name of the program. - :type name: null|string - :field prevProgramId: - The ID of the program run before this one. - :type prevProgramId: null|string - :field version: - The version of the program run. - :type version: null|string - - Program can be used to track the provenance of how read data was generated. - -.. avro:record:: ReadStats - - :field alignedReadCount: - The number of aligned reads. - :type alignedReadCount: null|long - :field unalignedReadCount: - The number of unaligned reads. - :type unalignedReadCount: null|long - :field baseCount: - The total number of bases. - This is equivalent to the sum of `alignedSequence.length` for all reads. - :type baseCount: null|long - - ReadStats can be used to provide summary statistics about read data. - -.. avro:record:: ReadGroup - - :field id: - The read group ID. - :type id: string - :field datasetId: - The ID of the dataset this read group belongs to. - :type datasetId: null|string - :field name: - The read group name. - :type name: null|string - :field description: - The read group description. - :type description: null|string - :field sampleId: - The sample this read group's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the SM tag in a - BAM file. - :type sampleId: null|string - :field experiment: - The experiment used to generate this read group. - :type experiment: null|Experiment - :field predictedInsertSize: - The predicted insert size of this read group. - :type predictedInsertSize: null|int - :field created: - The time at which this read group was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this read group was last updated in milliseconds - from the epoch. - :type updated: null|long - :field stats: - Statistical data on reads in this read group. - :type stats: null|ReadStats - :field programs: - The programs used to generate this read group. - :type programs: array - :field referenceSetId: - The ID of the reference set to which the reads in this read group are aligned. - Required if there are any read alignments. - :type referenceSetId: null|string - :field info: - A map of additional read group information. - :type info: map> - - A ReadGroup is a set of reads derived from one physical sequencing process. - -.. avro:record:: ReadGroupSet - - :field id: - The read group set ID. - :type id: string - :field datasetId: - The ID of the dataset this read group set belongs to. - :type datasetId: null|string - :field name: - The read group set name. - :type name: null|string - :field stats: - Statistical data on reads in this read group set. - :type stats: null|ReadStats - :field readGroups: - The read groups in this set. - :type readGroups: array - - A ReadGroupSet is a logical collection of ReadGroups. Typically one ReadGroupSet - represents all the reads from one experimental sample. - -.. avro:record:: LinearAlignment - - :field position: - The position of this alignment. - :type position: Position - :field mappingQuality: - The mapping quality of this alignment, meaning the likelihood that the read - maps to this position. - - Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to the - nearest integer. - :type mappingQuality: null|int - :field cigar: - Represents the local alignment of this sequence (alignment matches, indels, etc) - versus the reference. - :type cigar: array - - A linear alignment describes the alignment of a read to a Reference, using a - position and CIGAR array. - -.. avro:record:: ReadAlignment - - :field id: - The read alignment ID. This ID is unique within the read group this - alignment belongs to. - - For performance reasons, this field may be omitted by a backend. - If provided, its intended use is to make caching and UI display easier for - genome browsers and other lightweight clients. - :type id: null|string - :field readGroupId: - The ID of the read group this read belongs to. - (Every read must belong to exactly one read group.) - :type readGroupId: string - :field fragmentName: - The fragment name. Equivalent to QNAME (query template name) in SAM. - :type fragmentName: string - :field properPlacement: - The orientation and the distance between reads from the fragment are - consistent with the sequencing protocol (equivalent to SAM flag 0x2) - :type properPlacement: null|boolean - :field duplicateFragment: - The fragment is a PCR or optical duplicate (SAM flag 0x400). - :type duplicateFragment: null|boolean - :field numberReads: - The number of reads in the fragment (extension to SAM flag 0x1) - :type numberReads: null|int - :field fragmentLength: - The observed length of the fragment, equivalent to TLEN in SAM. - :type fragmentLength: null|int - :field readNumber: - The read ordinal in the fragment, 0-based and less than numberReads. This - field replaces SAM flag 0x40 and 0x80 and is intended to more cleanly - represent multiple reads per fragment. - :type readNumber: null|int - :field failedVendorQualityChecks: - The read fails platform or vendor quality checks (SAM flag 0x200). - :type failedVendorQualityChecks: null|boolean - :field alignment: - The alignment for this alignment record. This field will be null if the read - is unmapped. - :type alignment: null|LinearAlignment - :field secondaryAlignment: - Whether this alignment is secondary. Equivalent to SAM flag 0x100. - A secondary alignment represents an alternative to the primary alignment - for this read. Aligners may return secondary alignments if a read can map - ambiguously to multiple coordinates in the genome. - - By convention, each read has one and only one alignment where both - secondaryAlignment and supplementaryAlignment are false. - :type secondaryAlignment: null|boolean - :field supplementaryAlignment: - Whether this alignment is supplementary. Equivalent to SAM flag 0x800. - Supplementary alignments are used in the representation of a chimeric - alignment. In a chimeric alignment, a read is split into multiple - linear alignments that map to different reference contigs. The first - linear alignment in the read will be designated as the representative alignment; - the remaining linear alignments will be designated as supplementary alignments. - These alignments may have different mapping quality scores. - - In each linear alignment in a chimeric alignment, the read will be hard clipped. - The `alignedSequence` and `alignedQuality` fields in the alignment record will - only represent the bases for its respective linear alignment. - :type supplementaryAlignment: null|boolean - :field alignedSequence: - The bases of the read sequence contained in this alignment record (equivalent - to SEQ in SAM). - - `alignedSequence` and `alignedQuality` may be shorter than the full read sequence - and quality. This will occur if the alignment is part of a chimeric alignment, - or if the read was trimmed. When this occurs, the CIGAR for this read will - begin/end with a hard clip operator that will indicate the length of the - excised sequence. - :type alignedSequence: null|string - :field alignedQuality: - The quality of the read sequence contained in this alignment record - (equivalent to QUAL in SAM). - - `alignedSequence` and `alignedQuality` may be shorter than the full read sequence - and quality. This will occur if the alignment is part of a chimeric alignment, - or if the read was trimmed. When this occurs, the CIGAR for this read will - begin/end with a hard clip operator that will indicate the length of the excised sequence. - :type alignedQuality: array - :field nextMatePosition: - The mapping of the primary alignment of the `(readNumber+1)%numberReads` - read in the fragment. It replaces mate position and mate strand in SAM. - :type nextMatePosition: null|Position - :field info: - A map of additional read alignment information. - :type info: map> - - Each read alignment describes an alignment with additional information - about the fragment and the read. A read alignment object is equivalent to a - line in a SAM file. - -.. avro:record:: SearchReadsRequest - - :field readGroupIds: - The ReadGroups to search. At least one id must be specified. - :type readGroupIds: array - :field referenceId: - The reference to query. Leaving blank returns results from all - references, including unmapped reads - this could be very large. - :type referenceId: null|string - :field start: - The start position (0-based) of this query. - If a reference is specified, this defaults to 0. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - :type start: null|long - :field end: - The end position (0-based, exclusive) of this query. - If a reference is specified, this defaults to the - reference's length. - :type end: null|long - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /reads/search` as JSON. - - If a reference is specified, all queried `ReadGroup`s must be aligned - to `ReferenceSet`s containing that same `Reference`. If no reference is - specified, all queried `ReadGroup`s must be aligned to the same `ReferenceSet`. - -.. avro:record:: SearchReadsResponse - - :field alignments: - The list of matching alignment records, sorted by position. - Unmapped reads, which have no position, are returned last. - :type alignments: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /reads/search` expressed as JSON. - -.. avro:record:: SearchReadGroupSetsRequest - - :field datasetId: - The dataset to search. - :type datasetId: string - :field name: - Only return read group sets with this name (case-sensitive, exact match). - :type name: null|string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /readgroupsets/search` as JSON. - - TODO: Factor this out to a common API patterns section. - - If searching by a resource ID, and that resource is not found, the method - will return a `404` HTTP status code (`NOT_FOUND`). - - If searching by other attributes, e.g. `name`, and no matches are found, the - method will return a `200` HTTP status code (`OK`) with an empty result list. - -.. avro:record:: SearchReadGroupSetsResponse - - :field readGroupSets: - The list of matching read group sets. - :type readGroupSets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /readgroupsets/search` expressed as JSON. - diff --git a/doc/source/schemas/reads.rst b/doc/source/schemas/reads.rst deleted file mode 100644 index 7bcc1f6b..00000000 --- a/doc/source/schemas/reads.rst +++ /dev/null @@ -1,465 +0,0 @@ -Reads -***** - -This file defines the objects used to represent a reads and alignments, most importantly -ReadGroupSet, ReadGroup, and ReadAlignment. -See {TODO: LINK TO READS OVERVIEW} for more information. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - -.. avro:record:: Program - - :field commandLine: - The command line used to run this program. - :type commandLine: null|string - :field id: - The user specified ID of the program. - :type id: null|string - :field name: - The name of the program. - :type name: null|string - :field prevProgramId: - The ID of the program run before this one. - :type prevProgramId: null|string - :field version: - The version of the program run. - :type version: null|string - - Program can be used to track the provenance of how read data was generated. - -.. avro:record:: ReadStats - - :field alignedReadCount: - The number of aligned reads. - :type alignedReadCount: null|long - :field unalignedReadCount: - The number of unaligned reads. - :type unalignedReadCount: null|long - :field baseCount: - The total number of bases. - This is equivalent to the sum of `alignedSequence.length` for all reads. - :type baseCount: null|long - - ReadStats can be used to provide summary statistics about read data. - -.. avro:record:: ReadGroup - - :field id: - The read group ID. - :type id: string - :field datasetId: - The ID of the dataset this read group belongs to. - :type datasetId: null|string - :field name: - The read group name. - :type name: null|string - :field description: - The read group description. - :type description: null|string - :field sampleId: - The sample this read group's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the SM tag in a - BAM file. - :type sampleId: null|string - :field experiment: - The experiment used to generate this read group. - :type experiment: null|Experiment - :field predictedInsertSize: - The predicted insert size of this read group. - :type predictedInsertSize: null|int - :field created: - The time at which this read group was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this read group was last updated in milliseconds - from the epoch. - :type updated: null|long - :field stats: - Statistical data on reads in this read group. - :type stats: null|ReadStats - :field programs: - The programs used to generate this read group. - :type programs: array - :field referenceSetId: - The ID of the reference set to which the reads in this read group are aligned. - Required if there are any read alignments. - :type referenceSetId: null|string - :field info: - A map of additional read group information. - :type info: map> - - A ReadGroup is a set of reads derived from one physical sequencing process. - -.. avro:record:: ReadGroupSet - - :field id: - The read group set ID. - :type id: string - :field datasetId: - The ID of the dataset this read group set belongs to. - :type datasetId: null|string - :field name: - The read group set name. - :type name: null|string - :field stats: - Statistical data on reads in this read group set. - :type stats: null|ReadStats - :field readGroups: - The read groups in this set. - :type readGroups: array - - A ReadGroupSet is a logical collection of ReadGroups. Typically one ReadGroupSet - represents all the reads from one experimental sample. - -.. avro:record:: LinearAlignment - - :field position: - The position of this alignment. - :type position: Position - :field mappingQuality: - The mapping quality of this alignment, meaning the likelihood that the read - maps to this position. - - Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to the - nearest integer. - :type mappingQuality: null|int - :field cigar: - Represents the local alignment of this sequence (alignment matches, indels, etc) - versus the reference. - :type cigar: array - - A linear alignment describes the alignment of a read to a Reference, using a - position and CIGAR array. - -.. avro:record:: ReadAlignment - - :field id: - The read alignment ID. This ID is unique within the read group this - alignment belongs to. - - For performance reasons, this field may be omitted by a backend. - If provided, its intended use is to make caching and UI display easier for - genome browsers and other lightweight clients. - :type id: null|string - :field readGroupId: - The ID of the read group this read belongs to. - (Every read must belong to exactly one read group.) - :type readGroupId: string - :field fragmentName: - The fragment name. Equivalent to QNAME (query template name) in SAM. - :type fragmentName: string - :field properPlacement: - The orientation and the distance between reads from the fragment are - consistent with the sequencing protocol (equivalent to SAM flag 0x2) - :type properPlacement: null|boolean - :field duplicateFragment: - The fragment is a PCR or optical duplicate (SAM flag 0x400). - :type duplicateFragment: null|boolean - :field numberReads: - The number of reads in the fragment (extension to SAM flag 0x1) - :type numberReads: null|int - :field fragmentLength: - The observed length of the fragment, equivalent to TLEN in SAM. - :type fragmentLength: null|int - :field readNumber: - The read ordinal in the fragment, 0-based and less than numberReads. This - field replaces SAM flag 0x40 and 0x80 and is intended to more cleanly - represent multiple reads per fragment. - :type readNumber: null|int - :field failedVendorQualityChecks: - The read fails platform or vendor quality checks (SAM flag 0x200). - :type failedVendorQualityChecks: null|boolean - :field alignment: - The alignment for this alignment record. This field will be null if the read - is unmapped. - :type alignment: null|LinearAlignment - :field secondaryAlignment: - Whether this alignment is secondary. Equivalent to SAM flag 0x100. - A secondary alignment represents an alternative to the primary alignment - for this read. Aligners may return secondary alignments if a read can map - ambiguously to multiple coordinates in the genome. - - By convention, each read has one and only one alignment where both - secondaryAlignment and supplementaryAlignment are false. - :type secondaryAlignment: null|boolean - :field supplementaryAlignment: - Whether this alignment is supplementary. Equivalent to SAM flag 0x800. - Supplementary alignments are used in the representation of a chimeric - alignment. In a chimeric alignment, a read is split into multiple - linear alignments that map to different reference contigs. The first - linear alignment in the read will be designated as the representative alignment; - the remaining linear alignments will be designated as supplementary alignments. - These alignments may have different mapping quality scores. - - In each linear alignment in a chimeric alignment, the read will be hard clipped. - The `alignedSequence` and `alignedQuality` fields in the alignment record will - only represent the bases for its respective linear alignment. - :type supplementaryAlignment: null|boolean - :field alignedSequence: - The bases of the read sequence contained in this alignment record (equivalent - to SEQ in SAM). - - `alignedSequence` and `alignedQuality` may be shorter than the full read sequence - and quality. This will occur if the alignment is part of a chimeric alignment, - or if the read was trimmed. When this occurs, the CIGAR for this read will - begin/end with a hard clip operator that will indicate the length of the - excised sequence. - :type alignedSequence: null|string - :field alignedQuality: - The quality of the read sequence contained in this alignment record - (equivalent to QUAL in SAM). - - `alignedSequence` and `alignedQuality` may be shorter than the full read sequence - and quality. This will occur if the alignment is part of a chimeric alignment, - or if the read was trimmed. When this occurs, the CIGAR for this read will - begin/end with a hard clip operator that will indicate the length of the excised sequence. - :type alignedQuality: array - :field nextMatePosition: - The mapping of the primary alignment of the `(readNumber+1)%numberReads` - read in the fragment. It replaces mate position and mate strand in SAM. - :type nextMatePosition: null|Position - :field info: - A map of additional read alignment information. - :type info: map> - - Each read alignment describes an alignment with additional information - about the fragment and the read. A read alignment object is equivalent to a - line in a SAM file. - diff --git a/doc/source/schemas/referencemethods.rst b/doc/source/schemas/referencemethods.rst deleted file mode 100644 index e287d4df..00000000 --- a/doc/source/schemas/referencemethods.rst +++ /dev/null @@ -1,379 +0,0 @@ -ReferenceMethods -**************** - - .. function:: getReferenceSet(id) - - :param id: string: The ID of the `ReferenceSet`. - :return type: org.ga4gh.models.ReferenceSet - :throws: GAException - -Gets a `ReferenceSet` by ID. -`GET /referencesets/{id}` will return a JSON version of `ReferenceSet`. - - .. function:: getReference(id) - - :param id: string: The ID of the `Reference`. - :return type: org.ga4gh.models.Reference - :throws: GAException - -Gets a `Reference` by ID. -`GET /references/{id}` will return a JSON version of `Reference`. - - .. function:: searchReferences(request) - - :param request: SearchReferencesRequest: This request maps to the body of `POST /references/search` - as JSON. - :return type: SearchReferencesResponse - :throws: GAException - -Gets a list of `Reference` matching the search criteria. - -`POST /references/search` must accept a JSON version of -`SearchReferencesRequest` as the post body and will return a JSON -version of `SearchReferencesResponse`. - - .. function:: getReferenceBases(id, request) - - :param id: string: The ID of the `Reference`. - :param request: ListReferenceBasesRequest: Additional request parameters to restrict the query. - :return type: ListReferenceBasesResponse - :throws: GAException - -Lists `Reference` bases by ID and optional range. -`GET /references/{id}/bases` will return a JSON version of -`ListReferenceBasesResponse`. - - .. function:: searchReferenceSets(request) - - :param request: SearchReferenceSetsRequest: This request maps to the body of `POST /referencesets/search` - as JSON. - :return type: SearchReferenceSetsResponse - :throws: GAException - -Gets a list of `ReferenceSet` matching the search criteria. - -`POST /referencesets/search` must accept a JSON version of -`SearchReferenceSetsRequest` as the post body and will return a JSON -version of `SearchReferenceSetsResponse`. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:error:: GAException - - A general exception type. - -.. avro:record:: Reference - - :field id: - The reference ID. Unique within the repository. - :type id: string - :field length: - The length of this reference's sequence. - :type length: long - :field md5checksum: - The MD5 checksum uniquely representing this `Reference` as a lower-case - hexadecimal string, calculated as the MD5 of the upper-case sequence - excluding all whitespace characters (this is equivalent to SQ:M5 in SAM). - :type md5checksum: string - :field name: - The name of this reference. (e.g. '22'). - :type name: string - :field sourceURI: - The URI from which the sequence was obtained. Specifies a FASTA format - file/string with one name, sequence pair. In most cases, clients should call - the `getReferenceBases()` method to obtain sequence bases for a `Reference` - instead of attempting to retrieve this URI. - :type sourceURI: null|string - :field sourceAccessions: - All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) which must include - a version number, e.g. `GCF_000001405.26`. - :type sourceAccessions: array - :field isDerived: - A sequence X is said to be derived from source sequence Y, if X and Y - are of the same length and the per-base sequence divergence at A/C/G/T bases - is sufficiently small. Two sequences derived from the same official - sequence share the same coordinates and annotations, and - can be replaced with the official sequence for certain use cases. - :type isDerived: boolean - :field sourceDivergence: - The `sourceDivergence` is the fraction of non-indel bases that do not match the - reference this record was derived from. - :type sourceDivergence: null|float - :field ncbiTaxonId: - ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human). - :type ncbiTaxonId: null|int - - A `Reference` is a canonical assembled contig, intended to act as a - reference coordinate space for other genomic annotations. A single - `Reference` might represent the human chromosome 1, for instance. - - `Reference`s are designed to be immutable. - -.. avro:record:: ReferenceSet - - :field id: - The reference set ID. Unique in the repository. - :type id: string - :field name: - The reference set name. - :type name: null|string - :field md5checksum: - Order-independent MD5 checksum which identifies this `ReferenceSet`. - - To compute this checksum, make a list of `Reference.md5checksum` for all - `Reference`s in this set. Then sort that list, and take the MD5 hash of - all the strings concatenated together. Express the hash as a lower-case - hexadecimal string. - :type md5checksum: string - :field ncbiTaxonId: - ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human) indicating - the species which this assembly is intended to model. Note that contained - `Reference`s may specify a different `ncbiTaxonId`, as assemblies may - contain reference sequences which do not belong to the modeled species, e.g. - EBV in a human reference genome. - :type ncbiTaxonId: null|int - :field description: - Optional free text description of this reference set. - :type description: null|string - :field assemblyId: - Public id of this reference set, such as `GRCh37`. - :type assemblyId: null|string - :field sourceURI: - Specifies a FASTA format file/string. - :type sourceURI: null|string - :field sourceAccessions: - All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally - with a version number, e.g. `NC_000001.11`. - :type sourceAccessions: array - :field isDerived: - A reference set may be derived from a source if it contains - additional sequences, or some of the sequences within it are derived - (see the definition of `isDerived` in `Reference`). - :type isDerived: boolean - - A `ReferenceSet` is a set of `Reference`s which typically comprise a - reference assembly, such as `GRCh38`. A `ReferenceSet` defines a common - coordinate space for comparing reference-aligned experimental data. - -.. avro:record:: SearchReferenceSetsRequest - - :field md5checksum: - If not null, return the reference sets for which the - `md5checksum` matches this string (case-sensitive, exact match). - See `ReferenceSet::md5checksum` for details. - :type md5checksum: null|string - :field accession: - If not null, return the reference sets for which the `accession` - matches this string (case-sensitive, exact match). - :type accession: null|string - :field assemblyId: - If not null, return the reference sets for which the `assemblyId` - matches this string (case-sensitive, exact match). - :type assemblyId: null|string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /referencesets/search` - as JSON. - -.. avro:record:: SearchReferenceSetsResponse - - :field referenceSets: - The list of matching reference sets. - :type referenceSets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /referencesets/search` - expressed as JSON. - -.. avro:record:: SearchReferencesRequest - - :field referenceSetId: - The `ReferenceSet` to search. - :type referenceSetId: string - :field md5checksum: - If not null, return the references for which the - `md5checksum` matches this string (case-sensitive, exact match). - See `ReferenceSet::md5checksum` for details. - :type md5checksum: null|string - :field accession: - If not null, return the references for which the `accession` - matches this string (case-sensitive, exact match). - :type accession: null|string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /references/search` - as JSON. - -.. avro:record:: SearchReferencesResponse - - :field references: - The list of matching references. - :type references: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /references/search` expressed as JSON. - -.. avro:record:: ListReferenceBasesRequest - - :field start: - The start position (0-based) of this query. Defaults to 0. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - :type start: long - :field end: - The end position (0-based, exclusive) of this query. Defaults - to the length of this `Reference`. - :type end: null|long - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - The query parameters for a request to `GET /references/{id}/bases`, for - example: - - `GET /references/{id}/bases?start=100&end=200` - -.. avro:record:: ListReferenceBasesResponse - - :field offset: - The offset position (0-based) of the given sequence from the start of this - `Reference`. This value will differ for each page in a paginated request. - :type offset: long - :field sequence: - A substring of the bases that make up this reference. Bases are represented - as IUPAC-IUB codes; this string matches the regexp `[ACGTMRWSYKVHDBN]*`. - :type sequence: string - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - The response from `GET /references/{id}/bases` expressed as JSON. - diff --git a/doc/source/schemas/references.rst b/doc/source/schemas/references.rst deleted file mode 100644 index 9473784d..00000000 --- a/doc/source/schemas/references.rst +++ /dev/null @@ -1,199 +0,0 @@ -References -********** - -Defines types used by the GA4GH References API. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: Reference - - :field id: - The reference ID. Unique within the repository. - :type id: string - :field length: - The length of this reference's sequence. - :type length: long - :field md5checksum: - The MD5 checksum uniquely representing this `Reference` as a lower-case - hexadecimal string, calculated as the MD5 of the upper-case sequence - excluding all whitespace characters (this is equivalent to SQ:M5 in SAM). - :type md5checksum: string - :field name: - The name of this reference. (e.g. '22'). - :type name: string - :field sourceURI: - The URI from which the sequence was obtained. Specifies a FASTA format - file/string with one name, sequence pair. In most cases, clients should call - the `getReferenceBases()` method to obtain sequence bases for a `Reference` - instead of attempting to retrieve this URI. - :type sourceURI: null|string - :field sourceAccessions: - All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) which must include - a version number, e.g. `GCF_000001405.26`. - :type sourceAccessions: array - :field isDerived: - A sequence X is said to be derived from source sequence Y, if X and Y - are of the same length and the per-base sequence divergence at A/C/G/T bases - is sufficiently small. Two sequences derived from the same official - sequence share the same coordinates and annotations, and - can be replaced with the official sequence for certain use cases. - :type isDerived: boolean - :field sourceDivergence: - The `sourceDivergence` is the fraction of non-indel bases that do not match the - reference this record was derived from. - :type sourceDivergence: null|float - :field ncbiTaxonId: - ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human). - :type ncbiTaxonId: null|int - - A `Reference` is a canonical assembled contig, intended to act as a - reference coordinate space for other genomic annotations. A single - `Reference` might represent the human chromosome 1, for instance. - - `Reference`s are designed to be immutable. - -.. avro:record:: ReferenceSet - - :field id: - The reference set ID. Unique in the repository. - :type id: string - :field name: - The reference set name. - :type name: null|string - :field md5checksum: - Order-independent MD5 checksum which identifies this `ReferenceSet`. - - To compute this checksum, make a list of `Reference.md5checksum` for all - `Reference`s in this set. Then sort that list, and take the MD5 hash of - all the strings concatenated together. Express the hash as a lower-case - hexadecimal string. - :type md5checksum: string - :field ncbiTaxonId: - ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human) indicating - the species which this assembly is intended to model. Note that contained - `Reference`s may specify a different `ncbiTaxonId`, as assemblies may - contain reference sequences which do not belong to the modeled species, e.g. - EBV in a human reference genome. - :type ncbiTaxonId: null|int - :field description: - Optional free text description of this reference set. - :type description: null|string - :field assemblyId: - Public id of this reference set, such as `GRCh37`. - :type assemblyId: null|string - :field sourceURI: - Specifies a FASTA format file/string. - :type sourceURI: null|string - :field sourceAccessions: - All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally - with a version number, e.g. `NC_000001.11`. - :type sourceAccessions: array - :field isDerived: - A reference set may be derived from a source if it contains - additional sequences, or some of the sequences within it are derived - (see the definition of `isDerived` in `Reference`). - :type isDerived: boolean - - A `ReferenceSet` is a set of `Reference`s which typically comprise a - reference assembly, such as `GRCh38`. A `ReferenceSet` defines a common - coordinate space for comparing reference-aligned experimental data. - diff --git a/doc/source/schemas/sequenceAnnotationmethods.rst b/doc/source/schemas/sequenceAnnotationmethods.rst deleted file mode 100644 index 01815cc6..00000000 --- a/doc/source/schemas/sequenceAnnotationmethods.rst +++ /dev/null @@ -1,457 +0,0 @@ -SequenceAnnotationMethods -************************* - - .. function:: searchFeatureSets(request) - - :param request: SearchFeatureSetsRequest: This request maps to the body of `POST /featuresets/search` as JSON. - :return type: SearchFeatureSetsResponse - :throws: GAException - -Gets a list of `FeatureSet` matching the search criteria. - - `POST /featuresets/search` must accept a JSON version of - `SearchFeatureSetsRequest` as the post body and will return a JSON version - of `SearchFeatureSetsResponse`. - - .. function:: getFeatureSet(id) - - :param id: string: The ID of the `FeatureSet`. - :return type: org.ga4gh.models.FeatureSet - :throws: GAException - -Gets a `FeatureSet` by ID. - `GET /featuresets/{id}` will return a JSON version of `FeatureSet`. - - .. function:: getFeature(id) - - :param id: string: The ID of the `Feature`. - :return type: org.ga4gh.models.Feature - :throws: GAException - -Gets a `org.ga4gh.models.Feature` by ID. - `GET /features/{id}` will return a JSON version of `Feature`. - - .. function:: searchFeatures(request) - - :param request: SearchFeaturesRequest: This request maps to the body of `POST /features/search` as JSON. - :return type: SearchFeaturesResponse - :throws: GAException - -Gets a list of `Feature` matching the search criteria. - - `POST /features/search` must accept a JSON version of - `SearchFeaturesRequest` as the post body and will return a JSON version of - `SearchFeaturesResponse`. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:error:: GAException - - A general exception type. - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - -.. avro:record:: Attributes - - :field vals: - :type vals: map> - - Type defining a collection of attributes associated with various protocol - records. Each attribute is a name that maps to an array of one or more - values. Values can be strings, external identifiers, or ontology terms. - Values should be split into the array elements instead of using a separator - syntax that needs to parsed. - -.. avro:record:: Feature - - :field id: - Id of this annotation node. - :type id: string - :field parentId: - Parent Id of this node. Set to empty string if node has no parent. - :type parentId: string - :field childIds: - Ordered array of Child Ids of this node. - Since not all child nodes are ordered by genomic coordinates, - this can't always be reconstructed from parentId's of the children alone. - :type childIds: array - :field featureSetId: - Identifier for the containing feature set. - :type featureSetId: string - :field referenceName: - The reference on which this feature occurs. - (e.g. `chr20` or `X`) - :type referenceName: string - :field start: - The start position at which this feature occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Features spanning the join of circular genomes are represented as - two features one on each side of the join (position 0). - :type start: long - :field end: - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - :type end: long - :field strand: - The strand on which the feature is present. - :type strand: Strand - :field featureType: - Feature that is annotated by this region. Normally, this will be a term in - the Sequence Ontology. - :type featureType: OntologyTerm - :field attributes: - Name/value attributes of the annotation. Attribute names follow the GFF3 - naming convention of reserved names starting with an upper cases - character, and user-define names start with lower-case. Most GFF3 - pre-defined attributes apply, the exceptions are ID and Parent, which are - defined as fields. Additional, the following attributes are added: - * Score - the GFF3 score column - * Phase - the GFF3 phase column for CDS features. - :type attributes: Attributes - - Node in the annotation graph that annotates a contiguous region of a - sequence. - -.. avro:record:: FeatureSet - - :field id: - The ID of this annotation set. - :type id: string - :field datasetId: - The ID of the dataset this annotation set belongs to. - :type datasetId: null|string - :field referenceSetId: - The ID of the reference set which defines the coordinate-space for this - set of annotations. - :type referenceSetId: null|string - :field name: - The display name for this annotation set. - :type name: null|string - :field sourceURI: - The source URI describing the file from which this annotation set was - generated, if any. - :type sourceURI: null|string - :field info: - Remaining structured metadata key-value pairs. - :type info: map> - -.. avro:record:: SearchFeatureSetsRequest - - :field datasetId: - The `Dataset` to search. - :type datasetId: string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /featuresets/search` as JSON. - -.. avro:record:: SearchFeatureSetsResponse - - :field featureSets: - The list of matching feature sets. - :type featureSets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /featuresets/search` expressed as JSON. - -.. avro:record:: SearchFeaturesRequest - - :field featureSetId: - The annotation set to search within. Either `featureSetId` or - `parentId` must be non-empty. - :type featureSetId: null|string - :field parentId: - Restricts the search to direct children of the given parent `feature` - ID. Either `featureSetId` or `parentId` must be non-empty. - :type parentId: null|string - :field referenceName: - Only return features on the reference with this name - (matched to literal reference name as imported from the GFF3). - :type referenceName: string - :field start: - Required. The beginning of the window (0-based, inclusive) for which - overlapping features should be returned. Genomic positions are - non-negative integers less than reference length. Requests spanning the - join of circular genomes are represented as two requests one on each side - of the join (position 0). - :type start: long - :field end: - Required. The end of the window (0-based, exclusive) for which overlapping - features should be returned. - :type end: long - :field featureTypes: - If specified, this query matches only annotations whose `featureType` - matches one of the provided ontology terms. - :type featureTypes: array - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /features/search` as JSON. - -.. avro:record:: SearchFeaturesResponse - - :field features: - The list of matching annotations, sorted by start position. Annotations which - share a start position are returned in a deterministic order. - :type features: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /features/search` expressed as JSON. - diff --git a/doc/source/schemas/sequenceAnnotations.rst b/doc/source/schemas/sequenceAnnotations.rst deleted file mode 100644 index a50b12d3..00000000 --- a/doc/source/schemas/sequenceAnnotations.rst +++ /dev/null @@ -1,342 +0,0 @@ -SequenceAnnotations -******************* - -This protocol defines annotations on GA4GH genomic sequences It includes two -types of annotations: continuous and discrete hierarchical. - -The discrete hierarchical annotations are derived from the Sequence Ontology -(SO) and GFF3 work - - http://www.sequenceontology.org/gff3.shtml - -The goal is to be able to store annotations using the GFF3 and SO conceptual -model, although there is not necessarly a one-to-one mapping in Avro records -to GFF3 records. - -The minimum requirement is to be able to accurately represent the current -state of the art annotation data and the full SO model. Feature is the -core generic record which corresponds to the a GFF3 record. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: OntologyTerm - - :field id: - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - :type id: string - :field term: - Ontology term - the representation the id is pointing to. - :type term: null|string - :field sourceName: - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - :type sourceName: null|string - :field sourceVersion: - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - :type sourceVersion: null|string - - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - -.. avro:record:: Experiment - - :field id: - The experiment UUID. This is globally unique. - :type id: string - :field name: - The name of the experiment. - :type name: null|string - :field description: - A description of the experiment. - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field runTime: - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - :type runTime: null|string - :field molecule: - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - :type molecule: null|string - :field strategy: - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - :type strategy: null|string - :field selection: - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - :type selection: null|string - :field library: - The name of the library used as part of this experiment. - :type library: null|string - :field libraryLayout: - The configuration of sequenced reads. (e.g. Single or Paired) - :type libraryLayout: null|string - :field instrumentModel: - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - :type instrumentModel: null|string - :field instrumentDataFile: - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - :type instrumentDataFile: null|string - :field sequencingCenter: - The sequencing center used as part of this experiment. - :type sequencingCenter: null|string - :field platformUnit: - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - :type platformUnit: null|string - :field info: - A map of additional experiment information. - :type info: map> - - An experimental preparation of a sample. - -.. avro:record:: Dataset - - :field id: - The dataset's id, locally unique to the server instance. - :type id: string - :field name: - The name of the dataset. - :type name: null|string - :field description: - Additional, human-readable information on the dataset. - :type description: null|string - - A Dataset is a collection of related data of multiple types. - Data providers decide how to group data into datasets. - See [Metadata API](../api/metadata.html) for a more detailed discussion. - -.. avro:record:: Analysis - - :field id: - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - :type id: string - :field name: - :type name: null|string - :field description: - :type description: null|string - :field createDateTime: - The time at which this record was created. - Format: :ref:`ISO 8601 ` - :type createDateTime: null|string - :field updateDateTime: - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - :type updateDateTime: string - :field type: - The type of analysis. - :type type: null|string - :field software: - The software run to generate this analysis. - :type software: array - :field info: - A map of additional analysis information. - :type info: map> - - An analysis contains an interpretation of one or several experiments. - (e.g. SNVs, copy number variations, methylation status) together with - information about the methodology used. - -.. avro:record:: Attributes - - :field vals: - :type vals: map> - - Type defining a collection of attributes associated with various protocol - records. Each attribute is a name that maps to an array of one or more - values. Values can be strings, external identifiers, or ontology terms. - Values should be split into the array elements instead of using a separator - syntax that needs to parsed. - -.. avro:record:: Feature - - :field id: - Id of this annotation node. - :type id: string - :field parentId: - Parent Id of this node. Set to empty string if node has no parent. - :type parentId: string - :field childIds: - Ordered array of Child Ids of this node. - Since not all child nodes are ordered by genomic coordinates, - this can't always be reconstructed from parentId's of the children alone. - :type childIds: array - :field featureSetId: - Identifier for the containing feature set. - :type featureSetId: string - :field referenceName: - The reference on which this feature occurs. - (e.g. `chr20` or `X`) - :type referenceName: string - :field start: - The start position at which this feature occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Features spanning the join of circular genomes are represented as - two features one on each side of the join (position 0). - :type start: long - :field end: - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - :type end: long - :field strand: - The strand on which the feature is present. - :type strand: Strand - :field featureType: - Feature that is annotated by this region. Normally, this will be a term in - the Sequence Ontology. - :type featureType: OntologyTerm - :field attributes: - Name/value attributes of the annotation. Attribute names follow the GFF3 - naming convention of reserved names starting with an upper cases - character, and user-define names start with lower-case. Most GFF3 - pre-defined attributes apply, the exceptions are ID and Parent, which are - defined as fields. Additional, the following attributes are added: - * Score - the GFF3 score column - * Phase - the GFF3 phase column for CDS features. - :type attributes: Attributes - - Node in the annotation graph that annotates a contiguous region of a - sequence. - -.. avro:record:: FeatureSet - - :field id: - The ID of this annotation set. - :type id: string - :field datasetId: - The ID of the dataset this annotation set belongs to. - :type datasetId: null|string - :field referenceSetId: - The ID of the reference set which defines the coordinate-space for this - set of annotations. - :type referenceSetId: null|string - :field name: - The display name for this annotation set. - :type name: null|string - :field sourceURI: - The source URI describing the file from which this annotation set was - generated, if any. - :type sourceURI: null|string - :field info: - Remaining structured metadata key-value pairs. - :type info: map> - diff --git a/doc/source/schemas/variantmethods.rst b/doc/source/schemas/variantmethods.rst deleted file mode 100644 index a3641a85..00000000 --- a/doc/source/schemas/variantmethods.rst +++ /dev/null @@ -1,475 +0,0 @@ -VariantMethods -************** - - .. function:: searchVariants(request) - - :param request: SearchVariantsRequest: This request maps to the body of `POST /variants/search` as JSON. - :return type: SearchVariantsResponse - :throws: GAException - -Gets a list of `Variant` matching the search criteria. - -`POST /variants/search` must accept a JSON version of `SearchVariantsRequest` -as the post body and will return a JSON version of `SearchVariantsResponse`. - - .. function:: getCallSet(id) - - :param id: string: The ID of the `CallSet`. - :return type: org.ga4gh.models.CallSet - :throws: GAException - -Gets a `CallSet` by ID. -`GET /callsets/{id}` will return a JSON version of `CallSet`. - - .. function:: searchVariantSets(request) - - :param request: SearchVariantSetsRequest: This request maps to the body of `POST /variantsets/search` as JSON. - :return type: SearchVariantSetsResponse - :throws: GAException - -Gets a list of `VariantSet` matching the search criteria. - -`POST /variantsets/search` must accept a JSON version of -`SearchVariantSetsRequest` as the post body and will return a JSON version -of `SearchVariantSetsResponse`. - - .. function:: getVariantSet(id) - - :param id: string: The ID of the `VariantSet`. - :return type: org.ga4gh.models.VariantSet - :throws: GAException - -Gets a `VariantSet` by ID. -`GET /variantsets/{id}` will return a JSON version of `VariantSet`. - - .. function:: getVariant(id) - - :param id: string: The ID of the `Variant`. - :return type: org.ga4gh.models.Variant - :throws: GAException - -Gets a `Variant` by ID. -`GET /variants/{id}` will return a JSON version of `Variant`. - - .. function:: searchCallSets(request) - - :param request: SearchCallSetsRequest: This request maps to the body of `POST /callsets/search` as JSON. - :return type: SearchCallSetsResponse - :throws: GAException - -Gets a list of `CallSet` matching the search criteria. - -`POST /callsets/search` must accept a JSON version of `SearchCallSetsRequest` -as the post body and will return a JSON version of `SearchCallSetsResponse`. - -.. avro:error:: GAException - - A general exception type. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: VariantSetMetadata - - :field key: - The top-level key. - :type key: string - :field value: - The value field for simple metadata. - :type value: string - :field id: - User-provided ID field, not enforced by this API. - Two or more pieces of structured metadata with identical - id and key fields are considered equivalent. - `FIXME: If it's not enforced, then why can't it be null?` - :type id: string - :field type: - The type of data. - :type type: string - :field number: - The number of values that can be included in a field described by this - metadata. - :type number: string - :field description: - A textual description of this metadata. - :type description: string - :field info: - Remaining structured metadata key-value pairs. - :type info: map> - - Optional metadata associated with a variant set. - -.. avro:record:: VariantSet - - :field id: - The variant set ID. - :type id: string - :field name: - The variant set name. - :type name: null|string - :field datasetId: - The ID of the dataset this variant set belongs to. - :type datasetId: string - :field referenceSetId: - The ID of the reference set that describes the sequences used by the variants in this set. - :type referenceSetId: string - :field metadata: - Optional metadata associated with this variant set. - This array can be used to store information about the variant set, such as information found - in VCF header fields, that isn't already available in first class fields such as "name". - :type metadata: array - - A VariantSet is a collection of variants and variant calls intended to be analyzed together. - -.. avro:record:: CallSet - - :field id: - The call set ID. - :type id: string - :field name: - The call set name. - :type name: null|string - :field sampleId: - The sample this call set's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the sampleId - field in the read groups used to generate this call set. - :type sampleId: null|string - :field variantSetIds: - The IDs of the variant sets this call set has calls in. - :type variantSetIds: array - :field created: - The date this call set was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this call set was last updated in - milliseconds from the epoch. - :type updated: null|long - :field info: - A map of additional call set information. - :type info: map> - - A CallSet is a collection of calls that were generated by the same analysis of the same sample. - -.. avro:record:: Call - - :field callSetName: - The name of the call set this variant call belongs to. - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetName: null|string - :field callSetId: - The ID of the call set this variant call belongs to. - - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetId: null|string - :field genotype: - The genotype of this variant call. - - A 0 value represents the reference allele of the associated `Variant`. Any - other value is a 1-based index into the alternate alleles of the associated - `Variant`. - - If a variant had a referenceBases field of "T", an alternateBases - value of ["A", "C"], and the genotype was [2, 1], that would mean the call - represented the heterozygous value "CA" for this variant. If the genotype - was instead [0, 1] the represented value would be "TA". Ordering of the - genotype values is important if the phaseset field is present. - :type genotype: array - :field phaseset: - If this field is not null, this variant call's genotype ordering implies - the phase of the bases and is consistent with any other variant calls on - the same contig which have the same phaseset string. - :type phaseset: null|string - :field genotypeLikelihood: - The genotype likelihoods for this variant call. Each array entry - represents how likely a specific genotype is for this call as - log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The - value ordering is defined by the GL tag in the VCF spec. - :type genotypeLikelihood: array - :field info: - A map of additional variant call information. - :type info: map> - - A `Call` represents the determination of genotype with respect to a - particular `Variant`. - - It may include associated information such as quality - and phasing. For example, a call might assign a probability of 0.32 to - the occurrence of a SNP named rs1234 in a call set with the name NA12345. - -.. avro:record:: Variant - - :field id: - The variant ID. - :type id: string - :field variantSetId: - The ID of the `VariantSet` this variant belongs to. This transitively defines - the `ReferenceSet` against which the `Variant` is to be interpreted. - :type variantSetId: string - :field names: - Names for the variant, for example a RefSNP ID. - :type names: array - :field created: - The date this variant was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this variant was last updated in - milliseconds from the epoch. - :type updated: null|long - :field referenceName: - The reference on which this variant occurs. - (e.g. `chr20` or `X`) - :type referenceName: string - :field start: - The start position at which this variant occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Variants spanning the join of circular genomes are represented as - two variants one on each side of the join (position 0). - :type start: long - :field end: - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - :type end: long - :field referenceBases: - The reference bases for this variant. They start at the given start position. - :type referenceBases: string - :field alternateBases: - The bases that appear instead of the reference bases. Multiple alternate - alleles are possible. - :type alternateBases: array - :field info: - A map of additional variant information. - :type info: map> - :field calls: - The variant calls for this particular variant. Each one represents the - determination of genotype with respect to this variant. `Call`s in this array - are implicitly associated with this `Variant`. - :type calls: array - - A `Variant` represents a change in DNA sequence relative to some reference. - For example, a variant could represent a SNP or an insertion. - Variants belong to a `VariantSet`. - This is equivalent to a row in VCF. - -.. avro:record:: SearchVariantSetsRequest - - :field datasetId: - The `Dataset` to search. - :type datasetId: string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /variantsets/search` as JSON. - -.. avro:record:: SearchVariantSetsResponse - - :field variantSets: - The list of matching variant sets. - :type variantSets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /variantsets/search` expressed as JSON. - -.. avro:record:: SearchVariantsRequest - - :field variantSetId: - The `VariantSet` to search. - :type variantSetId: string - :field callSetIds: - Only return variant calls which belong to call sets with these IDs. - If an empty array, returns variants without any call objects. - If null, returns all variant calls. - :type callSetIds: null|array - :field referenceName: - Required. Only return variants on this reference. - :type referenceName: string - :field start: - Required. The beginning of the window (0-based, inclusive) for - which overlapping variants should be returned. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - :type start: long - :field end: - Required. The end of the window (0-based, exclusive) for which overlapping - variants should be returned. - :type end: long - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /variants/search` as JSON. - -.. avro:record:: SearchVariantsResponse - - :field variants: - The list of matching variants. - If the `callSetId` field on the returned calls is not present, - the ordering of the call sets from a `SearchCallSetsRequest` - over the parent `VariantSet` is guaranteed to match the ordering - of the calls on each `Variant`. The number of results will also be - the same. - :type variants: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /variants/search` expressed as JSON. - -.. avro:record:: SearchCallSetsRequest - - :field variantSetId: - The VariantSet to search. - :type variantSetId: string - :field name: - Only return call sets with this name (case-sensitive, exact match). - :type name: null|string - :field pageSize: - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - :type pageSize: null|int - :field pageToken: - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - :type pageToken: null|string - - This request maps to the body of `POST /callsets/search` as JSON. - -.. avro:record:: SearchCallSetsResponse - - :field callSets: - The list of matching call sets. - :type callSets: array - :field nextPageToken: - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - :type nextPageToken: null|string - - This is the response from `POST /callsets/search` expressed as JSON. - diff --git a/doc/source/schemas/variants.rst b/doc/source/schemas/variants.rst deleted file mode 100644 index f51ccd9c..00000000 --- a/doc/source/schemas/variants.rst +++ /dev/null @@ -1,297 +0,0 @@ -Variants -******** - -This file defines the objects used to represent variant calls, most importantly -VariantSet, Variant, and Call. -See {TODO: LINK TO VARIANTS OVERVIEW} for more information. - -.. avro:enum:: Strand - - :symbols: NEG_STRAND|POS_STRAND - Indicates the DNA strand associate for some data item. - * `NEG_STRAND`: The negative (-) strand. - * `POS_STRAND`: The postive (+) strand. - -.. avro:record:: Position - - :field referenceName: - The name of the `Reference` on which the `Position` is located. - :type referenceName: string - :field position: - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - :type position: long - :field strand: - Strand the position is associated with. - :type strand: Strand - - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - -.. avro:record:: ExternalIdentifier - - :field database: - The source of the identifier. - (e.g. `Ensembl`) - :type database: string - :field identifier: - The ID defined by the external database. - (e.g. `ENST00000000000`) - :type identifier: string - :field version: - The version of the object or the database - (e.g. `78`) - :type version: string - - Identifier from a public database - -.. avro:enum:: CigarOperation - - :symbols: ALIGNMENT_MATCH|INSERT|DELETE|SKIP|CLIP_SOFT|CLIP_HARD|PAD|SEQUENCE_MATCH|SEQUENCE_MISMATCH - An enum for the different types of CIGAR alignment operations that exist. - Used wherever CIGAR alignments are used. The different enumerated values - have the following usage: - - * `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. - * `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. - * `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. - * `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. - * `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. - * `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. - * `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. - * `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. - * `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. - -.. avro:record:: CigarUnit - - :field operation: - The operation type. - :type operation: CigarOperation - :field operationLength: - The number of bases that the operation runs for. - :type operationLength: long - :field referenceSequence: - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - :type referenceSequence: null|string - - A structure for an instance of a CIGAR operation. - `FIXME: This belongs under Reads (only readAlignment refers to this)` - -.. avro:record:: VariantSetMetadata - - :field key: - The top-level key. - :type key: string - :field value: - The value field for simple metadata. - :type value: string - :field id: - User-provided ID field, not enforced by this API. - Two or more pieces of structured metadata with identical - id and key fields are considered equivalent. - `FIXME: If it's not enforced, then why can't it be null?` - :type id: string - :field type: - The type of data. - :type type: string - :field number: - The number of values that can be included in a field described by this - metadata. - :type number: string - :field description: - A textual description of this metadata. - :type description: string - :field info: - Remaining structured metadata key-value pairs. - :type info: map> - - Optional metadata associated with a variant set. - -.. avro:record:: VariantSet - - :field id: - The variant set ID. - :type id: string - :field name: - The variant set name. - :type name: null|string - :field datasetId: - The ID of the dataset this variant set belongs to. - :type datasetId: string - :field referenceSetId: - The ID of the reference set that describes the sequences used by the variants in this set. - :type referenceSetId: string - :field metadata: - Optional metadata associated with this variant set. - This array can be used to store information about the variant set, such as information found - in VCF header fields, that isn't already available in first class fields such as "name". - :type metadata: array - - A VariantSet is a collection of variants and variant calls intended to be analyzed together. - -.. avro:record:: CallSet - - :field id: - The call set ID. - :type id: string - :field name: - The call set name. - :type name: null|string - :field sampleId: - The sample this call set's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the sampleId - field in the read groups used to generate this call set. - :type sampleId: null|string - :field variantSetIds: - The IDs of the variant sets this call set has calls in. - :type variantSetIds: array - :field created: - The date this call set was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this call set was last updated in - milliseconds from the epoch. - :type updated: null|long - :field info: - A map of additional call set information. - :type info: map> - - A CallSet is a collection of calls that were generated by the same analysis of the same sample. - -.. avro:record:: Call - - :field callSetName: - The name of the call set this variant call belongs to. - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetName: null|string - :field callSetId: - The ID of the call set this variant call belongs to. - - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - :type callSetId: null|string - :field genotype: - The genotype of this variant call. - - A 0 value represents the reference allele of the associated `Variant`. Any - other value is a 1-based index into the alternate alleles of the associated - `Variant`. - - If a variant had a referenceBases field of "T", an alternateBases - value of ["A", "C"], and the genotype was [2, 1], that would mean the call - represented the heterozygous value "CA" for this variant. If the genotype - was instead [0, 1] the represented value would be "TA". Ordering of the - genotype values is important if the phaseset field is present. - :type genotype: array - :field phaseset: - If this field is not null, this variant call's genotype ordering implies - the phase of the bases and is consistent with any other variant calls on - the same contig which have the same phaseset string. - :type phaseset: null|string - :field genotypeLikelihood: - The genotype likelihoods for this variant call. Each array entry - represents how likely a specific genotype is for this call as - log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The - value ordering is defined by the GL tag in the VCF spec. - :type genotypeLikelihood: array - :field info: - A map of additional variant call information. - :type info: map> - - A `Call` represents the determination of genotype with respect to a - particular `Variant`. - - It may include associated information such as quality - and phasing. For example, a call might assign a probability of 0.32 to - the occurrence of a SNP named rs1234 in a call set with the name NA12345. - -.. avro:record:: Variant - - :field id: - The variant ID. - :type id: string - :field variantSetId: - The ID of the `VariantSet` this variant belongs to. This transitively defines - the `ReferenceSet` against which the `Variant` is to be interpreted. - :type variantSetId: string - :field names: - Names for the variant, for example a RefSNP ID. - :type names: array - :field created: - The date this variant was created in milliseconds from the epoch. - :type created: null|long - :field updated: - The time at which this variant was last updated in - milliseconds from the epoch. - :type updated: null|long - :field referenceName: - The reference on which this variant occurs. - (e.g. `chr20` or `X`) - :type referenceName: string - :field start: - The start position at which this variant occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Variants spanning the join of circular genomes are represented as - two variants one on each side of the join (position 0). - :type start: long - :field end: - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - :type end: long - :field referenceBases: - The reference bases for this variant. They start at the given start position. - :type referenceBases: string - :field alternateBases: - The bases that appear instead of the reference bases. Multiple alternate - alleles are possible. - :type alternateBases: array - :field info: - A map of additional variant information. - :type info: map> - :field calls: - The variant calls for this particular variant. Each one represents the - determination of genotype with respect to this variant. `Call`s in this array - are implicitly associated with this `Variant`. - :type calls: array - - A `Variant` represents a change in DNA sequence relative to some reference. - For example, a variant could represent a SNP or an insertion. - Variants belong to a `VariantSet`. - This is equivalent to a row in VCF. - diff --git a/tools/sphinx/avpr2rest.py b/tools/sphinx/avpr2rest.py index 143fd018..0f0df1c4 100644 --- a/tools/sphinx/avpr2rest.py +++ b/tools/sphinx/avpr2rest.py @@ -56,16 +56,13 @@ def cleanup_doc(doc,indent=0): # process formal parameters ('request') request = message_def['request'] # collect the names - param_names = [] - for param in request: - param_names.append(param['name']) + response = message_def['response'] errors = message_def['errors'] output += " .. function:: %s(%s)\n\n" % (message_name, - ', '.join(param_names)) - for param in request: - output += " :param %s: %s: %s\n" % (param['name'], param['type'], - param['doc']) + ', '.join([request['name']])) + output += " :param %s: %s: %s\n" % (request['name'], request['type'], + request['doc']) output += " :return type: %s\n" % response output += " :throws: %s\n\n" % ', '.join(errors) output += cleanup_doc(doc) @@ -74,7 +71,7 @@ def cleanup_doc(doc,indent=0): for item in data['types']: output += '.. avro:%s:: %s\n\n' % (item['type'], item['name']) - if item['type'] == 'record': + if item['type'] == 'message': for field in item['fields']: output += ' :field %s:\n' % field['name'] if 'doc' in field: @@ -142,16 +139,14 @@ def typename(typeobject): # process formal parameters ('request') request = message_def['request'] # collect the names - param_names = [] - for param in request: - param_names.append(param['name']) + param_names = [request['name']] response = message_def['response'] errors = message_def['errors'] output += " .. function:: %s(%s)\n\n" % (message_name, ', '.join(param_names)) for param in request: - output += " :param %s: %s: %s\n" % (param['name'], param['type'], - param['doc']) + output += " :param %s: %s: %s\n" % (request['name'], request['type'], + request['doc']) output += " :return type: %s\n" % response output += " :throws: %s\n\n" % ', '.join(errors) output += cleanup_doc(doc) @@ -160,7 +155,7 @@ def typename(typeobject): for item in data['types']: output += '.. avro:%s:: %s\n\n' % (item['type'], item['name']) - if item['type'] == 'record': + if item['type'] == 'message': for field in item['fields']: output += ' :field %s:\n' % field['name'] if 'doc' in field: diff --git a/tools/sphinx/avrodomain.py b/tools/sphinx/avrodomain.py index f0453541..23a9c238 100644 --- a/tools/sphinx/avrodomain.py +++ b/tools/sphinx/avrodomain.py @@ -71,7 +71,7 @@ def get_index_text(self,name): return _('%s (Avro fixed-width value)') % name if self.objtype == 'enum': return _('%s (Avro enum)') % name - if self.objtype == 'record': + if self.objtype == 'message': return _('%s (Avro record)') % name if self.objtype == 'error': return _('%s (Avro error)') % name @@ -109,12 +109,12 @@ class AvroEnum(AvroObject): ] class AvroRecord(AvroObject): - prefix = 'record' + prefix = 'message' doc_field_types = [ TypedField('fields', label=l_('Fields'), names=('field','member'), typenames=('type',), - typerolename='record') + typerolename='message') ] class AvroError(AvroRecord): @@ -139,7 +139,7 @@ class AvroDomain(Domain): object_types = { 'fixed': ObjType(l_('fixed'), 'fixed'), 'enum': ObjType(l_('enum'), 'enum'), - 'record': ObjType(l_('record'), 'record'), + 'message': ObjType(l_('message'), 'message'), 'error': ObjType(l_('error'), 'error'), 'rpc': ObjType(l_('rpc'), 'rpc'), } @@ -147,7 +147,7 @@ class AvroDomain(Domain): directives = { 'fixed': AvroFixedField, 'enum': AvroEnum, - 'record': AvroRecord, + 'message': AvroRecord, 'error': AvroError, 'rpc': AvroRPCMessage } @@ -155,7 +155,7 @@ class AvroDomain(Domain): roles = { 'fixed': XRefRole(), 'enum': XRefRole(), - 'record': XRefRole(), + 'message': XRefRole(), 'error': XRefRole(), 'rpc': XRefRole() } diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 2974d1d9..f2943888 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -80,9 +80,6 @@ def _traverse(package, items, tree): for nested_item in _traverse(nested, nested_package): yield nested_item, nested_package - import pprint - open("dump", "w").write(pprint.pformat(proto_file.source_code_info)) - tree = collections.defaultdict(collections.defaultdict) for loc in proto_file.source_code_info.location: if loc.leading_comments or loc.trailing_comments: @@ -165,6 +162,7 @@ def generate_code(request, response): "request": { "name": "request", "type": m.input_type[1:], + "doc": '' }, "response": m.output_type[1:], "errors" : [ "GAException" ] From ed3f56591119e91478352a3217be15c36d8f52a4 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 14 Jun 2016 17:26:58 +0100 Subject: [PATCH 07/40] Request args for services should be a list --- tools/sphinx/protobuf-json-docs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index f2943888..e5bf7940 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -159,11 +159,11 @@ def generate_code(request, response): for m in item.method: messages[m.name] = { "doc": m.comment, - "request": { + "request": [{ "name": "request", "type": m.input_type[1:], "doc": '' - }, + }], "response": m.output_type[1:], "errors" : [ "GAException" ] } From bf975a72c369212988a3a96d2e18c9598c9abac1 Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Tue, 14 Jun 2016 17:39:07 +0100 Subject: [PATCH 08/40] Undo previous change to avpr2rest.py now that request args is a list --- tools/sphinx/avpr2rest.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/sphinx/avpr2rest.py b/tools/sphinx/avpr2rest.py index 0f0df1c4..4cb86e0d 100644 --- a/tools/sphinx/avpr2rest.py +++ b/tools/sphinx/avpr2rest.py @@ -56,13 +56,16 @@ def cleanup_doc(doc,indent=0): # process formal parameters ('request') request = message_def['request'] # collect the names - + param_names = [] + for param in request: + param_names.append(param['name']) response = message_def['response'] errors = message_def['errors'] output += " .. function:: %s(%s)\n\n" % (message_name, - ', '.join([request['name']])) - output += " :param %s: %s: %s\n" % (request['name'], request['type'], - request['doc']) + ', '.join(param_names)) + for param in request: + output += " :param %s: %s: %s\n" % (param['name'], param['type'], + param['doc']) output += " :return type: %s\n" % response output += " :throws: %s\n\n" % ', '.join(errors) output += cleanup_doc(doc) @@ -139,14 +142,16 @@ def typename(typeobject): # process formal parameters ('request') request = message_def['request'] # collect the names - param_names = [request['name']] + param_names = [] + for param in request: + param_names.append(param['name']) response = message_def['response'] errors = message_def['errors'] output += " .. function:: %s(%s)\n\n" % (message_name, ', '.join(param_names)) for param in request: - output += " :param %s: %s: %s\n" % (request['name'], request['type'], - request['doc']) + output += " :param %s: %s: %s\n" % (param['name'], param['type'], + param['doc']) output += " :return type: %s\n" % response output += " :throws: %s\n\n" % ', '.join(errors) output += cleanup_doc(doc) From 7c7979487ada1be6a4b57fa52c215e7c5bb54c38 Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Tue, 14 Jun 2016 18:44:57 +0100 Subject: [PATCH 09/40] Show the bullet list for enum values docs --- tools/sphinx/protobuf-json-docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index e5bf7940..a6792505 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -153,7 +153,7 @@ def generate_code(request, response): 'type': 'enum', 'symbols': [v.name for v in item.value] }) - data["doc"] += " ".join(comments) + data["doc"] += "\n" + " ".join(comments) types.append(data) elif item.kind == ServiceDescriptorProto: for m in item.method: From 883982d1ae2aed74dc2ebc23b73ddaa487d3cbb9 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:06:13 +0100 Subject: [PATCH 10/40] Add Conda environment and RTD config --- environment.yml | 17 +++++++++++++++++ readthedocs.yml | 2 ++ 2 files changed, 19 insertions(+) create mode 100644 environment.yml create mode 100644 readthedocs.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..edf5ea0b --- /dev/null +++ b/environment.yml @@ -0,0 +1,17 @@ +name: ga4gh +dependencies: +- ioos::protobuf=3.0.0b2.post2=py27_3 +- openssl=1.0.2h=1 +- pip=8.1.2=py27_0 +- python=2.7.11=0 +- readline=6.2=2 +- setuptools=23.0.0=py27_0 +- six=1.10.0=py27_0 +- sqlite=3.13.0=0 +- tk=8.5.18=0 +- wheel=0.29.0=py27_0 +- zlib=1.2.8=3 +- pip: + - protobuf==3.0.0b2 +prefix: /home/palfrey/.miniconda2/envs/ga4gh + diff --git a/readthedocs.yml b/readthedocs.yml new file mode 100644 index 00000000..5d3b36c7 --- /dev/null +++ b/readthedocs.yml @@ -0,0 +1,2 @@ +conda: + file: environment.yml From cc632f8906234522caa99e8d0089a821bb90f12f Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:08:57 +0100 Subject: [PATCH 11/40] Remove name from environment.yml --- environment.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/environment.yml b/environment.yml index edf5ea0b..850c674a 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,3 @@ -name: ga4gh dependencies: - ioos::protobuf=3.0.0b2.post2=py27_3 - openssl=1.0.2h=1 @@ -14,4 +13,3 @@ dependencies: - pip: - protobuf==3.0.0b2 prefix: /home/palfrey/.miniconda2/envs/ga4gh - From d71d6236d526f6a2346cdb474a18c10870859479 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:15:17 +0100 Subject: [PATCH 12/40] Correct channel data for Conda (split out "ioos") --- environment.yml | 3 +-- readthedocs.yml | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 850c674a..69cab5a9 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,5 @@ dependencies: -- ioos::protobuf=3.0.0b2.post2=py27_3 +- protobuf=3.0.0b2.post2=py27_3 - openssl=1.0.2h=1 - pip=8.1.2=py27_0 - python=2.7.11=0 @@ -12,4 +12,3 @@ dependencies: - zlib=1.2.8=3 - pip: - protobuf==3.0.0b2 -prefix: /home/palfrey/.miniconda2/envs/ga4gh diff --git a/readthedocs.yml b/readthedocs.yml index 5d3b36c7..b259c80c 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,2 +1,4 @@ conda: file: environment.yml + channels: + - ioos From 6a5bef3c381b0a0d198473d80bc063187c987e39 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:17:50 +0100 Subject: [PATCH 13/40] Fix spacing issues in readthedocs.yml --- readthedocs.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/readthedocs.yml b/readthedocs.yml index b259c80c..997c5553 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,4 +1,4 @@ conda: - file: environment.yml - channels: - - ioos + file: environment.yml + channels: + - ioos From bb70a42dc125aa998ca83eaf46eead519479f9e8 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:19:23 +0100 Subject: [PATCH 14/40] Conda channels need to be in conda environment, not RTD config --- environment.yml | 4 ++-- readthedocs.yml | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 69cab5a9..f6d88421 100644 --- a/environment.yml +++ b/environment.yml @@ -1,3 +1,5 @@ +channels: +- ioos dependencies: - protobuf=3.0.0b2.post2=py27_3 - openssl=1.0.2h=1 @@ -10,5 +12,3 @@ dependencies: - tk=8.5.18=0 - wheel=0.29.0=py27_0 - zlib=1.2.8=3 -- pip: - - protobuf==3.0.0b2 diff --git a/readthedocs.yml b/readthedocs.yml index 997c5553..aebb9e03 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,4 +1,2 @@ conda: file: environment.yml - channels: - - ioos From 199c1a3c85c5bc150bfc033b52a1fc2b3fe4a816 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:31:18 +0100 Subject: [PATCH 15/40] Test running protoc --- doc/source/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index a393c1e3..7d0a3b13 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -14,6 +14,7 @@ import sys import os +import subprocess # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -36,6 +37,8 @@ 'avrodomain', ] +subprocess.check_call("protoc") + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] From bb8a7ee51432a87fd12e310c08ff25f9f4e1350b Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:49:56 +0100 Subject: [PATCH 16/40] Build protobuf files before running the rest of Sphinx --- doc/source/conf.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 7d0a3b13..72e76ef7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -19,7 +19,8 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('../../tools/sphinx')) +sphinx_path = '../../tools/sphinx' +sys.path.insert(0, os.path.abspath(sphinx_path)) # -- General configuration ------------------------------------------------ @@ -37,7 +38,18 @@ 'avrodomain', ] -subprocess.check_call("protoc") +base_dir = "../../src/main/proto" +json_dir = os.path.join(base_dir, "json") +schema_dir = os.path.join(base_dir, "ga4gh") +for protofile in os.listdir(schema_dir): + fullpath = os.path.join(schema_dir, protofile) + json_file = protofile.replace(".proto", ".json") + cmd = "protoc --proto_path %s --plugin=protoc-gen-custom=%s --custom_out=%s %s" % (base_dir, os.path.join(sphinx_path, "protobuf-json-docs.py"), json_dir, fullpath) + print cmd + subprocess.check_call(cmd, shell=True) + cmd = "python %s %s/ga4gh/%s %s" %(os.path.join(sphinx_path, "avpr2rest.py"), json_dir, json_file, "schemas/%s.rst" % protofile) + print cmd + subprocess.check_call(cmd, shell=True) # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] From 1ecb071443c51d2774f6090c93a9e30c0be92e1a Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:52:57 +0100 Subject: [PATCH 17/40] Use requirements to pull in Protobuf python bits --- readthedocs.yml | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/readthedocs.yml b/readthedocs.yml index aebb9e03..1a9d1cf9 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,2 +1,3 @@ conda: file: environment.yml +requirements_file: requirements.txt diff --git a/requirements.txt b/requirements.txt index 9eff163c..2effd325 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ PyYAML -avro +protobuf==3.0.0b3 flake8 humanize nose From cf6aaeb166c32a55984f20a47a9653c8e8baba98 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:55:15 +0100 Subject: [PATCH 18/40] Add Protobuf to Conda deps --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index f6d88421..8bb8e716 100644 --- a/environment.yml +++ b/environment.yml @@ -12,3 +12,5 @@ dependencies: - tk=8.5.18=0 - wheel=0.29.0=py27_0 - zlib=1.2.8=3 +- pip: + - protobuf==3.0.0b3 From a0621a7441f86e902933e5b86ddd7e272339a087 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 11:59:46 +0100 Subject: [PATCH 19/40] Make json dir before using it --- doc/source/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index 72e76ef7..bb3114c5 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -40,6 +40,8 @@ base_dir = "../../src/main/proto" json_dir = os.path.join(base_dir, "json") +if not os.path.exists(json_dir): + os.mkdir(json_dir) schema_dir = os.path.join(base_dir, "ga4gh") for protofile in os.listdir(schema_dir): fullpath = os.path.join(schema_dir, protofile) From 54685e3772e500b6f9f263cab512c118cc047c10 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 12:05:47 +0100 Subject: [PATCH 20/40] Fix avpr2rest command line --- doc/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index bb3114c5..b5aad21a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -45,11 +45,11 @@ schema_dir = os.path.join(base_dir, "ga4gh") for protofile in os.listdir(schema_dir): fullpath = os.path.join(schema_dir, protofile) - json_file = protofile.replace(".proto", ".json") + json_file = protofile + ".json" cmd = "protoc --proto_path %s --plugin=protoc-gen-custom=%s --custom_out=%s %s" % (base_dir, os.path.join(sphinx_path, "protobuf-json-docs.py"), json_dir, fullpath) print cmd subprocess.check_call(cmd, shell=True) - cmd = "python %s %s/ga4gh/%s %s" %(os.path.join(sphinx_path, "avpr2rest.py"), json_dir, json_file, "schemas/%s.rst" % protofile) + cmd = "python %s %s/ga4gh/%s schemas/" %(os.path.join(sphinx_path, "avpr2rest.py"), json_dir, json_file) print cmd subprocess.check_call(cmd, shell=True) From bb5e16f2000247fd206f7a2cb7941e676cb4d0ec Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 12:20:17 +0100 Subject: [PATCH 21/40] Tidy up the JSON temp directory, and put environment.yml under docs --- doc/source/conf.py | 2 +- environment.yml => doc/source/environment.yml | 0 readthedocs.yml | 3 +-- 3 files changed, 2 insertions(+), 3 deletions(-) rename environment.yml => doc/source/environment.yml (100%) diff --git a/doc/source/conf.py b/doc/source/conf.py index b5aad21a..54ab0d79 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -39,7 +39,7 @@ ] base_dir = "../../src/main/proto" -json_dir = os.path.join(base_dir, "json") +json_dir = os.path.join("_build", "json-temp") if not os.path.exists(json_dir): os.mkdir(json_dir) schema_dir = os.path.join(base_dir, "ga4gh") diff --git a/environment.yml b/doc/source/environment.yml similarity index 100% rename from environment.yml rename to doc/source/environment.yml diff --git a/readthedocs.yml b/readthedocs.yml index 1a9d1cf9..80e30687 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,3 +1,2 @@ conda: - file: environment.yml -requirements_file: requirements.txt + file: doc/source/environment.yml From f6525553d689b441fd2a942d802ae9a955c1c2d0 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 14:11:38 +0100 Subject: [PATCH 22/40] Use makedirs not mkdir to make the whole temporary json path --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 54ab0d79..29c341e1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -41,7 +41,7 @@ base_dir = "../../src/main/proto" json_dir = os.path.join("_build", "json-temp") if not os.path.exists(json_dir): - os.mkdir(json_dir) + os.makedirs(json_dir) schema_dir = os.path.join(base_dir, "ga4gh") for protofile in os.listdir(schema_dir): fullpath = os.path.join(schema_dir, protofile) From 60f33eb01cabe7efdd2888c968987b70d2107aee Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Wed, 15 Jun 2016 15:20:00 +0100 Subject: [PATCH 23/40] Handle multiline comments --- tools/sphinx/protobuf-json-docs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index a6792505..b35f17fb 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -35,7 +35,8 @@ def __init__(self, prot): def traverse(proto_file): def _collapse_comments(comments): - return (comments["leading_comments"] + comments["trailing_comments"]).strip() + return '\n'.join( + [c.strip() for c in (comments["leading_comments"] + comments["trailing_comments"]).split('\n')]) def _traverse(package, items, tree): for item_index, item in enumerate(items): @@ -170,8 +171,7 @@ def generate_code(request, response): else: raise Exception, item.kind - - comments = "".join(results["file"]).strip() + comments = "\n".join(results["file"]) output = { "types": types, "messages": messages, From 72c9a7c5313e03f649653838cad2a82083682382 Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Wed, 15 Jun 2016 15:43:33 +0100 Subject: [PATCH 24/40] Remove no longer used doc field --- .gitignore | 1 + tools/sphinx/avpr2rest.py | 6 ++---- tools/sphinx/protobuf-json-docs.py | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 979ea845..cc03585d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ target *~ #* doc/source/schemas/*.proto.rst +doc/source/schemas/build.rst build #********** windows template********** diff --git a/tools/sphinx/avpr2rest.py b/tools/sphinx/avpr2rest.py index 4cb86e0d..23bb7287 100644 --- a/tools/sphinx/avpr2rest.py +++ b/tools/sphinx/avpr2rest.py @@ -64,8 +64,7 @@ def cleanup_doc(doc,indent=0): output += " .. function:: %s(%s)\n\n" % (message_name, ', '.join(param_names)) for param in request: - output += " :param %s: %s: %s\n" % (param['name'], param['type'], - param['doc']) + output += " :param %s: %s\n" % (param['name'], param['type']) output += " :return type: %s\n" % response output += " :throws: %s\n\n" % ', '.join(errors) output += cleanup_doc(doc) @@ -150,8 +149,7 @@ def typename(typeobject): output += " .. function:: %s(%s)\n\n" % (message_name, ', '.join(param_names)) for param in request: - output += " :param %s: %s: %s\n" % (param['name'], param['type'], - param['doc']) + output += " :param %s: %s\n" % (param['name'], param['type']) output += " :return type: %s\n" % response output += " :throws: %s\n\n" % ', '.join(errors) output += cleanup_doc(doc) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index b35f17fb..6251914d 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -163,7 +163,6 @@ def generate_code(request, response): "request": [{ "name": "request", "type": m.input_type[1:], - "doc": '' }], "response": m.output_type[1:], "errors" : [ "GAException" ] From b25826742d977f475c9aef7d76e86ea7c04d1172 Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Wed, 15 Jun 2016 16:02:45 +0100 Subject: [PATCH 25/40] Fix rst errors --- doc/source/api/reads.rst | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/doc/source/api/reads.rst b/doc/source/api/reads.rst index ccb52fc7..fe05f4f9 100644 --- a/doc/source/api/reads.rst +++ b/doc/source/api/reads.rst @@ -20,35 +20,35 @@ The model has the following data types: ============================== ============================================ ================== Record | Description SAM/BAM rough equivalent ============================== ============================================ ================== -:avro:record:`ReadAlignment` | One alignment for one read A single line in a file -:avro:record:`ReadGroup` | A group of read alignments A single RG tag -:avro:record:`ReadGroupSet` | Collecton of ReadGroups that map to the Single SAM/BAM file +:avro:message:`ReadAlignment` | One alignment for one read A single line in a file +:avro:message:`ReadGroup` | A group of read alignments A single RG tag +:avro:message:`ReadGroupSet` | Collecton of ReadGroups that map to the Single SAM/BAM file | same genome -:avro:record:`Program` | Software version and parameters that were PN, CL tags in SAM header +:avro:message:`Program` | Software version and parameters that were PN, CL tags in SAM header | used to align reads to the genome -:avro:record:`ReadStats` | Counts of aligned and unaligned reads Samtools flagstats on a file +:avro:message:`ReadStats` | Counts of aligned and unaligned reads Samtools flagstats on a file | for a ReadGroup or ReadGroupSet ============================== ============================================ ================== The relationships are mostly one to many (e.g. each -:avro:record:`ReadAlignment` is part of exactly one -:avro:record:`ReadGroup`), with the exception that a -:avro:record:`ReadGroup` is allowed to be part of more than one -:avro:record:`ReadGroupSet`. +:avro:message:`ReadAlignment` is part of exactly one +:avro:message:`ReadGroup`), with the exception that a +:avro:message:`ReadGroup` is allowed to be part of more than one +:avro:message:`ReadGroupSet`. -:avro:record:`Dataset` --< :avro:record:`ReadGroupSet` >--< :avro:record:`ReadGroup` --< :avro:record:`ReadAlignment` +:avro:message:`Dataset` --< :avro:message:`ReadGroupSet` >--< :avro:message:`ReadGroup` --< :avro:message:`ReadAlignment` -* A :avro:record:`Dataset` is a general-purpose container, defined in +* A :avro:message:`Dataset` is a general-purpose container, defined in metadata.avdl. -* A :avro:record:`ReadGroupSet` is a logical collection of ReadGroups, +* A :avro:message:`ReadGroupSet` is a logical collection of ReadGroups, as determined by the data owner. Typically one - :avro:record:`ReadGroupSet` represents all the Reads from one + :avro:message:`ReadGroupSet` represents all the Reads from one experimental sample, which traditionally would be stored in a single BAM file. -* A :avro:record:`ReadGroup` is all the data that's processed the same +* A :avro:message:`ReadGroup` is all the data that's processed the same way by the sequencer. There are typically 1-10 ReadGroups in a - :avro:record:`ReadGroupSet`. -* A :avro:record:`ReadAlignment` object is a flattened representation + :avro:message:`ReadGroupSet`. +* A :avro:message:`ReadAlignment` object is a flattened representation of several layers of bioinformatics hierarchy, including fragments, reads, and alignments, stored in one object for easy access. @@ -56,9 +56,9 @@ The relationships are mostly one to many (e.g. each ReadAlignment: detailed discussion @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -One :avro:record:`ReadAlignment` object represents the following +One :avro:message:`ReadAlignment` object represents the following logical hierarchy. See the field definitions in the -:avro:record:`ReadAlignment` object for more details. +:avro:message:`ReadAlignment` object for more details. .. image:: /_static/read_alignment_diagrams.png From d48bec991e19298d53d73068a44027293fa84bf9 Mon Sep 17 00:00:00 2001 From: Irene Papakonstantinou Date: Wed, 15 Jun 2016 16:30:46 +0100 Subject: [PATCH 26/40] Add doc/source/_build to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cc03585d..b04b71f2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ target #* doc/source/schemas/*.proto.rst doc/source/schemas/build.rst +doc/source/_build/ build #********** windows template********** From 3e523808c54f8c3e4dbde503d89088f45a491374 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 16:55:56 +0100 Subject: [PATCH 27/40] Link enum/message references --- tools/sphinx/protobuf-json-docs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 6251914d..f6dc9fac 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -21,6 +21,8 @@ def __init__(self, prot): elif isinstance(prot, EnumValueDescriptorProto): self.number = prot.number elif isinstance(prot, FieldDescriptorProto): + if prot.type in [11, 14]: + self.ref_type = prot.type_name.replace(".ga4gh.", "") self.type = prot.type elif isinstance(prot, ServiceDescriptorProto): self.method = [convert_protodef_to_editable(x) for x in prot.method] @@ -134,12 +136,10 @@ def generate_code(request, response): kind = "boolean" elif f.type in [9]: kind = "string" - elif f.type in [11]: - kind = "message" + elif f.type in [11, 14]: + kind = ":avro:message:`%s`" % f.ref_type elif f.type in [12]: kind = "bytes" - elif f.type in [14]: - kind = "enum" else: raise Exception, f.type data["fields"].append({ From 051ffb5523b03c4b12b5eae2fd1c419fdddb9c33 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 16:58:46 +0100 Subject: [PATCH 28/40] Also link service request/response types --- tools/sphinx/protobuf-json-docs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index f6dc9fac..27a6b317 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -162,10 +162,10 @@ def generate_code(request, response): "doc": m.comment, "request": [{ "name": "request", - "type": m.input_type[1:], + "type": ":avro:message:`%s`" % m.input_type.replace(".ga4gh.", ""), }], - "response": m.output_type[1:], - "errors" : [ "GAException" ] + "response": ":avro:message:`%s`" % m.output_type.replace(".ga4gh.", ""), + "errors" : [ ":avro:message:`GAException`" ] } else: raise Exception, item.kind From 03dc144b24ec2689f038aced2eecf77ee2c581c7 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 17:23:47 +0100 Subject: [PATCH 29/40] Correct nested item support --- tools/sphinx/protobuf-json-docs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 27a6b317..7a7a9fdb 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -18,6 +18,8 @@ def __init__(self, prot): self.value = [convert_protodef_to_editable(x) for x in prot.value] elif isinstance(prot, DescriptorProto): self.field = [convert_protodef_to_editable(x) for x in prot.field] + self.enum_type = [convert_protodef_to_editable(x) for x in prot.enum_type] + self.nested_type = prot.nested_type elif isinstance(prot, EnumValueDescriptorProto): self.number = prot.number elif isinstance(prot, FieldDescriptorProto): @@ -32,6 +34,7 @@ def __init__(self, prot): else: raise Exception, type(prot) + return Editable(proto) def traverse(proto_file): @@ -73,15 +76,15 @@ def _traverse(package, items, tree): yield item, package - if isinstance(item, DescriptorProto): + if item.kind is DescriptorProto: for enum in item.enum_type: yield enum, package for nested in item.nested_type: - nested_package = package + item.name + nested_package = package + "." + item.name - for nested_item in _traverse(nested, nested_package): - yield nested_item, nested_package + for nested_item, np in _traverse(nested_package, [nested], tree[item_index]): + yield nested_item, np tree = collections.defaultdict(collections.defaultdict) for loc in proto_file.source_code_info.location: @@ -114,7 +117,7 @@ def generate_code(request, response): results = traverse(proto_file) for item, package in results["types"]: data = { - 'name': item.name, + 'name': (package + "." + item.name).replace("ga4gh.", ""), 'doc': item.comment } From dbfadb3920a4e2e3edec42288beeafd271a83fb6 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 17:45:27 +0100 Subject: [PATCH 30/40] Add "list of" to repeated fields --- tools/sphinx/protobuf-json-docs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 7a7a9fdb..402055a5 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -26,6 +26,7 @@ def __init__(self, prot): if prot.type in [11, 14]: self.ref_type = prot.type_name.replace(".ga4gh.", "") self.type = prot.type + self.label = prot.label elif isinstance(prot, ServiceDescriptorProto): self.method = [convert_protodef_to_editable(x) for x in prot.method] elif isinstance(prot, MethodDescriptorProto): @@ -141,6 +142,8 @@ def generate_code(request, response): kind = "string" elif f.type in [11, 14]: kind = ":avro:message:`%s`" % f.ref_type + if f.label == 3: # LABEL_REPEATED + kind = "list of " + kind elif f.type in [12]: kind = "bytes" else: From a56dd953c0fd6105fda9284ca3fe9ca4635d8038 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Wed, 15 Jun 2016 17:46:42 +0100 Subject: [PATCH 31/40] Rename avpr2rest -> protodoc2rst --- doc/source/conf.py | 2 +- doc/source/schemas/Makefile | 2 +- tools/sphinx/{avpr2rest.py => protodoc2rst.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename tools/sphinx/{avpr2rest.py => protodoc2rst.py} (100%) diff --git a/doc/source/conf.py b/doc/source/conf.py index 29c341e1..91b3ae4b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -49,7 +49,7 @@ cmd = "protoc --proto_path %s --plugin=protoc-gen-custom=%s --custom_out=%s %s" % (base_dir, os.path.join(sphinx_path, "protobuf-json-docs.py"), json_dir, fullpath) print cmd subprocess.check_call(cmd, shell=True) - cmd = "python %s %s/ga4gh/%s schemas/" %(os.path.join(sphinx_path, "avpr2rest.py"), json_dir, json_file) + cmd = "python %s %s/ga4gh/%s schemas/" %(os.path.join(sphinx_path, "protodoc2rst.py"), json_dir, json_file) print cmd subprocess.check_call(cmd, shell=True) diff --git a/doc/source/schemas/Makefile b/doc/source/schemas/Makefile index 1e06308a..292ecb69 100644 --- a/doc/source/schemas/Makefile +++ b/doc/source/schemas/Makefile @@ -17,7 +17,7 @@ JSON_DIR:=/tmp/ga4gh-${UID}/json PROTO_BASE_DIR:=../../../src/main/proto PROTO_DIR:=${PROTO_BASE_DIR}/ga4gh -AVPR2REST_PATH:=../../../tools/sphinx/avpr2rest.py +AVPR2REST_PATH:=../../../tools/sphinx/protodoc2rst.py PROTOC_PLUGIN_PATH:=../../../tools/sphinx/protobuf-json-docs.py PROTO_BASENAMES:=$(subst ${PROTO_DIR}/,,$(wildcard ${PROTO_DIR}/*.proto)) diff --git a/tools/sphinx/avpr2rest.py b/tools/sphinx/protodoc2rst.py similarity index 100% rename from tools/sphinx/avpr2rest.py rename to tools/sphinx/protodoc2rst.py From 5f9c4d820d9ba2ca8134b37d3b62055863216bf6 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Thu, 16 Jun 2016 10:20:05 +0100 Subject: [PATCH 32/40] Fix one-of support --- tools/sphinx/protobuf-json-docs.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 402055a5..239481a8 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -20,6 +20,7 @@ def __init__(self, prot): self.field = [convert_protodef_to_editable(x) for x in prot.field] self.enum_type = [convert_protodef_to_editable(x) for x in prot.enum_type] self.nested_type = prot.nested_type + self.oneof_decl = prot.oneof_decl elif isinstance(prot, EnumValueDescriptorProto): self.number = prot.number elif isinstance(prot, FieldDescriptorProto): @@ -51,7 +52,6 @@ def _traverse(package, items, tree): comments = tree[item_index] if "leading_comments" in comments or "trailing_comments" in comments: item.comment = _collapse_comments(comments) - #raise Exception, item.__dict__ del comments["leading_comments"] del comments["trailing_comments"] if item.kind is EnumDescriptorProto: @@ -153,6 +153,13 @@ def generate_code(request, response): 'type': kind, 'doc': f.comment }) + if len(item.oneof_decl) > 0: + data["fields"] = [ + { + "name": item.oneof_decl[0].name, + "type": [" %s "% x["type"] for x in data["fields"]], + "doc": ", ".join([x["doc"] for x in data["fields"] if x["doc"] != ""]) + }] types.append(data) elif item.kind == EnumDescriptorProto: comments = ["\n* `%s`: %s"%(v.name, v.comment) for v in item.value] From 734717b8d79ec2596aea4975e041c1b98a842c50 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Thu, 16 Jun 2016 11:56:12 +0100 Subject: [PATCH 33/40] Add proper map type support --- tools/sphinx/protobuf-json-docs.py | 71 ++++++++++++++++++++---------- tools/sphinx/protodoc2rst.py | 4 +- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index 239481a8..b7e9134e 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -14,6 +14,7 @@ def __init__(self, prot): self.kind = type(prot) self.name = prot.name self.comment = "" + self.options = dict([(key.name, value) for (key, value) in prot.options.ListFields()]) if isinstance(prot, EnumDescriptorProto): self.value = [convert_protodef_to_editable(x) for x in prot.value] elif isinstance(prot, DescriptorProto): @@ -102,23 +103,66 @@ def _traverse(package, items, tree): raise Exception, sorted(tree.keys()) return {"types": - itertools.chain( + list(itertools.chain( _traverse(proto_file.package, proto_file.service, tree[6]), # 5 is enum_type in FileDescriptorProto _traverse(proto_file.package, proto_file.enum_type, tree[5]), # 5 is enum_type in FileDescriptorProto _traverse(proto_file.package, proto_file.message_type, tree[4]), # 4 is message_type in FileDescriptorProto - ), + )), "file": ["".join(x.leading_detached_comments) for x in proto_file.source_code_info.location if len(x.leading_detached_comments) > 0] } +def type_to_string(f, package, map_types): + if f.type in [1]: + return "double" + elif f.type in [2]: + return "float" + elif f.type in [3]: + return "long" + elif f.type in [5]: + return "integer" + elif f.type in [8]: + return "boolean" + elif f.type in [9]: + return "string" + elif f.type in [11, 14]: + ref_name = (package + "." + f.ref_type) + if ref_name in map_types: + ref_fields = map_types[ref_name] + return { + "type": "map", + "key": " %s "% type_to_string(ref_fields["key"], package, map_types), + "value": " %s "% type_to_string(ref_fields["value"], package, map_types) + } + else: + kind = ":avro:message:`%s`" % f.ref_type + if f.label == 3: # LABEL_REPEATED + return "list of " + kind + else: + return kind + elif f.type in [12]: + return "bytes" + else: + raise Exception, f.type + def generate_code(request, response): for proto_file in request.proto_file: types = [] messages = {} results = traverse(proto_file) + map_types = {} + def full_name(package, item): + return "%s.%s" % (package, item.name) + for item, package in results["types"]: + if item.options.has_key("map_entry"): + map_types[full_name(package, item)] = dict([(x.name,x) for x in item.field]) for item, package in results["types"]: + name = full_name(package, item) + if name in map_types: + continue + pass data = { - 'name': (package + "." + item.name).replace("ga4gh.", ""), + 'name': name.replace("ga4gh.", ""), 'doc': item.comment } @@ -128,26 +172,7 @@ def generate_code(request, response): 'fields': [] }) for f in item.field: # types from FieldDescriptorProto - if f.type in [1]: - kind = "double" - elif f.type in [2]: - kind = "float" - elif f.type in [3]: - kind = "long" - elif f.type in [5]: - kind = "integer" - elif f.type in [8]: - kind = "boolean" - elif f.type in [9]: - kind = "string" - elif f.type in [11, 14]: - kind = ":avro:message:`%s`" % f.ref_type - if f.label == 3: # LABEL_REPEATED - kind = "list of " + kind - elif f.type in [12]: - kind = "bytes" - else: - raise Exception, f.type + kind = type_to_string(f, package, map_types) data["fields"].append({ 'name': f.name, 'type': kind, diff --git a/tools/sphinx/protodoc2rst.py b/tools/sphinx/protodoc2rst.py index 23bb7287..bb7f0c9b 100644 --- a/tools/sphinx/protodoc2rst.py +++ b/tools/sphinx/protodoc2rst.py @@ -21,7 +21,9 @@ def typename(typeobject): if typeobject['type'] == 'array': return 'array<%s>' % typename(typeobject['items']) elif typeobject['type'] == 'map': - return 'map<%s>' % typename(typeobject['values']) + return 'map<%s, %s>' % (typename(typeobject['key']), typename(typeobject['value'])) + else: + raise Exception, "Unsupported type object: %s" %(typeobject['type']) elif isinstance(typeobject, basestring): return typeobject From e64628276de7cf4d08e82f2e9e04ac46ce755aee Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Thu, 16 Jun 2016 11:56:31 +0100 Subject: [PATCH 34/40] Remove duplicate code in protodoc2rst --- tools/sphinx/protodoc2rst.py | 91 ++---------------------------------- 1 file changed, 3 insertions(+), 88 deletions(-) diff --git a/tools/sphinx/protodoc2rst.py b/tools/sphinx/protodoc2rst.py index bb7f0c9b..7cd0a919 100644 --- a/tools/sphinx/protodoc2rst.py +++ b/tools/sphinx/protodoc2rst.py @@ -5,96 +5,9 @@ import re import argparse -def get_file_locations(): - parser = argparse.ArgumentParser() - parser.add_argument('input', help='Input AVPR filename(s)', nargs='+') - parser.add_argument('output', help='Output directory') - args = parser.parse_args() - return (args.input, args.output) - -def typename(typeobject): - if isinstance(typeobject, list): - union_names = [typename(item) for item in typeobject] - return '|'.join(union_names) - - elif isinstance(typeobject, dict): - if typeobject['type'] == 'array': - return 'array<%s>' % typename(typeobject['items']) - elif typeobject['type'] == 'map': - return 'map<%s, %s>' % (typename(typeobject['key']), typename(typeobject['value'])) - else: - raise Exception, "Unsupported type object: %s" %(typeobject['type']) - - elif isinstance(typeobject, basestring): - return typeobject - - raise ValueError - def cleanup_doc(doc,indent=0): return '\n'.join([' '*indent + line for line in doc.split('\n')]) -if __name__ == '__main__': - - avpr_filenames, rest_directory = get_file_locations() - - for avpr_filename in avpr_filenames: - base_filename = os.path.basename(avpr_filename) - name = os.path.splitext(base_filename)[0] - - rest_filename = os.path.join(rest_directory, name+'.rst') - - with open(avpr_filename,'r') as f: - data = json.load(f) - - output = data['protocol'] + '\n' - output += '*' * len(data['protocol']) + '\n\n' - - if 'doc' in data: - output += cleanup_doc(data['doc']) + '\n\n' - - for message_name in data['messages']: - message_def = data['messages'][message_name] - doc = message_def['doc'] - # process formal parameters ('request') - request = message_def['request'] - # collect the names - param_names = [] - for param in request: - param_names.append(param['name']) - response = message_def['response'] - errors = message_def['errors'] - output += " .. function:: %s(%s)\n\n" % (message_name, - ', '.join(param_names)) - for param in request: - output += " :param %s: %s\n" % (param['name'], param['type']) - output += " :return type: %s\n" % response - output += " :throws: %s\n\n" % ', '.join(errors) - output += cleanup_doc(doc) - output += "\n\n" - - for item in data['types']: - output += '.. avro:%s:: %s\n\n' % (item['type'], item['name']) - - if item['type'] == 'message': - for field in item['fields']: - output += ' :field %s:\n' % field['name'] - if 'doc' in field: - output += cleanup_doc(field['doc'],indent=4) + '\n' - output += ' :type %s: %s\n' % (field['name'], typename(field['type'])) - output += '\n' - - if item['type'] == 'enum': - output += ' :symbols: %s\n' % '|'.join(item['symbols']) - - if item['type'] == 'fixed': - output += ' :size: %s\n' % item['size'] - - if 'doc' in item: - output += cleanup_doc(item['doc'],indent=2) + '\n\n' - - with open(rest_filename,'w') as f: - f.write(output) - def get_file_locations(): parser = argparse.ArgumentParser() parser.add_argument('input', help='Input AVPR filename(s)', nargs='+') @@ -111,7 +24,9 @@ def typename(typeobject): if typeobject['type'] == 'array': return 'array<%s>' % typename(typeobject['items']) elif typeobject['type'] == 'map': - return 'map<%s>' % typename(typeobject['values']) + return 'map<%s, %s>' % (typename(typeobject['key']), typename(typeobject['value'])) + else: + raise Exception, "Unsupported type object: %s" %(typeobject['type']) elif isinstance(typeobject, basestring): return typeobject From df7b4a7a3b67fdc4f229ac37e598f48dd5667f6b Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Thu, 16 Jun 2016 14:04:35 +0100 Subject: [PATCH 35/40] Fix refs to Protobuf internal items --- doc/source/conf.py | 24 ++++++++++++++---------- doc/source/schemas/index.rst | 1 + tools/sphinx/protobuf-json-docs.py | 29 ++++++++++++++++------------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 91b3ae4b..a3edc911 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -42,16 +42,20 @@ json_dir = os.path.join("_build", "json-temp") if not os.path.exists(json_dir): os.makedirs(json_dir) -schema_dir = os.path.join(base_dir, "ga4gh") -for protofile in os.listdir(schema_dir): - fullpath = os.path.join(schema_dir, protofile) - json_file = protofile + ".json" - cmd = "protoc --proto_path %s --plugin=protoc-gen-custom=%s --custom_out=%s %s" % (base_dir, os.path.join(sphinx_path, "protobuf-json-docs.py"), json_dir, fullpath) - print cmd - subprocess.check_call(cmd, shell=True) - cmd = "python %s %s/ga4gh/%s schemas/" %(os.path.join(sphinx_path, "protodoc2rst.py"), json_dir, json_file) - print cmd - subprocess.check_call(cmd, shell=True) +schema_dir = base_dir +for root, dirs, files in os.walk(schema_dir): + for f in files: + fullpath = os.path.join(root, f) + json_file = f + ".json" + cmd = "protoc --proto_path %s --plugin=protoc-gen-custom=%s --custom_out=%s %s" % (base_dir, os.path.join(sphinx_path, "protobuf-json-docs.py"), json_dir, fullpath) + print cmd + subprocess.check_call(cmd, shell=True) + +for root, dirs, files in os.walk(json_dir): + for f in files: + cmd = "python %s %s/%s schemas/" %(os.path.join(sphinx_path, "protodoc2rst.py"), root, f) + print cmd + subprocess.check_call(cmd, shell=True) # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/doc/source/schemas/index.rst b/doc/source/schemas/index.rst index f1640056..d9d47360 100644 --- a/doc/source/schemas/index.rst +++ b/doc/source/schemas/index.rst @@ -17,3 +17,4 @@ Schemas allele_annotation_service.proto.rst sequence_annotations.proto.rst sequence_annotation_service.proto.rst + struct.proto.rst diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index b7e9134e..c0ed7386 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -8,6 +8,9 @@ import json from google.protobuf.descriptor_pb2 import DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto, ServiceDescriptorProto, MethodDescriptorProto +def simplify_name(name): + return name.split(".")[-1] + def convert_protodef_to_editable(proto): class Editable(object): def __init__(self, prot): @@ -26,7 +29,7 @@ def __init__(self, prot): self.number = prot.number elif isinstance(prot, FieldDescriptorProto): if prot.type in [11, 14]: - self.ref_type = prot.type_name.replace(".ga4gh.", "") + self.ref_type = prot.type_name[1:] self.type = prot.type self.label = prot.label elif isinstance(prot, ServiceDescriptorProto): @@ -37,7 +40,6 @@ def __init__(self, prot): else: raise Exception, type(prot) - return Editable(proto) def traverse(proto_file): @@ -111,7 +113,7 @@ def _traverse(package, items, tree): "file": ["".join(x.leading_detached_comments) for x in proto_file.source_code_info.location if len(x.leading_detached_comments) > 0] } -def type_to_string(f, package, map_types): +def type_to_string(f, map_types): if f.type in [1]: return "double" elif f.type in [2]: @@ -125,16 +127,18 @@ def type_to_string(f, package, map_types): elif f.type in [9]: return "string" elif f.type in [11, 14]: - ref_name = (package + "." + f.ref_type) + ref_name = f.ref_type if ref_name in map_types: ref_fields = map_types[ref_name] return { "type": "map", - "key": " %s "% type_to_string(ref_fields["key"], package, map_types), - "value": " %s "% type_to_string(ref_fields["value"], package, map_types) - } + "key": " %s "% type_to_string(ref_fields["key"], map_types), + "value": " %s "% type_to_string(ref_fields["value"], map_types) + } + elif ref_name.find("InfoEntry") != -1: + raise Exception, (f.__dict__, ref_name) else: - kind = ":avro:message:`%s`" % f.ref_type + kind = ":avro:message:`%s`" % simplify_name(f.ref_type) if f.label == 3: # LABEL_REPEATED return "list of " + kind else: @@ -160,9 +164,8 @@ def full_name(package, item): name = full_name(package, item) if name in map_types: continue - pass data = { - 'name': name.replace("ga4gh.", ""), + 'name': simplify_name(name), 'doc': item.comment } @@ -172,7 +175,7 @@ def full_name(package, item): 'fields': [] }) for f in item.field: # types from FieldDescriptorProto - kind = type_to_string(f, package, map_types) + kind = type_to_string(f, map_types) data["fields"].append({ 'name': f.name, 'type': kind, @@ -200,9 +203,9 @@ def full_name(package, item): "doc": m.comment, "request": [{ "name": "request", - "type": ":avro:message:`%s`" % m.input_type.replace(".ga4gh.", ""), + "type": ":avro:message:`%s`" % simplify_name(m.input_type), }], - "response": ":avro:message:`%s`" % m.output_type.replace(".ga4gh.", ""), + "response": ":avro:message:`%s`" % simplify_name(m.output_type), "errors" : [ ":avro:message:`GAException`" ] } else: From f2f33afe66946b3a7611f33995dc7a11da7c85f7 Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Thu, 16 Jun 2016 14:42:56 +0100 Subject: [PATCH 36/40] Remove remaining internal refs to Avro --- doc/source/api/metadata.rst | 2 +- doc/source/api/reads.rst | 51 +++++----- doc/source/api/variants.rst | 14 +-- doc/source/conf.py | 2 +- .../proto/ga4gh/sequence_annotations.proto | 2 +- tools/sphinx/protobuf-json-docs.py | 8 +- .../{avrodomain.py => protobufdomain.py} | 92 +++++++++---------- tools/sphinx/protodoc2rst.py | 2 +- 8 files changed, 86 insertions(+), 87 deletions(-) rename tools/sphinx/{avrodomain.py => protobufdomain.py} (73%) diff --git a/doc/source/api/metadata.rst b/doc/source/api/metadata.rst index 4aa50d94..6075fe25 100644 --- a/doc/source/api/metadata.rst +++ b/doc/source/api/metadata.rst @@ -24,7 +24,7 @@ data-provider-specified collection of related data of multiple types. Logically, it's akin to a folder, where it's up to the provider what goes into the folder. Individual data objects are linked by `datasetId` fields to `Dataset objects -<../schemas/metadata.html#avro.Dataset>`_. +<../schemas/metadata.proto.html#protobuf.Dataset>`_. Since the grouping of content in a dataset is determined by the data provider, users should not make semantic assumptions about that data. diff --git a/doc/source/api/reads.rst b/doc/source/api/reads.rst index fe05f4f9..c337465a 100644 --- a/doc/source/api/reads.rst +++ b/doc/source/api/reads.rst @@ -17,38 +17,38 @@ specific genomic regions instead. The model has the following data types: -============================== ============================================ ================== -Record | Description SAM/BAM rough equivalent -============================== ============================================ ================== -:avro:message:`ReadAlignment` | One alignment for one read A single line in a file -:avro:message:`ReadGroup` | A group of read alignments A single RG tag -:avro:message:`ReadGroupSet` | Collecton of ReadGroups that map to the Single SAM/BAM file - | same genome -:avro:message:`Program` | Software version and parameters that were PN, CL tags in SAM header - | used to align reads to the genome -:avro:message:`ReadStats` | Counts of aligned and unaligned reads Samtools flagstats on a file - | for a ReadGroup or ReadGroupSet -============================== ============================================ ================== +==================================== =========================================== ======================== +Record Description SAM/BAM rough equivalent +==================================== =========================================== ======================== +:protobuf:message:`ReadAlignment` One alignment for one read A single line in a file +:protobuf:message:`ReadGroup` A group of read alignments A single RG tag +:protobuf:message:`ReadGroupSet` Collecton of ReadGroups that map to the Single SAM/BAM file + same genome +:protobuf:message:`Program` Software version and parameters that were PN, CL tags in SAM header + used to align reads to the genome +:protobuf:message:`ReadStats` Counts of aligned and unaligned reads Samtools flagstats on a file + for a ReadGroup or ReadGroupSet +==================================== =========================================== ======================== The relationships are mostly one to many (e.g. each -:avro:message:`ReadAlignment` is part of exactly one -:avro:message:`ReadGroup`), with the exception that a -:avro:message:`ReadGroup` is allowed to be part of more than one -:avro:message:`ReadGroupSet`. +:protobuf:message:`ReadAlignment` is part of exactly one +:protobuf:message:`ReadGroup`), with the exception that a +:protobuf:message:`ReadGroup` is allowed to be part of more than one +:protobuf:message:`ReadGroupSet`. -:avro:message:`Dataset` --< :avro:message:`ReadGroupSet` >--< :avro:message:`ReadGroup` --< :avro:message:`ReadAlignment` +:protobuf:message:`Dataset` --< :protobuf:message:`ReadGroupSet` >--< :protobuf:message:`ReadGroup` --< :protobuf:message:`ReadAlignment` -* A :avro:message:`Dataset` is a general-purpose container, defined in +* A :protobuf:message:`Dataset` is a general-purpose container, defined in metadata.avdl. -* A :avro:message:`ReadGroupSet` is a logical collection of ReadGroups, +* A :protobuf:message:`ReadGroupSet` is a logical collection of ReadGroups, as determined by the data owner. Typically one - :avro:message:`ReadGroupSet` represents all the Reads from one + :protobuf:message:`ReadGroupSet` represents all the Reads from one experimental sample, which traditionally would be stored in a single BAM file. -* A :avro:message:`ReadGroup` is all the data that's processed the same +* A :protobuf:message:`ReadGroup` is all the data that's processed the same way by the sequencer. There are typically 1-10 ReadGroups in a - :avro:message:`ReadGroupSet`. -* A :avro:message:`ReadAlignment` object is a flattened representation + :protobuf:message:`ReadGroupSet`. +* A :protobuf:message:`ReadAlignment` object is a flattened representation of several layers of bioinformatics hierarchy, including fragments, reads, and alignments, stored in one object for easy access. @@ -56,9 +56,9 @@ The relationships are mostly one to many (e.g. each ReadAlignment: detailed discussion @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -One :avro:message:`ReadAlignment` object represents the following +One :protobuf:message:`ReadAlignment` object represents the following logical hierarchy. See the field definitions in the -:avro:message:`ReadAlignment` object for more details. +:protobuf:message:`ReadAlignment` object for more details. .. image:: /_static/read_alignment_diagrams.png @@ -88,4 +88,3 @@ identified by that ID. Records are represented by blue rectangles; dotted lines indicate records defined in other schemas. .. image:: /_static/reads_schema.png - diff --git a/doc/source/api/variants.rst b/doc/source/api/variants.rst index abb25149..cfa4eb46 100644 --- a/doc/source/api/variants.rst +++ b/doc/source/api/variants.rst @@ -24,20 +24,20 @@ constitute the genotype matrix. The lowest-level entity is a Call: - * a :avro:record:`Call` encodes the genotype of an individual with + * a :protobuf:message:`Call` encodes the genotype of an individual with respect to a variant, as determined by some analysis of experimental data. The other entities can be thought of as collections of Calls that have something in common: - * a :avro:record:`VariantSet` supports working with a collection + * a :protobuf:message:`VariantSet` supports working with a collection of Calls intended to be analyzed together. - * a :avro:record:`Variant` supports working with the subset of + * a :protobuf:message:`Variant` supports working with the subset of Calls in a VariantSet that are at the same site and are described using the same set of alleles. The Variant entity contains: - + * a variant description: a potential difference between experimental DNA and a reference sequence, including the site (position of the difference) and alleles (how the bases @@ -46,7 +46,7 @@ something in common: evidence for actual instances of that difference, as seen in analyses of experimental data - * a :avro:record:`CallSet` supports working with the subset of + * a :protobuf:message:`CallSet` supports working with the subset of Calls in a VariantSet that were generated by the same analysis of the same sample. The CallSet includes information about which sample was analyzed and how it was analyzed, and is linked to @@ -54,9 +54,9 @@ something in common: The following diagram shows the relationship of these four entities to each other and to other GA4GH API entities. It shows which entities -contain other entities (such as :avro:record:`VariantSetMetadata`), +contain other entities (such as :protobuf:message:`VariantSetMetadata`), and which contain IDs that can be used to get information from other -entities (such as :avro:record:`Variant`'s ``variantSetId``). The +entities (such as :protobuf:message:`Variant`'s ``variantSetId``). The arrow points *from* the entity that contains the ID *to* the entity that can be identified by that ID. diff --git a/doc/source/conf.py b/doc/source/conf.py index a3edc911..146cf1e5 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -35,7 +35,7 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', - 'avrodomain', + 'protobufdomain', ] base_dir = "../../src/main/proto" diff --git a/src/main/proto/ga4gh/sequence_annotations.proto b/src/main/proto/ga4gh/sequence_annotations.proto index 0cbe44a3..e94bf601 100644 --- a/src/main/proto/ga4gh/sequence_annotations.proto +++ b/src/main/proto/ga4gh/sequence_annotations.proto @@ -16,7 +16,7 @@ The discrete hierarchical annotations are derived from the Sequence Ontology http://www.sequenceontology.org/gff3.shtml The goal is to be able to store annotations using the GFF3 and SO conceptual -model, although there is not necessarly a one-to-one mapping in Avro records +model, although there is not necessarly a one-to-one mapping in Protobuf messages to GFF3 records. The minimum requirement is to be able to accurately represent the current diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index c0ed7386..ad12969b 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -138,7 +138,7 @@ def type_to_string(f, map_types): elif ref_name.find("InfoEntry") != -1: raise Exception, (f.__dict__, ref_name) else: - kind = ":avro:message:`%s`" % simplify_name(f.ref_type) + kind = ":protobuf:message:`%s`" % simplify_name(f.ref_type) if f.label == 3: # LABEL_REPEATED return "list of " + kind else: @@ -203,10 +203,10 @@ def full_name(package, item): "doc": m.comment, "request": [{ "name": "request", - "type": ":avro:message:`%s`" % simplify_name(m.input_type), + "type": ":protobuf:message:`%s`" % simplify_name(m.input_type), }], - "response": ":avro:message:`%s`" % simplify_name(m.output_type), - "errors" : [ ":avro:message:`GAException`" ] + "response": ":protobuf:message:`%s`" % simplify_name(m.output_type), + "errors" : [ ":protobuf:message:`GAException`" ] } else: raise Exception, item.kind diff --git a/tools/sphinx/avrodomain.py b/tools/sphinx/protobufdomain.py similarity index 73% rename from tools/sphinx/avrodomain.py rename to tools/sphinx/protobufdomain.py index 23a9c238..fa21540b 100644 --- a/tools/sphinx/avrodomain.py +++ b/tools/sphinx/protobufdomain.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- """ - avrodomain + protobufdomain ~~~~~~~~~~ - Apache Avro domain. + Protobuf domain. """ __version__ = "0.1" -# for this module's sphinx doc +# for this module's sphinx doc release = __version__ version = release.rsplit('.', 1)[0] @@ -29,7 +29,7 @@ # By default, disable this warning. WARN_ABOUT_DUPLICATES = False -avro_sig_regex = re.compile( +protobuf_sig_regex = re.compile( r'''^ ([^(]*?) # type (\w+) # name @@ -37,23 +37,23 @@ $ ''', re.X) -class AvroObject(ObjectDescription): - """Description of a general Avro object.""" +class ProtobufObject(ObjectDescription): + """Description of a general Protobuf object.""" prefix = None - + def handle_signature(self,sig,signode): sig = sig.strip() - type_name, name, arglist = avro_sig_regex.match(sig).groups() - + type_name, name, arglist = protobuf_sig_regex.match(sig).groups() + if self.prefix: signode += addnodes.desc_annotation(self.prefix+' ', self.prefix+' ') - + if type_name: signode += addnodes.desc_type(type_name, type_name) - + if name: signode += addnodes.desc_name(name,name) - + if arglist: paramlist = addnodes.desc_parameterlist() for arg in arglist.split(','): @@ -63,52 +63,52 @@ def handle_signature(self,sig,signode): param += nodes.emphasis(' '+argname,' '+argname) paramlist += param signode += paramlist - + return name - + def get_index_text(self,name): if self.objtype == 'fixed': - return _('%s (Avro fixed-width value)') % name + return _('%s (Protobuf fixed-width value)') % name if self.objtype == 'enum': - return _('%s (Avro enum)') % name + return _('%s (Protobuf enum)') % name if self.objtype == 'message': - return _('%s (Avro record)') % name + return _('%s (Protobuf message)') % name if self.objtype == 'error': - return _('%s (Avro error)') % name + return _('%s (Protobuf error)') % name if self.objtype == 'rpc': - return _('%s (Avro RPC)') % name - + return _('%s (Protobuf RPC)') % name + def add_target_and_index(self, name, sig, signode): - targetname = 'avro.' + name + targetname = 'protobuf.' + name if targetname not in self.state.document.ids: signode['names'].append(targetname) signode['ids'].append(targetname) signode['first'] = (not self.names) self.state.document.note_explicit_target(signode) - objects = self.env.domaindata['avro']['objects'] + objects = self.env.domaindata['protobuf']['objects'] if name in objects and WARN_ABOUT_DUPLICATES: - self.state_machine.reporter.warning('duplicate Avro object description of %s.' % name, line=self.lineno) + self.state_machine.reporter.warning('duplicate Protobuf object description of %s.' % name, line=self.lineno) objects[name] = (self.env.docname, self.objtype) - + indextext = self.get_index_text(name) if indextext: self.indexnode['entries'].append(('single',indextext,targetname,'')) -class AvroFixedField(AvroObject): +class ProtobufFixedField(ProtobufObject): prefix = 'fixed' doc_field_types = [ Field('size', label=l_('Size'), names=('size',)) ] -class AvroEnum(AvroObject): +class ProtobufEnum(ProtobufObject): prefix = 'enum' doc_field_types = [ Field('symbols', label=l_('Symbols'), names=('symbols',)) ] -class AvroRecord(AvroObject): +class ProtobufMessage(ProtobufObject): prefix = 'message' doc_field_types = [ TypedField('fields', label=l_('Fields'), @@ -117,10 +117,10 @@ class AvroRecord(AvroObject): typerolename='message') ] -class AvroError(AvroRecord): +class ProtobufError(ProtobufMessage): prefix = 'error' -class AvroRPCMessage(AvroObject): +class ProtobufRPCMessage(ProtobufObject): doc_field_types = [ TypedField('arguments', label=l_('Arguments'), names=('argument','arg','param'), @@ -132,10 +132,10 @@ class AvroRPCMessage(AvroObject): names=('returns','return')) ] -class AvroDomain(Domain): - name = "avro" - label = "Apache Avro" - +class ProtobufDomain(Domain): + name = "protobuf" + label = "Apache Protobuf" + object_types = { 'fixed': ObjType(l_('fixed'), 'fixed'), 'enum': ObjType(l_('enum'), 'enum'), @@ -143,15 +143,15 @@ class AvroDomain(Domain): 'error': ObjType(l_('error'), 'error'), 'rpc': ObjType(l_('rpc'), 'rpc'), } - + directives = { - 'fixed': AvroFixedField, - 'enum': AvroEnum, - 'message': AvroRecord, - 'error': AvroError, - 'rpc': AvroRPCMessage + 'fixed': ProtobufFixedField, + 'enum': ProtobufEnum, + 'message': ProtobufMessage, + 'error': ProtobufError, + 'rpc': ProtobufRPCMessage } - + roles = { 'fixed': XRefRole(), 'enum': XRefRole(), @@ -159,20 +159,20 @@ class AvroDomain(Domain): 'error': XRefRole(), 'rpc': XRefRole() } - + initial_data = { 'objects': {} } - + def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode): if target not in self.data['objects']: return None obj = self.data['objects'][target] - return make_refnode(builder, fromdocname, obj[0], 'avro.' + target, contnode, target) - + return make_refnode(builder, fromdocname, obj[0], 'protobuf.' + target, contnode, target) + def get_objects(self): for refname, (docname, type) in list(self.data['objects'].items()): - yield (refname, refname, type, docname, 'avro.' + refname, 1) + yield (refname, refname, type, docname, 'protobuf.' + refname, 1) def setup(app): - app.add_domain(AvroDomain) + app.add_domain(ProtobufDomain) diff --git a/tools/sphinx/protodoc2rst.py b/tools/sphinx/protodoc2rst.py index 7cd0a919..956a0ffe 100644 --- a/tools/sphinx/protodoc2rst.py +++ b/tools/sphinx/protodoc2rst.py @@ -73,7 +73,7 @@ def typename(typeobject): output += "\n\n" for item in data['types']: - output += '.. avro:%s:: %s\n\n' % (item['type'], item['name']) + output += '.. protobuf:%s:: %s\n\n' % (item['type'], item['name']) if item['type'] == 'message': for field in item['fields']: From 43cdcb0537e092db13c72d8d019ce18bcd041afb Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Thu, 16 Jun 2016 15:25:01 +0100 Subject: [PATCH 37/40] Add various documentation to Protobuf convertors --- tools/sphinx/protobuf-json-docs.py | 34 +++++++++++++++++++++++++----- tools/sphinx/protobufdomain.py | 2 +- tools/sphinx/protodoc2rst.py | 10 ++++----- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/tools/sphinx/protobuf-json-docs.py b/tools/sphinx/protobuf-json-docs.py index ad12969b..55f06a97 100755 --- a/tools/sphinx/protobuf-json-docs.py +++ b/tools/sphinx/protobuf-json-docs.py @@ -1,4 +1,14 @@ #!/usr/bin/env python +""" +Plugin for generation of Sphinx-suitable JSON from Protobuf definitions +It's a plugin for protoc as per https://developers.google.com/protocol-buffers/docs/reference/other + +Usage: + protoc --plugin=protoc-gen-custom=