diff --git a/schema/vrs-source.yaml b/schema/vrs-source.yaml index 1f4ed00d..4056fd10 100644 --- a/schema/vrs-source.yaml +++ b/schema/vrs-source.yaml @@ -150,6 +150,9 @@ $defs: required: [ "location", "state" ] Haplotype: + # For SV-VRS 'non-overlapping' is defined with respect to the derivate chromosome + # This allows SNVs within duplications to be reported as they can overlap in reference + # coordinate space, but not on the sample molecule. maturity: Alpha ga4ghDigest: prefix: HT @@ -168,13 +171,30 @@ $defs: MUST be "Haplotype" members: type: array - ordered: false + # The ordering of members within a Haplotype defines a traversal + # order for the derivate chromosome. This allows phasing of + # breakpoints as well as the Alleles between them. + # This is equivalent to the VCF phase set list (PSL) field + # with the VCF PSO field implicitly encoded in the member + # ordering within the Haplotype. + # + # Is this model appropriate? It doesn't support scenarios + # such as knowing two SNVs within a duplication are on the same + # molecule but not being able to place them to the first or + # second copy. + # (Note that the current VRS model does not support this + # scenario as the Alleles must be non-overlapping). + ordered: true minItems: 2 - uniqueItems: true + # By allowing repeats within a Haplotype, we can now represent + # variants within SV. Notably, we can define SNVs that + # occur multiple times within a duplication. + uniqueItems: false items: oneOf: - $ref: "#/$defs/Allele" - $refCurie: gks.core:IRI + - $ref: '#/$defs/Breakpoint' description: >- A list of :ref:`Alleles ` (or IRI references to `Alleles`) that comprise a Haplotype. Since each `Haplotype` member MUST be an `Allele`, and all members MUST share a common :ref:`SequenceReference`, @@ -377,7 +397,7 @@ $defs: description: >- The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. - MUST represent a coordinate or range less than the value of `end`. + MUST represent a coordinate or range less than or equal to the value of `end`. end: oneOf: - type: integer @@ -385,7 +405,7 @@ $defs: description: >- The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. - MUST represent a coordinate or range greater than the value of `start`. + MUST represent a coordinate or range greater than or equal to the value of `start`. required: - start - end @@ -760,3 +780,137 @@ $defs: sequences). IUPAC ambiguity codes are permitted in Sequence Strings. type: string pattern: '^[A-Z*\-]*$' + + + # ============================================================================= + # NEW PROPOSED SV Classes + # These classes are not yet organized within the vrs-source.yaml document + # These classes are under active discussion. + # ============================================================================= + + Breakend: + description: + "A break in a molecule with respect to a reference sequence indicating + the sequence deviates from the reference sequence after or before this + location." + type: object + properties: + type: + type: string + const: Breakend + default: Breakend + description: MUST be "Breakend" + location: + type: Location + description: The interval over which the break could occur in + orientation: + type: string + enum: + - DivergesAfter + - DivergesBefore + description: + MUST be one of "DivergesAfter" or "DivergesBefore" indicating whether the + sequences diverges from the reference after or before any position in the + interval. + required: + - type + - location + - orientation + additionalProperties: false + Breakpoint: + description: + A rearrangement resulting in sequences flanking the two breakends becoming + adjacent sequences on the same molecule. + type: object + properties: + type: + type: string + const: Breakpoint + default: Breakpoint + description: MUST be "Breakpoint" + # I've taken the VCF-style model + # This is actually a lossy representation as many variant callers + # can constrain the actual location much more than anywhere in the + # [(start1, end1), (start2, end2)] range. + # for example, when the interval are due to homology, then the + # interval widths must be the same and, for any given position + # in the first breakend interval, there is only one possible position + # in the second breakend interval that is possible. + # Just specifying the two intervals independently as is done in this + # model does not intrinsically encode this information. + # + # Similarly, even imprecise calls have possibilities that are less + # likely than others. For example if a deletion break1 was at start1, then + # break2 might be constrained so something like [start2, start2 + (end2 - start2) / 3] + # because that would imply a longer deletion length that is plausible. + # + # VCF has a CILEN field that can encode this sort of information. + # Do we need an equivalent for VRS? + breakends: + type: array + uniqueItems: false + ordered: false + items: + type: Breakend + description: Breakends involed in the sequence + minItems: 1 + maxItems: 2 + # Needed to support optical mapping gaps where the sequence between the + # breaks is not known but the approximate length is + insertion: + type: DefiniteRange + description: Approximate length of unknown sequence between the breaks. + homology: + # Only valid for breakends=2 + type: boolean + default: false + description: + A flag indicating whether the location interval of the breakend + is due to the sequences at the breakends being homologous or + whether the interval is due to uncertainty regarding the actual + locations of the breakends. + # Does anyone have a need to support anything other than a LiteralSequenceExpression? + # I'd prefer not to as allowing reference-based sequences in here is just an + # alternate representation of multiple breaks and we want to minimise the number + # of different ways a sequence can be represented. + sequence: + type: LiteralSequenceExpression + description: + # TODO: clarify what this sequence is. We can define this as: + # - Traversal from the anchoring sequence (i.e RevComp DivergesBefore sequences) + # - Sequence prepend/concatenation + # TODO: What happens when the sequence itself has DerivedSequenceExpression.reverse_complement=true? + Sequence occuring after the break. + terminal: + # TODO: can the schema encode a constraint that a terminal breakend cannot + # be part of a breakpoint? + type: boolean + default: false + description: + # Only valid for breakends=1 + Indicates the end of the molecule + required: + - type + - breakends + additionalProperties: false + Event: + description: + An event that results in a set of variants. + type: object + properties: + type: + type: string + const: Event + default: Event + description: MUST be "Event" + variants: + $ref: '#/definitions/VariationSet' + classification: + # TODO: what event ontology should we use? + type: string + description: Category of event + required: + - type + - variants + - classification + additionalProperties: true \ No newline at end of file