diff --git a/schema/vrs.yaml b/schema/vrs.yaml index e84f369e..736c44a4 100644 --- a/schema/vrs.yaml +++ b/schema/vrs.yaml @@ -73,6 +73,11 @@ definitions: - type additionalProperties: false Haplotype: + # For SV-VRS 'non-overlapping' is defined with respect to the + # derivate chromosome + # This allows SNVs within duplications to be reported as they + # can overlap in reference coordinate space, but not on the + # sample molecule. description: A set of non-overlapping Allele members that co-occur on the same molecule. type: object @@ -88,12 +93,29 @@ definitions: members: type: array minItems: 2 - uniqueItems: true - ordered: false + # By allowing repeats within a Haplotype, we can now represent + # variants within SV. Notably, we can define SNVs that + # occur multiple times within a duplication. + uniqueItems: false + # The ordering of members within a Haplotype defines a traversal + # order for the derivate chromosome. This allows phasing of + # breakpoints as well as the Alleles between them. + # This is equivalent to the VCF phase set list (PSL) field + # with the VCF PSO field implicitly encoded in the member + # ordering within the Haplotype. + # + # Is this model appropriate? It doesn't support scenarios + # such as knowing two SNVs within a duplication are on the same + # molecule but not being able to place them to the first or + # second copy. + # (Note that the current VRS model does not support this + # scenario as the Alleles must be non-overlapping). + ordered: true items: oneOf: - $ref: '#/definitions/Allele' - $ref: '#/definitions/CURIE' + - $ref: '#/definitions/Breakpoint' description: List of Alleles, or references to Alleles, that comprise this Haplotype. required: @@ -146,6 +168,7 @@ definitions: - $ref: '#/definitions/Haplotype' - $ref: '#/definitions/Text' - $ref: '#/definitions/VariationSet' + - $ref: '#/definitions/Breakpoint' description: List of Variation objects or identifiers. Attribute is required, but MAY be empty. required: @@ -341,7 +364,7 @@ definitions: - $ref: '#/definitions/Number' description: The start coordinate or range of the interval. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less - than the value of `end`. + than or equal to the value of `end`. end: oneOf: - $ref: '#/definitions/DefiniteRange' @@ -349,7 +372,7 @@ definitions: - $ref: '#/definitions/Number' description: The end coordinate or range of the interval. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater - than the value of `start`. + than or equal to the value of `start`. required: - end - start @@ -678,3 +701,129 @@ definitions: - start - type additionalProperties: false + Breakend: + description: + "A break in a molecule with respect to a reference sequence indicating + the sequence deviates from the reference sequence after or before this + location." + type: object + properties: + type: + type: string + const: Breakend + default: Breakend + description: MUST be "Breakend" + location: + type: Location + description: The interval over which the break could occur in + orientation: + type: string + enum: + - DivergesAfter + - DivergesBefore + description: + MUST be one of "DivergesAfter" or "DivergesBefore" indicating whether the + sequences diverges from the reference after or before any position in the + interval. + required: + - type + - location + - orientation + additionalProperties: false + Breakpoint: + description: + A rearrangement resulting in sequences flanking the two breakends becoming + adjacent sequences on the same molecule. + type: object + properties: + type: + type: string + const: Breakpoint + default: Breakpoint + description: MUST be "Breakpoint" + # I've taken the VCF-style model + # This is actually a lossy representation as many variant callers + # can constrain the actual location much more than anywhere in the + # [(start1, end1), (start2, end2)] range. + # for example, when the interval are due to homology, then the + # interval widths must be the same and, for any given position + # in the first breakend interval, there is only one possible position + # in the second breakend interval that is possible. + # Just specifying the two intervals independently as is done in this + # model does not intrinsically encode this information. + # + # Similarly, even imprecise calls have possibilities that are less + # likely than others. For example if a deletion break1 was at start1, then + # break2 might be constrained so something like [start2, start2 + (end2 - start2) / 3] + # because that would imply a longer deletion length that is plausible. + # + # VCF has a CILEN field that can encode this sort of information. + # Do we need an equivalent for VRS? + breakends: + type: array + uniqueItems: false + ordered: false + items: + type: Breakend + description: Breakends involed in the sequence + minItems: 1 + maxItems: 2 + # Needed to support optical mapping gaps where the sequence between the + # breaks is not known but the approximate length is + insertion: + type: DefiniteRange + description: Approximate length of unknown sequence between the breaks. + homology: + # Only valid for breakends=2 + type: boolean + default: false + description: + A flag indicating whether the location interval of the breakend + is due to the sequences at the breakends being homologous or + whether the interval is due to uncertainty regarding the actual + locations of the breakends. + # Does anyone have a need to support anything other than a LiteralSequenceExpression? + # I'd prefer not to as allowing reference-based sequences in here is just an + # alternate representation of multiple breaks and we want to minimise the number + # of different ways a sequence can be represented. + sequence: + type: LiteralSequenceExpression + description: + # TODO: clarify what this sequence is. We can define this as: + # - Traversal from the anchoring sequence (i.e RevComp DivergesBefore sequences) + # - Sequence prepend/concatenation + # TODO: What happens when the sequence itself has DerivedSequenceExpression.reverse_complement=true? + Sequence occuring after the break. + terminal: + # TODO: can the schema encode a constraint that a terminal breakend cannot + # be part of a breakpoint? + type: boolean + default: false + description: + # Only valid for breakends=1 + Indicates the end of the molecule + required: + - type + - breakends + additionalProperties: false + Event: + description: + An event that results in a set of variants. + type: object + properties: + type: + type: string + const: Event + default: Event + description: MUST be "Event" + variants: + $ref: '#/definitions/VariationSet' + classification: + # TODO: what event ontology should we use? + type: string + description: Category of event + required: + - type + - variants + - classification + additionalProperties: true