Skip to content

Commit

Permalink
fix: still return a result for protein/cdna with no mane data found (#…
Browse files Browse the repository at this point in the history
…507)

- Creates an enum for vrs sequence location accession status
- Fix getting priority translation result in `_get_priority_translation_result` when there are multiple translation results for a given ac status
  • Loading branch information
korikuzma committed Sep 22, 2023
1 parent 8d7150c commit 40e18f2
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 150 deletions.
161 changes: 40 additions & 121 deletions tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,127 +41,37 @@ def dis3_p63a():
"type": "Allele",
},
"molecule_context": "protein",
"gene_context": {
"id": "normalize.gene:DIS3",
"type": "GeneDescriptor",
"label": "DIS3",
"xrefs": ["ensembl:ENSG00000083520", "ncbigene:22894"],
"alternate_labels": [
"dis3p",
"RRP44",
"KIAA1008",
"2810028N01Rik",
"EXOSC11",
],
"extensions": [
{"name": "symbol_status", "value": "approved", "type": "Extension"},
{
"name": "approved_name",
"value": "DIS3 homolog, exosome endoribonuclease and 3'-5' exoribonuclease", # noqa: E501
"type": "Extension",
},
{
"name": "hgnc_locations",
"value": [
{
"species_id": "taxonomy:9606",
"interval": {
"type": "CytobandInterval",
"start": "q21.33",
"end": "q21.33",
},
"_id": "ga4gh:VCL.84IPub_nKl33cWX9pNoPeGsyeVuJnyra",
"type": "ChromosomeLocation",
"chr": "13",
}
],
"type": "Extension",
},
{
"type": "Extension",
"name": "ensembl_locations",
"value": [
{
"_id": "ga4gh:VSL.HZtod8n11kD7jCAbtsJLDGwLKCEhRWO1",
"type": "SequenceLocation",
"sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"interval": {
"start": {"type": "Number", "value": 72752168},
"end": {"type": "Number", "value": 72782096},
"type": "SequenceInterval",
},
}
],
},
{
"type": "Extension",
"name": "ncbi_locations",
"value": [
{
"_id": "ga4gh:VCL.84IPub_nKl33cWX9pNoPeGsyeVuJnyra",
"type": "ChromosomeLocation",
"species_id": "taxonomy:9606",
"chr": "13",
"interval": {
"end": "q21.33",
"start": "q21.33",
"type": "CytobandInterval",
},
},
{
"_id": "ga4gh:VSL.BIRwPLT8rtyOnhd3aUXaz4xlHC4P4zA8",
"type": "SequenceLocation",
"sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"interval": {
"start": {"type": "Number", "value": 72752168},
"end": {"type": "Number", "value": 72781900},
"type": "SequenceInterval",
},
},
],
},
{
"name": "associated_with",
"value": [
"vega:OTTHUMG00000017070",
"ccds:CCDS9447",
"orphanet:470196",
"ena.embl:AB023225",
"ccds:CCDS45057",
"omim:607533",
"pubmed:11935316",
"refseq:NM_014953",
"uniprot:Q9Y2L1",
"ccds:CCDS81772",
"ucsc:uc001vix.6",
"pubmed:9562621",
],
"type": "Extension",
},
{
"name": "previous_symbols",
"value": ["KIAA1008"],
"type": "Extension",
},
{
"type": "Extension",
"name": "hgnc_locus_type",
"value": "gene with protein product",
},
{
"type": "Extension",
"name": "ncbi_gene_type",
"value": "protein-coding",
},
{
"type": "Extension",
"name": "ensembl_biotype",
"value": "protein_coding",
"gene_context": "hgnc:20604",
"vrs_ref_allele_seq": "P",
}
return VariationDescriptor(**params)


@pytest.fixture(scope="module")
def tp53_g262c():
"""Create TP53 G262C test fixture."""
params = {
"id": "normalize.variation:TP53%20G262C",
"type": "VariationDescriptor",
"variation_id": "ga4gh:VA.M_aggPZhA47fKQbmDhajHujncmFjMtB7",
"variation": {
"_id": "ga4gh:VA.M_aggPZhA47fKQbmDhajHujncmFjMtB7",
"location": {
"_id": "ga4gh:VSL.2rV8a3PeziQSLNLBzmXMlRHX-vxYRLpS",
"interval": {
"end": {"value": 262, "type": "Number"},
"start": {"value": 261, "type": "Number"},
"type": "SequenceInterval",
},
],
"gene_id": "hgnc:20604",
"sequence_id": "ga4gh:SQ.YIlmVwD0rxIqnlvb-8WujHPbR0j3WEGI",
"type": "SequenceLocation",
},
"state": {"sequence": "C", "type": "LiteralSequenceExpression"},
"type": "Allele",
},
"vrs_ref_allele_seq": "P",
"molecule_context": "protein",
"gene_context": "hgnc:11998",
"vrs_ref_allele_seq": "G",
}
return VariationDescriptor(**params)

Expand Down Expand Up @@ -934,7 +844,7 @@ def gnomad_vcf_genomic_delins3():


@pytest.mark.asyncio
async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a):
async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a, tp53_g262c):
"""Test that protein substitutions normalize correctly."""
resp = await test_handler.normalize(" BRAF V600E ")
assertion_checks(resp.variation_descriptor, braf_v600e, "BRAF V600E")
Expand Down Expand Up @@ -966,6 +876,11 @@ async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a):
resp = await test_handler.normalize("DIS3 P63A")
assertion_checks(resp.variation_descriptor, dis3_p63a, "DIS3 P63A")

# Case where NA priority
q = "TP53 G262C"
resp = await test_handler.normalize(q)
assertion_checks(resp.variation_descriptor, tp53_g262c, q)


@pytest.mark.asyncio
async def test_polypeptide_truncation(test_handler, vhl):
Expand Down Expand Up @@ -1563,7 +1478,6 @@ async def test_valid_queries(test_handler):
assert await test_handler.normalize("CCND1 Y44D")

resp = await test_handler.normalize("NC_000002.12:g.73448098_73448100delCTC")
assert resp
assert resp.variation_descriptor.variation.state.sequence == "CTC"
assert (
resp.variation_descriptor.variation.id
Expand All @@ -1582,6 +1496,11 @@ async def test_valid_queries(test_handler):
resp = await test_handler.normalize(q)
assert resp.variation_descriptor, q

# Test where Mane data not found
resp = await test_handler.normalize("ALK p.A1280V")
assert resp.variation_descriptor.variation
assert resp.warnings == ["Unable to find MANE representation"]


@pytest.mark.asyncio
async def test_no_matches(test_handler):
Expand Down
22 changes: 17 additions & 5 deletions variation/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, UTADatabase
from ga4gh.vrsatile.pydantic.vrs_models import CopyChange
from ga4gh.vrsatile.pydantic.vrsatile_models import MoleculeContext
from gene.query import QueryHandler as GeneQueryHandler

from variation.classify import Classify
Expand All @@ -19,6 +20,7 @@
from variation.schemas.translation_response_schema import (
AC_PRIORITY_LABELS,
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.to_vrsatile import ToVRSATILE
from variation.tokenize import Tokenize
Expand Down Expand Up @@ -65,7 +67,7 @@ def __init__(

@staticmethod
def _get_priority_translation_result(
translations: List[TranslationResult], ac_status: str
translations: List[TranslationResult], ac_status: VrsSeqLocAcStatus
) -> Optional[TranslationResult]:
"""Get prioritized translation result. Tries to find translation results with
the same `vrs_seq_loc_ac_status` as `ac_status`. If more than one translation
Expand All @@ -87,6 +89,10 @@ def _get_priority_translation_result(
# Different `og_ac`'s can lead to different translation results.
# We must be consistent in what we return in /normalize
if len_preferred_translations > 1:
preferred_translations.sort(
key=lambda t: (t.og_ac.split(".")[0], int(t.og_ac.split(".")[1])),
reverse=True,
)
og_ac_preferred_match = (
[t for t in preferred_translations if t.og_ac == t.vrs_seq_loc_ac]
or [None]
Expand All @@ -99,10 +105,6 @@ def _get_priority_translation_result(
if og_ac_preferred_match:
translation_result = og_ac_preferred_match
else:
preferred_translations.sort(
key=lambda t: (t.og_ac.split(".")[0], int(t.og_ac.split(".")[1])),
reverse=True,
)
translation_result = preferred_translations[0]
elif len_preferred_translations == 1:
translation_result = preferred_translations[0]
Expand Down Expand Up @@ -251,6 +253,16 @@ async def normalize(
translations, ac_status
)
if translation_result:
if (
translation_result.vrs_seq_loc_ac_status
== VrsSeqLocAcStatus.NA
):
molecule_context = (
translation_result.validation_result.classification.molecule_context
)
if molecule_context != MoleculeContext.GENOMIC:
# Only supports protein/cDNA at the moment
warnings.append("Unable to find MANE representation")
break

# Get variation descriptor information
Expand Down
33 changes: 22 additions & 11 deletions variation/schemas/translation_response_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module for Translation Response Schema."""
from enum import Enum
from typing import Dict, Optional

from cool_seq_tool.schemas import TranscriptPriorityLabel
Expand All @@ -7,21 +8,31 @@
from variation.schemas.validation_response_schema import ValidationResult


class VrsSeqLocAcStatus(str, Enum):
"""Create enum for VRS SequenceLocation accession status.
Order when defining matters.
First has highest priority, last has lowest priority
Once issue-191 is resolved in cool-seq-tool, we should use the
TranscriptPriorityLabel enum
"""

MANE_SELECT = TranscriptPriorityLabel.MANESelect.value
MANE_PLUS_CLINICAL = TranscriptPriorityLabel.MANEPlusClinical.value
LONGEST_COMPATIBLE_REMAINING = (
TranscriptPriorityLabel.LongestCompatibleRemaining.value
)
GRCH38 = "GRCh38" # will change to lowercase in cool-seq-tool issue-191
NA = "na"


AC_PRIORITY_LABELS = [m for m in VrsSeqLocAcStatus.__members__.values()]


class TranslationResult(BaseModel):
"""Translation Result"""

vrs_variation: Optional[Dict]
vrs_seq_loc_ac: Optional[StrictStr]
vrs_seq_loc_ac_status: StrictStr = "na"
vrs_seq_loc_ac_status: VrsSeqLocAcStatus = VrsSeqLocAcStatus.NA
og_ac: Optional[StrictStr]
validation_result: ValidationResult


# Define accession priority. First has highest priority, last has lowest priority
AC_PRIORITY_LABELS = [
TranscriptPriorityLabel.MANESelect.value,
TranscriptPriorityLabel.MANEPlusClinical.value,
TranscriptPriorityLabel.LongestCompatibleRemaining.value,
"GRCh38",
"na",
]
7 changes: 5 additions & 2 deletions variation/translators/genomic_del_dup_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.service_schema import ClinVarAssembly
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator
from variation.utils import get_assembly
Expand Down Expand Up @@ -107,7 +110,7 @@ async def translate(

grch38_data = None
vrs_variation = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if do_liftover or endpoint_name == Endpoint.NORMALIZE:
errors = []
Expand Down
7 changes: 5 additions & 2 deletions variation/translators/genomic_delins.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator

Expand Down Expand Up @@ -54,7 +57,7 @@ async def translate(
classification: GenomicDelInsClassification = validation_result.classification
vrs_allele = None
vrs_seq_loc_ac = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if endpoint_name == Endpoint.NORMALIZE:
gene = (
Expand Down
7 changes: 5 additions & 2 deletions variation/translators/genomic_insertion.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator

Expand Down Expand Up @@ -57,7 +60,7 @@ async def translate(
)
vrs_allele = None
vrs_seq_loc_ac = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if endpoint_name == Endpoint.NORMALIZE:
gene = (
Expand Down
7 changes: 5 additions & 2 deletions variation/translators/genomic_reference_agree.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator

Expand Down Expand Up @@ -56,7 +59,7 @@ async def translate(
)
vrs_allele = None
vrs_seq_loc_ac = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if endpoint_name == Endpoint.NORMALIZE:
gene = (
Expand Down
Loading

0 comments on commit 40e18f2

Please sign in to comment.