Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: still return a result for protein/cdna with no mane data found #507

Merged
merged 2 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 40 additions & 121 deletions tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,127 +41,37 @@ def dis3_p63a():
"type": "Allele",
},
"molecule_context": "protein",
"gene_context": {
"id": "normalize.gene:DIS3",
"type": "GeneDescriptor",
"label": "DIS3",
"xrefs": ["ensembl:ENSG00000083520", "ncbigene:22894"],
"alternate_labels": [
"dis3p",
"RRP44",
"KIAA1008",
"2810028N01Rik",
"EXOSC11",
],
"extensions": [
{"name": "symbol_status", "value": "approved", "type": "Extension"},
{
"name": "approved_name",
"value": "DIS3 homolog, exosome endoribonuclease and 3'-5' exoribonuclease", # noqa: E501
"type": "Extension",
},
{
"name": "hgnc_locations",
"value": [
{
"species_id": "taxonomy:9606",
"interval": {
"type": "CytobandInterval",
"start": "q21.33",
"end": "q21.33",
},
"_id": "ga4gh:VCL.84IPub_nKl33cWX9pNoPeGsyeVuJnyra",
"type": "ChromosomeLocation",
"chr": "13",
}
],
"type": "Extension",
},
{
"type": "Extension",
"name": "ensembl_locations",
"value": [
{
"_id": "ga4gh:VSL.HZtod8n11kD7jCAbtsJLDGwLKCEhRWO1",
"type": "SequenceLocation",
"sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"interval": {
"start": {"type": "Number", "value": 72752168},
"end": {"type": "Number", "value": 72782096},
"type": "SequenceInterval",
},
}
],
},
{
"type": "Extension",
"name": "ncbi_locations",
"value": [
{
"_id": "ga4gh:VCL.84IPub_nKl33cWX9pNoPeGsyeVuJnyra",
"type": "ChromosomeLocation",
"species_id": "taxonomy:9606",
"chr": "13",
"interval": {
"end": "q21.33",
"start": "q21.33",
"type": "CytobandInterval",
},
},
{
"_id": "ga4gh:VSL.BIRwPLT8rtyOnhd3aUXaz4xlHC4P4zA8",
"type": "SequenceLocation",
"sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"interval": {
"start": {"type": "Number", "value": 72752168},
"end": {"type": "Number", "value": 72781900},
"type": "SequenceInterval",
},
},
],
},
{
"name": "associated_with",
"value": [
"vega:OTTHUMG00000017070",
"ccds:CCDS9447",
"orphanet:470196",
"ena.embl:AB023225",
"ccds:CCDS45057",
"omim:607533",
"pubmed:11935316",
"refseq:NM_014953",
"uniprot:Q9Y2L1",
"ccds:CCDS81772",
"ucsc:uc001vix.6",
"pubmed:9562621",
],
"type": "Extension",
},
{
"name": "previous_symbols",
"value": ["KIAA1008"],
"type": "Extension",
},
{
"type": "Extension",
"name": "hgnc_locus_type",
"value": "gene with protein product",
},
{
"type": "Extension",
"name": "ncbi_gene_type",
"value": "protein-coding",
},
{
"type": "Extension",
"name": "ensembl_biotype",
"value": "protein_coding",
"gene_context": "hgnc:20604",
"vrs_ref_allele_seq": "P",
}
return VariationDescriptor(**params)


@pytest.fixture(scope="module")
def tp53_g262c():
"""Create TP53 G262C test fixture."""
params = {
"id": "normalize.variation:TP53%20G262C",
"type": "VariationDescriptor",
"variation_id": "ga4gh:VA.M_aggPZhA47fKQbmDhajHujncmFjMtB7",
"variation": {
"_id": "ga4gh:VA.M_aggPZhA47fKQbmDhajHujncmFjMtB7",
"location": {
"_id": "ga4gh:VSL.2rV8a3PeziQSLNLBzmXMlRHX-vxYRLpS",
"interval": {
"end": {"value": 262, "type": "Number"},
"start": {"value": 261, "type": "Number"},
"type": "SequenceInterval",
},
],
"gene_id": "hgnc:20604",
"sequence_id": "ga4gh:SQ.YIlmVwD0rxIqnlvb-8WujHPbR0j3WEGI",
"type": "SequenceLocation",
},
"state": {"sequence": "C", "type": "LiteralSequenceExpression"},
"type": "Allele",
},
"vrs_ref_allele_seq": "P",
"molecule_context": "protein",
"gene_context": "hgnc:11998",
"vrs_ref_allele_seq": "G",
}
return VariationDescriptor(**params)

Expand Down Expand Up @@ -934,7 +844,7 @@ def gnomad_vcf_genomic_delins3():


@pytest.mark.asyncio
async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a):
async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a, tp53_g262c):
"""Test that protein substitutions normalize correctly."""
resp = await test_handler.normalize(" BRAF V600E ")
assertion_checks(resp.variation_descriptor, braf_v600e, "BRAF V600E")
Expand Down Expand Up @@ -966,6 +876,11 @@ async def test_protein_substitution(test_handler, braf_v600e, dis3_p63a):
resp = await test_handler.normalize("DIS3 P63A")
assertion_checks(resp.variation_descriptor, dis3_p63a, "DIS3 P63A")

# Case where NA priority
q = "TP53 G262C"
resp = await test_handler.normalize(q)
assertion_checks(resp.variation_descriptor, tp53_g262c, q)


@pytest.mark.asyncio
async def test_polypeptide_truncation(test_handler, vhl):
Expand Down Expand Up @@ -1563,7 +1478,6 @@ async def test_valid_queries(test_handler):
assert await test_handler.normalize("CCND1 Y44D")

resp = await test_handler.normalize("NC_000002.12:g.73448098_73448100delCTC")
assert resp
assert resp.variation_descriptor.variation.state.sequence == "CTC"
assert (
resp.variation_descriptor.variation.id
Expand All @@ -1582,6 +1496,11 @@ async def test_valid_queries(test_handler):
resp = await test_handler.normalize(q)
assert resp.variation_descriptor, q

# Test where Mane data not found
resp = await test_handler.normalize("ALK p.A1280V")
assert resp.variation_descriptor.variation
assert resp.warnings == ["Unable to find MANE representation"]


@pytest.mark.asyncio
async def test_no_matches(test_handler):
Expand Down
22 changes: 17 additions & 5 deletions variation/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, UTADatabase
from ga4gh.vrsatile.pydantic.vrs_models import CopyChange
from ga4gh.vrsatile.pydantic.vrsatile_models import MoleculeContext
from gene.query import QueryHandler as GeneQueryHandler

from variation.classify import Classify
Expand All @@ -19,6 +20,7 @@
from variation.schemas.translation_response_schema import (
AC_PRIORITY_LABELS,
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.to_vrsatile import ToVRSATILE
from variation.tokenize import Tokenize
Expand Down Expand Up @@ -65,7 +67,7 @@ def __init__(

@staticmethod
def _get_priority_translation_result(
translations: List[TranslationResult], ac_status: str
translations: List[TranslationResult], ac_status: VrsSeqLocAcStatus
) -> Optional[TranslationResult]:
"""Get prioritized translation result. Tries to find translation results with
the same `vrs_seq_loc_ac_status` as `ac_status`. If more than one translation
Expand All @@ -87,6 +89,10 @@ def _get_priority_translation_result(
# Different `og_ac`'s can lead to different translation results.
# We must be consistent in what we return in /normalize
if len_preferred_translations > 1:
preferred_translations.sort(
key=lambda t: (t.og_ac.split(".")[0], int(t.og_ac.split(".")[1])),
reverse=True,
)
og_ac_preferred_match = (
[t for t in preferred_translations if t.og_ac == t.vrs_seq_loc_ac]
or [None]
Expand All @@ -99,10 +105,6 @@ def _get_priority_translation_result(
if og_ac_preferred_match:
translation_result = og_ac_preferred_match
else:
preferred_translations.sort(
key=lambda t: (t.og_ac.split(".")[0], int(t.og_ac.split(".")[1])),
reverse=True,
)
translation_result = preferred_translations[0]
elif len_preferred_translations == 1:
translation_result = preferred_translations[0]
Expand Down Expand Up @@ -251,6 +253,16 @@ async def normalize(
translations, ac_status
)
if translation_result:
if (
translation_result.vrs_seq_loc_ac_status
== VrsSeqLocAcStatus.NA
):
molecule_context = (
translation_result.validation_result.classification.molecule_context
)
if molecule_context != MoleculeContext.GENOMIC:
# Only supports protein/cDNA at the moment
warnings.append("Unable to find MANE representation")
break

# Get variation descriptor information
Expand Down
33 changes: 22 additions & 11 deletions variation/schemas/translation_response_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module for Translation Response Schema."""
from enum import Enum
from typing import Dict, Optional

from cool_seq_tool.schemas import TranscriptPriorityLabel
Expand All @@ -7,21 +8,31 @@
from variation.schemas.validation_response_schema import ValidationResult


class VrsSeqLocAcStatus(str, Enum):
"""Create enum for VRS SequenceLocation accession status.
Order when defining matters.
First has highest priority, last has lowest priority
Once issue-191 is resolved in cool-seq-tool, we should use the
TranscriptPriorityLabel enum
"""

MANE_SELECT = TranscriptPriorityLabel.MANESelect.value
MANE_PLUS_CLINICAL = TranscriptPriorityLabel.MANEPlusClinical.value
LONGEST_COMPATIBLE_REMAINING = (
TranscriptPriorityLabel.LongestCompatibleRemaining.value
)
GRCH38 = "GRCh38" # will change to lowercase in cool-seq-tool issue-191
NA = "na"


AC_PRIORITY_LABELS = [m for m in VrsSeqLocAcStatus.__members__.values()]


class TranslationResult(BaseModel):
"""Translation Result"""

vrs_variation: Optional[Dict]
vrs_seq_loc_ac: Optional[StrictStr]
vrs_seq_loc_ac_status: StrictStr = "na"
vrs_seq_loc_ac_status: VrsSeqLocAcStatus = VrsSeqLocAcStatus.NA
og_ac: Optional[StrictStr]
validation_result: ValidationResult


# Define accession priority. First has highest priority, last has lowest priority
AC_PRIORITY_LABELS = [
TranscriptPriorityLabel.MANESelect.value,
TranscriptPriorityLabel.MANEPlusClinical.value,
TranscriptPriorityLabel.LongestCompatibleRemaining.value,
"GRCh38",
"na",
]
7 changes: 5 additions & 2 deletions variation/translators/genomic_del_dup_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.service_schema import ClinVarAssembly
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator
from variation.utils import get_assembly
Expand Down Expand Up @@ -107,7 +110,7 @@ async def translate(

grch38_data = None
vrs_variation = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if do_liftover or endpoint_name == Endpoint.NORMALIZE:
errors = []
Expand Down
7 changes: 5 additions & 2 deletions variation/translators/genomic_delins.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator

Expand Down Expand Up @@ -54,7 +57,7 @@ async def translate(
classification: GenomicDelInsClassification = validation_result.classification
vrs_allele = None
vrs_seq_loc_ac = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if endpoint_name == Endpoint.NORMALIZE:
gene = (
Expand Down
7 changes: 5 additions & 2 deletions variation/translators/genomic_insertion.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator

Expand Down Expand Up @@ -57,7 +60,7 @@ async def translate(
)
vrs_allele = None
vrs_seq_loc_ac = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if endpoint_name == Endpoint.NORMALIZE:
gene = (
Expand Down
7 changes: 5 additions & 2 deletions variation/translators/genomic_reference_agree.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
from variation.schemas.normalize_response_schema import HGVSDupDelModeOption
from variation.schemas.token_response_schema import AltType
from variation.schemas.translation_response_schema import TranslationResult
from variation.schemas.translation_response_schema import (
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationResult
from variation.translators.translator import Translator

Expand Down Expand Up @@ -56,7 +59,7 @@ async def translate(
)
vrs_allele = None
vrs_seq_loc_ac = None
vrs_seq_loc_ac_status = "na"
vrs_seq_loc_ac_status = VrsSeqLocAcStatus.NA

if endpoint_name == Endpoint.NORMALIZE:
gene = (
Expand Down
Loading