diff --git a/Pipfile b/Pipfile index 7562aa68..7c43e495 100644 --- a/Pipfile +++ b/Pipfile @@ -12,7 +12,7 @@ coveralls = "*" coverage = "*" flake8-docstrings = "*" pre-commit = "*" -variation-normalization = {editable = true, path = "."} +variation-normalizer = {editable = true, path = "."} pyyaml = "*" jupyter = "*" ipykernel = "*" @@ -31,10 +31,9 @@ uvicorn = "*" pydantic = "*" uvloop = "*" httptools = "*" -"ga4gh.vrs" = {version = "==0.7.0rc3", extras = ["extras"]} -gene-normalizer = ">=0.1.21" +"ga4gh.vrs" = {version = ">=0.7.2", extras = ["extras"]} +gene-normalizer = ">=0.1.23" pyliftover = "*" boto3 = "*" -"ga4gh.vrsatile.pydantic" = "*" +"ga4gh.vrsatile.pydantic" = ">=0.0.5" pandas = "*" -jsonschema = ">=2.3, <4.0" diff --git a/README.md b/README.md index 39c3003c..ab90568b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Variation Normalization -Services and guidelines for normalizing variation terms into [VRS (v1.1.1)](https://vrs.ga4gh.org/en/1.1.1) and [VRSATILE (latest)](https://vrsatile.readthedocs.io/en/latest/) compatible representations. +Services and guidelines for normalizing variation terms into [VRS (v1.2.0)](https://vrs.ga4gh.org/en/1.2.0) and [VRSATILE (latest)](https://vrsatile.readthedocs.io/en/latest/) compatible representations. Public OpenAPI endpoint: https://normalize.cancervariants.org/variation @@ -13,12 +13,19 @@ pip install variation-normalizer ## About Variation Normalization works by using four main steps: tokenization, classification, validation, and translation. During tokenization, we split strings on whitespace and parse to determine the type of token. During classification, we specify the order of tokens a classification can have. We then do validation checks such as ensuring references for a nucleotide or amino acid matches the expected value and validating a position exists on the given transcript. During translation, we return a VRS Allele object. +Variation Normalization is limited to the following types of variants represented as HGVS expressions and text representations (ex: `BRAF V600E`): + +* **protein (p.)**: substitution, deletion, insertion, deletion-insertion +* **coding DNA (c.)**: substitution, deletion, insertion, deletion-insertion +* **genomic (g.)**: substitution, deletion, ambiguous deletion, insertion, deletion-insertion, duplication + +We are working towards adding more types of variations, coordinates, and representations. + ### Endpoints #### /toVRS -The `/toVRS` endpoint returns a list of valid [Alleles](https://vrs.ga4gh.org/en/1.1.1/terms_and_model.html#allele). +The `/toVRS` endpoint returns a list of valid VRS [Variations](https://vrs.ga4gh.org/en/1.2.0/terms_and_model.html#variation). -#### /normalize -The `/normalize` endpoint returns a [Variation Descriptor](https://vrsatile.readthedocs.io/en/latest/value_object_descriptor/vod_index.html#variation-descriptor) containing the MANE Transcript, if one is found. +The `/normalize` endpoint returns a [Variation Descriptor](https://vrsatile.readthedocs.io/en/latest/value_object_descriptor/vod_index.html#variation-descriptor) containing the MANE Transcript, if one is found. If a genomic query is not given a gene, `normalize` will return its GRCh38 representation. The steps for retrieving MANE Transcript data is as follows: 1. Map starting annotation layer to genomic @@ -30,17 +37,6 @@ The steps for retrieving MANE Transcript data is as follows: 3. Longest Compatible Remaining Transcript 4. Map back to starting annotation layer -#### Limitations -Variation Normalization is limited to the following types of variants represented as HGVS expressions and text representations (ex: `BRAF V600E`): - -* **protein (p.)**: substitution, deletion, insertion, deletion-insertion -* **coding DNA (c.)**: substitution, deletion, insertion, deletion-insertion\ - *Note: c. coordinates will be returned as r. coordinates in the VRS and VRSATILE objects* -* **genomic (g.)**: substitution, deletion, insertion, deletion-insertion\ - *Note: If a genomic query is not given a gene, `normalize` will return its GRCh38 representation.* - -We are working towards adding more types of variants, coordinates, and representations. - ## Backend Services Variation Normalization relies on some local data caches which you will need to set up. It uses pipenv to manage its environment, which you will also need to install. @@ -52,17 +48,13 @@ pipenv lock pipenv sync ``` -### Setting up Gene Normalizer -Variation Normalization relies on data from [Gene Normalization](https://github.com/cancervariants/gene-normalization. You must have Gene Normalization's DynamoDB running for the application to work. +### Gene Normalizer -You must run the following when loading the database: - -```commandline -python3 -m gene.cli --update_all --update_merged -``` +Variation Normalization relies on data from [Gene Normalization](https://github.com/cancervariants/gene-normalization). You must load all sources _and_ merged concepts. -For more information, visit see the [README](https://github.com/cancervariants/gene-normalization/blob/main/README.md). +You must also have Gene Normalization's DynamoDB running for the application to work. +For more information about the gene-normalizer, visit the [README](https://github.com/cancervariants/gene-normalization/blob/main/README.md). ### SeqRepo Variation Normalization relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself. diff --git a/docs/hgvs_dup_del_mode.md b/docs/hgvs_dup_del_mode.md new file mode 100644 index 00000000..0c39ca7c --- /dev/null +++ b/docs/hgvs_dup_del_mode.md @@ -0,0 +1,33 @@ +# HGVS Dup Del Mode + +This mode helps us interpret deletions and duplications that are represented as HGVS expressions. + +## Default Characteristics + +- If endpoints are ambiguous: cnv (copies attribute) + - handling X chromosome + - base 1-2 + - Duplication: Definite Range = 2, 3 + - Deletion: Definite Range = 0, 1 + - handling Y chromosome + - base of 1 + - Duplication: Number = 2 + - Deletion: Number = 0 + - handling 1 – 22 chromosome + - base of 2 + - Duplication: Number = 3 + - Deletion: Number = 1 +- elif len del or dup > 100bp: (use outermost coordinates) + - repeated_seq_expr with a derived_seq_expr subject (Allele) +- else: + - literal_seq_expr (normalized LiteralSequenceExpression Allele) + +# Notes + +- Ambiguous ranges are of the form: + - `(#_#)_(#_#)` + - `(?_#)_(#_?)` + - `(?_#)_#` + - `#_(#_?)` +- We do not normalize any ambiguous ranges +- We do not change the molecular context for ambiguous ranges. diff --git a/requirements-dev.txt b/requirements-dev.txt index 3ae3e629..f82845e6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,8 +10,7 @@ -i https://pypi.org/simple -e . --e . -anyio==3.3.4; python_full_version >= '3.6.2' +anyio==3.4.0; python_full_version >= '3.6.2' appdirs==1.4.4 appnope==0.1.2; sys_platform == 'darwin' argcomplete==1.12.3 @@ -21,13 +20,13 @@ asgiref==3.4.1; python_version >= '3.6' attrs==21.2.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' babel==2.9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' backcall==0.2.0 -backports.entry-points-selectable==1.1.0; python_version >= '2.7' +backports.entry-points-selectable==1.1.1; python_version >= '2.7' beautifulsoup4==4.10.0; python_version >= '3.1' biocommons.seqrepo==0.6.4 bioutils==0.5.5; python_version >= '3.6' bleach==4.1.0; python_version >= '3.6' -boto3==1.19.10 -botocore==1.22.10; python_version >= '3.6' +boto3==1.20.11 +botocore==1.23.11; python_version >= '3.6' bs4==0.0.1 canonicaljson==1.5.0; python_version ~= '3.5' certifi==2021.10.8 @@ -38,8 +37,8 @@ click==8.0.3; python_version >= '3.6' colorama==0.4.4; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' configparser==5.1.0; python_version >= '3.6' -coverage[toml]==6.1.1 -coveralls==3.3.0 +coverage[toml]==6.1.2 +coveralls==3.3.1 cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' cycler==0.11.0; python_version >= '3.6' debugpy==1.5.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' @@ -51,59 +50,60 @@ docutils==0.18; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2 entrypoints==0.3; python_version >= '2.7' fake-useragent==0.1.11 fastapi==0.70.0 -filelock==3.3.2; python_version >= '3.6' +filelock==3.4.0; python_version >= '3.6' flake8-docstrings==1.6.0 flake8==4.0.1 -frozendict==2.0.7; python_version >= '3.6' -ga4gh.vrs[extras]==0.7.0rc3 -ga4gh.vrsatile.pydantic==0.0.3 -gene-normalizer==0.1.22 +fonttools==4.28.2; python_version >= '3.7' +frozendict==2.1.0; python_version >= '3.6' +ga4gh.vrs[extras]==0.7.2 +ga4gh.vrsatile.pydantic==0.0.5 +gene-normalizer==0.1.23 gffutils==0.10.1 h11==0.12.0; python_version >= '3.6' hgvs==1.5.1 httptools==0.3.0 humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -identify==2.3.3; python_full_version >= '3.6.1' +identify==2.4.0; python_full_version >= '3.6.1' idna==3.3; python_version >= '3' -importlib-metadata==4.8.1; python_version >= '3.6' +importlib-metadata==4.8.2; python_version < '3.10' inflection==0.5.1; python_version >= '3.5' iniconfig==1.1.1 -ipykernel==6.5.0 +ipykernel==6.5.1 ipython-genutils==0.2.0 ipython==7.29.0; python_version >= '3.7' ipywidgets==7.6.5 -jedi==0.18.0; python_version >= '3.6' -jinja2==3.0.2; python_version >= '3.6' +jedi==0.18.1; python_version >= '3.6' +jinja2==3.0.3; python_version >= '3.6' jmespath==0.10.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' json5==0.9.6 jsonschema==3.2.0 -jupyter-client==7.0.6; python_full_version >= '3.6.1' +jupyter-client==7.1.0; python_full_version >= '3.6.1' jupyter-console==6.4.0; python_version >= '3.6' jupyter-core==4.9.1; python_version >= '3.6' -jupyter-server==1.11.2; python_version >= '3.6' +jupyter-server==1.12.0; python_version >= '3.6' jupyter==1.0.0 jupyterlab-pygments==0.1.2 jupyterlab-server==2.8.2; python_version >= '3.6' jupyterlab-widgets==1.0.2; python_version >= '3.6' -jupyterlab==3.2.1 +jupyterlab==3.2.4 keyring==23.2.1; python_version >= '3.6' kiwisolver==1.3.2; python_version >= '3.7' lxml==4.6.4 -markdown==3.3.4; python_version >= '3.6' +markdown==3.3.6; python_version >= '3.6' markupsafe==2.0.1; python_version >= '3.6' matplotlib-inline==0.1.3; python_version >= '3.5' -matplotlib==3.4.3 +matplotlib==3.5.0 mccabe==0.6.1 mistune==0.8.4 nbclassic==0.3.4; python_version >= '3.6' -nbclient==0.5.4; python_full_version >= '3.6.1' -nbconvert==6.2.0; python_version >= '3.7' +nbclient==0.5.9; python_full_version >= '3.6.1' +nbconvert==6.3.0; python_version >= '3.7' nbformat==5.1.3; python_version >= '3.5' nest-asyncio==1.5.1; python_version >= '3.5' nodeenv==1.6.0 -notebook==6.4.5; python_version >= '3.6' -numpy==1.21.3; python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64' -packaging==21.2; python_version >= '3.6' +notebook==6.4.6; python_version >= '3.6' +numpy==1.21.4; python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64' +packaging==21.3; python_version >= '3.6' pandas==1.3.4 pandocfilters==1.5.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' parse==1.19.0 @@ -112,17 +112,17 @@ parso==0.8.2; python_version >= '3.6' pexpect==4.8.0; sys_platform != 'win32' pickleshare==0.7.5 pillow==8.4.0; python_version >= '3.6' -pkginfo==1.7.1 +pkginfo==1.8.1 platformdirs==2.4.0; python_version >= '3.6' pluggy==1.0.0; python_version >= '3.6' pre-commit==2.15.0 prometheus-client==0.12.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' prompt-toolkit==3.0.22; python_full_version >= '3.6.2' -psycopg2-binary==2.9.1; python_version >= '3.6' +psycopg2-binary==2.9.2; python_version >= '3.6' ptyprocess==0.7.0 -py==1.10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' pycodestyle==2.8.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -pycparser==2.20; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +pycparser==2.21; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' pydantic==1.8.2 pydocstyle==6.1.1; python_version >= '3.6' pyee==8.2.2 @@ -130,19 +130,19 @@ pyfaidx==0.6.3.1 pyflakes==2.4.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' pygments==2.10.0; python_version >= '3.5' pyliftover==0.4 -pyparsing==2.4.7; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' -pyppeteer==0.2.6; python_full_version >= '3.6.1' and python_full_version < '4.0.0' +pyparsing==3.0.6; python_version >= '3.6' +pyppeteer==0.2.6; python_version < '4' and python_full_version >= '3.6.1' pyquery==1.4.3 pyrsistent==0.18.0; python_version >= '3.6' -pysam==0.17.0 +pysam==0.18.0 pytest-cov==3.0.0 pytest==6.2.5 python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -python-jsonschema-objects==0.3.10 +python-jsonschema-objects==0.4.1 pytz==2021.3 pyyaml==6.0; python_version >= '3.6' pyzmq==22.3.0; python_version >= '3.6' -qtconsole==5.1.1; python_version >= '3.6' +qtconsole==5.2.0; python_version >= '3.6' qtpy==1.11.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' readme-renderer==30.0 requests-html==0.10.0; python_version >= '3.6' @@ -151,11 +151,11 @@ requests==2.26.0 rfc3986==1.5.0 s3transfer==0.5.0; python_version >= '3.6' send2trash==1.8.0 -simplejson==3.17.5; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' +simplejson==3.17.6; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' sniffio==1.2.0; python_version >= '3.5' -snowballstemmer==2.1.0 -soupsieve==2.3; python_version >= '3.6' +snowballstemmer==2.2.0 +soupsieve==2.3.1; python_version >= '3.6' sqlparse==0.4.2; python_version >= '3.5' starlette==0.16.0; python_version >= '3.6' tabulate==0.8.9 @@ -166,9 +166,9 @@ tomli==1.2.2; python_version >= '3.6' tornado==6.1; python_version >= '3.5' tqdm==4.62.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' traitlets==5.1.1; python_version >= '3.7' -twine==3.5.0 -typing-extensions==3.10.0.2 -urllib3==1.26.7; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_full_version < '4.0.0' +twine==3.6.0 +typing-extensions==4.0.0 +urllib3==1.26.7; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4' uvicorn==0.15.0 uvloop==0.16.0 virtualenv==20.10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' diff --git a/requirements.txt b/requirements.txt index 437808f4..672d4910 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ # -i https://pypi.org/simple -anyio==3.3.4; python_full_version >= '3.6.2' +anyio==3.4.0; python_full_version >= '3.6.2' appdirs==1.4.4 appnope==0.1.2; sys_platform == 'darwin' argcomplete==1.12.3 @@ -17,8 +17,8 @@ backcall==0.2.0 beautifulsoup4==4.10.0; python_version >= '3.1' biocommons.seqrepo==0.6.4 bioutils==0.5.5; python_version >= '3.6' -boto3==1.19.10 -botocore==1.22.10; python_version >= '3.6' +boto3==1.20.11 +botocore==1.23.11; python_version >= '3.6' bs4==0.0.1 canonicaljson==1.5.0; python_version ~= '3.5' certifi==2021.10.8 @@ -30,26 +30,26 @@ cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3 decorator==5.1.0; python_version >= '3.5' fake-useragent==0.1.11 fastapi==0.70.0 -frozendict==2.0.7; python_version >= '3.6' -ga4gh.vrs[extras]==0.7.0rc3 -ga4gh.vrsatile.pydantic==0.0.3 -gene-normalizer==0.1.22 +frozendict==2.1.0; python_version >= '3.6' +ga4gh.vrs[extras]==0.7.2 +ga4gh.vrsatile.pydantic==0.0.5 +gene-normalizer==0.1.23 gffutils==0.10.1 h11==0.12.0; python_version >= '3.6' hgvs==1.5.1 httptools==0.3.0 humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' idna==3.3; python_version >= '3' -importlib-metadata==4.8.1; python_version >= '3.6' +importlib-metadata==4.8.2; python_version < '3.10' inflection==0.5.1; python_version >= '3.5' ipython==7.29.0; python_version >= '3.7' -jedi==0.18.0; python_version >= '3.6' +jedi==0.18.1; python_version >= '3.6' jmespath==0.10.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' jsonschema==3.2.0 lxml==4.6.4 -markdown==3.3.4; python_version >= '3.6' +markdown==3.3.6; python_version >= '3.6' matplotlib-inline==0.1.3; python_version >= '3.5' -numpy==1.21.3; python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64' +numpy==1.21.4; python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64' pandas==1.3.4 parse==1.19.0 parsley==1.3 @@ -57,35 +57,35 @@ parso==0.8.2; python_version >= '3.6' pexpect==4.8.0; sys_platform != 'win32' pickleshare==0.7.5 prompt-toolkit==3.0.22; python_full_version >= '3.6.2' -psycopg2-binary==2.9.1; python_version >= '3.6' +psycopg2-binary==2.9.2; python_version >= '3.6' ptyprocess==0.7.0 pydantic==1.8.2 pyee==8.2.2 pyfaidx==0.6.3.1 pygments==2.10.0; python_version >= '3.5' pyliftover==0.4 -pyppeteer==0.2.6; python_full_version >= '3.6.1' and python_full_version < '4.0.0' +pyppeteer==0.2.6; python_version < '4' and python_full_version >= '3.6.1' pyquery==1.4.3 pyrsistent==0.18.0; python_version >= '3.6' -pysam==0.17.0 +pysam==0.18.0 python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -python-jsonschema-objects==0.3.10 +python-jsonschema-objects==0.4.1 pytz==2021.3 pyyaml==6.0; python_version >= '3.6' requests-html==0.10.0; python_version >= '3.6' requests==2.26.0 s3transfer==0.5.0; python_version >= '3.6' -simplejson==3.17.5; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' +simplejson==3.17.6; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' sniffio==1.2.0; python_version >= '3.5' -soupsieve==2.3; python_version >= '3.6' +soupsieve==2.3.1; python_version >= '3.6' sqlparse==0.4.2; python_version >= '3.5' starlette==0.16.0; python_version >= '3.6' tabulate==0.8.9 tqdm==4.62.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' traitlets==5.1.1; python_version >= '3.7' -typing-extensions==3.10.0.2 -urllib3==1.26.7; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_full_version < '4.0.0' +typing-extensions==4.0.0 +urllib3==1.26.7; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4' uvicorn==0.15.0 uvloop==0.16.0 w3lib==1.22.0 diff --git a/setup.cfg b/setup.cfg index ecce17e6..39f53fa5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,13 +31,12 @@ install_requires = fastapi uvicorn pydantic - ga4gh.vrs[extras] == 0.7.0rc3 - gene-normalizer + ga4gh.vrs[extras] >= 0.7.2 + gene-normalizer >= 0.1.23 pyliftover boto3 - ga4gh.vrsatile.pydantic + ga4gh.vrsatile.pydantic >= 0.0.5 pandas - jsonschema >=2.3, <4.0 tests_require = pytest diff --git a/tests/classifiers/test_genomic_deletion_range.py b/tests/classifiers/test_genomic_deletion_range.py new file mode 100644 index 00000000..ab10c826 --- /dev/null +++ b/tests/classifiers/test_genomic_deletion_range.py @@ -0,0 +1,17 @@ +"""Module for testing Genomic Deletion Range Classifier.""" +import unittest +from variation.classifiers import GenomicDeletionRangeClassifier +from .classifier_base import ClassifierBase + + +class TestGenomicDeletionRangeClassifier(ClassifierBase, + unittest.TestCase): + """A class to test the Genomic Deletion Range Classifier.""" + + def classifier_instance(self): + """Return Genomic Deletion Range Classifier instance.""" + return GenomicDeletionRangeClassifier() + + def fixture_name(self): + """Return Genomic Deletion Range fixture name.""" + return 'genomic_deletion_range' diff --git a/tests/classifiers/test_genomic_duplication.py b/tests/classifiers/test_genomic_duplication.py new file mode 100644 index 00000000..36c0bc68 --- /dev/null +++ b/tests/classifiers/test_genomic_duplication.py @@ -0,0 +1,16 @@ +"""Module for testing Genomic Duplication Classifier.""" +import unittest +from variation.classifiers import GenomicDuplicationClassifier +from .classifier_base import ClassifierBase + + +class TestGenomicDuplicationClassifier(ClassifierBase, unittest.TestCase): + """A class to test the Genomic Duplication Classifier.""" + + def classifier_instance(self): + """Return GenomicDuplicationClassifier instance.""" + return GenomicDuplicationClassifier() + + def fixture_name(self): + """Return GenomicDuplicationClassifier fixture name.""" + return 'genomic_duplication' diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..be0e6c43 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,138 @@ +"""Create methods used throughout tests.""" +import pytest + + +@pytest.fixture(scope='session') +def vhl_gene_context(): + """Create a VHL gene context.""" + return { + "id": "normalize.gene:VHL", + "type": "GeneDescriptor", + "label": "VHL", + "gene_id": "hgnc:12687", + "xrefs": [ + "ncbigene:7428", + "ensembl:ENSG00000134086" + ], + "alternate_labels": [ + "HRCA1", + "VHL1", + "RCA1", + "pVHL" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "von Hippel-Lindau tumor suppressor" + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "ucsc:uc003bvc.4", + "pubmed:9671762", + "refseq:NM_000551", + "cosmic:VHL", + "omim:608537", + "vega:OTTHUMG00000128668", + "ccds:CCDS2598", + "ena.embl:L15409", + "orphanet:120467", + "ccds:CCDS2597", + "uniprot:P40337" + ] + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "_id": + "ga4gh:VCL.S-TtMfLdsgZPVRrWEf1-jiZMyTDCt5y1", + "type": "ChromosomeLocation", + "species_id": "taxonomy:9606", + "chr": "3", + "interval": { + "end": "p25.3", + "start": "p25.3", + "type": "CytobandInterval" + } + } + }, + { + "name": "previous_symbols", + "value": [ + "RCA1" + ], + "type": "Extension" + } + ] + } + + +def assertion_checks(normalize_response, test_variation, ignore_id=False): + """Check that normalize_response and test_variation are equal.""" + if not ignore_id: + assert normalize_response.id == test_variation.id, "id" + assert normalize_response.type == test_variation.type, "type" + assert normalize_response.variation_id == \ + test_variation.variation_id, "variation_id" + if test_variation.variation.type != "Text": + if test_variation.variation.id: + assert normalize_response.variation.id == \ + test_variation.variation.id, "variation._id" + if test_variation.variation_id: + assert normalize_response.variation_id == \ + normalize_response.variation.id, "variation_id == variation.id" # noqa: E501 + assert normalize_response.variation == \ + test_variation.variation, "variation" + else: + if not ignore_id: + assert normalize_response.variation.id == \ + test_variation.variation.id + assert normalize_response.variation.type == \ + test_variation.variation.type + assert normalize_response.variation.definition == \ + test_variation.variation.definition + assert normalize_response.molecule_context == \ + test_variation.molecule_context, "molecule_context" + assert normalize_response.structural_type == \ + test_variation.structural_type, "structural_type" + assert normalize_response.vrs_ref_allele_seq == \ + test_variation.vrs_ref_allele_seq, "vrs_ref_allele_seq" + + resp_gene_context = normalize_response.gene_context + test_variation_context = test_variation.gene_context + if resp_gene_context: + assert resp_gene_context.id == \ + test_variation_context.id, "gene_context.id" + assert resp_gene_context.label == \ + test_variation_context.label, "gene_context.label" + assert resp_gene_context.gene_id ==\ + test_variation_context.gene_id, "gene_context.gene_id" + assert set(resp_gene_context.xrefs) ==\ + set(test_variation_context.xrefs), "gene_context.xrefs" + if test_variation_context.alternate_labels: + assert set(resp_gene_context.alternate_labels) == \ + set(test_variation_context.alternate_labels), "gene_context.alternate_labels" # noqa: E501 + assert len(resp_gene_context.extensions) == \ + len(test_variation_context.extensions), "len gene_context.extensions" # noqa: E501 + for resp_ext in resp_gene_context.extensions: + for test_var in test_variation_context.extensions: + if resp_ext.name == test_var.name: + if resp_ext.name == 'chromosome_location': + assert resp_ext.value == test_var.value, \ + "gene_context.chromosome_location" + elif resp_ext.name == 'associated_with': + assert set(resp_ext.value) == set(test_var.value), \ + "gene_context.associated_with" + else: + assert resp_ext.value == test_var.value,\ + f"gene_context.{resp_ext.name}" + else: + assert not test_variation_context diff --git a/tests/fixtures/classifiers.yml b/tests/fixtures/classifiers.yml index 9d16991f..8b0e605e 100644 --- a/tests/fixtures/classifiers.yml +++ b/tests/fixtures/classifiers.yml @@ -286,4 +286,31 @@ genomic_uncertain_deletion: confidence: ConfidenceRating.INTERSECTION should_not_match: - query: GENE (?_155980375)_(156013167_?)del - - query: accession:g.(?_155980375)_(156013167_?)del \ No newline at end of file + - query: accession:g.(?_155980375)_(156013167_?)del + +genomic_duplication: + should_match: + - query: NC_000020.11:g.(?_30417576)_(31394018_?)dup + confidence: ConfidenceRating.EXACT + - query: NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup + confidence: ConfidenceRating.EXACT + - query: NC_000023.11:g.(?_154021812)_154092209dup + confidence: ConfidenceRating.EXACT + - query: NC_000003.12:g.49531262dup + confidence: ConfidenceRating.EXACT + - query: NC_000016.10:g.2087938_2087948dup + confidence: ConfidenceRating.EXACT + - query: BRAF g.2087938_2087948dup + confidence: ConfidenceRating.EXACT + should_not_match: + - query: foo (?_30417576)_(31394018_?)dup + - query: Accession:g.49531262dup + +genomic_deletion_range: + should_match: + - query: NC_000023.11:g.(31060227_31100351)_(33274278_33417151)del + confidence: ConfidenceRating.EXACT + - query: BRAF g.(31060227_31100351)_(33274278_33417151)del + confidence: ConfidenceRating.EXACT + should_not_match: + - query: GENE (?_31120496)_(33339477_?)del diff --git a/tests/fixtures/tokenizers.yml b/tests/fixtures/tokenizers.yml index cea5c934..ca8e46c7 100644 --- a/tests/fixtures/tokenizers.yml +++ b/tests/fixtures/tokenizers.yml @@ -293,6 +293,7 @@ coding_dna_deletion: - token: c._2277del - token: c.2263_del - token: c.2263 + - token: c.2277_2263del genomic_deletion: should_match: @@ -308,6 +309,7 @@ genomic_deletion: - token: g._37880233del - token: g._37880233del - token: g.37880219 + - token: g.37880233_37880219del amino_acid_insertion: should_match: @@ -353,9 +355,47 @@ genomic_uncertain_deletion: should_match: - token: g.(?_31120496)_(33339477_?)del - token: g.(?_155980375)_(156013167_?)del + - token: g.(?_18575354)_18653629del + - token: g.133462764_(133464858_?)del should_not_match: - token: c.(?_169)_(170_?)del - token: g.(31120496_?)_(?_33339477)del - token: g.(?_31120496)_(33339477_?)delins - token: (?_31120496)_(33339477_?)del - - token: g.(?_33339477)_(31120496_?)del \ No newline at end of file + - token: g.(?_33339477)_(31120496_?)del + - token: g.(?_18653629)_18575354del + - token: g.133464858_(133462764_?)del + +genomic_duplication: + should_match: + - token: g.2087938_2087948dup + - token: g.49531262dup + should_not_match: + - token: 49531262dup + - token: dupdup + - token: g.2087948_2087938dup + - token: g.(?_30417576)_(31394018_?)dup + - token: g.(31060227_31100351)_(33274278_33417151)dup + - token: g.(?_154021812)_154092209dup + - token: g.2087948_2087938dup + +genomic_duplication_range: + should_match: + - token: g.(?_30417576)_(31394018_?)dup + - token: g.(31060227_31100351)_(33274278_33417151)dup + - token: g.(?_154021812)_154092209dup + should_not_match: + - token: g.2087938_2087948dup + - token: g.49531262dup + - token: g.(?_31394018)_(_30417576_?)dup + - token: g.(31060227_33274278)_(31100351_33417151)dup + - token: g.(?_154092209)_154021812dup + +genomic_deletion_range: + should_match: + - token: g.(31060227_31100351)_(33274278_33417151)del + should_not_match: + - token: g.(?_31120496)_(33339477_?)del + - token: g.(4_3)_(2_1)del + - token: g.(x_31100351)_(33274278_33417151)del + - token: g.(31100351_31060227)_(33274278_33417151)del diff --git a/tests/fixtures/translators.yml b/tests/fixtures/translators.yml index 4812be9e..9a82ccff 100644 --- a/tests/fixtures/translators.yml +++ b/tests/fixtures/translators.yml @@ -3,87 +3,92 @@ amino_acid_substitution: - query: BRAF V600E variations: [ { - "id": "ga4gh:VA.u6sKlz0mMQvARmrlnt0Aksz6EbSkmL8z", + "_id": "ga4gh:VA.7ys8TiDzrk04O3Upd63__rOBCEhv3P5d", "location": { + "_id": "ga4gh:VSL.Vxqx2bv42rWeu08Eg7JpkdQkMCNLskoz", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number"}, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.ZJwurRo2HLY018wghYjDKSfIlEH0Y8At", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.mJbjSsW541oOsOtBoX36Mppr6hMjbjFr", + "_id": "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", "location": { + "_id": "ga4gh:VSL.2cHIgn7iLKk4x9z3zLkSTTFMV0e48DR4", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.9dA0egRAIfVFDL1sdU1VP7HsBcG0-DtE", + "_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", "location": { + "_id": "ga4gh:VSL.AqrQ-EkAvTrXOFn70_8i3dXF5shBBZ5i", "interval": { - "start": 639, - "end": 640, - "type": "SimpleInterval" + "start": { "value": 639, "type": "Number" }, + "end": { "value": 640, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.fC1iEWNGM54CnSI3VDiW9h1jETr1NCmI", + "_id": "ga4gh:VA.vimwyw0pFTwatfFhi3rhhb153ARWsPrW", "location": { + "_id": "ga4gh:VSL.FVmsWpfSOA3B2ryq0k995oHMuSGiFvMa", "interval": { - "start": 599, - "end": 600, - "type": "SimpleInterval" + "start": { "value": 599, "type": "Number" }, + "end": { "value": 600, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.lKdPZpuT-VNvRuKDjsUItNgutfWYgWQd", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.2A1O22DON5OKrUXyniXurZuogFLuhGvG", + "_id": "ga4gh:VA.FzlrH5feNcQ3S9GayMU9EF008j-8Pbz5", "location": { + "_id": "ga4gh:VSL.QDLST2nKpPWwIArdO57L2VIWPNZ0DiN3", "interval": { - "start": 599, - "end": 600, - "type": "SimpleInterval" + "start": { "value": 599, "type": "Number" }, + "end": { "value": 600, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.0Q-SgJX1V3seUUIu3qVUtEa55CQsGmEU", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -91,87 +96,92 @@ amino_acid_substitution: - query: braf v600e variations: [ { - "id": "ga4gh:VA.u6sKlz0mMQvARmrlnt0Aksz6EbSkmL8z", + "_id": "ga4gh:VA.7ys8TiDzrk04O3Upd63__rOBCEhv3P5d", "location": { + "_id": "ga4gh:VSL.Vxqx2bv42rWeu08Eg7JpkdQkMCNLskoz", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.ZJwurRo2HLY018wghYjDKSfIlEH0Y8At", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.mJbjSsW541oOsOtBoX36Mppr6hMjbjFr", + "_id": "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", "location": { + "_id": "ga4gh:VSL.2cHIgn7iLKk4x9z3zLkSTTFMV0e48DR4", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.9dA0egRAIfVFDL1sdU1VP7HsBcG0-DtE", + "_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", "location": { + "_id": "ga4gh:VSL.AqrQ-EkAvTrXOFn70_8i3dXF5shBBZ5i", "interval": { - "start": 639, - "end": 640, - "type": "SimpleInterval" + "start": { "value": 639, "type": "Number" }, + "end": { "value": 640, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.fC1iEWNGM54CnSI3VDiW9h1jETr1NCmI", + "_id": "ga4gh:VA.vimwyw0pFTwatfFhi3rhhb153ARWsPrW", "location": { + "_id": "ga4gh:VSL.FVmsWpfSOA3B2ryq0k995oHMuSGiFvMa", "interval": { - "start": 599, - "end": 600, - "type": "SimpleInterval" + "start": { "value": 599, "type": "Number" }, + "end": { "value": 600, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.lKdPZpuT-VNvRuKDjsUItNgutfWYgWQd", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.2A1O22DON5OKrUXyniXurZuogFLuhGvG", + "_id": "ga4gh:VA.FzlrH5feNcQ3S9GayMU9EF008j-8Pbz5", "location": { + "_id": "ga4gh:VSL.QDLST2nKpPWwIArdO57L2VIWPNZ0DiN3", "interval": { - "start": 599, - "end": 600, - "type": "SimpleInterval" + "start": { "value": 599, "type": "Number" }, + "end": { "value": 600, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.0Q-SgJX1V3seUUIu3qVUtEa55CQsGmEU", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -179,36 +189,38 @@ amino_acid_substitution: - query: NP_004324.2:p.Val600Glu variations: [ { - "id": "ga4gh:VA.mJbjSsW541oOsOtBoX36Mppr6hMjbjFr", + "_id": "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", "location": { + "_id": "ga4gh:VSL.2cHIgn7iLKk4x9z3zLkSTTFMV0e48DR4", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.9dA0egRAIfVFDL1sdU1VP7HsBcG0-DtE", + "_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", "location": { + "_id": "ga4gh:VSL.AqrQ-EkAvTrXOFn70_8i3dXF5shBBZ5i", "interval": { - "start": 639, - "end": 640, - "type": "SimpleInterval" + "start": { "value": 639, "type": "Number" }, + "end": { "value": 640, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -220,19 +232,20 @@ polypeptide_truncation: - query: NP_000542.1:p.Tyr185Ter (p.Tyr185*) variations: [ { - "id": "ga4gh:VA.5Zx8fM1_wE3T_DFPbJgEe5CD-youM0op", + "_id": "ga4gh:VA._S0nFwX4Y2FPmv5Radf01DAsxQbxA2cc", "location": { + "_id": "ga4gh:VSL._P3rBWI3f7OBs3a4gvZ18QJ6f6dSfqEQ", "interval": { - "start": 184, - "end": 185, - "type": "SimpleInterval" + "start": { "value": 184, "type": "Number" }, + "end": { "value": 185, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "*", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -240,19 +253,20 @@ polypeptide_truncation: - query: NP_000542.1:p.Tyr185Ter variations: [ { - "id": "ga4gh:VA.5Zx8fM1_wE3T_DFPbJgEe5CD-youM0op", + "_id": "ga4gh:VA._S0nFwX4Y2FPmv5Radf01DAsxQbxA2cc", "location": { + "_id": "ga4gh:VSL._P3rBWI3f7OBs3a4gvZ18QJ6f6dSfqEQ", "interval": { - "start": 184, - "end": 185, - "type": "SimpleInterval" + "start": { "value": 184, "type": "Number" }, + "end": { "value": 185, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "*", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -260,19 +274,20 @@ polypeptide_truncation: - query: NP_000542.1:p.(Tyr185Ter) variations: [ { - "id": "ga4gh:VA.5Zx8fM1_wE3T_DFPbJgEe5CD-youM0op", + "_id": "ga4gh:VA._S0nFwX4Y2FPmv5Radf01DAsxQbxA2cc", "location": { + "_id": "ga4gh:VSL._P3rBWI3f7OBs3a4gvZ18QJ6f6dSfqEQ", "interval": { - "start": 184, - "end": 185, - "type": "SimpleInterval" + "start": { "value": 184, "type": "Number" }, + "end": { "value": 185, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "*", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -280,19 +295,20 @@ polypeptide_truncation: - query: NP_000539.2:p.Gln1178Ter variations: [ { - "id": "ga4gh:VA.za7tYoH27J_IdV6xlDcAWimFAQ4ORugp", + "_id": "ga4gh:VA.RZQfQfXPXMjLW1V2GU2r7UIqkoaYSLPw", "location": { + "_id": "ga4gh:VSL.0_xIolylyTbss2s-6KnK14dXW_xKzjG1", "interval": { - "start": 1177, - "end": 1178, - "type": "SimpleInterval" + "start": { "value": 1177, "type": "Number" }, + "end": { "value": 1178, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.OBQRUmiVewAVPYXV5ACyxczHF1Q4YGOm", "type": "SequenceLocation" }, "state": { "sequence": "*", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -304,19 +320,20 @@ silent_mutation: - query: NP_000542.1:p.Pro154= variations: [ { - "id": "ga4gh:VA.rVexjvD44WBTwBulB5OayUPby5y6Px1H", + "_id": "ga4gh:VA.wQlQ-gnmRFPk4mNEHARBiFDNEbwS-ndE", "location": { + "_id": "ga4gh:VSL.0ShyMtnd7c00rtL9ugYd_NRYORNnSSZ_", "interval": { - "start": 153, - "end": 154, - "type": "SimpleInterval" + "start": { "value": 153, "type": "Number" }, + "end": { "value": 154, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "P", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -324,19 +341,20 @@ silent_mutation: - query: NP_000542.1:p.Pro61= variations: [ { - "id": "ga4gh:VA.LBNTm7QqFZp1alJHaFKlKuRY9cOfdHeI", + "_id": "ga4gh:VA.S1GX6EwJV3exmJAH8MnxS8-S9J4i2Ip_", "location": { + "_id": "ga4gh:VSL.zuNGmA02Uq49faqvCIPtwVrF_IJuP4dM", "interval": { - "start": 60, - "end": 61, - "type": "SimpleInterval" + "start": { "value": 60, "type": "Number" }, + "end": { "value": 61, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "P", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -344,19 +362,20 @@ silent_mutation: - query: NP_000542.1:p.Glu55= variations: [ { - "id": "ga4gh:VA.wDFRtlaQa8RY5lYKlEXECYM23g_95yUu", + "_id": "ga4gh:VA.F0W_HqW1omXVMhVHFxh9PyeawcUrV74T", "location": { + "_id": "ga4gh:VSL.HXUFIsiGWasR2zYnKfRcOsQ7lcHtrVHN", "interval": { - "start": 54, - "end": 55, - "type": "SimpleInterval" + "start": { "value": 54, "type": "Number" }, + "end": { "value": 55, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -367,36 +386,38 @@ coding_dna_substitution: - query: NM_004333.4:c.1799T>A variations: [ { - "id": "ga4gh:VA.4p_TTN1dPvgglFGsg8hxDO2FDwh5BYbe", + "_id": "ga4gh:VA.1WMCRA6pQbY6k2plXyZmadcmgTW3kkwv", "location": { + "_id": "ga4gh:VSL.o8cm5c0c_hfTyT-4h0Z8UdEi9vUzIa3o", "interval": { - "start": 1859, - "end": 1860, - "type": "SimpleInterval" + "start": { "value": 1859, "type": "Number" }, + "end": { "value": 1860, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.jkiXxxRjK7uTMiW2KQFjpgvF3VQi-HhX", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.19rEOp0IBkrDkUA4gwwM-4Gde08-kBb1", + "_id": "ga4gh:VA.AfzMBlMIDLDZNjEYEhVTH-KWxq7lAN-B", "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "start": 2144, - "end": 2145, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2145, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -404,36 +425,38 @@ coding_dna_substitution: - query: ENST00000288602.10:c.1799T>A variations: [ { - "id": "ga4gh:VA.B159aDjjLVYYuDuTeY6foJrtX20thzP1", + "_id": "ga4gh:VA.mOTittwHJ9l78td6rbWLZ4QzGvN71bHj", "location": { + "_id": "ga4gh:VSL.Di2zK_IxmrGYgC15qZ8cMtghT0aeV0uR", "interval": { - "start": 1859, - "end": 1860, - "type": "SimpleInterval" + "start": { "value": 1859, "type": "Number" }, + "end": { "value": 1860, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.VTW7KhA6-0s3_nxgkGq05eUGiDFnItW0", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.19rEOp0IBkrDkUA4gwwM-4Gde08-kBb1", + "_id": "ga4gh:VA.AfzMBlMIDLDZNjEYEhVTH-KWxq7lAN-B", "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "start": 2144, - "end": 2145, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2145, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -446,36 +469,38 @@ genomic_substitution: - query: NC_000007.13:g.140453136A>T variations: [ { - "id": "ga4gh:VA.y3OnKtG-zUM-LTJHOg7IsLVfm8x3PNLI", + "_id": "ga4gh:VA.jCQx4yBcU6u6u1RcT9Tp0PjhaQ6ynicY", "location": { + "_id": "ga4gh:VSL.8cQ9y-2J75mg5ioJvbqtgwiskdUV4zuO", "interval": { - "start": 140453135, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453135, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.HaPTmn-rrjRoZnIVw1I4AZPa6YHa2ojh", + "_id": "ga4gh:VA.fZiBjQEolbkL0AxjoTZf4SOkFy9J0ebU", "location": { + "_id": "ga4gh:VSL.zga82-TpYiNmBESCfvDvAz9DyvJF98I-", "interval": { - "start": 140753335, - "end": 140753336, - "type": "SimpleInterval" + "start": { "value": 140753335, "type": "Number" }, + "end": { "value": 140753336, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -483,36 +508,38 @@ genomic_substitution: - query: BRAF g.140453136A>T variations: [ { - "id": "ga4gh:VA.y3OnKtG-zUM-LTJHOg7IsLVfm8x3PNLI", + "_id": "ga4gh:VA.jCQx4yBcU6u6u1RcT9Tp0PjhaQ6ynicY", "location": { + "_id": "ga4gh:VSL.8cQ9y-2J75mg5ioJvbqtgwiskdUV4zuO", "interval": { - "start": 140453135, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453135, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.19rEOp0IBkrDkUA4gwwM-4Gde08-kBb1", + "_id": "ga4gh:VA.AfzMBlMIDLDZNjEYEhVTH-KWxq7lAN-B", "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "start": 2144, - "end": 2145, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2145, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -520,36 +547,38 @@ genomic_substitution: - query: BRAF V600E g.140453136A>T variations: [ { - "id": "ga4gh:VA.y3OnKtG-zUM-LTJHOg7IsLVfm8x3PNLI", + "_id": "ga4gh:VA.jCQx4yBcU6u6u1RcT9Tp0PjhaQ6ynicY", "location": { + "_id": "ga4gh:VSL.8cQ9y-2J75mg5ioJvbqtgwiskdUV4zuO", "interval": { - "start": 140453135, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453135, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.19rEOp0IBkrDkUA4gwwM-4Gde08-kBb1", + "_id": "ga4gh:VA.AfzMBlMIDLDZNjEYEhVTH-KWxq7lAN-B", "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "start": 2144, - "end": 2145, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2145, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -560,36 +589,38 @@ coding_dna_silent_mutation: - query: NM_004333.4:c.1799= variations: [ { - "id": "ga4gh:VA.JTfb-8jCxV7bMjJitDqR2xu4Q-Rvbc7q", + "_id": "ga4gh:VA.95ovl8XYrKnRYadIt3dzJPrlrbDLDcmY", "location": { + "_id": "ga4gh:VSL.o8cm5c0c_hfTyT-4h0Z8UdEi9vUzIa3o", "interval": { - "start": 1859, - "end": 1860, - "type": "SimpleInterval" + "start": { "value": 1859, "type": "Number" }, + "end": { "value": 1860, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.jkiXxxRjK7uTMiW2KQFjpgvF3VQi-HhX", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.bVNMOANetNE2z4PZ1j0DmwUL1rULmqkN", + "_id": "ga4gh:VA.9wvlCJDeaw5HxwmUJg8qkcoUoT4A3azR", "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "start": 2144, - "end": 2145, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2145, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -600,36 +631,38 @@ genomic_silent_mutation: - query: NC_000007.13:g.140453136= variations: [ { - "id": "ga4gh:VA.d_gv3DxemrrINBM8LnjHLCX7CScUbfbZ", + "_id": "ga4gh:VA.xFpHP8_VHxMxnVe6IcRl7vNqcttDbGC9", "location": { + "_id": "ga4gh:VSL.8cQ9y-2J75mg5ioJvbqtgwiskdUV4zuO", "interval": { - "start": 140453135, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453135, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.VkKteRkWR9MncFR3j4EICtAfdD4ZwR-1", + "_id": "ga4gh:VA.aMwnr5rEbtPQe5gXDDO2gZO_zSqN2RmH", "location": { + "_id": "ga4gh:VSL.zga82-TpYiNmBESCfvDvAz9DyvJF98I-", "interval": { - "start": 140753335, - "end": 140753336, - "type": "SimpleInterval" + "start": { "value": 140753335, "type": "Number" }, + "end": { "value": 140753336, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -637,53 +670,56 @@ genomic_silent_mutation: - query: BRAF g.140453136= variations: [ { - "id": "ga4gh:VA.z65Ys4mFA_ViSVPEr5SPCb8Q0n_cegHq", + "_id": "ga4gh:VA.V_nRtVHF8B3fRjjzrLHQqNj25kbHKOXk", "location": { + "_id": "ga4gh:VSL.u7Zmqs2E7g3P-DjFZwCGc_jAXVtgrXKC", "interval": { - "start": 140453135, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453135, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.d_gv3DxemrrINBM8LnjHLCX7CScUbfbZ", + "_id": "ga4gh:VA.xFpHP8_VHxMxnVe6IcRl7vNqcttDbGC9", "location": { + "_id": "ga4gh:VSL.8cQ9y-2J75mg5ioJvbqtgwiskdUV4zuO", "interval": { - "start": 140453135, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453135, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.bVNMOANetNE2z4PZ1j0DmwUL1rULmqkN", + "_id": "ga4gh:VA.9wvlCJDeaw5HxwmUJg8qkcoUoT4A3azR", "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "start": 2144, - "end": 2145, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2145, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -694,36 +730,38 @@ amino_acid_delins: - query: NP_001333827.1:p.Leu747_Thr751delinsPro variations: [ { - "id": "ga4gh:VA.4I9gnKdjPjtnlQyzMF6aI-gyXollCMkQ", + "_id": "ga4gh:VA.9uOHyaYzdfy0X5eauJulmEkGLR_ZOLhq", "location": { + "_id": "ga4gh:VSL.pZJXY_-oyY0tNuQzXl9n-eZxGSZnulKJ", "interval": { - "end": 751, - "start": 746, - "type": "SimpleInterval" + "end": { "value": 751, "type": "Number" }, + "start": { "value": 746, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.OWicZUhcw_9nRH2SQjawk7BAEkrUIe__", "type": "SequenceLocation" }, "state": { "sequence": "P", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.drLuUW5T542RCeDlVo4zbQ-_tcAiEnb6", + "_id": "ga4gh:VA.eDMXxJw9shlSKF3znIg5abniGoyJ3GQ4", "location": { - "interval": { - "end": 751, - "start": 746, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", - "type": "SequenceLocation" + "_id": "ga4gh:VSL.Mm8duqYDJyel5ZnwScnxLyGH1i9lcl3T", + "interval": { + "end": { "value": 751, "type": "Number" }, + "start": { "value": 746, "type": "Number" }, + "type": "SequenceInterval" + }, + "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", + "type": "SequenceLocation" }, "state": { "sequence": "P", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -731,104 +769,110 @@ amino_acid_delins: - query: ERBB2 G776delinsVC variations: [ { - "id": "ga4gh:VA.J52bVcfOIg2AWOmsq3yy3s4FiOD1t7d3", + "_id": "ga4gh:VA.Ewl4g4FUaCiuGcgX7-PbnS-lMGJdxJz-", "location": { + "_id": "ga4gh:VSL._gHhj5CL2_O6rwya-XkoAaGVU8vGYNkR", "interval": { - "end": 776, - "start": 775, - "type": "SimpleInterval" + "end": { "value": 776, "type": "Number" }, + "start": { "value": 775, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.fuC5BAgMxoOgTKKQTpwtT807ZVF2-zdF", "type": "SequenceLocation" }, "state": { "sequence": "VC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.gex4Elp4SARiLeNqSL9QdQcFLjZkVBAv", + "_id": "ga4gh:VA.NCimxZ1kNB-cBQguf3wuEjuxxviBZVPC", "location": { + "_id": "ga4gh:VSL.LomAULC8vNGNySvNtA4wMgBTuRLZRaB9", "interval": { - "end": 776, - "start": 775, - "type": "SimpleInterval" + "end": { "value": 776, "type": "Number" }, + "start": { "value": 775, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.AF1UFydIo02-bMplonKSfxlWY2q6ze3m", "type": "SequenceLocation" }, "state": { "sequence": "VC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.QSwqvdOkeIYBa2WDPB5-huCtSWjdqwWV", + "_id": "ga4gh:VA.wNZgzeyipGMGzs9z5lLps1bg4ZlpOZJ1", "location": { + "_id": "ga4gh:VSL.u2UXOjYF9xrSFmHIKf5DP7N9wieIBTva", "interval": { - "end": 776, - "start": 775, - "type": "SimpleInterval" + "end": { "value": 776, "type": "Number" }, + "start": { "value": 775, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.attSSnFSyeB6FFNsZL1uulpdISBVBuXQ", "type": "SequenceLocation" }, "state": { "sequence": "VC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.YnNGQXyfmr40IbT_6UgeSWHu4M7JdER_", + "_id": "ga4gh:VA.ehlZxghDc8OGl0-BMMcn0LcdLxVaEzzI", "location": { + "_id": "ga4gh:VSL.wICyoUlA7PIL5hPAU84qg8krqCFR6mlo", "interval": { - "end": 776, - "start": 775, - "type": "SimpleInterval" + "end": { "value": 776, "type": "Number" }, + "start": { "value": 775, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.gWQeaH0zeaCffRawhGhXkDeP3ZcPIlyF", "type": "SequenceLocation" }, "state": { "sequence": "VC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.GEkuDsNvEvb7qZtc50-JkdjzOp2FMuAo", + "_id": "ga4gh:VA.5j4lZT0FFV7AZfB74eVDjQJMX2T3sL9g", "location": { + "_id": "ga4gh:VSL.maXN0JgJxK39ndNim6VKLcYGofVG3JGm", "interval": { - "end": 776, - "start": 775, - "type": "SimpleInterval" + "end": { "value": 776, "type": "Number" }, + "start": { "value": 775, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.9DqU06SJwLX1WjvlmcZFBZAbRFojVMpp", "type": "SequenceLocation" }, "state": { "sequence": "VC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.bRcsOV_K2Gj56_9HDK34orC8LYvkSRoH", + "_id": "ga4gh:VA.z586Oz2qJ8F53EdsQL9O7r1Vj_1svHnS", "location": { + "_id": "ga4gh:VSL.vDHLwg6T-aZqvpA3MOqnOxeTfxT4h8ZM", "interval": { - "end": 776, - "start": 775, - "type": "SimpleInterval" + "end": { "value": 776, "type": "Number" }, + "start": { "value": 775, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.oDpVzgNlz91eIwDnT1Boi7vgtCMdhKG8", "type": "SequenceLocation" }, "state": { "sequence": "VC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -839,36 +883,38 @@ coding_dna_delins: - query: NM_001289937.1:c.2326_2327delinsCT variations: [ { - "id": "ga4gh:VA._Prdq6s4-XaiLyu7-xPxDCB4cWz9tyaq", + "_id": "ga4gh:VA.-7PBIn7cC4E0BqNFyG9pwNQRAAhQy622", "location": { + "_id": "ga4gh:VSL.J-2QepZ_NHb8qVUJAwSD6nY-KAxtntv8", "interval": { - "start": 2586, - "end": 2588, - "type": "SimpleInterval" + "start": { "value": 2586, "type": "Number" }, + "end": { "value": 2588, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.Djq77klvL3m3p0xzraFfQPyuFhFgN_9w", "type": "SequenceLocation" }, "state": { "sequence": "CT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.sSFX2CO2DPTvE4MqnJ5VifnaQOGS0CVb", + "_id": "ga4gh:VA.eMxxAEjNduAvg5U3eBZxf0nLtfcMNxqy", "location": { + "_id": "ga4gh:VSL.bBzTvpLChbWE2SZ7X0drm8NQj5rzNqTK", "interval": { - "start": 2500, - "end": 2502, - "type": "SimpleInterval" + "start": { "value": 2500, "type": "Number" }, + "end": { "value": 2502, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", "type": "SequenceLocation" }, "state": { "sequence": "CT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -876,36 +922,38 @@ coding_dna_delins: - query: ENST00000256474.2:c.364_365delinsAT variations: [ { - "id": "ga4gh:VA.W-HTLniCyKtrRfozCtVVHI60K4qlAWvI", + "_id": "ga4gh:VA.ngE81Y7VVehqTBVxlav-xG36NoqGGJ5T", "location": { + "_id": "ga4gh:VSL.-_kxnVYdh48LHnlFGOaPFss0OgO4aSjB", "interval": { - "start": 1203, - "end": 1205, - "type": "SimpleInterval" + "start": { "value": 1203, "type": "Number" }, + "end": { "value": 1205, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.guuEF-tM_kFZodRliLEI35w_k-DqTaDs", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.BIqf-VS950ZggNaA2bHKni-KrxJmeBcM", + "_id": "ga4gh:VA.TYDT7elgjDiI1dQjstKncQHhLmdueeEw", "location": { + "_id": "ga4gh:VSL.gp5fjzoxf2Ctl7mQcYGfLj-9hmrhjJet", "interval": { - "start": 433, - "end": 435, - "type": "SimpleInterval" + "start": { "value": 433, "type": "Number" }, + "end": { "value": 435, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -913,36 +961,38 @@ coding_dna_delins: - query: NM_000551.3:c.615delinsAA variations: [ { - "id": "ga4gh:VA.HUkoxsBkK9HpQwOxjuhKu1iQ_IbRMN2i", + "_id": "ga4gh:VA.Y-iW9HzF98syXebXqVAnIKhtpwCWBswe", "location": { + "_id": "ga4gh:VSL.-vLzLKeWNVum4KmUWNFjQ0PLT7Cv0Xdj", "interval": { - "start": 827, - "end": 828, - "type": "SimpleInterval" + "start": { "value": 827, "type": "Number" }, + "end": { "value": 828, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.SjJnUcJL1EyRFUb6f8PSJA4u3fyin2Wj", + "_id": "ga4gh:VA._JN_AF5PO9kWKgKxB5T48cypZl7ccEsQ", "location": { + "_id": "ga4gh:VSL.tQRFfWMPPHErWSbKvoEIh2gI2ehHIYAs", "interval": { - "start": 684, - "end": 685, - "type": "SimpleInterval" + "start": { "value": 684, "type": "Number" }, + "end": { "value": 685, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -950,70 +1000,74 @@ coding_dna_delins: - query: LRG_542:c.900_901delinsAC variations: [ { - "id": "ga4gh:VA._sVoxX8FHXXt5c0D6kwAUpudvhF7Hdvz", + "_id": "ga4gh:VA.SWkjzPhH7x8ZjjQerjtvBYUoWVnit3hR", "location": { + "_id": "ga4gh:VSL.b6Kqyg7cbZeffAIlEeODzin4bhD9Wu-3", "interval": { - "start": 1212, - "end": 1214, - "type": "SimpleInterval" + "start": { "value": 1212, "type": "Number" }, + "end": { "value": 1214, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.U_uwMjCJru2vU7eBuynumaQJqT0O6cIV", "type": "SequenceLocation" }, "state": { "sequence": "AC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.t01WNVXlnZQNW9ONzHlVDH3d-ZwknCvs", + "_id": "ga4gh:VA.MMNikmGyviXQcSxByO0_gBkWP47MoE5m", "location": { + "_id": "ga4gh:VSL.rM8zd6ur3O_L-4QQbab64Of4k8aY2oNl", "interval": { - "start": 1218, - "end": 1220, - "type": "SimpleInterval" + "start": { "value": 1218, "type": "Number" }, + "end": { "value": 1220, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.CMHotHdJwhFah2_gNdlwU7DTT4yv-x_1", "type": "SequenceLocation" }, "state": { "sequence": "AC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.1xqhiy3sAyIjgDsv1qArGjtNiwMAuVKT", + "_id": "ga4gh:VA.G7R1jWMgPBDjGNKpjv7bLdaz-yGMMqdr", "location": { + "_id": "ga4gh:VSL.-UM7jf9tD6UAZmNUadMN3dZznWVb_U0C", "interval": { - "start": 1196, - "end": 1198, - "type": "SimpleInterval" + "start": { "value": 1196, "type": "Number" }, + "end": { "value": 1198, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.xpx15sumYFV5iNvkVnWPR4YQFIzHqXj9", "type": "SequenceLocation" }, "state": { "sequence": "AC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.3Vx2HkpFxw-xpaPADt9zCYAHIGqwWLMl", + "_id": "ga4gh:VA.wCwdKEmS_dZ4JwBcKdrFend2VAKlZ3so", "location": { + "_id": "ga4gh:VSL.wzF5mgN7l4RkfKCax0K2LZyU5_JiKzGb", "interval": { - "start": 1295, - "end": 1296, - "type": "SimpleInterval" + "start": { "value": 1295, "type": "Number" }, + "end": { "value": 1296, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.U_uwMjCJru2vU7eBuynumaQJqT0O6cIV", "type": "SequenceLocation" }, "state": { "sequence": "C", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1024,36 +1078,38 @@ genomic_delins: - query: NC_000007.13:g.140453135_140453136delinsAT variations: [ { - "id": "ga4gh:VA.mFIMSODRDoCuDbiOjoCHmOEGgvebdt2T", + "_id": "ga4gh:VA.6qFzaiSaR9Oa4h0fIXo2ZJBtmkghsIQh", "location": { + "_id": "ga4gh:VSL.EMgfC1GciHKDq6cifF0GEm7zyPF6tbQt", "interval": { - "start": 140453134, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453134, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.5cmrRcsVIUBiFii-tOHhxNPNA3OB69fe", + "_id": "ga4gh:VA.mlJVnI7js6Tsb2GSLFlNRbCKE9zFRX5p", "location": { + "_id": "ga4gh:VSL.b0Ldj2KcT2k0n0PZfqHCBH1YzQZYceYX", "interval": { - "start": 140753334, - "end": 140753336, - "type": "SimpleInterval" + "start": { "value": 140753334, "type": "Number" }, + "end": { "value": 140753336, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1061,53 +1117,56 @@ genomic_delins: - query: BRAF g.140453135_140453136delinsAT variations: [ { - "id": "ga4gh:VA.mFIMSODRDoCuDbiOjoCHmOEGgvebdt2T", + "_id": "ga4gh:VA.6qFzaiSaR9Oa4h0fIXo2ZJBtmkghsIQh", "location": { + "_id": "ga4gh:VSL.EMgfC1GciHKDq6cifF0GEm7zyPF6tbQt", "interval": { - "start": 140453134, - "end": 140453136, - "type": "SimpleInterval" + "start": { "value": 140453134, "type": "Number" }, + "end": { "value": 140453136, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA._GzAG8_K8YwcYQk6bEvINNGM_hEViytU", + "_id": "ga4gh:VA.4387UZ6Yssh3XCGKjm71z_WtadpBZT3O", "location": { + "_id": "ga4gh:VSL.6PeoFwkO4ISmUjDWoYLkVsATVx8JRApd", "interval": { - "start": 2144, - "end": 2146, - "type": "SimpleInterval" + "start": { "value": 2144, "type": "Number" }, + "end": { "value": 2146, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.vNNHkZaZK8YROmOWoKq2keDJ1Co9DwF3", + "_id": "ga4gh:VA.NKePityTpOgvs0Nig8TVjtMsvPrzWbfk", "location": { + "_id": "ga4gh:VSL.krAK_8hf67OTr6PM6fr9pcp2V8FC9Qim", "interval": { - "start": 140453134, - "end": 140453135, - "type": "SimpleInterval" + "start": { "value": 140453134, "type": "Number" }, + "end": { "value": 140453135, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1115,19 +1174,20 @@ genomic_delins: - query: NC_000003.12:g.10149938delinsAA variations: [ { - "id": "ga4gh:VA.-f07PmCJYlYbzBHVUQzDYWguUU4usche", + "_id": "ga4gh:VA.ndCPwzek-KU626kK28bQd1gWAAk2ELze", "location": { + "_id": "ga4gh:VSL.mVulVOKoX2frLH1XTIJfpvJa6RGdOVu_", "interval": { - "start": 10149937, - "end": 10149938, - "type": "SimpleInterval" + "start": { "value": 10149937, "type": "Number" }, + "end": { "value": 10149938, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1135,53 +1195,56 @@ genomic_delins: - query: VHL g.10149938delinsAA variations: [ { - "id": "ga4gh:VA.-f07PmCJYlYbzBHVUQzDYWguUU4usche", + "_id": "ga4gh:VA.ndCPwzek-KU626kK28bQd1gWAAk2ELze", "location": { + "_id": "ga4gh:VSL.mVulVOKoX2frLH1XTIJfpvJa6RGdOVu_", "interval": { - "start": 10149937, - "end": 10149938, - "type": "SimpleInterval" + "start": { "value": 10149937, "type": "Number" }, + "end": { "value": 10149938, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.SjJnUcJL1EyRFUb6f8PSJA4u3fyin2Wj", + "_id": "ga4gh:VA._JN_AF5PO9kWKgKxB5T48cypZl7ccEsQ", "location": { + "_id": "ga4gh:VSL.tQRFfWMPPHErWSbKvoEIh2gI2ehHIYAs", "interval": { - "start": 684, - "end": 685, - "type": "SimpleInterval" + "start": { "value": 684, "type": "Number" }, + "end": { "value": 685, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.6QID6Lckjhe-nfdU_jOX0pv23-6TTmBh", + "_id": "ga4gh:VA.KVKcV8JADfndsT23v8btal9kIEEeq7YG", "location": { + "_id": "ga4gh:VSL.t_BjgkkJm-CxdHkW0HUWLUBuQMutb1VZ", "interval": { - "start": 10149937, - "end": 10149938, - "type": "SimpleInterval" + "start": { "value": 10149937, "type": "Number" }, + "end": { "value": 10149938, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1192,19 +1255,20 @@ amino_acid_deletion: - query: NP_000542.1:p.Phe76del variations: [ { - "id": "ga4gh:VA.P6K_G0ZM2KZdvCfG0U0P_OtHgGxZfdd9", + "_id": "ga4gh:VA.92AfH5KBJzL4d38wPE9lfhwRwxD67uZq", "location": { + "_id": "ga4gh:VSL.-URUTHHESJWil6xz0wlU5-4HJsVV3Xs2", "interval": { - "end": 76, - "start": 75, - "type": "SimpleInterval" + "end": { "value": 76, "type": "Number" }, + "start": { "value": 75, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1212,19 +1276,20 @@ amino_acid_deletion: - query: NP_000542.1:p.Arg82_Val84del variations: [ { - "id": "ga4gh:VA.1ZjAyIGY2Uwb4_ItKPII33nGbvrtAQmL", + "_id": "ga4gh:VA.Qx37r00Ro1VIZxrrhs548BVwGnDQ0xXf", "location": { + "_id": "ga4gh:VSL.lVF6GOyLpyG_VpwFCQjp9pWtRsNZsKtJ", "interval": { - "end": 84, - "start": 81, - "type": "SimpleInterval" + "end": { "value": 84, "type": "Number" }, + "start": { "value": 81, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1235,36 +1300,38 @@ coding_dna_deletion: - query: NM_004448.3:c.2263_2277delTTGAGGGAAAACACA variations: [ { - "id": "ga4gh:VA.IKpS_bt1Leeo23MFKPfwUG8brkFD4f9x", + "_id": "ga4gh:VA.flgLAEHLaG_O9m5D1FKsQ1P3dRN8NLce", "location": { + "_id": "ga4gh:VSL.eTNgV11ScIpSynz5cOU9Yd6RIiLiOWLw", "interval": { - "end": 2539, - "start": 2523, - "type": "SimpleInterval" + "end": { "value": 2539, "type": "Number" }, + "start": { "value": 2523, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.3Ob8qgT17gn62p5Yhdy5yoCacMqHcGBG", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.pMmAfNtE5g3O3gZ6E9e-uyXJtwbN9whN", + "_id": "ga4gh:VA.tMjlwNf2mYOKPbXXwGo4IKd_OtHuVfMT", "location": { + "_id": "ga4gh:VSL.3uPWAjsdzd8MbAqw8DV46eBLK8tQRyEs", "interval": { - "end": 2453, - "start": 2437, - "type": "SimpleInterval" + "end": { "value": 2453, "type": "Number" }, + "start": { "value": 2437, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1275,36 +1342,38 @@ genomic_deletion: - query: NC_000003.11:g.10191486_10191487delAG variations: [ { - "id": "ga4gh:VA.PcUd92rbhoZxkQMNP9_UOOWHd19LAMdP", + "_id": "ga4gh:VA.2jURgc5DDHYWaUP1eea-sHucy1tza6Pv", "location": { + "_id": "ga4gh:VSL.WMs6q4Q2XWOPMa6ZMgof7xuhKN5nLsGn", "interval": { - "end": 10149803, - "start": 10149799, - "type": "SimpleInterval" + "end": { "value": 10149803, "type": "Number" }, + "start": { "value": 10149799, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.46ZSku6zg54ZWx0Lfp3azV3PV-a1iYbB", + "_id": "ga4gh:VA.VOHJmr3khwFkEKYOHugSr22OpF9KWWDH", "location": { + "_id": "ga4gh:VSL.eY-L18CsfltCfdtTwTPJA5Yd4yE3KNbA", "interval": { - "end": 10191487, - "start": 10191483, - "type": "SimpleInterval" + "end": { "value": 10191487, "type": "Number" }, + "start": { "value": 10191483, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1312,36 +1381,38 @@ genomic_deletion: - query: VHL g.10191486_10191487delAG variations: [ { - "id": "ga4gh:VA.cdEPsoFoVBbfs7d0-vBDV1_lIW15tnMI", + "_id": "ga4gh:VA.otmQbPfr2ogACdEmHPRsA4EksX6fKg_k", "location": { + "_id": "ga4gh:VSL.80ORox81ZbFEpgMvBZuaAkIpE5soWy0P", "interval": { - "end": 550, - "start": 546, - "type": "SimpleInterval" + "end": { "value": 550, "type": "Number" }, + "start": { "value": 546, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.46ZSku6zg54ZWx0Lfp3azV3PV-a1iYbB", + "_id": "ga4gh:VA.VOHJmr3khwFkEKYOHugSr22OpF9KWWDH", "location": { + "_id": "ga4gh:VSL.eY-L18CsfltCfdtTwTPJA5Yd4yE3KNbA", "interval": { - "end": 10191487, - "start": 10191483, - "type": "SimpleInterval" + "end": { "value": 10191487, "type": "Number" }, + "start": { "value": 10191483, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1354,19 +1425,20 @@ amino_acid_insertion: - query: NP_005219.2:p.Asp770_Asn771insGlyLeu variations: [ { - "id": "ga4gh:VA.d3dLtsYaLYE2Yh_GENvPUtTVZWlwLnJw", + "_id": "ga4gh:VA.t_WLqe5efVQlBmdbIBgqIeLRu2rSJDJJ", "location": { + "_id": "ga4gh:VSL.DJIP1jlxQIro1oC5re8txtH7N8vAvM7A", "interval": { - "end": 770, - "start": 770, - "type": "SimpleInterval" + "end": { "value": 770, "type": "Number" }, + "start": { "value": 770, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", "type": "SequenceLocation" }, "state": { "sequence": "GL", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -1374,87 +1446,92 @@ amino_acid_insertion: - query: BRAF T599_V600insV variations: [ { - "id": "ga4gh:VA.vYbwb3dHIv-W42hpCEoST-xZ8kCnA9Wh", + "_id": "ga4gh:VA.pD5cMR_LZRvCVwQvHcmNQx06vhfLPd2Z", "location": { + "_id": "ga4gh:VSL.Vxqx2bv42rWeu08Eg7JpkdQkMCNLskoz", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.ZJwurRo2HLY018wghYjDKSfIlEH0Y8At", "type": "SequenceLocation" }, "state": { "sequence": "VV", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.JIydJUdKHeViBxtdpGRnJjY6O6iP-B2h", + "_id": "ga4gh:VA.MtpVh_21Qq_y5ezNG-f7oMpWIG28q5lg", "location": { + "_id": "ga4gh:VSL.QDLST2nKpPWwIArdO57L2VIWPNZ0DiN3", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.0Q-SgJX1V3seUUIu3qVUtEa55CQsGmEU", "type": "SequenceLocation" }, "state": { "sequence": "VV", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.aueh1WPw5tborZe_uOxocslI8ABdhGcP", + "_id": "ga4gh:VA.4MnOqA77if6-fXQt8xYJwEBC0Vhd1Was", "location": { + "_id": "ga4gh:VSL.FVmsWpfSOA3B2ryq0k995oHMuSGiFvMa", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.lKdPZpuT-VNvRuKDjsUItNgutfWYgWQd", "type": "SequenceLocation" }, "state": { "sequence": "VV", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.tHKwuQkyozKBytcTNGPXdW_7cxgLzDFf", + "_id": "ga4gh:VA.nD-TPvy9klcXMMMsAPlFCkZv3MxpYAFN", "location": { + "_id": "ga4gh:VSL.2cHIgn7iLKk4x9z3zLkSTTFMV0e48DR4", "interval": { - "end": 600, - "start": 599, - "type": "SimpleInterval" + "end": { "value": 600, "type": "Number" }, + "start": { "value": 599, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", "type": "SequenceLocation" }, "state": { "sequence": "VV", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.34pPQFJdG6fJfK4g42sEUQZGELv3scbZ", + "_id": "ga4gh:VA.zaMDM5o3bzpsZw_ZohhXI691Ud92TmHW", "location": { + "_id": "ga4gh:VSL.AqrQ-EkAvTrXOFn70_8i3dXF5shBBZ5i", "interval": { - "end": 640, - "start": 639, - "type": "SimpleInterval" + "end": { "value": 640, "type": "Number" }, + "start": { "value": 639, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", "type": "SequenceLocation" }, "state": { "sequence": "VV", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -1465,38 +1542,40 @@ coding_dna_insertion: - query: NM_007294.3:c.2902_2903insTC variations: [ { - "id": "ga4gh:VA.ACiRpocqtheayrWgLa3H9fNz09h0QDDI", + "_id": "ga4gh:VA.jT_wa3AdO58vqwoGCMDCOgdo6xxIfFT0", "location": { + "_id": "ga4gh:VSL.r2xHtlVFCntMsE1cGPzvB1DsgWTY2OIA", "interval": { - "end": 3134, - "start": 3131, - "type": "SimpleInterval" + "end": { "value": 3134, "type": "Number" }, + "start": { "value": 3131, "type": "Number" }, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.jj1RAXMGdOU-D39IRYnDX_fXcM7LQvb6", "type": "SequenceLocation" }, "state": { "sequence": "CTCTC", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, { - "id": "ga4gh:VA.0-QxXjPJrsRwaYlTiZe9SpmIou5oh3EV", - "type": "Allele", - "location": { - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.oR9jzdHf6J23TozeyuvTXtwlm6PpUHHl", - "interval": { - "type": "SimpleInterval", - "start": 3012, - "end": 3015 - } - }, - "state": { - "type": "SequenceState", - "sequence": "CTCTC" + "_id": "ga4gh:VA.zMjbr4BnvjGp_ZfPktxTQlbcLJY16Lv8", + "type": "Allele", + "location": { + "_id": "ga4gh:VSL.ByWi966-x5k6kyDi8JxFyjP3PnTOE92D", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.oR9jzdHf6J23TozeyuvTXtwlm6PpUHHl", + "interval": { + "type": "SequenceInterval", + "start": { "value": 3012, "type": "Number" }, + "end": { "value": 3015, "type": "Number" } } + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CTCTC" + } } ] @@ -1505,36 +1584,38 @@ genomic_insertion: - query: NC_000022.10:g.30051593_30051594insT variations: [ { - "id": "ga4gh:VA.yC7OlTrV1A1f6fLr2oRwpSWSL2_OeVz3", + "_id": "ga4gh:VA.YvY2mjFjnKULH4RCWFf7S9Ce_BRG3Jqn", "type": "Allele", "location": { + "_id": "ga4gh:VSL.qFuEOSyo-arCRC1yjVcUZMXybmaUq9P0", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8", "interval": { - "type": "SimpleInterval", - "start": 30051592, - "end": 30051593 + "type": "SequenceInterval", + "start": { "value": 30051592, "type": "Number" }, + "end": { "value": 30051593, "type": "Number" } } }, "state": { - "type": "SequenceState", + "type": "LiteralSequenceExpression", "sequence": "TT" } }, { - "id": "ga4gh:VA.jrTUS8iX0BH7b9lrZX1pFKislZRdjAa3", + "_id": "ga4gh:VA.6vU7_6BeiwUjY6kHVAoZkrlMsQ0Jjm3g", "type": "Allele", "location": { + "_id": "ga4gh:VSL.7qYBQbFB-QTWCElP1BMX7McXOYYW91go", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", "interval": { - "type": "SimpleInterval", - "start": 29655603, - "end": 29655604 + "type": "SequenceInterval", + "start": { "value": 29655603, "type": "Number" }, + "end": { "value": 29655604, "type": "Number" } } }, "state": { - "type": "SequenceState", + "type": "LiteralSequenceExpression", "sequence": "TT" } } @@ -1542,53 +1623,56 @@ genomic_insertion: - query: NF2 g.30051593_30051594insT variations: [ { - "id": "ga4gh:VA.yC7OlTrV1A1f6fLr2oRwpSWSL2_OeVz3", + "_id": "ga4gh:VA.YvY2mjFjnKULH4RCWFf7S9Ce_BRG3Jqn", "type": "Allele", "location": { + "_id": "ga4gh:VSL.qFuEOSyo-arCRC1yjVcUZMXybmaUq9P0", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8", "interval": { - "type": "SimpleInterval", - "start": 30051592, - "end": 30051593 + "type": "SequenceInterval", + "start": { "value": 30051592, "type": "Number" }, + "end": { "value": 30051593, "type": "Number" } } }, "state": { - "type": "SequenceState", + "type": "LiteralSequenceExpression", "sequence": "TT" } }, { - "id": "ga4gh:VA.kLDWiu0w7pLtZs2vrnMHoJ4clC8J4bDg", + "_id": "ga4gh:VA.LGH3wmHR1lgAYQWE5_cyIAHsLPUyBZD0", "type": "Allele", "location": { + "_id": "ga4gh:VSL.NlAimLSD0gSyQ3HQ6TwykbSUy7h5mLsa", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.pz00wSmvjP2gQjqFlCrADh_E3R0HIUia", "interval": { - "type": "SimpleInterval", - "start": 892, - "end": 893 + "type": "SequenceInterval", + "start": { "value": 892, "type": "Number" }, + "end": { "value": 893, "type": "Number" } } }, "state": { - "type": "SequenceState", + "type": "LiteralSequenceExpression", "sequence": "TT" } }, { - "id": "ga4gh:VA.V8Dre-auh4zfwdX3KplO78U0jVKl1I_o", + "_id": "ga4gh:VA.FuzkymtMbsFqrmu9dwOSJskLB1LN7lfT", "type": "Allele", "location": { + "_id": "ga4gh:VSL.eDzicXnixZP_AIeHdOlwNHwPf8_86liM", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", "interval": { - "type": "SimpleInterval", - "start": 30051593, - "end": 30051593 + "type": "SequenceInterval", + "start": { "value": 30051593, "type": "Number" }, + "end": { "value": 30051593, "type": "Number" } } }, "state": { - "type": "SequenceState", + "type": "LiteralSequenceExpression", "sequence": "T" } } @@ -1599,9 +1683,10 @@ genomic_uncertain_deletion: - query: NC_000023.11:g.(?_31120496)_(33339477_?)del variations: [ { - "id": "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk", + "_id": "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk", "subject": { "location": { + "_id": "ga4gh:VSL.7OJ5EFgu_2C4zPFDUBgn-ziE6BZwsRcv", "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "interval": { "start": { @@ -1632,9 +1717,10 @@ genomic_uncertain_deletion: - query: NC_000023.10:g.(?_31138613)_(33357594_?)del variations: [ { - "id": "ga4gh:VCN.gRshXhruFQw-QdKwU4xc2iKBNLIbFNzt", + "_id": "ga4gh:VCN.gRshXhruFQw-QdKwU4xc2iKBNLIbFNzt", "subject": { "location": { + "_id": "ga4gh:VSL.7Chaj1X9NH2G9sSK1diUKgBEZO4pHqr8", "sequence_id": "ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm", "interval": { "start": { @@ -1662,9 +1748,10 @@ genomic_uncertain_deletion: "type": "CopyNumber" }, { - "id": "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk", + "_id": "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk", "subject": { "location": { + "_id": "ga4gh:VSL.7OJ5EFgu_2C4zPFDUBgn-ziE6BZwsRcv", "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "interval": { "start": { @@ -1695,9 +1782,10 @@ genomic_uncertain_deletion: - query: NC_000002.12:g.(?_110104900)_(110207160_?)del variations: [ { - "id": "ga4gh:VCN.8o5X1HTglUvwUAFo9vGL5OBnZqgpylys", + "_id": "ga4gh:VCN.8o5X1HTglUvwUAFo9vGL5OBnZqgpylys", "subject": { "location": { + "_id": "ga4gh:VSL.75GQmJvq7dyP9-wom8Jffjk0Q9Le7Q9O", "sequence_id": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", "interval": { "start": { @@ -1727,9 +1815,10 @@ genomic_uncertain_deletion: - query: NC_000024.10:g.(?_14076802)_(57165209_?)del variations: [ { - "id": "ga4gh:VCN._T4dHJIfXB-cpqQSJ5g5pAM1JnwupWuv", + "_id": "ga4gh:VCN._T4dHJIfXB-cpqQSJ5g5pAM1JnwupWuv", "subject": { "location": { + "_id": "ga4gh:VSL.1xIN_RumlXTIsdTWvyJznzuzxJlwUfiD", "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", "interval": { "start": { @@ -1755,4 +1844,149 @@ genomic_uncertain_deletion: }, "type": "CopyNumber" } - ] \ No newline at end of file + ] + +genomic_duplication: + tests: + - query: NC_000003.12:g.49531262dup + variations: [ + { + "_id": "ga4gh:VA.aeNse-a8IJzqHiG-P5zTRYA_eVFhrJXw", + "type": "Allele", + "location": { + "_id": "ga4gh:VSL.G_J9WrfooiONRgjbmGPuCBYbBYFQnYOg", + "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", + "interval": { + "type": "SequenceInterval", + "start": { "value": 49531260, "type": "Number" }, + "end": { "value": 49531262, "type": "Number" }, + }, + "type": "SequenceLocation", + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "GGG" + } + } + ] + - query: NC_000016.10:g.2087938_2087948dup + variations: [ + { + "_id": "ga4gh:VA.wqqxfUCrFSndedI2-4oiIuHLHHGjBFof", + "type": "Allele", + "location": { + "_id": "ga4gh:VSL.4mH68huylkPmu6zyUwH4wiazIYr9cQUX", + "sequence_id": "ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0", + "interval": { + "type": "SequenceInterval", + "start": { "value": 2087937, "type": "Number" }, + "end": { "value": 2087948, "type": "Number" }, + }, + "type": "SequenceLocation", + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "AAAGGTAGGGCAAAGGTAGGGC" + } + } + ] + - query: NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup + variations: [ + { + "_id": "ga4gh:VCN.IgQATuKrM_J5MDHm2VemKThFOkzz-7AZ", + "type": "CopyNumber", + "subject": { + "location": { + "_id": "ga4gh:VSL.DgEMxYt1AdPe-HZAQbT2AVz5OejICnOj", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "min": 31060226, + "max": 31100350, + "type": "DefiniteRange" + }, + "end": { + "min": 33274279, + "max": 33417152, + "type": "DefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 2, + "max": 3 + } + } + ] + - query: NC_000023.11:g.(?_154021812)_154092209dup + variations: [ + { + "_id": "ga4gh:VCN.eLAZZ-ht1h2dTtZqzhO9TVhBdFufv67-", + "type": "CopyNumber", + "subject": { + "location": { + "_id": "ga4gh:VSL.k2FXLyqyS8pbtZxEHCpNd2SHD6iCtH9C", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 154021811, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 154092209, + "type": "Number" + } + } + }, + "reverse_complement": false, + "type": "DerivedSequenceExpression" + }, + "copies": { + "min": 2, + "max": 3, + "type": "DefiniteRange" + } + } + ] + - query: NC_000020.11:g.(?_30417576)_(31394018_?)dup + variations: [ + { + "_id": "ga4gh:VCN.3rvfUmiIb4hSxVQhXKOonuOY6Q3xTkKx", + "type": "CopyNumber", + "subject": { + "location": { + "_id": "ga4gh:VSL.us51izImAQQWr-Hu6Q7HQm-vYvmb-jJo", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 30417575, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 31394018, + "comparator": ">=", + "type": "IndefiniteRange" + } + } + }, + "reverse_complement": false, + "type": "DerivedSequenceExpression" + }, + "copies": { + "value": 3, + "type": "Number" + } + } + ] diff --git a/tests/fixtures/validators.yml b/tests/fixtures/validators.yml index d2721944..9cad3490 100644 --- a/tests/fixtures/validators.yml +++ b/tests/fixtures/validators.yml @@ -191,4 +191,16 @@ genomic_uncertain_deletion: - query: NC_000024.10:g.(?_14076802)_(57165209_?)del should_not_match: - query: NC_000012.12:g.123256943_123256947del - - query: NC_000020.11:g.(?_30417576)_(31394018_?)dup \ No newline at end of file + - query: NC_000020.11:g.(?_30417576)_(31394018_?)dup + +genomic_duplication: + should_match: + - query: NC_000003.12:g.49531262dup + - query: NC_000016.10:g.2087938_2087948dup + - query: NC_000020.11:g.(?_30417576)_(31394018_?)dup + - query: NC_000023.11:g.(?_154021812)_154092209dup + - query: NC_000023.11:g.154021812_(154092209_?)dup + - query: NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup + - query: NC_000023.10:g.(31078344_31118468)_(33292395_33435268)dup + should_not_match: + - query: NC_000003.12:g.495312625165465465465dup diff --git a/tests/test_hgvs_dup_del_mode.py b/tests/test_hgvs_dup_del_mode.py new file mode 100644 index 00000000..7b40f313 --- /dev/null +++ b/tests/test_hgvs_dup_del_mode.py @@ -0,0 +1,3193 @@ +"""Module for testing HGVS Dup Del mode.""" +import pytest +from variation.query import QueryHandler +from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor +from tests.conftest import assertion_checks +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum + + +@pytest.fixture(scope="module") +def test_normalize(): + """Build normalize test fixture.""" + class TestNormalize: + + def __init__(self): + self.query_handler = QueryHandler() + self.warnings = [] + + def normalize(self, q, hgvs_dup_del_mode=HGVSDupDelModeEnum.DEFAULT): + resp = self.query_handler.normalize( + q, hgvs_dup_del_mode=hgvs_dup_del_mode) + self.warnings = \ + self.query_handler.normalize_handler.warnings + return resp + return TestNormalize() + + +@pytest.fixture(scope='module') +def dmd_gene_context(): + """Create test fixture for DMD gene context""" + return { + "id": "normalize.gene:DMD", + "type": "GeneDescriptor", + "label": "DMD", + "xrefs": [ + "ensembl:ENSG00000198947", + "ncbigene:1756" + ], + "alternate_labels": [ + "DXS272", + "DXS230", + "DXS206", + "DXS142", + "CMD3B", + "DXS269", + "BMD", + "DXS268", + "MRX85", + "DXS164", + "DXS270", + "DXS239" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "dystrophin" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "p21.2", + "end": "p21.1" + }, + "_id": "ga4gh:VCL.JgyIOPZJ9G6Hn6QziVAs8SQpaIWPK46H", + "type": "ChromosomeLocation", + "chr": "X" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "omim:300377", + "ucsc:uc004dda.2", + "ccds:CCDS14234", + "ccds:CCDS55394", + "pubmed:3607877", + "ccds:CCDS14232", + "ccds:CCDS55395", + "orphanet:121117", + "ccds:CCDS14233", + "ccds:CCDS75965", + "ccds:CCDS48091", + "vega:OTTHUMG00000021336", + "uniprot:P11532", + "ccds:CCDS14231", + "ena.embl:AF047505", + "pubmed:23900271", + "refseq:NM_004006", + "pubmed:3282674" + ] + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "MRX85" + ] + } + ], + "gene_id": "hgnc:2928" + } + + +@pytest.fixture(scope='module') +def mecp2_gene_context(): + """Create test fixture for MECP2 gene context""" + return { + "id": "normalize.gene:MECP2", + "type": "GeneDescriptor", + "label": "MECP2", + "xrefs": [ + "ensembl:ENSG00000169057", + "ncbigene:4204" + ], + "alternate_labels": [ + "RTT", + "AUTSX3", + "RS", + "PPMX", + "MRX16", + "MRXS13", + "LOC113065", + "RTS", + "MRX79", + "MRXSL" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "methyl-CpG binding protein 2" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "q28", + "end": "q28" + }, + "_id": "ga4gh:VCL.fEBeCyej0jVKsvjw4vxyW6j1h8UVLb5S", + "type": "ChromosomeLocation", + "chr": "X" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "ccds:CCDS48193", + "ucsc:uc004fjv.3", + "ccds:CCDS14741", + "omim:300005", + "orphanet:123186", + "pubmed:1606614", + "pubmed:10508514", + "uniprot:P51608", + "refseq:NM_004992", + "vega:OTTHUMG00000024229", + "ena.embl:AF158180" + ] + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "LOC113065", + "PPMX", + "MRX16", + "RTT", + "MRX79" + ] + } + ], + "gene_id": "hgnc:6990" + } + + +@pytest.fixture(scope='module') +def genomic_dup1(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:NC_000003.12%3Ag.49531262dup", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:1000035", + "vrs_ref_allele_seq": "GG" + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup1_seq_loc(): + """Create test fixture containing genomic dup1 sequence location""" + return { + "_id": "ga4gh:VSL.G_J9WrfooiONRgjbmGPuCBYbBYFQnYOg", + "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", + "interval": { + "type": "SequenceInterval", + "start": {"value": 49531260, "type": "Number"}, + "end": {"value": 49531262, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_dup1_default(genomic_dup1, genomic_dup1_seq_loc): + """Create a test fixture for genomic dup default and LSE.""" + _id = "ga4gh:VA.aeNse-a8IJzqHiG-P5zTRYA_eVFhrJXw" + genomic_dup1["variation_id"] = _id + genomic_dup1["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup1_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "GGG" + } + } + return VariationDescriptor(**genomic_dup1) + + +@pytest.fixture(scope='module') +def genomic_dup1_cnv(genomic_dup1, genomic_dup1_seq_loc): + """Create a test fixture for genomic dup CNV.""" + _id = "ga4gh:VCN.KdBguJLeiXM2yr3JaRQ2kxLxaAd4pPlq" + genomic_dup1["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_dup1_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 3 + } + } + genomic_dup1["variation_id"] = _id + return VariationDescriptor(**genomic_dup1) + + +@pytest.fixture(scope='module') +def genomic_dup1_rse(genomic_dup1, genomic_dup1_seq_loc): + """Create a test fixture for genomic dup RSE.""" + _id = "ga4gh:VA.lAyulP9JxvQReKWjpq0LbO50r2UTeMkl" + genomic_dup1["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup1_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_dup1_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 2 + } + } + } + genomic_dup1["variation_id"] = _id + return VariationDescriptor(**genomic_dup1) + + +@pytest.fixture(scope='module') +def genomic_dup1_free_text(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:DAG1%20g.49568695dup", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "transcript", + "structural_type": "SO:1000035", + "vrs_ref_allele_seq": "GG", + "gene_context": { + "id": "normalize.gene:DAG1", + "type": "GeneDescriptor", + "label": "DAG1", + "xrefs": [ + "ensembl:ENSG00000173402", + "ncbigene:1605" + ], + "alternate_labels": [ + "156DAG", + "MDDGA9", + "AGRNR", + "DAG", + "LGMDR16", + "A3a", + "MDDGC7", + "MDDGC9" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "dystroglycan 1" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "p21.31", + "end": "p21.31" + }, + "_id": "ga4gh:VCL.l_F_O8hoRfwdUsaN3UScymcvqRWLeKQT", + "type": "ChromosomeLocation", + "chr": "3" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "pubmed:7774920", + "pubmed:1741056", + "merops:S72.001", + "uniprot:Q14118", + "orphanet:280347", + "ucsc:uc021wxz.1", + "vega:OTTHUMG00000156869", + "omim:128239", + "refseq:NM_001165928", + "ena.embl:L19711", + "ccds:CCDS2799" + ] + } + ], + "gene_id": "hgnc:2666" + } + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup1_free_text_seq_loc(): + """Create genomic dup1 free text sequence location""" + return { + "_id": "ga4gh:VSL.wasOdqigAN-is7O2nEqJeDwkPlwpiOak", + "sequence_id": "ga4gh:SQ.tpvbnWsfEGqip8gJQZnWJAF8-bWDUDKd", + "interval": { + "type": "SequenceInterval", + "start": {"value": 1032, "type": "Number"}, + "end": {"value": 1034, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_dup1_free_text_default(genomic_dup1_free_text): + """Create a test fixture for genomic dup default and LSE.""" + _id = "ga4gh:VA.eE5Kr1zJrv3PSXeBabbKTFnZxToaYxat" + genomic_dup1_free_text["variation_id"] = _id + genomic_dup1_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": { + "_id": "ga4gh:VSL.wasOdqigAN-is7O2nEqJeDwkPlwpiOak", + "sequence_id": "ga4gh:SQ.tpvbnWsfEGqip8gJQZnWJAF8-bWDUDKd", + "interval": { + "type": "SequenceInterval", + "start": {"value": 1032, "type": "Number"}, + "end": {"value": 1034, "type": "Number"}, + }, + "type": "SequenceLocation", + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "GGG" + } + } + return VariationDescriptor(**genomic_dup1_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup1_free_text_cnv(genomic_dup1_free_text, + genomic_dup1_free_text_seq_loc): + """Create a test fixture for genomic dup CNV.""" + _id = "ga4gh:VCN.QaiY27vxjYq1pNlI7xWSxom2S-JHkW-r" + genomic_dup1_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_dup1_free_text_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 3 + } + } + genomic_dup1_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_dup1_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup1_free_text_rse(genomic_dup1_free_text, + genomic_dup1_free_text_seq_loc): + """Create a test fixture for genomic dup RSE.""" + _id = "ga4gh:VA.VQKwP3GpeObfGc3MzvA9JNL1YwkZynKO" + genomic_dup1_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup1_free_text_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_dup1_free_text_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 2 + } + } + } + genomic_dup1_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_dup1_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup2(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:NC_000016.10%3Ag.2087938_2087948dup", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:1000035", + "vrs_ref_allele_seq": "AAAGGTAGGGC" + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup2_seq_loc(): + """Create genomic dup2 sequence location""" + return { + "_id": "ga4gh:VSL.4mH68huylkPmu6zyUwH4wiazIYr9cQUX", + "sequence_id": "ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0", + "interval": { + "type": "SequenceInterval", + "start": {"value": 2087937, "type": "Number"}, + "end": {"value": 2087948, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_dup2_default(genomic_dup2, genomic_dup2_seq_loc): + """Create a test fixture for genomic dup default and LSE.""" + _id = "ga4gh:VA.wqqxfUCrFSndedI2-4oiIuHLHHGjBFof" + genomic_dup2["variation_id"] = _id + genomic_dup2["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup2_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "AAAGGTAGGGCAAAGGTAGGGC" + } + } + return VariationDescriptor(**genomic_dup2) + + +@pytest.fixture(scope='module') +def genomic_dup2_cnv(genomic_dup2, genomic_dup2_seq_loc): + """Create a test fixture for genomic dup CNV.""" + _id = "ga4gh:VCN.rd1wobb8NXRxk6O__njJUQg_ekZUALGx" + genomic_dup2["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_dup2_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 3 + } + } + genomic_dup2["variation_id"] = _id + return VariationDescriptor(**genomic_dup2) + + +@pytest.fixture(scope='module') +def genomic_dup2_rse(genomic_dup2, genomic_dup2_seq_loc): + """Create a test fixture for genomic dup RSE.""" + _id = "ga4gh:VA.fXANtjCcUPJ1A4bCSgcAxSSrxoqXuL3A" + genomic_dup2["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup2_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_dup2_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 2 + } + } + } + genomic_dup2["variation_id"] = _id + return VariationDescriptor(**genomic_dup2) + + +@pytest.fixture(scope='module') +def genomic_dup2_free_text(dmd_gene_context): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:TSC2%20g.2137939_2137949dup", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "transcript", + "structural_type": "SO:1000035", + "vrs_ref_allele_seq": "TAGA", + "gene_context": dmd_gene_context + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup2_free_text_seq_loc(): + """Create genomic dup2 free text sequence location""" + return { + "_id": "ga4gh:VSL.3JAa1wqyQWE510wqzNXoPptxYVXocFqj", + "sequence_id": "ga4gh:SQ.1DeZLYHMnd-smp3GDlpRxETb9_0AokO7", + "interval": { + "type": "SequenceInterval", + "start": {"value": 256, "type": "Number"}, + "end": {"value": 260, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_dup2_free_text_default(genomic_dup2_free_text, + genomic_dup2_free_text_seq_loc): + """Create a test fixture for genomic dup default and LSE.""" + _id = "ga4gh:VA.BRi89LZSxVMXaa6xVjuXIh0I_u2MyPkc" + genomic_dup2_free_text["variation_id"] = _id + genomic_dup2_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup2_free_text_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "TAGATAGA" + } + } + return VariationDescriptor(**genomic_dup2_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup2_free_text_cnv(genomic_dup2_free_text, + genomic_dup2_free_text_seq_loc): + """Create a test fixture for genomic dup CNV.""" + _id = "ga4gh:VCN.KfNh7wnKkw5pfvauEK2lu5TOdgDZfnJP" + genomic_dup2_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_dup2_free_text_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 2, + "max": 3 + } + } + genomic_dup2_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_dup2_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup2_free_text_rse(genomic_dup2_free_text, + genomic_dup2_free_text_seq_loc): + """Create a test fixture for genomic dup RSE.""" + _id = "ga4gh:VA.Rby7K6TikhqXL9BhM8xDJHNudJlRmS3j" + genomic_dup2_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_dup2_free_text_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_dup2_free_text_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 2 + } + } + } + genomic_dup2_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_dup2_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup3(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:NC_000023.11%3Ag.%2831060227_31100351%29_%2833274278_33417151%29dup", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup3_default(genomic_dup3): + """Create a test fixture for genomic dup default and cnv.""" + _id = "ga4gh:VCN.IgQATuKrM_J5MDHm2VemKThFOkzz-7AZ" + genomic_dup3["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.DgEMxYt1AdPe-HZAQbT2AVz5OejICnOj", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "min": 31060226, + "max": 31100350, + "type": "DefiniteRange" + }, + "end": { + "min": 33274279, + "max": 33417152, + "type": "DefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 2, + "max": 3 + } + } + genomic_dup3["variation_id"] = _id + return VariationDescriptor(**genomic_dup3) + + +@pytest.fixture(scope='module') +def genomic_dup3_rse_lse(genomic_dup3): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup3["id"], + "type": genomic_dup3["type"], + "variation": { + "_id": "ga4gh:VT.15sKDgSyoCPOgfrFHvSea-fHVeu7huVT", + "type": "Text", + "definition": "NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup" # noqa: E501 + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup3_free_text(dmd_gene_context): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:DMD%20g.%2831147274_31147278%29_%2831182737_31182739%29dup", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None, + "gene_context": dmd_gene_context + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup3_free_text_default(genomic_dup3_free_text): + """Create a test fixture for genomic dup default and cnv.""" + _id = "ga4gh:VCN.mMt9eqOhTHjRLR_gAJ7zgbDMVOblxSLo" + genomic_dup3_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.6JRgXRroqGleDLuwmOdHSbUK8Lm27fos", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "min": 31147273, + "max": 31147277, + "type": "DefiniteRange" + }, + "end": { + "min": 31182738, + "max": 31182740, + "type": "DefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 2, + "max": 3 + } + } + genomic_dup3_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_dup3_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup3_free_text_rse_lse(genomic_dup3_free_text): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup3_free_text["id"], + "type": genomic_dup3_free_text["type"], + "variation": { + "_id": "ga4gh:VT.F0AX-RkMN4U8KLkIE68ECGU83Y-ICWXh", + "type": "Text", + "definition": "DMD g.(31147274_31147278)_(31182737_31182739)dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup4(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:NC_000020.11%3Ag.%28%3F_30417576%29_%2831394018_%3F%29dup", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup4_default(genomic_dup4): + """Create a test fixture for genomic dup default and cnv.""" + _id = "ga4gh:VCN.3rvfUmiIb4hSxVQhXKOonuOY6Q3xTkKx" + genomic_dup4["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.us51izImAQQWr-Hu6Q7HQm-vYvmb-jJo", + "sequence_id": "ga4gh:SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 30417575, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 31394018, + "comparator": ">=", + "type": "IndefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 3 + } + } + genomic_dup4["variation_id"] = _id + return VariationDescriptor(**genomic_dup4) + + +@pytest.fixture(scope='module') +def genomic_dup4_rse_lse(genomic_dup4): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup4["id"], + "type": genomic_dup4["type"], + "variation": { + "_id": "ga4gh:VT.Pga4IH82qga2iZAodjxYw9OXhB4Xa2g8", + "type": "Text", + "definition": "NC_000020.11:g.(?_30417576)_(31394018_?)dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup4_free_text(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:PRF8%20g.%28%3F_1577736%29_%281587865_%3F%29", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None, + "gene_context": { + "id": "normalize.gene:PRPF8", + "type": "GeneDescriptor", + "label": "PRPF8", + "xrefs": [ + "ensembl:ENSG00000174231", + "ncbigene:10594" + ], + "alternate_labels": [ + "PRPC8", + "PRP8", + "HPRP8", + "Prp8", + "RP13", + "hPrp8", + "SNRNP220" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "pre-mRNA processing factor 8" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "p13.3", + "end": "p13.3" + }, + "_id": "ga4gh:VCL.GJ_KKaBnwZCC9_0vezbSxp_yAwM6R8c4", + "type": "ChromosomeLocation", + "chr": "17" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "pubmed:10411133", + "ucsc:uc002fte.3", + "pubmed:11468273", + "orphanet:118066", + "ccds:CCDS11010", + "refseq:NM_006445", + "vega:OTTHUMG00000090553", + "uniprot:Q6P2Q9", + "ena.embl:AB007510", + "omim:607300" + ] + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "RP13" + ] + } + ], + "gene_id": "hgnc:17340" + } + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup4_free_text_default(genomic_dup4_free_text): + """Create a test fixture for genomic dup default and cnv.""" + _id = "ga4gh:VCN.Yq_C5caHcDU8tLVHWFLoBFF4Xvv2g5Qp" + genomic_dup4_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.4eNCJnROfnoO-YvGnf-iGCeDHF_68g8H", + "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 1674441, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 1684571, + "comparator": ">=", + "type": "IndefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 3 + } + } + genomic_dup4_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_dup4_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup4_free_text_rse_lse(genomic_dup4_free_text): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup4_free_text["id"], + "type": genomic_dup4_free_text["type"], + "variation": { + "_id": "ga4gh:VT.Pga4IH82qga2iZAodjxYw9OXhB4Xa2g8", + "type": "Text", + "definition": "NC_000020.11:g.(?_30417576)_(31394018_?)dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup5(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:NC_000023.11%3Ag.%28%3F_154021812%29_154092209dup", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None + } + return params + + +def genomic_dup5_copy_number(params): + """Create genomic dup5 copy number object""" + _id = "ga4gh:VCN.eLAZZ-ht1h2dTtZqzhO9TVhBdFufv67-" + params["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.k2FXLyqyS8pbtZxEHCpNd2SHD6iCtH9C", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 154021811, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 154092209, + "type": "Number" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 2, + "max": 3 + } + } + params["variation_id"] = _id + + +@pytest.fixture(scope='module') +def genomic_dup5_default(genomic_dup5): + """Create a test fixture for genomic dup default and cnv.""" + genomic_dup5_copy_number(genomic_dup5) + return VariationDescriptor(**genomic_dup5) + + +@pytest.fixture(scope='module') +def genomic_dup5_rse_lse(genomic_dup5): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup5["id"], + "type": genomic_dup5["type"], + "variation": { + "_id": "ga4gh:VT.of16BEeHyU9od62SrjSCQ4LyUtbbGoKi", + "type": "Text", + "definition": "NC_000023.11:g.(?_154021812)_154092209dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup5_free_text(mecp2_gene_context): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:MECP2%20g.%28%3F_154021812%29_154092209dup", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None, + "gene_context": mecp2_gene_context + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup5_free_text_default(genomic_dup5_free_text): + """Create a test fixture for genomic dup default and cnv.""" + genomic_dup5_copy_number(genomic_dup5_free_text) + return VariationDescriptor(**genomic_dup5_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup5_free_text_rse_lse(genomic_dup5_free_text): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup5_free_text["id"], + "type": genomic_dup5_free_text["type"], + "variation": { + "_id": "ga4gh:VT.Kw18bSFpQp9xdKg88fqW7zUx4_VXFIiW", + "type": "Text", + "definition": "MECP2 g.(?_154021812)_154092209dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup6(): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:NC_000023.11%3Ag.154021812_%28154092209_%3F%29dup", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None + } + return params + + +def genomic_dup6_copy_number(params): + """Create genomic dup6 copy number object""" + _id = "ga4gh:VCN.Rekk_MmUQ777V76S51x7nZGjh4U3LkLy" + params["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.h0_xXu36uSnPEuLoxvVmTAFQCS1ZFuLN", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 154021811, + "type": "Number" + }, + "end": { + "value": 154092209, + "comparator": ">=", + "type": "IndefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 2, + "max": 3 + } + } + params["variation_id"] = _id + + +@pytest.fixture(scope='module') +def genomic_dup6_default(genomic_dup6): + """Create a test fixture for genomic dup default and cnv.""" + genomic_dup6_copy_number(genomic_dup6) + return VariationDescriptor(**genomic_dup6) + + +@pytest.fixture(scope='module') +def genomic_dup6_rse_lse(genomic_dup6): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup6["id"], + "type": genomic_dup6["type"], + "variation": { + "_id": "ga4gh:VT.2k5AWTbGJxvLVT6bUW0pUMq6XGAcEjXW", + "type": "Text", + "definition": "NC_000023.11:g.154021812_(154092209_?)dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_dup6_free_text(mecp2_gene_context): + """Create test fixture containing params for genomic dup VD.""" + params = { + "id": "normalize.variation:MECP2%20g.154021812_%28154092209_%3F%29dup", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001742", + "vrs_ref_allele_seq": None, + "gene_context": mecp2_gene_context + } + return params + + +@pytest.fixture(scope='module') +def genomic_dup6_free_text_default(genomic_dup6_free_text): + """Create a test fixture for genomic dup default and cnv.""" + genomic_dup6_copy_number(genomic_dup6_free_text) + return VariationDescriptor(**genomic_dup6_free_text) + + +@pytest.fixture(scope='module') +def genomic_dup6_free_text_rse_lse(genomic_dup6_free_text): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_dup6_free_text["id"], + "type": genomic_dup6_free_text["type"], + "variation": { + "_id": "ga4gh:VT.LbAqiLmJs1t9-FgEKD0-KDKwzvM3AAlz", + "type": "Text", + "definition": "MECP2 g.154021812_(154092209_?)dup" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del1(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:NC_000003.12%3Ag.10149811del", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0000159", + "vrs_ref_allele_seq": "T" + } + return params + + +@pytest.fixture(scope='module') +def genomic_del1_seq_loc(): + """Create genomic del1 sequence location""" + return { + "_id": "ga4gh:VSL.Yg5B66zErDjK9Lqeaw-kuzAB9w5-uUaS", + "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", + "interval": { + "type": "SequenceInterval", + "start": {"value": 10149810, "type": "Number"}, + "end": {"value": 10149811, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_del1_default(genomic_del1, genomic_del1_seq_loc): + """Create a test fixture for genomic del default and LSE.""" + _id = "ga4gh:VA.jUeT1n4AuBzwtt5TT-Iaac1KasATWjKE" + genomic_del1["variation_id"] = _id + genomic_del1["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del1_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "" + } + } + return VariationDescriptor(**genomic_del1) + + +@pytest.fixture(scope='module') +def genomic_del1_cnv(genomic_del1, genomic_del1_seq_loc): + """Create a test fixture for genomic del CNV.""" + _id = "ga4gh:VCN._Iv1RBu8ctlHOaobb4emjxwbxPdkBIVF" + genomic_del1["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_del1_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 1 + } + } + genomic_del1["variation_id"] = _id + return VariationDescriptor(**genomic_del1) + + +@pytest.fixture(scope='module') +def genomic_del1_rse(genomic_del1, genomic_del1_seq_loc): + """Create a test fixture for genomic del RSE.""" + _id = "ga4gh:VA.6fIEZ3R2W4wIaltUX1jyw9ap5YN6oGDT" + genomic_del1["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del1_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_del1_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 0 + } + } + } + genomic_del1["variation_id"] = _id + return VariationDescriptor(**genomic_del1) + + +@pytest.fixture(scope='module') +def genomic_del1_free_text(vhl_gene_context): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:VHL%20g.10191495del", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "transcript", + "structural_type": "SO:0000159", + "vrs_ref_allele_seq": "T", + "gene_context": vhl_gene_context + } + return params + + +@pytest.fixture(scope='module') +def genomic_del1_free_text_seq_loc(): + """Create genomic del1 free text sequence location""" + return { + "_id": "ga4gh:VSL.90XXYrpPCTvaFcyb7L4W4EcE9OexpmNv", + "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", + "interval": { + "type": "SequenceInterval", + "start": {"value": 557, "type": "Number"}, + "end": {"value": 558, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_del1_free_text_default(genomic_del1_free_text, + genomic_del1_free_text_seq_loc): + """Create a test fixture for genomic del default and LSE.""" + _id = "ga4gh:VA.DdtLZ_d22R0O0VU020WcCLvNhXNZtU2j" + genomic_del1_free_text["variation_id"] = _id + genomic_del1_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del1_free_text_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "" + } + } + return VariationDescriptor(**genomic_del1_free_text) + + +@pytest.fixture(scope='module') +def genomic_del1_free_text_cnv(genomic_del1_free_text, + genomic_del1_free_text_seq_loc): + """Create a test fixture for genomic del CNV.""" + _id = "ga4gh:VCN.HBeZfrNQLpVppisn_FHfWbpa8ehL-49P" + genomic_del1_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_del1_free_text_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 1 + } + } + genomic_del1_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_del1_free_text) + + +@pytest.fixture(scope='module') +def genomic_del1_free_text_rse(genomic_del1_free_text, + genomic_del1_free_text_seq_loc): + """Create a test fixture for genomic del RSE.""" + _id = "ga4gh:VA.o8kDqsCKM-cakyb_Pa5HWXLFxKqHtZA4" + genomic_del1_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del1_free_text_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_del1_free_text_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 0 + } + } + } + genomic_del1_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_del1_free_text) + + +@pytest.fixture(scope='module') +def genomic_del2(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:NC_000003.12%3Ag.10146595_10146613del", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0000159", + "vrs_ref_allele_seq": "ATGTTGACGGACAGCCTAT" + } + return params + + +@pytest.fixture(scope='module') +def genomic_del2_seq_loc(): + """Create genomic del2 sequence location""" + return { + "_id": "ga4gh:VSL.lksYAhEQvP8biy_nxoOJ_Zwu75a_kYtQ", + "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", + "interval": { + "type": "SequenceInterval", + "start": {"value": 10146594, "type": "Number"}, + "end": {"value": 10146613, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_del2_default(genomic_del2, genomic_del2_seq_loc): + """Create a test fixture for genomic del default and LSE.""" + _id = "ga4gh:VA.CSWNhR5w_geMmJTxkbO3UCLCvT0S2Ypx" + genomic_del2["variation_id"] = _id + genomic_del2["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del2_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "" + } + } + return VariationDescriptor(**genomic_del2) + + +@pytest.fixture(scope='module') +def genomic_del2_cnv(genomic_del2, genomic_del2_seq_loc): + """Create a test fixture for genomic del CNV.""" + _id = "ga4gh:VCN.gBHXvaw64pQg04DAhp_Gtzh8ADUf7HuI" + genomic_del2["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_del2_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 1 + } + } + genomic_del2["variation_id"] = _id + return VariationDescriptor(**genomic_del2) + + +@pytest.fixture(scope='module') +def genomic_del2_rse(genomic_del2, genomic_del2_seq_loc): + """Create a test fixture for genomic del RSE.""" + _id = "ga4gh:VA.aQeEhbisBWYrzVbf3-VPOZtGJu1vKmfx" + genomic_del2["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del2_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_del2_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 0 + } + } + } + genomic_del2["variation_id"] = _id + return VariationDescriptor(**genomic_del2) + + +@pytest.fixture(scope='module') +def genomic_del2_free_text(vhl_gene_context): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:VHL%20g.10188279_10188297del", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "transcript", + "structural_type": "SO:0000159", + "vrs_ref_allele_seq": "ATGTTGACGGACAGCCTAT", + "gene_context": vhl_gene_context + } + return params + + +@pytest.fixture(scope='module') +def genomic_del2_free_text_seq_loc(): + """Create genomic del2 free text sequence location""" + return { + "_id": "ga4gh:VSL.9fIfzZxIhfm4AlUhBlU9PswkG8ei57lR", + "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", + "interval": { + "type": "SequenceInterval", + "start": {"value": 491, "type": "Number"}, + "end": {"value": 510, "type": "Number"}, + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope='module') +def genomic_del2_free_text_default(genomic_del2_free_text, + genomic_del2_free_text_seq_loc): + """Create a test fixture for genomic del default and LSE.""" + _id = "ga4gh:VA.V0TeIIZTlBnFTIc64hqxzjbhAH3I4VZI" + genomic_del2_free_text["variation_id"] = _id + genomic_del2_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del2_free_text_seq_loc, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "" + } + } + return VariationDescriptor(**genomic_del2_free_text) + + +@pytest.fixture(scope='module') +def genomic_del2_free_text_cnv(genomic_del2_free_text, + genomic_del2_free_text_seq_loc): + """Create a test fixture for genomic del CNV.""" + _id = "ga4gh:VCN.aTh-gPjB3WdB27ihgFWJFJs52rGVm35z" + genomic_del2_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": genomic_del2_free_text_seq_loc, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 1 + } + } + genomic_del2_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_del2_free_text) + + +@pytest.fixture(scope='module') +def genomic_del2_free_text_rse(genomic_del2_free_text, + genomic_del2_free_text_seq_loc): + """Create a test fixture for genomic del RSE.""" + _id = "ga4gh:VA.uED5jM7zwbFLiXfCufVuwIs2ufkPF2KJ" + genomic_del2_free_text["variation"] = { + "type": "Allele", + "_id": _id, + "location": genomic_del2_free_text_seq_loc, + "state": { + "type": "RepeatedSequenceExpression", + "seq_expr": { + "type": "DerivedSequenceExpression", + "location": genomic_del2_free_text_seq_loc, + "reverse_complement": False + }, + "count": { + "type": "Number", + "value": 0 + } + } + } + genomic_del2_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_del2_free_text) + + +@pytest.fixture(scope='module') +def genomic_del3(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:NC_000023.11%3Ag.%2831060227_31100351%29_%2833274278_33417151%29del", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None + } + return params + + +@pytest.fixture(scope='module') +def genomic_del3_default(genomic_del3): + """Create a test fixture for genomic del default and cnv.""" + _id = "ga4gh:VCN.9h2LkajTwHBdXYMRyrD9HkYwU9d7fIBr" + genomic_del3["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.DgEMxYt1AdPe-HZAQbT2AVz5OejICnOj", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "min": 31060226, + "max": 31100350, + "type": "DefiniteRange" + }, + "end": { + "min": 33274279, + "max": 33417152, + "type": "DefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 0, + "max": 1 + } + } + genomic_del3["variation_id"] = _id + return VariationDescriptor(**genomic_del3) + + +@pytest.fixture(scope='module') +def genomic_del3_rse_lse(genomic_del3): + """Create test fixture for genomic del rse and lse.""" + params = { + "id": genomic_del3["id"], + "type": genomic_del3["type"], + "variation": { + "_id": "ga4gh:VT.tmA3mpMy9HKUweaB8aYsq6uuejEx9iK7", + "type": "Text", + "definition": "NC_000023.11:g.(31060227_31100351)_(33274278_33417151)del" # noqa: E501 + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del3_free_text(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:EFNB1%20g.%2868839265_68839268%29_%2868841120_68841125%29del", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None, + "gene_context": { + "id": "normalize.gene:EFNB1", + "type": "GeneDescriptor", + "label": "EFNB1", + "xrefs": [ + "ensembl:ENSG00000090776", + "ncbigene:1947" + ], + "alternate_labels": [ + "EPLG2", + "Elk-L", + "CFND", + "CFNS", + "EFB1", + "LERK2", + "EFL3" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "ephrin B1" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "q13.1", + "end": "q13.1" + }, + "_id": "ga4gh:VCL.2INIrDKtMs_uh9lw8BWws2AMpzlbMaBB", + "type": "ChromosomeLocation", + "chr": "X" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "uniprot:P98172", + "ena.embl:U09303", + "ucsc:uc004dxd.5", + "omim:300035", + "pubmed:16526919", + "refseq:NM_004429", + "ccds:CCDS14391", + "orphanet:121305", + "vega:OTTHUMG00000021751", + "iuphar:4913", + "pubmed:7774950" + ] + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "CFNS", + "EPLG2" + ] + } + ], + "gene_id": "hgnc:3226" + } + } + return params + + +@pytest.fixture(scope='module') +def genomic_del3_free_text_default(genomic_del3_free_text): + """Create a test fixture for genomic del default and cnv.""" + _id = "ga4gh:VCN.-sOh0hKxd_KA2v6mRDsCliowXNAl-4lV" + genomic_del3_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.gqWO-oN2bMIXm_YuZR4_beT57QN-kRGJ", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "min": 68839264, + "max": 68839267, + "type": "DefiniteRange" + }, + "end": { + "min": 68841121, + "max": 68841126, + "type": "DefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 0, + "max": 1 + } + } + genomic_del3_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_del3_free_text) + + +@pytest.fixture(scope='module') +def genomic_del3_free_text_rse_lse(genomic_del3_free_text): + """Create test fixture for genomic del rse and lse.""" + params = { + "id": genomic_del3_free_text["id"], + "type": genomic_del3_free_text["type"], + "variation": { + "_id": "ga4gh:VT.9mGg0U_Z7NZCFV3jrLdGxSQU03g7z3Z1", + "type": "Text", + "definition": "EFNB1 g.(68839265_68839268)_(68841120_68841125)del" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del4(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:NC_000023.11%3Ag.%28%3F_31120496%29_%2833339477_%3F%29del", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None + } + return params + + +@pytest.fixture(scope='module') +def genomic_del4_default(genomic_del4): + """Create a test fixture for genomic del default and cnv.""" + _id = "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk" + genomic_del4["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.7OJ5EFgu_2C4zPFDUBgn-ziE6BZwsRcv", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 31120495, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 33339477, + "comparator": ">=", + "type": "IndefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 0, + "max": 1 + } + } + genomic_del4["variation_id"] = _id + return VariationDescriptor(**genomic_del4) + + +@pytest.fixture(scope='module') +def genomic_del4_rse_lse(genomic_del4): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_del4["id"], + "type": genomic_del4["type"], + "variation": { + "_id": "ga4gh:VT.whBY5P24WVxF1wneDcI8x8btqorJUWXQ", + "type": "Text", + "definition": "NC_000023.11:g.(?_31120496)_(33339477_?)del" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del4_free_text(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:COL4A4%20g.%28%3F_227022028%29_%28227025830_%3F%29del", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None, + "gene_context": { + "id": "normalize.gene:COL4A4", + "type": "GeneDescriptor", + "label": "COL4A4", + "xrefs": [ + "ensembl:ENSG00000081052", + "ncbigene:1286" + ], + "alternate_labels": [ + "BFH", + "ATS2", + "CA44" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "collagen type IV alpha 4 chain" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "q36.3", + "end": "q36.3" + }, + "_id": "ga4gh:VCL.1raDfW4j_diAb62KX4wnjRGD3A6va_BB", + "type": "ChromosomeLocation", + "chr": "2" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "omim:120131", + "ucsc:uc061teu.1", + "ccds:CCDS42828", + "orphanet:120720", + "pubmed:1639407", + "vega:OTTHUMG00000149892", + "refseq:NM_000092", + "uniprot:P53420" + ] + } + ], + "gene_id": "hgnc:2206" + } + } + return params + + +@pytest.fixture(scope='module') +def genomic_del4_free_text_default(genomic_del4_free_text): + """Create a test fixture for genomic del default and cnv.""" + _id = "ga4gh:VCN.SvFPk7UrVFhzI3ANMJidDk5GItHgw0j_" + genomic_del4_free_text["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.s4_6D986zFS0HIBuEDFl5aq2-VCl45h1", + "sequence_id": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 227022027, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 227025830, + "comparator": ">=", + "type": "IndefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 1 + } + } + genomic_del4_free_text["variation_id"] = _id + return VariationDescriptor(**genomic_del4_free_text) + + +@pytest.fixture(scope='module') +def genomic_del4_free_text_rse_lse(genomic_del4_free_text): + """Create test fixture for genomic dup rse and lse.""" + params = { + "id": genomic_del4_free_text["id"], + "type": genomic_del4_free_text["type"], + "variation": { + "_id": "ga4gh:VT.lT0rFYhOGFLA9MYA8ypnCf5q-CkV8dJv", + "type": "Text", + "definition": "COL4A4 g.(?_227022028)_(227025830_?)del" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_uncertain_del_2(): + """Create a genomic uncertain deletion on chr 2 test fixture.""" + params = { + "id": 'normalize.variation:NC_000002.12%3Ag.%28%3F_110104900%29_%28110207160_%3F%29del', # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "ga4gh:VCN.8o5X1HTglUvwUAFo9vGL5OBnZqgpylys", + "variation": { + "_id": "ga4gh:VCN.8o5X1HTglUvwUAFo9vGL5OBnZqgpylys", + "subject": { + "location": { + "_id": "ga4gh:VSL.75GQmJvq7dyP9-wom8Jffjk0Q9Le7Q9O", + "sequence_id": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", + "interval": { + "start": { + "value": 110104899, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 110207160, + "comparator": ">=", + "type": "IndefiniteRange" + }, + "type": "SequenceInterval" + }, + "type": "SequenceLocation" + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "value": 1, + "type": "Number" + }, + "type": "CopyNumber" + }, + "molecule_context": "genomic", + "structural_type": "SO:0001743" + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_uncertain_del_y(): + """Create a genomic uncertain deletion on chr Y test fixture.""" + params = { + "id": 'normalize.variation:NC_000024.10%3Ag.%28%3F_14076802%29_%2857165209_%3F%29del', # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "ga4gh:VCN._T4dHJIfXB-cpqQSJ5g5pAM1JnwupWuv", + "variation": { + "_id": "ga4gh:VCN._T4dHJIfXB-cpqQSJ5g5pAM1JnwupWuv", + "subject": { + "location": { + "_id": "ga4gh:VSL.1xIN_RumlXTIsdTWvyJznzuzxJlwUfiD", + "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + "interval": { + "start": { + "value": 14076801, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 57165209, + "comparator": ">=", + "type": "IndefiniteRange" + }, + "type": "SequenceInterval" + }, + "type": "SequenceLocation" + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "value": 0, + "type": "Number" + }, + "type": "CopyNumber" + }, + "molecule_context": "genomic", + "structural_type": "SO:0001743" + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del5(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:NC_000023.11%3Ag.%28%3F_18575354%29_18653629del", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None + } + return params + + +def genomic_del5_copy_number(params): + """Create genomic del5 copy number""" + _id = "ga4gh:VCN._RIw5UC5bZeLeHnBLYAow7Ml-lv2nKJW" + params["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.jURzcCBf3kJVx19uuJJtwt78LuBbtfwD", + "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 18575353, + "comparator": "<=", + "type": "IndefiniteRange" + }, + "end": { + "value": 18653629, + "type": "Number" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "DefiniteRange", + "min": 0, + "max": 1 + } + } + params["variation_id"] = _id + + +@pytest.fixture(scope='module') +def genomic_del5_default(genomic_del5): + """Create a test fixture for genomic del default and cnv.""" + genomic_del5_copy_number(genomic_del5) + return VariationDescriptor(**genomic_del5) + + +@pytest.fixture(scope='module') +def genomic_del5_rse_lse(genomic_del5): + """Create test fixture for genomic del rse and lse.""" + params = { + "id": genomic_del5["id"], + "type": genomic_del5["type"], + "variation": { + "_id": "ga4gh:VT.xCLHh3GpCebrP6KDMsWZRdIiW7Sti27H", + "type": "Text", + "definition": "NC_000023.11:g.(?_18575354)_18653629del" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del5_free_text(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:CDKL5%20g.%28%3F_18575354%29_18653629del", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None, + "gene_context": { + "id": "normalize.gene:CDKL5", + "type": "GeneDescriptor", + "label": "CDKL5", + "xrefs": [ + "ensembl:ENSG00000008086", + "ncbigene:6792" + ], + "alternate_labels": [ + "STK9", + "ISSX", + "EIEE2", + "CFAP247", + "DEE2" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "cyclin dependent kinase like 5" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "p22.13", + "end": "p22.13" + }, + "_id": "ga4gh:VCL.BzhQOPmaVZVLol6JOVltNZrsv0XRekWR", + "type": "ChromosomeLocation", + "chr": "X" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "orphanet:119297", + "ena.embl:Y15057", + "pubmed:16935860", + "ccds:CCDS83458", + "omim:300203", + "vega:OTTHUMG00000021214", + "uniprot:O76039", + "pubmed:9721213", + "refseq:NM_003159", + "ccds:CCDS14186", + "iuphar:1986", + "ucsc:uc004cyn.4" + ] + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "STK9" + ] + } + ], + "gene_id": "hgnc:11411" + } + } + return params + + +@pytest.fixture(scope='module') +def genomic_del5_free_text_default(genomic_del5_free_text): + """Create a test fixture for genomic del default and cnv.""" + genomic_del5_copy_number(genomic_del5_free_text) + return VariationDescriptor(**genomic_del5_free_text) + + +@pytest.fixture(scope='module') +def genomic_del5_free_text_rse_lse(genomic_del5_free_text): + """Create test fixture for genomic del rse and lse.""" + params = { + "id": genomic_del5_free_text["id"], + "type": genomic_del5_free_text["type"], + "variation": { + "_id": "ga4gh:VT.xCLHh3GpCebrP6KDMsWZRdIiW7Sti27H", + "type": "Text", + "definition": "NC_000023.11:g.(?_18575354)_18653629del" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del6(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:NC_000006.12%3Ag.133462764_%28133464858_%3F%29del", # noqa: E501 + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None + } + return params + + +def genomic_del6_copy_number(params): + """Create genomic del6 copy number""" + _id = "ga4gh:VCN.F3U6Rmov1WO2mhmRHWumJb-YALOMkeeI" + params["variation"] = { + "type": "CopyNumber", + "_id": _id, + "subject": { + "location": { + "_id": "ga4gh:VSL.TPwsB5ymsNI7TynTlI8_8CI_NmNrBHUQ", + "sequence_id": "ga4gh:SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV", + "interval": { + "type": "SequenceInterval", + "start": { + "value": 133462763, + "type": "Number" + }, + "end": { + "value": 133464858, + "comparator": ">=", + "type": "IndefiniteRange" + } + }, + "type": "SequenceLocation", + }, + "reverse_complement": False, + "type": "DerivedSequenceExpression" + }, + "copies": { + "type": "Number", + "value": 1 + } + } + params["variation_id"] = _id + + +@pytest.fixture(scope='module') +def genomic_del6_default(genomic_del6): + """Create a test fixture for genomic del default and cnv.""" + genomic_del6_copy_number(genomic_del6) + return VariationDescriptor(**genomic_del6) + + +@pytest.fixture(scope='module') +def genomic_del6_rse_lse(genomic_del6): + """Create test fixture for genomic del rse and lse.""" + params = { + "id": genomic_del6["id"], + "type": genomic_del6["type"], + "variation": { + "_id": "ga4gh:VT.Df49jbB-kZ2LSm180uA9wn4TT_p215yX", + "type": "Text", + "definition": "NC_000006.12:g.133462764_(133464858_?)del" + } + } + return VariationDescriptor(**params) + + +@pytest.fixture(scope='module') +def genomic_del6_free_text(): + """Create test fixture containing params for genomic del VD.""" + params = { + "id": "normalize.variation:EYA4%20g.133462764_%28133464858_%3F%29del", + "type": "VariationDescriptor", + "variation_id": "", + "variation": dict(), + "molecule_context": "genomic", + "structural_type": "SO:0001743", + "vrs_ref_allele_seq": None, + "gene_context": { + "id": "normalize.gene:EYA4", + "type": "GeneDescriptor", + "label": "EYA4", + "xrefs": [ + "ensembl:ENSG00000112319", + "ncbigene:2070" + ], + "alternate_labels": [ + "CMD1J", + "DFNA10" + ], + "extensions": [ + { + "type": "Extension", + "name": "symbol_status", + "value": "approved" + }, + { + "type": "Extension", + "name": "approved_name", + "value": "EYA transcriptional coactivator " + "and phosphatase 4" + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "q23.2", + "end": "q23.2" + }, + "_id": "ga4gh:VCL.h3RPutxMQk5_6dIPcN5GL8KRahoTi9fm", + "type": "ChromosomeLocation", + "chr": "6" + } + }, + { + "type": "Extension", + "name": "associated_with", + "value": [ + "pubmed:9887327", + "ccds:CCDS43506", + "ccds:CCDS75521", + "pubmed:11159937", + "ucsc:uc011ecs.3", + "orphanet:121654", + "ccds:CCDS75523", + "omim:603550", + "refseq:NM_004100", + "ccds:CCDS5165", + "ena.embl:Y17114", + "vega:OTTHUMG00000015602", + "uniprot:O95677" + ] + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "CMD1J", + "DFNA10" + ] + } + ], + "gene_id": "hgnc:3522" + } + } + return params + + +@pytest.fixture(scope='module') +def genomic_del6_free_text_default(genomic_del6_free_text): + """Create a test fixture for genomic del default and cnv.""" + genomic_del6_copy_number(genomic_del6_free_text) + return VariationDescriptor(**genomic_del6_free_text) + + +@pytest.fixture(scope='module') +def genomic_del6_free_text_rse_lse(genomic_del6_free_text): + """Create test fixture for genomic del rse and lse.""" + params = { + "id": genomic_del6_free_text["id"], + "type": genomic_del6_free_text["type"], + "variation": { + "_id": "ga4gh:VT.a3kXhodtO3tgsdPlEL39Ql4jOuCpOc0s", + "type": "Text", + "definition": "EYA4 g.133462764_(133464858_?)del" + } + } + return VariationDescriptor(**params) + + +def assert_text_variation(query_list, test_normalize): + """Make assertion checks for invalid queries""" + for q in query_list: + resp = test_normalize.normalize(q, "default") + assert (resp.variation.type == "Text"), q + + +def test_genomic_dup1(test_normalize, genomic_dup1_default, + genomic_dup1_cnv, genomic_dup1_rse, + genomic_dup1_free_text_default, + genomic_dup1_free_text_cnv, genomic_dup1_free_text_rse): + """Test that genomic duplication works correctly.""" + q = "NC_000003.12:g.49531262dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup1_default) + + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup1_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup1_cnv) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup1_rse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup1_default) + + q = "NC_000003.11:g.49568695dup" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup1_default, ignore_id=True) + + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup1_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup1_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup1_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup1_default, ignore_id=True) + + # Free Text + for q in [ + "DAG1 g.49568695dup", # 37 + "DAG1 g.49531262dup" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup1_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup1_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup1_free_text_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup1_free_text_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup1_free_text_default, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000007.13:g.159138670dup", + "NC_000007.14:g.159345976dup", + "BRAF g.140219337dup", "BRAF g.141024929dup" + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_dup2(test_normalize, genomic_dup2_default, genomic_dup2_cnv, + genomic_dup2_rse, genomic_dup2_free_text_default, + genomic_dup2_free_text_cnv, genomic_dup2_free_text_rse): + """Test that genomic duplication works correctly.""" + q = "NC_000016.10:g.2087938_2087948dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup2_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup2_cnv) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup2_rse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup2_default) + + q = "NC_000016.9:g.2137939_2137949dup" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup2_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup2_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup2_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup2_default, ignore_id=True) + + # Free text + for q in [ + "DMD g.33229407_33229410dup", # 37 + "DMD g.33211290_33211293dup" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup2_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup2_free_text_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup2_free_text_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup2_free_text_default, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000007.13:g.140413127_159138670dup", + "NC_000007.14:g.140413127_159345976dup", + "BRAF g.140219337_140924929dup", "BRAF g.140719326_141024929dup" + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_dup3(test_normalize, genomic_dup3_default, + genomic_dup3_rse_lse, genomic_dup3_free_text_default, + genomic_dup3_free_text_rse_lse): + """Test that genomic duplication works correctly.""" + q = "NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup3_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup3_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup3_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup3_rse_lse) + + q = "NC_000023.10:g.(31078344_31118468)_(33292395_33435268)dup" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup3_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup3_default, ignore_id=True) + + genomic_dup3_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup3_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup3_rse_lse, ignore_id=True) + + # Free Text + for q in [ + # TODO: issue-176 + # "DMD g.(31165391_31165395)_(31200854_31200856)dup", + "DMD g.(31147274_31147278)_(31182737_31182739)dup" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup3_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup3_free_text_default, ignore_id=True) + + genomic_dup3_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup3_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup3_free_text_rse_lse, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000023.10:g.(31119221_31119227)_(31119300_155270562)dup", + "NC_000023.11:g.(31119221_31119227)_(31119300_156040899)dup", + "DMD g.(31060227_31100351)_(33274278_33417151)dup" + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_dup4(test_normalize, genomic_dup4_default, + genomic_dup4_rse_lse, genomic_dup4_free_text_default, + genomic_dup4_free_text_rse_lse): + """Test that genomic duplication works correctly.""" + q = "NC_000020.11:g.(?_30417576)_(31394018_?)dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup4_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup4_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup4_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup4_rse_lse) + + q = "NC_000020.10:g.(?_29652252)_(29981821_?)dup" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup4_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup4_default, ignore_id=True) + + genomic_dup4_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup4_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup4_rse_lse, ignore_id=True) + + # Free Text + for q in [ + "PRPF8 g.(?_1577736)_(1587865_?)dup", # 37 + "PRPF8 g.(?_1674442)_(1684571_?)dup" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup4_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup4_free_text_default, ignore_id=True) + + genomic_dup4_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + genomic_dup4_free_text_rse_lse.variation.definition = q + assertion_checks(resp, genomic_dup4_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup4_free_text_rse_lse, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000020.10:g.(?_29652252)_(63025530_?)dup", + "NC_000020.11:g.(?_29652252)_(64444169_?)dup", + "PRPF8 g.(?_1650628)_(1684571_?)dup" + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_dup5(test_normalize, genomic_dup5_default, + genomic_dup5_rse_lse, genomic_dup5_free_text_default, + genomic_dup5_free_text_rse_lse): + """Test that genomic duplication works correctly.""" + q = "NC_000023.11:g.(?_154021812)_154092209dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup5_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup5_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup5_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup5_rse_lse) + + q = "NC_000023.10:g.(?_153287263)_153357667dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup5_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup5_default, ignore_id=True) + + genomic_dup5_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup5_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup5_rse_lse, ignore_id=True) + + # Free Text + for q in [ + "MECP2 g.(?_153287263)_153357667dup", # 37 + "MECP2 g.(?_154021812)_154092209dup" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup5_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup5_free_text_default, ignore_id=True) + + genomic_dup5_free_text_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup5_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup5_free_text_rse_lse, ignore_id=True) + + # Invalid + for q in [ + "NC_000023.10:g.(?_153287263)_155270561dup", + "NC_000023.11:g.(?_154021812)_156040896dup", + "MECP2 g.(?_154021812)_154097733dup" # 37 + "MECP2 g.(?_154021572)_154092209dup", # 38 + ]: + resp = test_normalize.normalize(q, "default") + assert resp.variation.type == "Text" + + +def test_genomic_dup6(test_normalize, genomic_dup6_default, + genomic_dup6_rse_lse, genomic_dup6_free_text_default, + genomic_dup6_free_text_rse_lse): + """Test that genomic duplication works correctly.""" + q = "NC_000023.11:g.154021812_(154092209_?)dup" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup6_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup6_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup6_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup6_rse_lse) + + q = "NC_000023.10:g.153287263_(153357667_?)dup" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup6_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup6_default, ignore_id=True) + + genomic_dup6_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup6_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup6_rse_lse, ignore_id=True) + + # Free Text + for q in [ + "MECP2 g.153287263_(153357667_?)dup", # 37 + "MECP2 g.154021812_(154092209_?)dup" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_dup6_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_dup6_free_text_default, ignore_id=True) + + genomic_dup6_free_text_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_dup6_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_dup6_free_text_rse_lse, ignore_id=True) + + # Invalid + for q in [ + "NC_000023.10:g.153287263_(155270561_?)dup", + "NC_000023.11:g.154021812_(156040896_?)dup", + "MECP2 g.154021812_(154097733_?)dup" # 37 + "MECP2 g.154021572_(154092209_?)dup", # 38 + ]: + resp = test_normalize.normalize(q, "default") + assert resp.variation.type == "Text" + + +def test_genomic_del1(test_normalize, genomic_del1_default, genomic_del1_cnv, + genomic_del1_rse, genomic_del1_free_text_default, + genomic_del1_free_text_cnv, genomic_del1_free_text_rse): + """Test that genomic deletion works correctly.""" + q = "NC_000003.12:g.10149811del" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del1_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del1_cnv) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del1_rse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del1_default) + + q = "NC_000003.11:g.10191495del" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del1_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del1_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del1_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del1_default, ignore_id=True) + + # Free text + for q in [ + "VHL g.10191495del", # 37 + "VHL g.10149811del" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del1_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del1_free_text_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del1_free_text_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del1_free_text_default, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000003.11:g.198022431del", + "NC_000003.12:g.198295567del", + "BRAF g.140413127del", "BRAF g.141024929del" + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_del2(test_normalize, genomic_del2_default, genomic_del2_cnv, + genomic_del2_rse, genomic_del2_free_text_default, + genomic_del2_free_text_cnv, genomic_del2_free_text_rse): + """Test that genomic deletion works correctly.""" + q = "NC_000003.12:g.10146595_10146613del" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del2_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del2_cnv) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del2_rse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del2_default) + + q = "NC_000003.11:g.10188279_10188297del" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del2_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del2_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del2_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del2_default, ignore_id=True) + + # Free text + for q in [ + "VHL g.10188279_10188297del", # 37 + "VHL g.10146595_10146613del" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del2_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del2_free_text_cnv, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del2_free_text_rse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del2_free_text_default, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000003.12:g.10146595_198295580del", + "NC_000003.11:g.198022435_198022437del", + "BRAF g.140413127_140419136del", "BRAF g.140719326_141024929del" + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_del3(test_normalize, genomic_del3_default, + genomic_del3_rse_lse, genomic_del3_free_text_default, + genomic_del3_free_text_rse_lse): + """Test that genomic deletion works correctly.""" + q = "NC_000023.11:g.(31060227_31100351)_(33274278_33417151)del" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del3_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del3_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del3_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del3_rse_lse) + + q = "NC_000023.10:g.(31078344_31118468)_(33292395_33435268)del" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del3_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del3_default, ignore_id=True) + + genomic_del3_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del3_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del3_rse_lse, ignore_id=True) + + # Free Text + for q in [ + "EFNB1 g.(68059108_68059111)_(68060963_68060968)del", # 37 + "EFNB1 g.(68839265_68839268)_(68841120_68841125)del" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del3_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del3_free_text_default, ignore_id=True) + + genomic_del3_free_text_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del3_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del3_free_text_rse_lse, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000023.11:g.(156040880_156040883)_(156040896_156040899)del", + "NC_000023.10:g.(155270550_155270555)_(155270560_155270562)del", + "EFNB1 g.(68048863_68048870)_(68842150_68842152)del", # 37 + "EFNB1 g.(68829022_68829030)_(68842150_68842161)del" # 38 + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_del4(test_normalize, genomic_del4_default, + genomic_del4_rse_lse, genomic_uncertain_del_2, + genomic_uncertain_del_y, genomic_del4_free_text_default, + genomic_del4_free_text_rse_lse): + """Test that genomic deletion works correctly.""" + q = "NC_000023.11:g.(?_31120496)_(33339477_?)del" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del4_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del4_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del4_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del4_rse_lse) + + q = "NC_000023.10:g.(?_31138613)_(33357594_?)del" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del4_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del4_default, ignore_id=True) + + genomic_del4_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del4_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del4_rse_lse, ignore_id=True) + + q = "NC_000002.12:g.(?_110104900)_(110207160_?)del" + resp = test_normalize.normalize(q) + assertion_checks(resp, genomic_uncertain_del_2) + + q = "NC_000024.10:g.(?_14076802)_(57165209_?)del" + resp = test_normalize.normalize(q) + assertion_checks(resp, genomic_uncertain_del_y) + + # Free Text + for q in [ + # TODO: issue-176 + # "COL4A4 g.(?_227886744)_(227890546_?)del", # 37 + "COL4A4 g.(?_227022028)_(227025830_?)del" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del4_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del4_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del4_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del4_free_text_rse_lse, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000023.11:g.(?_156040899)_(156040900_?)del", + "NC_000024.10:g.(?_155270565)_(155270568_?)del", + "COL4A4 g.(?_227002710)_(227003710_?)del", + "COL4A4 g.(?_227867430)_(228029276_?)del", + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_del5(test_normalize, genomic_del5_default, + genomic_del5_rse_lse, genomic_del5_free_text_default, + genomic_del5_free_text_rse_lse): + """Test that genomic deletion works correctly.""" + q = "NC_000023.11:g.(?_18575354)_18653629del" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del5_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del5_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del5_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del5_rse_lse) + + q = "NC_000023.10:g.(?_18593474)_18671749del" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del5_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del5_default, ignore_id=True) + + genomic_del5_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del5_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del5_rse_lse, ignore_id=True) + + # Free text + for q in [ + # TODO: issue-176 + # "CDKL5 g.(?_18593474)_18671749del", + "CDKL5 g.(?_18575354)_18653629del" + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del5_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del5_free_text_default, ignore_id=True) + + genomic_del5_free_text_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del5_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del5_free_text_rse_lse, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000023.10:g.(?_155270550)_155270570del", + "NC_000023.11:g.(?_18593474)_18671749del" + "CDKL5 g.(?_18443702)_18671700del", # 37 + "CDKL5 g.(?_18425585)_18653631del", # 38 + "CDKL5 g.(?_18425582)_18653500del" # 38 + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_genomic_del6(test_normalize, genomic_del6_default, + genomic_del6_rse_lse, genomic_del6_free_text_default, + genomic_del6_free_text_rse_lse): + """Test that genomic deletion works correctly.""" + q = "NC_000006.12:g.133462764_(133464858_?)del" # 38 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del6_default) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del6_default) + + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del6_rse_lse) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del6_rse_lse) + + q = "NC_000006.11:g.133783902_(133785996_?)del" # 37 + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del6_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del6_default, ignore_id=True) + + genomic_del6_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del6_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del6_rse_lse, ignore_id=True) + + # Free text + for q in [ + # TODO: issue-176 + # "EYA4 g.133783902_(133785996_?)del", # 37 + "EYA4 g.133462764_(133464858_?)del" # 38 + ]: + resp = test_normalize.normalize(q, "default") + assertion_checks(resp, genomic_del6_free_text_default, ignore_id=True) + + resp = test_normalize.normalize(q, "cnv") + assertion_checks(resp, genomic_del6_free_text_default, ignore_id=True) + + genomic_del6_rse_lse.variation.definition = q + resp = test_normalize.normalize(q, "repeated_seq_expr") + assertion_checks(resp, genomic_del6_free_text_rse_lse, ignore_id=True) + + resp = test_normalize.normalize(q, "literal_seq_expr") + assertion_checks(resp, genomic_del6_free_text_rse_lse, ignore_id=True) + + # Invalid + invalid_queries = [ + "NC_000006.11:g.171115069_(171115080_?)del", + "NC_000006.12:g.170805981_(170805989_?)del" + "EYA4 g.133561700_(133853270_?)del", # 37 + "EYA4 g.133561651_(133561708_?)del", # 37 + "EYA4 g.133240513_(133240600_?)del", # 38 + "EYA4 g.133240515_(133532130_?)del" # 38 + ] + assert_text_variation(invalid_queries, test_normalize) + + +def test_parameters(test_normalize): + """Check that valid and invalid parameters work as intended.""" + q = "NC_000003.12:g.49531262dup" + warnings = ["hgvs_dup_del_mode must be one of: ['default', 'cnv', " + "'repeated_seq_expr', 'literal_seq_expr']"] + resp = test_normalize.normalize(q, "copy_number") + assert resp is None + assert test_normalize.warnings == warnings + + resp = test_normalize.normalize(q, "repeated_seq_exprs") + assert resp is None + assert test_normalize.warnings == warnings + + warnings = ["hgvs_dup_del_mode cannot be None"] + resp = test_normalize.normalize(q, '') + assert resp is None + assert test_normalize.warnings == warnings + + resp = test_normalize.normalize(q, None) + assert resp is None + assert test_normalize.warnings == warnings + + resp = test_normalize.normalize(q, " CnV ") + assert resp is not None + assert test_normalize.warnings == [] diff --git a/tests/test_mane_transcript.py b/tests/test_mane_transcript.py index a254a8e0..ce6ab921 100644 --- a/tests/test_mane_transcript.py +++ b/tests/test_mane_transcript.py @@ -83,7 +83,7 @@ def nm_004333v6_g(): 'tx_ac': 'NM_004333.6', 'tx_pos_range': (1967, 2086), 'alt_ac': 'NC_000007.14', - 'alt_pos_change_range': (140753331, 140753333), + 'alt_pos_change_range': (140753336, 140753334), 'alt_pos_range': (140753274, 140753393), 'pos_change': (57, 60), 'strand': '-', @@ -373,6 +373,19 @@ def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, 55259515, gene='EGFR') assert mane_c == egfr_l858r_mane_c + mane_c = test_mane_transcript.g_to_mane_c('NC_000012.11', 25398284, None, + gene='KRAS') + assert mane_c == { + 'refseq': 'NM_004985.5', + 'ensembl': 'ENST00000311936.8', + 'pos': (35, 35), + 'status': 'MANE Select', + 'strand': '-', + 'coding_start_site': 190, + 'coding_end_site': 757, + 'gene': 'KRAS' + } + mane_c = test_mane_transcript.g_to_mane_c('NC_000007.13', 140453136, None, gene='BRAF') assert mane_c == braf_v600e_mane_c diff --git a/tests/test_normalize.py b/tests/test_normalize.py index fbb81ba3..1131eabe 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -1,11 +1,12 @@ """Module for testing the normalize endpoint.""" import pytest from variation.query import QueryHandler -from ga4gh.vrsatile.pydantic.vrsatile_model import VariationDescriptor +from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor from datetime import datetime from variation.main import normalize as normalize_get_response from variation.main import to_vrs as to_vrs_get_response import copy +from tests.conftest import assertion_checks @pytest.fixture(scope="module") @@ -93,79 +94,6 @@ def braf_gene_context(): } -@pytest.fixture(scope='module') -def vhl_gene_context(): - """Create a VHL gene context.""" - return { - "id": "normalize.gene:VHL", - "type": "GeneDescriptor", - "label": "VHL", - "gene_id": "hgnc:12687", - "xrefs": [ - "ncbigene:7428", - "ensembl:ENSG00000134086" - ], - "alternate_labels": [ - "HRCA1", - "VHL1", - "RCA1", - "pVHL" - ], - "extensions": [ - { - "type": "Extension", - "name": "symbol_status", - "value": "approved" - }, - { - "type": "Extension", - "name": "approved_name", - "value": "von Hippel-Lindau tumor suppressor" - }, - { - "type": "Extension", - "name": "associated_with", - "value": [ - "ucsc:uc003bvc.4", - "pubmed:9671762", - "refseq:NM_000551", - "cosmic:VHL", - "omim:608537", - "vega:OTTHUMG00000128668", - "ccds:CCDS2598", - "ena.embl:L15409", - "orphanet:120467", - "ccds:CCDS2597", - "uniprot:P40337" - ] - }, - { - "type": "Extension", - "name": "chromosome_location", - "value": { - "_id": - "ga4gh:VCL.S-TtMfLdsgZPVRrWEf1-jiZMyTDCt5y1", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "3", - "interval": { - "end": "p25.3", - "start": "p25.3", - "type": "CytobandInterval" - } - } - }, - { - "name": "previous_symbols", - "value": [ - "RCA1" - ], - "type": "Extension" - } - ] - } - - @pytest.fixture(scope='module') def erbb2_context(): """Create test fixture for ERBB2 Gene Context.""" @@ -391,20 +319,22 @@ def braf_v600e(braf_gene_context): params = { "id": "normalize.variation:BRAF%20V600E", "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.9dA0egRAIfVFDL1sdU1VP7HsBcG0-DtE", + "variation_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", "variation": { + "_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", "location": { + "_id": "ga4gh:VSL.AqrQ-EkAvTrXOFn70_8i3dXF5shBBZ5i", "interval": { - "end": 640, - "start": 639, - "type": "SimpleInterval" + "end": {"value": 640, "type": "Number"}, + "start": {"value": 639, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -416,26 +346,129 @@ def braf_v600e(braf_gene_context): return VariationDescriptor(**params) +@pytest.fixture(scope="module") +def dis3_p63a(): + """Create DIS3 P63A test fixture.""" + params = { + "id": "normalize.variation:DIS3%20P63A", + "type": "VariationDescriptor", + "variation_id": "ga4gh:VA.ueB4LJE1wba68a1w1w6Mrb-MOquy8n3C", + "variation": { + "_id": "ga4gh:VA.ueB4LJE1wba68a1w1w6Mrb-MOquy8n3C", + "location": { + "_id": "ga4gh:VSL.JcyesAEvndeQYxn4Gzi68hqC1ANZnrGN", + "interval": { + "end": {"value": 63, "type": "Number"}, + "start": {"value": 62, "type": "Number"}, + "type": "SequenceInterval" + }, + "sequence_id": "ga4gh:SQ.mlWsxfPKINN3o300stAI8oqN5U7P6kEu", + "type": "SequenceLocation" + }, + "state": { + "sequence": "A", + "type": "LiteralSequenceExpression" + }, + "type": "Allele" + }, + "molecule_context": "protein", + "structural_type": "SO:0001606", + "gene_context": { + "id": "normalize.gene:DIS3", + "type": "GeneDescriptor", + "label": "DIS3", + "xrefs": [ + "ensembl:ENSG00000083520", + "ncbigene:22894" + ], + "alternate_labels": [ + "dis3p", + "RRP44", + "KIAA1008", + "2810028N01Rik", + "EXOSC11" + ], + "extensions": [ + { + "name": "symbol_status", + "value": "approved", + "type": "Extension" + }, + { + "name": "approved_name", + "value": "DIS3 homolog, exosome endoribonuclease and 3'-5' exoribonuclease", # noqa: E501 + "type": "Extension" + }, + { + "name": "chromosome_location", + "value": { + "species_id": "taxonomy:9606", + "interval": { + "type": "CytobandInterval", + "start": "q21.33", + "end": "q21.33" + }, + "_id": "ga4gh:VCL.84IPub_nKl33cWX9pNoPeGsyeVuJnyra", + "type": "ChromosomeLocation", + "chr": "13" + }, + "type": "Extension" + }, + { + "name": "associated_with", + "value": [ + "vega:OTTHUMG00000017070", + "ccds:CCDS9447", + "orphanet:470196", + "ena.embl:AB023225", + "ccds:CCDS45057", + "omim:607533", + "pubmed:11935316", + "refseq:NM_014953", + "uniprot:Q9Y2L1", + "ccds:CCDS81772", + "ucsc:uc001vix.6", + "pubmed:9562621" + ], + "type": "Extension" + }, + { + "name": "previous_symbols", + "value": [ + "KIAA1008" + ], + "type": "Extension" + } + ], + "gene_id": "hgnc:20604" + }, + "vrs_ref_allele_seq": "P" + } + return VariationDescriptor(**params) + + @pytest.fixture(scope="module") def vhl(vhl_gene_context): """Create VHL Tyr185Ter fixture.""" params = { "id": "normalize.variation:NP_000542.1%3Ap.Tyr185Ter", "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.5Zx8fM1_wE3T_DFPbJgEe5CD-youM0op", + "variation_id": "ga4gh:VA._S0nFwX4Y2FPmv5Radf01DAsxQbxA2cc", "variation": { + "_id": "ga4gh:VA._S0nFwX4Y2FPmv5Radf01DAsxQbxA2cc", "location": { + "_id": "ga4gh:VSL._P3rBWI3f7OBs3a4gvZ18QJ6f6dSfqEQ", "interval": { - "end": 185, - "start": 184, - "type": "SimpleInterval" + "end": {"value": 185, "type": "Number"}, + "start": {"value": 184, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "*", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -453,20 +486,22 @@ def vhl_silent(vhl_gene_context): params = { "id": "normalize.variation:NP_000542.1%3Ap.Pro61%3D", "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.LBNTm7QqFZp1alJHaFKlKuRY9cOfdHeI", + "variation_id": "ga4gh:VA.S1GX6EwJV3exmJAH8MnxS8-S9J4i2Ip_", "variation": { + "_id": "ga4gh:VA.S1GX6EwJV3exmJAH8MnxS8-S9J4i2Ip_", "location": { + "_id": "ga4gh:VSL.zuNGmA02Uq49faqvCIPtwVrF_IJuP4dM", "interval": { - "end": 61, - "start": 60, - "type": "SimpleInterval" + "end": {"value": 61, "type": "Number"}, + "start": {"value": 60, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", "type": "SequenceLocation" }, "state": { "sequence": "P", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -481,11 +516,13 @@ def vhl_silent(vhl_gene_context): @pytest.fixture(scope='module') def braf_v600e_nucleotide(braf_gene_context, braf_nuc_value): """Create a test fixture for BRAF V600E MANE select nucleotide hgvs.""" + variation = copy.deepcopy(braf_nuc_value) + variation["_id"] = "ga4gh:VA.AfzMBlMIDLDZNjEYEhVTH-KWxq7lAN-B" params = { "id": "normalize.variation:NM_004333.4%3Ac.1799T%3EA", "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.19rEOp0IBkrDkUA4gwwM-4Gde08-kBb1", - "variation": braf_nuc_value, + "variation_id": variation["_id"], + "variation": variation, "molecule_context": "transcript", "structural_type": "SO:0001483", "vrs_ref_allele_seq": "T", @@ -500,20 +537,22 @@ def nm_004448_coding_dna_delins(erbb2_context): params = { "id": "normalize.variation:NM_004448.4%3Ac.2326_2327delinsCT", "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.sSFX2CO2DPTvE4MqnJ5VifnaQOGS0CVb", + "variation_id": "ga4gh:VA.eMxxAEjNduAvg5U3eBZxf0nLtfcMNxqy", "variation": { + "_id": "ga4gh:VA.eMxxAEjNduAvg5U3eBZxf0nLtfcMNxqy", "location": { + "_id": "ga4gh:VSL.bBzTvpLChbWE2SZ7X0drm8NQj5rzNqTK", "interval": { - "end": 2502, - "start": 2500, - "type": "SimpleInterval" + "end": {"value": 2502, "type": "Number"}, + "start": {"value": 2500, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", "type": "SequenceLocation" }, "state": { "sequence": "CT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -531,20 +570,22 @@ def nc_000007_genomic_delins(braf_gene_context): params = { "id": "normalize.variation:NC_000007.13%3Ag.140453135_140453136delinsAT", # noqa: E501 "type": "VariationDescriptor", - "variation_id": "ga4gh:VA._GzAG8_K8YwcYQk6bEvINNGM_hEViytU", + "variation_id": "ga4gh:VA.4387UZ6Yssh3XCGKjm71z_WtadpBZT3O", "variation": { + "_id": "ga4gh:VA.4387UZ6Yssh3XCGKjm71z_WtadpBZT3O", "location": { + "_id": "ga4gh:VSL.6PeoFwkO4ISmUjDWoYLkVsATVx8JRApd", "interval": { - "end": 2146, - "start": 2144, - "type": "SimpleInterval" + "end": {"value": 2146, "type": "Number"}, + "start": {"value": 2144, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -562,20 +603,22 @@ def nm_000551(vhl_gene_context): params = { "id": 'normalize.variation:temp', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.SjJnUcJL1EyRFUb6f8PSJA4u3fyin2Wj", + "variation_id": "ga4gh:VA._JN_AF5PO9kWKgKxB5T48cypZl7ccEsQ", "variation": { + "_id": "ga4gh:VA._JN_AF5PO9kWKgKxB5T48cypZl7ccEsQ", "location": { + "_id": "ga4gh:VSL.tQRFfWMPPHErWSbKvoEIh2gI2ehHIYAs", "interval": { - "end": 685, - "start": 684, - "type": "SimpleInterval" + "end": {"value": 685, "type": "Number"}, + "start": {"value": 684, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -592,17 +635,18 @@ def braf_nuc_value(): """Create test fixture for BRAF V600E value on c. coordinate.""" return { "location": { + "_id": "ga4gh:VSL.qF6Dh-Rk6DY75gAmJrIdNYDN8xhaf_Nr", "interval": { - "end": 2145, - "start": 2144, - "type": "SimpleInterval" + "end": {"value": 2145, "type": "Number"}, + "start": {"value": 2144, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.I_0feOk5bZ3VfH8ejhWQiMDe9o6o4QdR", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" } @@ -613,10 +657,11 @@ def coding_dna_silent_mutation(braf_gene_context, braf_nuc_value): """Create test fixture for NM_004333.4:c.1799=.""" value = copy.deepcopy(braf_nuc_value) value['state']['sequence'] = 'T' + value['_id'] = "ga4gh:VA.9wvlCJDeaw5HxwmUJg8qkcoUoT4A3azR" params = { "id": 'normalize.variation:NM_004333.4%3Ac.1799%3D', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.bVNMOANetNE2z4PZ1j0DmwUL1rULmqkN", + "variation_id": value['_id'], "variation": value, "molecule_context": "transcript", "structural_type": "SO:0002073", @@ -631,10 +676,11 @@ def nc_000007_silent_mutation(braf_gene_context, braf_nuc_value): """Create test fixture for NC_000007.13:g.140453136=.""" value = copy.deepcopy(braf_nuc_value) value['state']['sequence'] = 'T' + value['_id'] = "ga4gh:VA.9wvlCJDeaw5HxwmUJg8qkcoUoT4A3azR" params = { "id": 'normalize.variation:NC_000007.13%3Ag.140453136%3D', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.bVNMOANetNE2z4PZ1j0DmwUL1rULmqkN", + "variation_id": value['_id'], "variation": value, "molecule_context": "transcript", "structural_type": "SO:0002073", @@ -650,20 +696,22 @@ def amino_acid_delins(egfr_context): params = { "id": 'normalize.variation:NP_001333827.1%3Ap.Leu747_Thr751delinsPro', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.drLuUW5T542RCeDlVo4zbQ-_tcAiEnb6", + "variation_id": "ga4gh:VA.eDMXxJw9shlSKF3znIg5abniGoyJ3GQ4", "variation": { + "_id": "ga4gh:VA.eDMXxJw9shlSKF3znIg5abniGoyJ3GQ4", "location": { + "_id": "ga4gh:VSL.Mm8duqYDJyel5ZnwScnxLyGH1i9lcl3T", "interval": { - "end": 751, - "start": 746, - "type": "SimpleInterval" + "end": {"value": 751, "type": "Number"}, + "start": {"value": 746, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", "type": "SequenceLocation" }, "state": { "sequence": "P", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -683,20 +731,22 @@ def amino_acid_deletion_np_range(erbb2_context): params = { "id": 'normalize.variation:NP_004439.2%3Ap.Leu755_Thr759del', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.Kzk2XquE5w5Ujd_tPBLVOZcylXMP8xbW", + "variation_id": "ga4gh:VA.rFwsfnekdWjwKNmsAw9fZOCGgIvcMnCn", "variation": { + "_id": "ga4gh:VA.rFwsfnekdWjwKNmsAw9fZOCGgIvcMnCn", "location": { + "_id": "ga4gh:VSL.vhpNJ0vsJx3WbnCfwJzxFU-wWyZwvPdL", "interval": { - "end": 759, - "start": 754, - "type": "SimpleInterval" + "end": {"value": 759, "type": "Number"}, + "start": {"value": 754, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.AF1UFydIo02-bMplonKSfxlWY2q6ze3m", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -716,20 +766,22 @@ def coding_dna_deletion(erbb2_context): params = { "id": 'normalize.variation:NM_004448.3%3Ac.2263_2277delTTGAGGGAAAACACA', # noqa: E501 "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.pMmAfNtE5g3O3gZ6E9e-uyXJtwbN9whN", + "variation_id": "ga4gh:VA.tMjlwNf2mYOKPbXXwGo4IKd_OtHuVfMT", "variation": { + "_id": "ga4gh:VA.tMjlwNf2mYOKPbXXwGo4IKd_OtHuVfMT", "location": { + "_id": "ga4gh:VSL.3uPWAjsdzd8MbAqw8DV46eBLK8tQRyEs", "interval": { - "end": 2453, - "start": 2437, - "type": "SimpleInterval" + "end": {"value": 2453, "type": "Number"}, + "start": {"value": 2437, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", "type": "SequenceLocation" }, "state": { "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -741,57 +793,28 @@ def coding_dna_deletion(erbb2_context): return VariationDescriptor(**params) -@pytest.fixture(scope='module') -def genomic_deletion(vhl_gene_context): - """Create test fixture for genomic deletion range.""" - params = { - "id": 'normalize.variation:NC_000003.11%3Ag.10188279_10188297del', - "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.uagNswLQY5rgN2c30_J3-45UMpIySM4C", - "variation": { - "location": { - "interval": { - "end": 510, - "start": 491, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", - "type": "SequenceLocation" - }, - "state": { - "sequence": "", - "type": "SequenceState" - }, - "type": "Allele" - }, - "molecule_context": "transcript", - "structural_type": "SO:0000159", - "vrs_ref_allele_seq": "ATGTTGACGGACAGCCTAT", - "gene_context": vhl_gene_context - } - return VariationDescriptor(**params) - - @pytest.fixture(scope='module') def amino_acid_insertion(egfr_context): """Create test fixture for NP amino acid insertion.""" params = { "id": 'normalize.variation:NP_005219.2%3Ap.Asp770_Asn771insGlyLeu', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.d3dLtsYaLYE2Yh_GENvPUtTVZWlwLnJw", + "variation_id": "ga4gh:VA.t_WLqe5efVQlBmdbIBgqIeLRu2rSJDJJ", "variation": { + "_id": "ga4gh:VA.t_WLqe5efVQlBmdbIBgqIeLRu2rSJDJJ", "location": { + "_id": "ga4gh:VSL.DJIP1jlxQIro1oC5re8txtH7N8vAvM7A", "interval": { - "end": 770, - "start": 770, - "type": "SimpleInterval" + "end": {"value": 770, "type": "Number"}, + "start": {"value": 770, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", "type": "SequenceLocation" }, "state": { "sequence": "GL", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -808,20 +831,22 @@ def coding_dna_insertion(limk2_gene_context): params = { "id": 'normalize.variation:ENST00000331728.9%3Ac.2049_2050insA', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.195Sg1AkyM4uQOhxLhBhANe2BUbnbEcR", + "variation_id": "ga4gh:VA.RAcEF24tRNB_J5Hz9E6GY-qQSi4ZG932", "variation": { + "_id": "ga4gh:VA.RAcEF24tRNB_J5Hz9E6GY-qQSi4ZG932", "location": { + "_id": "ga4gh:VSL.WKymELRFwu4LDDmH5ci5Ip0M3XA2RObr", "interval": { - "end": 2160, - "start": 2160, - "type": "SimpleInterval" + "end": {"value": 2160, "type": "Number"}, + "start": {"value": 2160, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -838,20 +863,22 @@ def genomic_insertion(erbb2_context): params = { "id": 'normalize.variation:NC_000017.10%3Ag.37880993_37880994insGCTTACGTGATG', # noqa: E501 "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.qk5UNMFwxqQQWjO6CGMk3tryHBN3Sm_P", + "variation_id": "ga4gh:VA.nHB0_mpsq2t90S-znr81oCi2cY5CMdUe", "variation": { + "_id": "ga4gh:VA.nHB0_mpsq2t90S-znr81oCi2cY5CMdUe", "location": { + "_id": "ga4gh:VSL.E0o4HCXjy1EUthF1m32oj_Bc45g5YmEm", "interval": { - "end": 2500, - "start": 2488, - "type": "SimpleInterval" + "end": {"value": 2500, "type": "Number"}, + "start": {"value": 2488, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", "type": "SequenceLocation" }, "state": { "sequence": "TACGTGATGGCTTACGTGATGGCT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -869,20 +896,22 @@ def genomic_substitution(egfr_context): params = { "id": 'normalize.variation:NC_000007.13%3Ag.55249071C%3ET', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.s_9_u_eDwRQMTK8eeWKGEQmfNjdKVuRk", + "variation_id": "ga4gh:VA.VkcuqgqMuSQeq8Hy0VPOGRIeyr8uSBV2", "variation": { + "_id": "ga4gh:VA.VkcuqgqMuSQeq8Hy0VPOGRIeyr8uSBV2", "location": { + "_id": "ga4gh:VSL.G1gIZ-om-8Exl3F0ZLxXYY8CjliwCaO1", "interval": { - "end": 2630, - "start": 2629, - "type": "SimpleInterval" + "end": {"value": 2630, "type": "Number"}, + "start": {"value": 2629, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.d_QsP29RWJi6bac7GOC9cJ9AO7s_HUMN", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -900,20 +929,22 @@ def genomic_sub_grch38(): params = { "id": 'normalize.variation:NC_000007.13%3Ag.55249071C%3ET', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.vWT6m5QcrdIJ37MfeQRsEO0avQiufIEx", + "variation_id": "ga4gh:VA.1ewlywoD423K7YH_K4YefZg6J_87pQTp", "variation": { + "_id": "ga4gh:VA.1ewlywoD423K7YH_K4YefZg6J_87pQTp", "location": { + "_id": "ga4gh:VSL.0p1nWj9-sryfUD5jvPTZZdnZeiHVHXls", "interval": { - "end": 55181378, - "start": 55181377, - "type": "SimpleInterval" + "end": {"value": 55181378, "type": "Number"}, + "start": {"value": 55181377, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -930,20 +961,22 @@ def egfr_grch38_sub(genomic_sub_grch38, egfr_context): params = { "id": 'normalize.variation:NC_000007.13%3Ag.55249071C%3ET', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.vWT6m5QcrdIJ37MfeQRsEO0avQiufIEx", + "variation_id": "ga4gh:VA.1ewlywoD423K7YH_K4YefZg6J_87pQTp", "variation": { + "_id": "ga4gh:VA.1ewlywoD423K7YH_K4YefZg6J_87pQTp", "location": { + "_id": "ga4gh:VSL.0p1nWj9-sryfUD5jvPTZZdnZeiHVHXls", "interval": { - "end": 55181378, - "start": 55181377, - "type": "SimpleInterval" + "end": {"value": 55181378, "type": "Number"}, + "start": {"value": 55181377, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -963,8 +996,10 @@ def genomic_uncertain_del_x(): "type": "VariationDescriptor", "variation_id": "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk", "variation": { + "_id": "ga4gh:VCN.yQJnQz12MXlZGWx6BuzccVGrCCic_tMk", "subject": { "location": { + "_id": "ga4gh:VSL.7OJ5EFgu_2C4zPFDUBgn-ziE6BZwsRcv", "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "interval": { "start": { @@ -997,108 +1032,28 @@ def genomic_uncertain_del_x(): return VariationDescriptor(**params) -@pytest.fixture(scope='module') -def genomic_uncertain_del_2(): - """Create a genomic uncertain deletion on chr 2 test fixture.""" - params = { - "id": 'normalize.variation:NC_000002.12%3Ag.%28%3F_110104900%29_%28110207160_%3F%29del', # noqa: E501 - "type": "VariationDescriptor", - "variation_id": "ga4gh:VCN.8o5X1HTglUvwUAFo9vGL5OBnZqgpylys", - "variation": { - "subject": { - "location": { - "sequence_id": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", - "interval": { - "start": { - "value": 110104899, - "comparator": "<=", - "type": "IndefiniteRange" - }, - "end": { - "value": 110207160, - "comparator": ">=", - "type": "IndefiniteRange" - }, - "type": "SequenceInterval" - }, - "type": "SequenceLocation" - }, - "reverse_complement": False, - "type": "DerivedSequenceExpression" - }, - "copies": { - "value": 1, - "type": "Number" - }, - "type": "CopyNumber" - }, - "molecule_context": "genomic", - "structural_type": "SO:0001743" - } - return VariationDescriptor(**params) - - -@pytest.fixture(scope='module') -def genomic_uncertain_del_y(): - """Create a genomic uncertain deletion on chr Y test fixture.""" - params = { - "id": 'normalize.variation:NC_000024.10%3Ag.%28%3F_14076802%29_%2857165209_%3F%29del', # noqa: E501 - "type": "VariationDescriptor", - "variation_id": "ga4gh:VCN._T4dHJIfXB-cpqQSJ5g5pAM1JnwupWuv", - "variation": { - "subject": { - "location": { - "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", - "interval": { - "start": { - "value": 14076801, - "comparator": "<=", - "type": "IndefiniteRange" - }, - "end": { - "value": 57165209, - "comparator": ">=", - "type": "IndefiniteRange" - }, - "type": "SequenceInterval" - }, - "type": "SequenceLocation" - }, - "reverse_complement": False, - "type": "DerivedSequenceExpression" - }, - "copies": { - "value": 0, - "type": "Number" - }, - "type": "CopyNumber" - }, - "molecule_context": "genomic", - "structural_type": "SO:0001743" - } - return VariationDescriptor(**params) - - @pytest.fixture(scope='module') def grch38_braf_genom_sub(): """Create a genomic substitution GRCh38 test fixture for BRAF.""" params = { "id": 'normalize.variation:NC_000007.13%3Ag.140453136A%3ET', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.HaPTmn-rrjRoZnIVw1I4AZPa6YHa2ojh", + "variation_id": "ga4gh:VA.fZiBjQEolbkL0AxjoTZf4SOkFy9J0ebU", "variation": { + "_id": "ga4gh:VA.fZiBjQEolbkL0AxjoTZf4SOkFy9J0ebU", "location": { + "_id": "ga4gh:VSL.zga82-TpYiNmBESCfvDvAz9DyvJF98I-", "interval": { - "end": 140753336, - "start": 140753335, - "type": "SimpleInterval" + "end": {"value": 140753336, "type": "Number"}, + "start": {"value": 140753335, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "T", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -1115,20 +1070,22 @@ def grch38_braf_genom_silent_mutation(): params = { "id": 'normalize.variation:NC_000007.13%3Ag.140453136%3D', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.VkKteRkWR9MncFR3j4EICtAfdD4ZwR-1", + "variation_id": "ga4gh:VA.aMwnr5rEbtPQe5gXDDO2gZO_zSqN2RmH", "variation": { + "_id": "ga4gh:VA.aMwnr5rEbtPQe5gXDDO2gZO_zSqN2RmH", "location": { + "_id": "ga4gh:VSL.zga82-TpYiNmBESCfvDvAz9DyvJF98I-", "interval": { - "end": 140753336, - "start": 140753335, - "type": "SimpleInterval" + "end": {"value": 140753336, "type": "Number"}, + "start": {"value": 140753335, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "A", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -1144,22 +1101,24 @@ def grch38_genomic_delins1(): """Create a test fixture for NC_000007.13:g.140453135_140453136delinsAT.""" params = { "id": - 'normalize.variation:NC_000007.13%3Ag.140453135_140453136delinsAT', + "normalize.variation:NC_000007.13%3Ag.140453135_140453136delinsAT", "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.5cmrRcsVIUBiFii-tOHhxNPNA3OB69fe", + "variation_id": "ga4gh:VA.mlJVnI7js6Tsb2GSLFlNRbCKE9zFRX5p", "variation": { + "_id": "ga4gh:VA.mlJVnI7js6Tsb2GSLFlNRbCKE9zFRX5p", "location": { + "_id": "ga4gh:VSL.b0Ldj2KcT2k0n0PZfqHCBH1YzQZYceYX", "interval": { - "end": 140753336, - "start": 140753334, - "type": "SimpleInterval" + "end": {"value": 140753336, "type": "Number"}, + "start": {"value": 140753334, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceLocation" }, "state": { "sequence": "AT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -1176,20 +1135,22 @@ def grch38_genomic_delins2(): params = { "id": 'normalize.variation:NC_000003.12%3Ag.10149938delinsAA', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.-f07PmCJYlYbzBHVUQzDYWguUU4usche", + "variation_id": "ga4gh:VA.ndCPwzek-KU626kK28bQd1gWAAk2ELze", "variation": { + "_id": "ga4gh:VA.ndCPwzek-KU626kK28bQd1gWAAk2ELze", "location": { + "_id": "ga4gh:VSL.mVulVOKoX2frLH1XTIJfpvJa6RGdOVu_", "interval": { - "start": 10149937, - "end": 10149938, - "type": "SimpleInterval" + "start": {"value": 10149937, "type": "Number"}, + "end": {"value": 10149938, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", "type": "SequenceLocation" }, "state": { "sequence": "AA", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -1206,20 +1167,22 @@ def grch38_genomic_deletion(): params = { "id": 'normalize.variation:NC_000003.11%3Ag.10188279_10188297del', "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.Id8H1AfB1DsgBPZnWJD3X4AyrpelKegQ", + "variation_id": "ga4gh:VA.CSWNhR5w_geMmJTxkbO3UCLCvT0S2Ypx", "variation": { + "_id": "ga4gh:VA.CSWNhR5w_geMmJTxkbO3UCLCvT0S2Ypx", "location": { + "_id": "ga4gh:VSL.lksYAhEQvP8biy_nxoOJ_Zwu75a_kYtQ", + "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", "interval": { - "end": 10146613, - "start": 10146594, - "type": "SimpleInterval" + "type": "SequenceInterval", + "start": {"value": 10146594, "type": "Number"}, + "end": {"value": 10146613, "type": "Number"}, }, - "sequence_id": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", - "type": "SequenceLocation" + "type": "SequenceLocation", }, "state": { - "sequence": "", - "type": "SequenceState" + "type": "LiteralSequenceExpression", + "sequence": "" }, "type": "Allele" }, @@ -1238,20 +1201,22 @@ def grch38_genomic_insertion(): params = { "id": 'normalize.variation:NC_000017.10%3Ag.37880993_37880994insGCTTACGTGATG', # noqa: E501 "type": "VariationDescriptor", - "variation_id": "ga4gh:VA.HxgI7LQ80HI0Q3t1letlRiWSNz8C8ea-", + "variation_id": "ga4gh:VA.tCjV190dUsV7tSjdR8qOLSQIR7Hr8VMe", "variation": { + "_id": "ga4gh:VA.tCjV190dUsV7tSjdR8qOLSQIR7Hr8VMe", "location": { + "_id": "ga4gh:VSL.fJ80Ab9JP0GXtDNeEaoDxE35tlI-k9Cd", "interval": { - "end": 39724743, - "start": 39724731, - "type": "SimpleInterval" + "end": {"value": 39724743, "type": "Number"}, + "start": {"value": 39724731, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", "type": "SequenceLocation" }, "state": { "sequence": "TACGTGATGGCTTACGTGATGGCT", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -1262,45 +1227,7 @@ def grch38_genomic_insertion(): return VariationDescriptor(**params) -def assertion_checks(normalize_response, test_variation): - """Check that normalize_response and test_variation are equal.""" - assert normalize_response.id == test_variation.id - assert normalize_response.type == test_variation.type - assert normalize_response.variation_id == test_variation.variation_id - assert normalize_response.variation == test_variation.variation - assert normalize_response.molecule_context == \ - test_variation.molecule_context - assert normalize_response.structural_type == test_variation.structural_type - assert normalize_response.vrs_ref_allele_seq == \ - test_variation.vrs_ref_allele_seq - - resp_gene_context = normalize_response.gene_context - test_variation_context = test_variation.gene_context - if resp_gene_context: - assert resp_gene_context.id == test_variation_context.id - assert resp_gene_context.label == test_variation_context.label - assert resp_gene_context.gene_id == test_variation_context.gene_id - assert set(resp_gene_context.xrefs) ==\ - set(test_variation_context.xrefs) - if test_variation_context.alternate_labels: - assert set(resp_gene_context.alternate_labels) == \ - set(test_variation_context.alternate_labels) - assert len(resp_gene_context.extensions) == \ - len(test_variation_context.extensions) - for resp_ext in resp_gene_context.extensions: - for test_var in test_variation_context.extensions: - if resp_ext.name == test_var.name: - if resp_ext.name == 'chromosome_location': - assert resp_ext.value == test_var.value - elif resp_ext.name == 'associated_with': - assert set(resp_ext.value) == set(test_var.value) - else: - assert resp_ext.value == test_var.value - else: - assert not test_variation_context - - -def test_amino_acid_substitution(test_normalize, braf_v600e): +def test_amino_acid_substitution(test_normalize, braf_v600e, dis3_p63a): """Test that amino acid substitutions normalize correctly.""" resp = test_normalize.normalize(' BRAF V600E ') assertion_checks(resp, braf_v600e) @@ -1322,6 +1249,9 @@ def test_amino_acid_substitution(test_normalize, braf_v600e): resp.id = braf_id assertion_checks(resp, braf_v600e) + resp = test_normalize.normalize('DIS3 P63A') + assertion_checks(resp, dis3_p63a) + def test_polypeptide_truncation(test_normalize, vhl): """Test that polypeptide truncations normalize correctly.""" @@ -1509,18 +1439,6 @@ def test_coding_dna_deletion(test_normalize, coding_dna_deletion): assertion_checks(resp, coding_dna_deletion) -def test_genomic_deletion(test_normalize, genomic_deletion, - grch38_genomic_deletion): - """Test that genomic deletion normalizes correctly.""" - resp = test_normalize.normalize('NC_000003.11:g.10188279_10188297del') - assertion_checks(resp, grch38_genomic_deletion) - - resp = test_normalize.normalize('VHL g.10188279_10188297del') - assert resp.id == 'normalize.variation:VHL%20g.10188279_10188297del' - resp.id = 'normalize.variation:NC_000003.11%3Ag.10188279_10188297del' - assertion_checks(resp, genomic_deletion) - - def test_amino_acid_insertion(test_normalize, amino_acid_insertion): """Test that amino acid insertion normalizes correctly.""" resp = test_normalize.normalize('NP_005219.2:p.Asp770_Asn771insGlyLeu') @@ -1557,10 +1475,11 @@ def test_coding_dna_insertion(test_normalize, coding_dna_insertion): resp = test_normalize.normalize('ENST00000331728.9:c.2049_2050insA') assertion_checks(resp, coding_dna_insertion) - resp = test_normalize.normalize('LIMK2 c.2049_2050insA') - assert resp.id == 'normalize.variation:LIMK2%20c.2049_2050insA' - resp.id = 'normalize.variation:ENST00000331728.9%3Ac.2049_2050insA' - assertion_checks(resp, coding_dna_insertion) + # TODO: issue-136 + # resp = test_normalize.normalize('LIMK2 c.2049_2050insA') + # assert resp.id == 'normalize.variation:LIMK2%20c.2049_2050insA' + # resp.id = 'normalize.variation:ENST00000331728.9%3Ac.2049_2050insA' + # assertion_checks(resp, coding_dna_insertion) def test_genomic_insertion(test_normalize, genomic_insertion, @@ -1577,31 +1496,6 @@ def test_genomic_insertion(test_normalize, genomic_insertion, assertion_checks(resp, genomic_insertion) -def test_genomic_uncertain_deletion(test_normalize, genomic_uncertain_del_x, - genomic_uncertain_del_2, - genomic_uncertain_del_y): - """Test that genomic uncertain deletion normalizes correctly.""" - # 38 Assembly - resp = test_normalize.normalize( - 'NC_000023.11:g.(?_31120496)_(33339477_?)del') - assertion_checks(resp, genomic_uncertain_del_x) - - # 37 Assembly - resp = test_normalize.normalize( - 'NC_000023.10:g.(?_31138613)_(33357594_?)del') - assert resp.id == 'normalize.variation:NC_000023.10%3Ag.%28%3F_31138613%29_%2833357594_%3F%29del' # noqa: E501 - resp.id = 'normalize.variation:NC_000023.11%3Ag.%28%3F_31120496%29_%2833339477_%3F%29del' # noqa: E501 - assertion_checks(resp, genomic_uncertain_del_x) - - resp = test_normalize.normalize( - 'NC_000002.12:g.(?_110104900)_(110207160_?)del') - assertion_checks(resp, genomic_uncertain_del_2) - - resp = test_normalize.normalize( - 'NC_000024.10:g.(?_14076802)_(57165209_?)del') - assertion_checks(resp, genomic_uncertain_del_y) - - def test_no_matches(test_normalize): """Test no matches work correctly.""" queries = [ @@ -1623,7 +1517,9 @@ def test_no_matches(test_normalize): resp = test_normalize.normalize('clinvar:10') assert resp.type == 'VariationDescriptor' + assert resp.variation.type == 'Text' assert resp.variation.definition == 'clinvar:10' + assert resp.variation.id == 'ga4gh:VT.xw9m9LZAyn6Z2-GPGwcpDT0ixqCm5g36' resp = test_normalize.normalize(' ') assert resp is None @@ -1637,14 +1533,14 @@ def test_no_matches(test_normalize): def test_service_meta(): """Test that service meta info populates correctly.""" - response = normalize_get_response('BRAF v600e') + response = normalize_get_response('BRAF v600e', 'default') service_meta = response.service_meta_ assert service_meta.name == "variation-normalizer" assert service_meta.version assert isinstance(service_meta.response_datetime, datetime) assert service_meta.url == 'https://github.com/cancervariants/variation-normalization' # noqa: E501 - response = normalize_get_response('this-wont-normalize') + response = normalize_get_response('this-wont-normalize', 'default') service_meta = response.service_meta_ assert service_meta.name == "variation-normalizer" assert service_meta.version diff --git a/tests/tokenizers/test_genomic_deletion_range.py b/tests/tokenizers/test_genomic_deletion_range.py new file mode 100644 index 00000000..06001a31 --- /dev/null +++ b/tests/tokenizers/test_genomic_deletion_range.py @@ -0,0 +1,20 @@ +"""A module for testing Genomic Deletion Ranges tokenization.""" +import unittest +from variation.tokenizers import GenomicDeletionRange +from .tokenizer_base import TokenizerBase + + +class TestGenomicDeletionRangeTokenizer(TokenizerBase, unittest.TestCase): + """A class for testing Genomic Deletion Range Tokenization.""" + + def tokenizer_instance(self): + """Return Genomic Deletion Range instance.""" + return GenomicDeletionRange() + + def token_type(self): + """Return genomic deletion range token type.""" + return 'GenomicDeletionRange' + + def fixture_name(self): + """Return the fixture name for genomic deletion range.""" + return 'genomic_deletion_range' diff --git a/tests/tokenizers/test_genomic_duplication.py b/tests/tokenizers/test_genomic_duplication.py new file mode 100644 index 00000000..029951c4 --- /dev/null +++ b/tests/tokenizers/test_genomic_duplication.py @@ -0,0 +1,20 @@ +"""A module for testing Genomic Duplication Tokenization.""" +import unittest +from variation.tokenizers import GenomicDuplication +from .tokenizer_base import TokenizerBase + + +class TestGenomicDuplicationTokenizer(TokenizerBase, unittest.TestCase): + """A class for testing Genomic Duplication Tokenization.""" + + def tokenizer_instance(self): + """Return Genomic Duplication instance.""" + return GenomicDuplication() + + def token_type(self): + """Return genomic duplication token type.""" + return 'GenomicDuplication' + + def fixture_name(self): + """Return the fixture name for Genomic Duplication.""" + return 'genomic_duplication' diff --git a/tests/tokenizers/test_genomic_duplication_range.py b/tests/tokenizers/test_genomic_duplication_range.py new file mode 100644 index 00000000..d8f00335 --- /dev/null +++ b/tests/tokenizers/test_genomic_duplication_range.py @@ -0,0 +1,20 @@ +"""A module for testing Genomic Duplication Range Tokenization.""" +import unittest +from variation.tokenizers import GenomicDuplication +from .tokenizer_base import TokenizerBase + + +class TestGenomicDuplicationRangeTokenizer(TokenizerBase, unittest.TestCase): + """A class for testing Genomic Duplication Tokenization.""" + + def tokenizer_instance(self): + """Return Genomic Duplication instance.""" + return GenomicDuplication() + + def token_type(self): + """Return genomic duplication token type.""" + return 'GenomicDuplicationRange' + + def fixture_name(self): + """Return the fixture name for Genomic Duplication.""" + return 'genomic_duplication_range' diff --git a/tests/tokenizers/tokenizer_base.py b/tests/tokenizers/tokenizer_base.py index 68364cc7..20ae1ce4 100644 --- a/tests/tokenizers/tokenizer_base.py +++ b/tests/tokenizers/tokenizer_base.py @@ -38,4 +38,7 @@ def test_not_matches(self): """Test that tokenizer matches correctly.""" for x in self.fixtures['should_not_match']: res = self.tokenizer_instance().match(x['token']) - self.assertIsNone(res, msg=x) + try: + self.assertIsNone(res, msg=x) + except AssertionError: + assert self.token_type() != res.token_type diff --git a/tests/translators/test_amino_acid_deletion.py b/tests/translators/test_amino_acid_deletion.py index d5b85140..47aace54 100644 --- a/tests/translators/test_amino_acid_deletion.py +++ b/tests/translators/test_amino_acid_deletion.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AAD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def translator_instance(self): """Return amino acid deletion instance.""" diff --git a/tests/translators/test_amino_acid_delins.py b/tests/translators/test_amino_acid_delins.py index 997c7181..488bc773 100644 --- a/tests/translators/test_amino_acid_delins.py +++ b/tests/translators/test_amino_acid_delins.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AAD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def translator_instance(self): """Return amino acid delins instance.""" diff --git a/tests/translators/test_amino_acid_insertion.py b/tests/translators/test_amino_acid_insertion.py index ed967af5..276727ba 100644 --- a/tests/translators/test_amino_acid_insertion.py +++ b/tests/translators/test_amino_acid_insertion.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AAI_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def translator_instance(self): """Return amino acid insertion instance.""" diff --git a/tests/translators/test_amino_acid_substitution.py b/tests/translators/test_amino_acid_substitution.py index 5a2e876a..cec611ef 100644 --- a/tests/translators/test_amino_acid_substitution.py +++ b/tests/translators/test_amino_acid_substitution.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AASUB_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def translator_instance(self): """Return amino acid substitution instance.""" diff --git a/tests/translators/test_coding_dna_deletion.py b/tests/translators/test_coding_dna_deletion.py index b675d9a7..78c0efbb 100644 --- a/tests/translators/test_coding_dna_deletion.py +++ b/tests/translators/test_coding_dna_deletion.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CDNAD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_coding_dna_delins.py b/tests/translators/test_coding_dna_delins.py index 272dc7a9..dba4248d 100644 --- a/tests/translators/test_coding_dna_delins.py +++ b/tests/translators/test_coding_dna_delins.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CDNADELINS_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_coding_dna_insertion.py b/tests/translators/test_coding_dna_insertion.py index 62fec15a..29e15a21 100644 --- a/tests/translators/test_coding_dna_insertion.py +++ b/tests/translators/test_coding_dna_insertion.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CDNAD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_coding_dna_silent_mutation.py b/tests/translators/test_coding_dna_silent_mutation.py index de04e6b9..f24b60c5 100644 --- a/tests/translators/test_coding_dna_silent_mutation.py +++ b/tests/translators/test_coding_dna_silent_mutation.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CDNASM_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_coding_dna_substitution.py b/tests/translators/test_coding_dna_substitution.py index d029fe4e..d89833ee 100644 --- a/tests/translators/test_coding_dna_substitution.py +++ b/tests/translators/test_coding_dna_substitution.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CDNASUB_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_genomic_deletion.py b/tests/translators/test_genomic_deletion.py index 5f905107..546d12c3 100644 --- a/tests/translators/test_genomic_deletion.py +++ b/tests/translators/test_genomic_deletion.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_genomic_delins.py b/tests/translators/test_genomic_delins.py index 2b24e443..2aad2dda 100644 --- a/tests/translators/test_genomic_delins.py +++ b/tests/translators/test_genomic_delins.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GENOMICDELINS_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_genomic_duplication.py b/tests/translators/test_genomic_duplication.py new file mode 100644 index 00000000..aeacd7c9 --- /dev/null +++ b/tests/translators/test_genomic_duplication.py @@ -0,0 +1,45 @@ +"""Module for testing Genomic Duplication Translator.""" +import unittest +from variation.classifiers import GenomicDuplicationClassifier +from variation.translators import GenomicDuplication +from variation.validators import GenomicDuplication as GD_V +from .translator_base import TranslatorBase +from variation.tokenizers import GeneSymbol +from variation.data_sources import TranscriptMappings, SeqRepoAccess, \ + MANETranscriptMappings, UTA +from variation.mane_transcript import MANETranscript +from ga4gh.vrs.dataproxy import SeqRepoDataProxy +from ga4gh.vrs.extras.translator import Translator +from gene.query import QueryHandler as GeneQueryHandler + + +class TestGenomicDuplicationTranslator(TranslatorBase, unittest.TestCase): + """A class to test the Genomic Duplication Translator.""" + + def classifier_instance(self): + """Return genomic duplication instance.""" + return GenomicDuplicationClassifier() + + def validator_instance(self): + """Return genomic duplication instance.""" + seqrepo_access = SeqRepoAccess() + transcript_mappings = TranscriptMappings() + uta = UTA() + dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) + tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() + return GD_V( + seqrepo_access, transcript_mappings, + GeneSymbol(gene_normalizer), + MANETranscript(seqrepo_access, transcript_mappings, + MANETranscriptMappings(), uta), + uta, dp, tlr, gene_normalizer + ) + + def translator_instance(self): + """Return genomic duplication instance.""" + return GenomicDuplication() + + def fixture_name(self): + """Return the fixture name for genomic duplication.""" + return 'genomic_duplication' diff --git a/tests/translators/test_genomic_insertion.py b/tests/translators/test_genomic_insertion.py index 06056687..c0157430 100644 --- a/tests/translators/test_genomic_insertion.py +++ b/tests/translators/test_genomic_insertion.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_genomic_silent_mutation.py b/tests/translators/test_genomic_silent_mutation.py index c09380d1..ee4c5c93 100644 --- a/tests/translators/test_genomic_silent_mutation.py +++ b/tests/translators/test_genomic_silent_mutation.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GENOMICSM_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_genomic_substitution.py b/tests/translators/test_genomic_substitution.py index a745ef22..e6d71e4e 100644 --- a/tests/translators/test_genomic_substitution.py +++ b/tests/translators/test_genomic_substitution.py @@ -27,12 +27,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GSUB_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_genomic_uncertain_deletion.py b/tests/translators/test_genomic_uncertain_deletion.py index fe69d879..93905811 100644 --- a/tests/translators/test_genomic_uncertain_deletion.py +++ b/tests/translators/test_genomic_uncertain_deletion.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GUD_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def translator_instance(self): diff --git a/tests/translators/test_polypeptide_truncation.py b/tests/translators/test_polypeptide_truncation.py index be8b6269..cba3cd9b 100644 --- a/tests/translators/test_polypeptide_truncation.py +++ b/tests/translators/test_polypeptide_truncation.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return PT_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def translator_instance(self): """Return polypeptide truncation instance.""" diff --git a/tests/translators/test_silent_mutation.py b/tests/translators/test_silent_mutation.py index 648d5e7b..503c9876 100644 --- a/tests/translators/test_silent_mutation.py +++ b/tests/translators/test_silent_mutation.py @@ -28,12 +28,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return SM_V( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def translator_instance(self): """Return silent mutation instance.""" diff --git a/tests/translators/translator_base.py b/tests/translators/translator_base.py index f509692c..2b4d8c25 100644 --- a/tests/translators/translator_base.py +++ b/tests/translators/translator_base.py @@ -52,13 +52,15 @@ def test_translator(self): found = list() for vr in validation_results: if vr.is_valid: - variation = (self.translator.translate(vr)).__dict__ + variation = self.translator.translate(vr) if variation['type'] == 'Allele': - variation['location'] = variation['location'].dict() - del variation['location']['id'] + variation['location'] = variation['location'] + if 'id' in variation['location'].keys(): + del variation['location']['id'] elif variation['type'] == 'CopyNumber': - variation['subject'] = variation['subject'].dict() - del variation['subject']['location']['id'] + variation['subject'] = variation['subject'] + if 'id' in variation['subject']['location'].keys(): + del variation['subject']['location']['id'] if variation not in found: found.append(variation) diff --git a/tests/validators/test_amino_acid_deletion.py b/tests/validators/test_amino_acid_deletion.py index 1692e86f..269f7d00 100644 --- a/tests/validators/test_amino_acid_deletion.py +++ b/tests/validators/test_amino_acid_deletion.py @@ -23,12 +23,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AminoAcidDeletion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def classifier_instance(self): """Return the amino acid deletion classifier instance.""" diff --git a/tests/validators/test_amino_acid_delins.py b/tests/validators/test_amino_acid_delins.py index 77141bfb..cf10defe 100644 --- a/tests/validators/test_amino_acid_delins.py +++ b/tests/validators/test_amino_acid_delins.py @@ -23,12 +23,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AminoAcidDelIns( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def classifier_instance(self): """Return the amino acid delins classifier instance.""" diff --git a/tests/validators/test_amino_acid_insertion.py b/tests/validators/test_amino_acid_insertion.py index b4e608a1..d677da4f 100644 --- a/tests/validators/test_amino_acid_insertion.py +++ b/tests/validators/test_amino_acid_insertion.py @@ -23,12 +23,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AminoAcidInsertion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def classifier_instance(self): """Return the amino acid insertion classifier instance.""" diff --git a/tests/validators/test_amino_acid_substitution.py b/tests/validators/test_amino_acid_substitution.py index d7982b2e..0ab3784a 100644 --- a/tests/validators/test_amino_acid_substitution.py +++ b/tests/validators/test_amino_acid_substitution.py @@ -23,12 +23,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return AminoAcidSubstitution( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def classifier_instance(self): """Return the protein substitution classifier instance.""" diff --git a/tests/validators/test_coding_dna_deletion.py b/tests/validators/test_coding_dna_deletion.py index 68d5488f..5e2f36de 100644 --- a/tests/validators/test_coding_dna_deletion.py +++ b/tests/validators/test_coding_dna_deletion.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CodingDNADeletion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_coding_dna_delins.py b/tests/validators/test_coding_dna_delins.py index a04d4e44..d8cba570 100644 --- a/tests/validators/test_coding_dna_delins.py +++ b/tests/validators/test_coding_dna_delins.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CodingDNADelIns( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_coding_dna_insertion.py b/tests/validators/test_coding_dna_insertion.py index 60a1a4e7..5e3ba404 100644 --- a/tests/validators/test_coding_dna_insertion.py +++ b/tests/validators/test_coding_dna_insertion.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CodingDNAInsertion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_coding_dna_silent_mutation.py b/tests/validators/test_coding_dna_silent_mutation.py index 8916172f..3680b2fb 100644 --- a/tests/validators/test_coding_dna_silent_mutation.py +++ b/tests/validators/test_coding_dna_silent_mutation.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CodingDNASilentMutation( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_coding_dna_subsitution.py b/tests/validators/test_coding_dna_subsitution.py index 6a3741b9..6216f47e 100644 --- a/tests/validators/test_coding_dna_subsitution.py +++ b/tests/validators/test_coding_dna_subsitution.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return CodingDNASubstitution( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_genomic_deletion.py b/tests/validators/test_genomic_deletion.py index d7a37f44..3cb950f2 100644 --- a/tests/validators/test_genomic_deletion.py +++ b/tests/validators/test_genomic_deletion.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GenomicDeletion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_genomic_delins.py b/tests/validators/test_genomic_delins.py index 797f4ab9..63cfbbb2 100644 --- a/tests/validators/test_genomic_delins.py +++ b/tests/validators/test_genomic_delins.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GenomicDelIns( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_genomic_duplication.py b/tests/validators/test_genomic_duplication.py new file mode 100644 index 00000000..26a70050 --- /dev/null +++ b/tests/validators/test_genomic_duplication.py @@ -0,0 +1,40 @@ +"""Module for testing Genomic Duplication Validator.""" +import unittest +from variation.validators import GenomicDuplication +from variation.classifiers import GenomicDuplicationClassifier +from .validator_base import ValidatorBase +from variation.tokenizers import GeneSymbol +from variation.data_sources import TranscriptMappings, SeqRepoAccess, \ + MANETranscriptMappings, UTA +from variation.mane_transcript import MANETranscript +from ga4gh.vrs.dataproxy import SeqRepoDataProxy +from ga4gh.vrs.extras.translator import Translator +from gene.query import QueryHandler as GeneQueryHandler + + +class TestGenomicDuplicationValidator(ValidatorBase, unittest.TestCase): + """A class to test the Genomic Duplication Validator.""" + + def validator_instance(self): + """Return genomic duplication instance.""" + seqrepo_access = SeqRepoAccess() + transcript_mappings = TranscriptMappings() + uta = UTA() + dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) + tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() + return GenomicDuplication( + seqrepo_access, transcript_mappings, + GeneSymbol(gene_normalizer), + MANETranscript(seqrepo_access, transcript_mappings, + MANETranscriptMappings(), uta), + uta, dp, tlr, gene_normalizer + ) + + def classifier_instance(self): + """Return the genomic duplication classifier instance.""" + return GenomicDuplicationClassifier() + + def fixture_name(self): + """Return the fixture name for genomic duplication.""" + return 'genomic_duplication' diff --git a/tests/validators/test_genomic_insertion.py b/tests/validators/test_genomic_insertion.py index cc2c456a..3a8d4606 100644 --- a/tests/validators/test_genomic_insertion.py +++ b/tests/validators/test_genomic_insertion.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GenomicInsertion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_genomic_silent_mutation.py b/tests/validators/test_genomic_silent_mutation.py index 6b8372d0..44209aba 100644 --- a/tests/validators/test_genomic_silent_mutation.py +++ b/tests/validators/test_genomic_silent_mutation.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GenomicSilentMutation( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_genomic_substitution.py b/tests/validators/test_genomic_substitution.py index f4242494..bf9fcbe8 100644 --- a/tests/validators/test_genomic_substitution.py +++ b/tests/validators/test_genomic_substitution.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GenomicSubstitution( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_genomic_uncertain_deletion.py b/tests/validators/test_genomic_uncertain_deletion.py index c8a978a2..9f9cd815 100644 --- a/tests/validators/test_genomic_uncertain_deletion.py +++ b/tests/validators/test_genomic_uncertain_deletion.py @@ -22,12 +22,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return GenomicUncertainDeletion( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) def classifier_instance(self): diff --git a/tests/validators/test_polypeptide_truncation.py b/tests/validators/test_polypeptide_truncation.py index 61acb66d..0e898a29 100644 --- a/tests/validators/test_polypeptide_truncation.py +++ b/tests/validators/test_polypeptide_truncation.py @@ -23,12 +23,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return PolypeptideTruncation( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def classifier_instance(self): """Return the Polypeptide Truncation classifier instance.""" diff --git a/tests/validators/test_silent_mutation.py b/tests/validators/test_silent_mutation.py index 7de22ad5..71b37a85 100644 --- a/tests/validators/test_silent_mutation.py +++ b/tests/validators/test_silent_mutation.py @@ -23,12 +23,13 @@ def validator_instance(self): uta = UTA() dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client) tlr = Translator(data_proxy=dp) + gene_normalizer = GeneQueryHandler() return SilentMutation( seqrepo_access, transcript_mappings, - GeneSymbol(GeneQueryHandler()), + GeneSymbol(gene_normalizer), MANETranscript(seqrepo_access, transcript_mappings, MANETranscriptMappings(), uta), - uta, dp, tlr, AminoAcidCache()) + uta, dp, tlr, gene_normalizer, AminoAcidCache()) def classifier_instance(self): """Return the Silent Mutation classifier instance.""" diff --git a/tests/validators/validator_base.py b/tests/validators/validator_base.py index e37ebf4d..78e1c923 100644 --- a/tests/validators/validator_base.py +++ b/tests/validators/validator_base.py @@ -40,7 +40,8 @@ def test_matches(self): tokens = self.tokenizer.perform(x['query'], []) classification = self.classifier.match(tokens) validation_results = self.validator.validate( - classification, normalize_endpoint=True + classification, normalize_endpoint=True, + hgvs_dup_del_mode="default" ) is_valid = False for vr in validation_results: diff --git a/variation/classifiers/__init__.py b/variation/classifiers/__init__.py index b57dff0b..36962600 100644 --- a/variation/classifiers/__init__.py +++ b/variation/classifiers/__init__.py @@ -26,4 +26,6 @@ from .coding_dna_insertion_classifier import CodingDNAInsertionClassifier # noqa: F401, E501 from .genomic_insertion_classifier import GenomicInsertionClassifier # noqa: F401, E501 from .genomic_uncertain_deletion_classifier import GenomicUncertainDeletionClassifier # noqa: F401, E501 +from .genomic_duplication_classifier import GenomicDuplicationClassifier # noqa: F401, E501 +from .genomic_deletion_range_classifier import GenomicDeletionRangeClassifier # noqa: F401, E501 from .classify import Classify # noqa: F401 diff --git a/variation/classifiers/classify.py b/variation/classifiers/classify.py index 68dc85f8..6cd5fe7e 100644 --- a/variation/classifiers/classify.py +++ b/variation/classifiers/classify.py @@ -15,7 +15,8 @@ AminoAcidDeletionClassifier, CodingDNADeletionClassifier, \ GenomicDeletionClassifier, AminoAcidInsertionClassifier, \ CodingDNAInsertionClassifier, GenomicInsertionClassifier, \ - GenomicUncertainDeletionClassifier, Classifier + GenomicUncertainDeletionClassifier, GenomicDuplicationClassifier, \ + GenomicDeletionRangeClassifier, Classifier class Classify: @@ -48,7 +49,9 @@ def __init__(self) -> None: AminoAcidInsertionClassifier(), CodingDNAInsertionClassifier(), GenomicInsertionClassifier(), - GenomicUncertainDeletionClassifier() + GenomicUncertainDeletionClassifier(), + GenomicDuplicationClassifier(), + GenomicDeletionRangeClassifier() ] def perform(self, tokens: List[Token]) -> List[Classification]: diff --git a/variation/classifiers/genomic_deletion_range_classifier.py b/variation/classifiers/genomic_deletion_range_classifier.py new file mode 100644 index 00000000..44930b51 --- /dev/null +++ b/variation/classifiers/genomic_deletion_range_classifier.py @@ -0,0 +1,22 @@ +"""A module for the Genomic Deletion Range Classifier.""" +from typing import List +from .set_based_classifier import SetBasedClassifier +from variation.schemas.classification_response_schema import ClassificationType + + +class GenomicDeletionRangeClassifier(SetBasedClassifier): + """The Genomic Deletion Range Classifier class.""" + + def classification_type(self) -> ClassificationType: + """Return the Genomic Deletion Range classification type.""" + return ClassificationType.GENOMIC_DELETION_RANGE + + def exact_match_candidates(self) -> List[List[str]]: + """Return the exact match token type candidates.""" + return [ + ['GeneSymbol', 'GenomicDeletionRange'], + ['GenomicDeletionRange', 'GeneSymbol'], + ['HGVS', 'GenomicDeletionRange'], + ['ReferenceSequence', 'GenomicDeletionRange'], + ['LocusReferenceGenomic', 'GenomicDeletionRange'] + ] diff --git a/variation/classifiers/genomic_duplication_classifier.py b/variation/classifiers/genomic_duplication_classifier.py new file mode 100644 index 00000000..843ecaba --- /dev/null +++ b/variation/classifiers/genomic_duplication_classifier.py @@ -0,0 +1,27 @@ +"""A module for the Genomic Duplication Classifier.""" +from typing import List +from .set_based_classifier import SetBasedClassifier +from variation.schemas.classification_response_schema import ClassificationType + + +class GenomicDuplicationClassifier(SetBasedClassifier): + """The Genomic Duplication Classifier class.""" + + def classification_type(self) -> ClassificationType: + """Return the Genomic Duplication classification type.""" + return ClassificationType.GENOMIC_DUPLICATION + + def exact_match_candidates(self) -> List[List[str]]: + """Return the exact match token type candidates.""" + return [ + ['GenomicDuplication', 'GeneSymbol'], + ['GeneSymbol', 'GenomicDuplication'], + ['HGVS', 'GenomicDuplication'], + ['ReferenceSequence', 'GenomicDuplication'], + ['LocusReferenceGenomic', 'GenomicDuplication'], + ['GenomicDuplicationRange', 'GeneSymbol'], + ['GeneSymbol', 'GenomicDuplicationRange'], + ['HGVS', 'GenomicDuplicationRange'], + ['ReferenceSequence', 'GenomicDuplicationRange'], + ['LocusReferenceGenomic', 'GenomicDuplicationRange'] + ] diff --git a/variation/data_sources/seq_repo_access.py b/variation/data_sources/seq_repo_access.py index d67870ea..a8180edf 100644 --- a/variation/data_sources/seq_repo_access.py +++ b/variation/data_sources/seq_repo_access.py @@ -74,3 +74,13 @@ def aliases(self, input_str) -> List[str]: except KeyError: logger.warning(f"SeqRepo could not translate alias: {input_str}") return [] + + def ac_to_chromosome(self, ac: str) -> Optional[str]: + """Get chromosome for accession. + + :param str ac: Accession + :return: Chromosome + """ + aliases = self.aliases(ac) + return ([a.split(':')[-1] for a in aliases + if a.startswith('GRCh') and '.' not in a and 'chr' not in a] or [None])[0] # noqa: E501 diff --git a/variation/data_sources/uta.py b/variation/data_sources/uta.py index 7e7d4a2c..795fa2d6 100644 --- a/variation/data_sources/uta.py +++ b/variation/data_sources/uta.py @@ -36,6 +36,7 @@ def __init__(self, db_url=UTA_DB_URL, db_pwd=None) -> None: self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) self._create_genomic_table() self.liftover = LiftOver('hg19', 'hg38') + self.liftover_to_37 = LiftOver('hg38', 'hg19') def _update_db_url(self, db_pwd, db_url) -> Optional[str]: """Return new db_url containing password. @@ -373,6 +374,13 @@ def get_mane_c_genomic_data(self, ac, alt_ac, start_pos, end_pos): data['alt_pos_range'][0] + data['alt_pos_change'][0], data['alt_pos_range'][1] - data['alt_pos_change'][1] ) + + if data['strand'] == '-': + data['alt_pos_change_range'] = ( + data['alt_pos_range'][1] - data['alt_pos_change'][0], + data['alt_pos_range'][0] + data['alt_pos_change'][1] + ) + return data def get_genomic_tx_data(self, ac, pos) -> Optional[Dict]: @@ -402,6 +410,13 @@ def get_genomic_tx_data(self, ac, pos) -> Optional[Dict]: data['alt_pos_range'][0] + data['pos_change'][0], data['alt_pos_range'][1] - data['pos_change'][1] ) + + if data['strand'] == '-': + data['alt_pos_change_range'] = ( + data['alt_pos_range'][1] - data['pos_change'][0], + data['alt_pos_range'][0] + data['pos_change'][1] + ) + return data def get_ac_from_gene(self, gene) -> Optional[List[str]]: diff --git a/variation/hgvs_dup_del_mode.py b/variation/hgvs_dup_del_mode.py new file mode 100644 index 00000000..0c75a4c7 --- /dev/null +++ b/variation/hgvs_dup_del_mode.py @@ -0,0 +1,208 @@ +"""Module for hgvs_dup_del_mode in normalize endpoint.""" +import logging +from typing import Optional, Dict, Tuple, List +from variation.data_sources.seq_repo_access import SeqRepoAccess +from ga4gh.vrs import models +from ga4gh.core import ga4gh_identify +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum + +logger = logging.getLogger('variation') +logger.setLevel(logging.DEBUG) + + +class HGVSDupDelMode: + """Class for handling how to interpret HGVS duplications and deletions.""" + + def __init__(self, seqrepo_access: SeqRepoAccess) -> None: + """Initialize HGVS Dup Del Mode. + + :param SeqRepoAccess seqrepo_access: Access to seqrepo + """ + self.seqrepo_access = seqrepo_access + self.valid_modes = [mode.value for mode in + HGVSDupDelModeEnum.__members__.values()] + + def is_valid_mode(self, mode: str) -> bool: + """Determine if mode is a valid input. + + :param str mode: Entered mode + :return: `True` if valid mode. `False` otherwise. + """ + hgvs_dup_del_mode = mode.strip().lower() + return hgvs_dup_del_mode in self.valid_modes + + def default_mode(self, ac: str, alt_type: str, pos: Tuple[int, int], + del_or_dup: str, location: Dict, + chromosome: str = None, + allele: Dict = None) -> Optional[Dict]: + """Use default characteristics to return a variation. + If endpoints are ambiguous: cnv + handling X chromosome, make cnv a definite range with base 1-2 + handling Y chromosome, base of 1 + handling anything else, base of 2 + elif len del or dup > 100bp: + repeated_seq_expr with a derived_seq_expr subject + else: + literal_seq_expr (normalized LiteralSequenceExpression Allele) + + :param str ac: Accession + :param str alt_type: Alteration type + :param tuple pos: start_pos, end_pos + :param str del_or_dup: Must be either `del` or `dup` + :param dict location: Sequence Location object + :param str chromosome: Chromosome + :param dict allele: VRS Allele object represented as a dict + :return: VRS Variation object represented as a dict + """ + if 'uncertain' in alt_type or 'range' in alt_type: + variation = self.cnv_mode(ac, del_or_dup, + location, chromosome=chromosome) + elif pos and (pos[1] - pos[0] > 100): + variation = self.repeated_seq_expr_mode(alt_type, location) + else: + variation = self.literal_seq_expr_mode(allele, alt_type) + return variation + + def cnv_mode(self, ac: str, del_or_dup: str, location: Dict, + chromosome: str = None) -> Optional[Dict]: + """Return a VRS Copy Number Variation. + + :param str ac: Accession + :param str del_or_dup: Must be either `del` or `dup` + :param dict location: VRS SequenceLocation + :param str chromosome: Chromosome + :return: VRS Copy Number object represented as a dict + """ + if chromosome is None: + chromosome = self.seqrepo_access.ac_to_chromosome(ac) + + if chromosome is None: + logger.warning(f"Unable to find chromosome on {ac}") + return None + + if chromosome == 'X': + copies = models.DefiniteRange( + min=0 if del_or_dup == 'del' else 2, + max=1 if del_or_dup == 'del' else 3 + ) + elif chromosome == 'Y': + copies = models.Number( + value=0 if del_or_dup == 'del' else 2 + ) + else: + # Chr 1-22 + copies = models.Number( + value=1 if del_or_dup == 'del' else 3 + ) + + variation = models.CopyNumber( + subject=models.DerivedSequenceExpression( + location=location, + reverse_complement=False + ), + copies=copies + ) + return self._ga4gh_identify_variation(variation) + + def repeated_seq_expr_mode(self, alt_type: str, + location: Dict) -> Optional[Dict]: + """Return a VRS Allele with a RepeatedSequenceExpression. + The RepeatedSequenceExpression subject will be a + DerivedSequenceExpression. + + :param str alt_type: Alteration type + :param dict location: VRS SequenceLocation + :return: VRS Allele object represented as a dict + """ + if 'range' in alt_type: + # Ranges should return an error + return None + + if alt_type == 'duplication': + count = models.Number(value=2) + elif alt_type == 'deletion': + count = models.Number(value=0) + else: + return None + + seq_expr = models.RepeatedSequenceExpression( + seq_expr=models.DerivedSequenceExpression( + location=location, + reverse_complement=False + ), + count=count + ) + + variation = models.Allele( + location=location, + state=seq_expr + ) + return self._ga4gh_identify_variation(variation) + + def literal_seq_expr_mode(self, allele: Dict, + alt_type: str) -> Optional[Dict]: + """Return a VRS Allele with a normalized LiteralSequenceExpression. + + :param dict allele: normalized VRS Allele object represented as a dict + :param str alt_type: Alteration type + :return: VRS Allele object represented as a dict + """ + if 'range' in alt_type or 'uncertain' in alt_type: + return None + + variation = models.Allele(**allele) if allele else None + return self._ga4gh_identify_variation(variation) + + def _ga4gh_identify_variation(self, + variation: models.Variation) -> Optional[Dict]: # noqa: E501 + """Return variation with GA4GH digest-based id. + + :param models.Variation variation: VRS variation object + :return: VRS Variation with GA4GH digest-based id represented as a dict + """ + if variation is None: + return None + else: + variation._id = ga4gh_identify(variation) + return variation.as_dict() + + def interpret_variation(self, ac: str, alt_type: str, allele: Dict, + errors: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum, + pos: Optional[Tuple[int, int]] = None) -> Dict: + """Interpret variation using HGVSDupDelMode + + :param str ac: Accession + :param str alt_type: Alteration type + :param dict allele: VRS Allele object + :param List errors: List of errors + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Mode to use for + interpreting HGVS duplications and deletions + :param Optional[Tuple[int, int]] pos: Position changes + :return: VRS Variation object + """ + if 'deletion' in alt_type: + del_or_dup = 'del' + else: + del_or_dup = 'dup' + variation = None + if allele is None: + errors.append("Unable to get Allele") + else: + if hgvs_dup_del_mode == HGVSDupDelModeEnum.DEFAULT: + variation = self.default_mode( + ac, alt_type, pos, del_or_dup, + allele['location'], allele=allele + ) + elif hgvs_dup_del_mode == HGVSDupDelModeEnum.CNV: + variation = self.cnv_mode(ac, del_or_dup, allele['location']) + elif hgvs_dup_del_mode == HGVSDupDelModeEnum.REPEATED_SEQ_EXPR: + variation = self.repeated_seq_expr_mode( + alt_type, allele['location'] + ) + elif hgvs_dup_del_mode == HGVSDupDelModeEnum.LITERAL_SEQ_EXPR: + variation = self.literal_seq_expr_mode(allele, alt_type) + if not variation: + errors.append("Unable to get VRS Variation") + return variation diff --git a/variation/main.py b/variation/main.py index 643629ac..dd467611 100644 --- a/variation/main.py +++ b/variation/main.py @@ -6,6 +6,9 @@ from datetime import datetime import html from variation.query import QueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum + app = FastAPI(docs_url='/variation', openapi_url='/variation/openapi.json') query_handler = QueryHandler() @@ -71,6 +74,10 @@ def to_vrs(q: str = Query(..., description=q_description)): normalize_description = \ 'Return VRSATILE compatible object for variation provided.' q_description = 'Variation to normalize.' +hgvs_dup_del_mode_decsr = ('Must be one of: `default`, `cnv`, ' + '`repeated_seq_expr`, `literal_seq_expr`. This' + ' parameter determines how to interpret HGVS ' + 'dup/del expressions in VRS.') @app.get('/variation/normalize', @@ -80,14 +87,22 @@ def to_vrs(q: str = Query(..., description=q_description)): description=normalize_description, response_model_exclude_none=True ) -def normalize(q: str = Query(..., description=q_description)): +def normalize(q: str = Query(..., description=q_description), + hgvs_dup_del_mode: HGVSDupDelModeEnum = Query( + HGVSDupDelModeEnum.DEFAULT, + description=hgvs_dup_del_mode_decsr)): """Return Value Object Descriptor for variation. - :param q: Variation to normalize + :param str q: Variation to normalize + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to interpret HGVS dup/del expressions + in VRS. :return: NormalizeService for variation """ normalize_resp = \ - query_handler.normalize(html.unescape(q)) + query_handler.normalize(html.unescape(q), + hgvs_dup_del_mode=hgvs_dup_del_mode) warnings = query_handler.normalize_handler.warnings if \ query_handler.normalize_handler.warnings else None diff --git a/variation/mane_transcript.py b/variation/mane_transcript.py index 20bb3074..b21dc3a3 100644 --- a/variation/mane_transcript.py +++ b/variation/mane_transcript.py @@ -217,8 +217,13 @@ def _g_to_mane_c(self, g, mane_data) -> Optional[Dict]: return None coding_start_site = cds_start_end[0] - g_pos = g['alt_pos_change_range'][0], g['alt_pos_change_range'][1] - g_pos_change = g_pos[0] - result[5], result[6] - g_pos[1] + g_pos = g['alt_pos_change_range'] # start/end genomic change + mane_g_pos = result[5], result[6] # alt_start_i, alt_end_i + g_pos_change = g_pos[0] - mane_g_pos[0], mane_g_pos[1] - g_pos[1] + if mane_data["chr_strand"] == "-": + g_pos_change = ( + mane_g_pos[1] - g_pos[0], g_pos[1] - mane_g_pos[0] + ) mane_tx_pos_range = result[2], result[3] mane_c_pos_change = ( @@ -320,10 +325,13 @@ def _validate_index(self, ac, pos, coding_start_site): """ start_pos = pos[0] + coding_start_site end_pos = pos[1] + coding_start_site - if self.seqrepo_access.get_sequence(ac, start_pos, end_pos): - return True + if start_pos > end_pos: + if self.seqrepo_access.get_sequence(ac, end_pos, start_pos): + return True else: - return None + if self.seqrepo_access.get_sequence(ac, start_pos, end_pos): + return True + return False def get_longest_compatible_transcript(self, gene, start_pos, end_pos, start_annotation_layer, @@ -454,7 +462,7 @@ def get_mane_transcript(self, ac, start_pos, end_pos, g = self._c_to_g(c_ac, c_pos) if g is None: return None - # Go from g -> mane transcript + # Get mane data for gene mane_data = \ self.mane_transcript_mappings.get_gene_mane_data(g['gene']) if not mane_data: @@ -614,12 +622,14 @@ def g_to_mane_c(self, ac, start_pos, end_pos, gene=None): mane_tx_genomic_data = None if grch38: # GRCh38 -> MANE C + g_pos = grch38['pos'] mane_tx_genomic_data = self.uta.get_mane_c_genomic_data( mane_c_ac, None, grch38['pos'][0], grch38['pos'][1] ) if not grch38 or not mane_tx_genomic_data: - # GRCh38 did not work, so let's try original assembly + # GRCh38 did not work, so let's try original assembly (37) + g_pos = start_pos, end_pos mane_tx_genomic_data = self.uta.get_mane_c_genomic_data( mane_c_ac, ac, start_pos, end_pos ) @@ -629,22 +639,25 @@ def g_to_mane_c(self, ac, start_pos, end_pos, gene=None): logger.info("Not using most recent assembly") tx_pos_range = mane_tx_genomic_data['tx_pos_range'] - alt_pos_change = mane_tx_genomic_data['alt_pos_change'] + mane_g_pos = mane_tx_genomic_data['alt_pos_range'] + g_pos_change = g_pos[0] - mane_g_pos[0], mane_g_pos[1] - g_pos[1] coding_start_site = mane_tx_genomic_data['coding_start_site'] coding_end_site = mane_tx_genomic_data['coding_end_site'] if mane_tx_genomic_data['strand'] == '-': - alt_pos_change = (alt_pos_change[1] + 1, alt_pos_change[0] - 1) + g_pos_change = ( + mane_g_pos[1] - g_pos[0] + 1, g_pos[1] - mane_g_pos[0] - 1 + ) mane_c_pos_change = ( - tx_pos_range[0] + alt_pos_change[0] - coding_start_site, - tx_pos_range[1] - alt_pos_change[1] - coding_start_site + tx_pos_range[0] + g_pos_change[0] - coding_start_site, + tx_pos_range[1] - g_pos_change[1] - coding_start_site ) if not self._validate_index(mane_c_ac, mane_c_pos_change, coding_start_site): logger.warning(f"{mane_c_pos_change} are not valid positions" - f" on {mane_c_ac}with coding start site " + f" on {mane_c_ac} with coding start site " f"{coding_start_site}") return None diff --git a/variation/normalize.py b/variation/normalize.py index 5e90d524..70e3000d 100644 --- a/variation/normalize.py +++ b/variation/normalize.py @@ -1,11 +1,16 @@ """Module for Variation Normalization.""" -from typing import Optional, List, Tuple -from ga4gh.vrsatile.pydantic.vrsatile_model import VariationDescriptor -from ga4gh.vrsatile.pydantic.vrs_model import Text +from typing import Optional, List, Tuple, Dict +from ga4gh.vrsatile.pydantic.vrs_models import Text +from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor, \ + GeneDescriptor +from ga4gh.vrs import models +from ga4gh.core import ga4gh_identify from variation.data_sources import SeqRepoAccess, UTA from urllib.parse import quote from variation import logger from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.token_response_schema import GeneMatchToken, Token +from variation.schemas.validation_response_schema import ValidationSummary class Normalize: @@ -25,12 +30,13 @@ def __init__(self, seqrepo_access: SeqRepoAccess, uta: UTA, self._gene_norm_cache = dict() self.gene_normalizer = gene_normalizer - def normalize(self, q, validations, warnings): + def normalize(self, q: str, validations: ValidationSummary, + warnings: List) -> Optional[VariationDescriptor]: """Normalize a given variation. :param str q: The variation to normalize :param ValidationSummary validations: Invalid and valid results - :param list warnings: List of warnings + :param List warnings: List of warnings :return: An variation descriptor for a valid result if one exists. Else, None. """ @@ -53,19 +59,23 @@ def normalize(self, q, validations, warnings): variation = valid_result.variation - variation_id = variation.pop('_id') + variation_id = variation['_id'] identifier = valid_result.identifier + token_type = \ + valid_result.classification_token.token_type.lower() - if variation['type'] == 'Allele': - vrs_ref_allele_seq = self.get_ref_allele_seq( - variation, identifier - ) - elif variation['type'] == 'CopyNumber': - vrs_ref_allele_seq = self.get_ref_allele_seq( - variation['subject'], identifier - ) - else: - vrs_ref_allele_seq = None + vrs_ref_allele_seq = None + if 'uncertain' in token_type: + warnings = ['Ambiguous regions cannot be normalized'] + elif 'range' not in token_type: + if variation['type'] == 'Allele': + vrs_ref_allele_seq = self.get_ref_allele_seq( + variation, identifier + ) + elif variation['type'] == 'CopyNumber': + vrs_ref_allele_seq = self.get_ref_allele_seq( + variation['subject'], identifier + ) if valid_result.gene_tokens: gene_token = valid_result.gene_tokens[0] @@ -73,16 +83,13 @@ def normalize(self, q, validations, warnings): else: gene_context = None - if 'Uncertain' in valid_result.classification_token.token_type: - warnings = ['Ambiguous regions cannot be normalized'] - resp = VariationDescriptor( id=_id, variation_id=variation_id, variation=variation, molecule_context=valid_result.classification_token.molecule_context, # noqa: E501 structural_type=valid_result.classification_token.so_id, - vrs_ref_allele_seq=vrs_ref_allele_seq, + vrs_ref_allele_seq=vrs_ref_allele_seq if vrs_ref_allele_seq else None, # noqa: E501 gene_context=gene_context ) else: @@ -90,9 +97,11 @@ def normalize(self, q, validations, warnings): resp, warnings = self._no_variation_entered() else: warning = f"Unable to normalize {q}" + text = models.Text(definition=q) + text._id = ga4gh_identify(text) resp = VariationDescriptor( id=_id, - variation=Text(definition=q) + variation=Text(**text.as_dict()) ) if not warnings: warnings.append(warning) @@ -109,7 +118,8 @@ def _no_variation_entered(self) -> Tuple[None, List[str]]: logger.warning(warnings) return None, warnings - def get_gene_descriptor(self, gene_token): + def get_gene_descriptor( + self, gene_token: GeneMatchToken) -> Optional[GeneDescriptor]: """Return a GA4GH Gene Descriptor using Gene Normalization. :param GeneMatchToken gene_token: A gene token @@ -127,10 +137,11 @@ def get_gene_descriptor(self, gene_token): return gene_descriptor return None - def get_ref_allele_seq(self, allele, identifier) -> Optional[str]: + def get_ref_allele_seq(self, allele: Dict, + identifier: str) -> Optional[str]: """Return ref allele seq for transcript. - :param dict allele: VRS Allele object + :param Dict allele: VRS Allele object :param str identifier: Identifier for allele :return: Ref seq allele """ @@ -138,11 +149,7 @@ def get_ref_allele_seq(self, allele, identifier) -> Optional[str]: end = None interval = allele['location']['interval'] ival_type = interval['type'] - if ival_type == 'SimpleInterval': - if interval['start'] != interval['end']: - start = interval['start'] + 1 - end = interval['end'] - elif ival_type == 'SequenceInterval': + if ival_type == 'SequenceInterval': if interval['start']['type'] == 'Number': start = interval['start']['value'] + 1 end = interval['end']['value'] @@ -152,10 +159,11 @@ def get_ref_allele_seq(self, allele, identifier) -> Optional[str]: return self.seqrepo_access.get_sequence(identifier, start, end) - def _is_token_type(self, valid_result_tokens, token_type) -> bool: + def _is_token_type(self, valid_result_tokens: List, + token_type: str) -> bool: """Return whether or not token_type is in valid_result_tokens. - :param list valid_result_tokens: Valid token matches + :param List valid_result_tokens: Valid token matches :param str token_type: The token's type :return: Whether or not token_type is in valid_result_tokens """ @@ -164,10 +172,11 @@ def _is_token_type(self, valid_result_tokens, token_type) -> bool: return True return False - def _get_instance_type_token(self, valid_result_tokens, instance_type): + def _get_instance_type_token(self, valid_result_tokens: List, + instance_type: Token) -> Optional[Token]: """Return the tokens for a given instance type. - :param list valid_result_tokens: A list of valid tokens for the input + :param List valid_result_tokens: A list of valid tokens for the input string :param Token instance_type: The instance type to check :return: Token for a given instance type diff --git a/variation/query.py b/variation/query.py index 88059ed8..677e62b5 100644 --- a/variation/query.py +++ b/variation/query.py @@ -3,6 +3,8 @@ from gene.query import QueryHandler as GeneQueryHandler from ga4gh.vrs.dataproxy import SeqRepoDataProxy from ga4gh.vrs.extras.translator import Translator +from ga4gh.core import ga4gh_identify +from ga4gh.vrs import models from variation import SEQREPO_DATA_PATH, TRANSCRIPT_MAPPINGS_PATH, \ REFSEQ_GENE_SYMBOL_PATH, AMINO_ACID_PATH, UTA_DB_URL, REFSEQ_MANE_PATH from variation.to_vrs import ToVRS @@ -14,11 +16,14 @@ from variation.data_sources import SeqRepoAccess, TranscriptMappings, \ UTA, MANETranscriptMappings from variation.mane_transcript import MANETranscript +from variation.hgvs_dup_del_mode import HGVSDupDelMode from variation.tokenizers import GeneSymbol from variation.tokenizers.caches import AminoAcidCache -from ga4gh.vrsatile.pydantic.vrs_model import Text, Allele, CopyNumber, \ +from ga4gh.vrsatile.pydantic.vrs_models import Text, Allele, CopyNumber, \ Haplotype, VariationSet -from ga4gh.vrsatile.pydantic.vrsatile_model import VariationDescriptor +from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum class QueryHandler: @@ -27,15 +32,15 @@ class QueryHandler: def __init__(self, dynamodb_url: str = '', dynamodb_region: str = 'us-east-2', - seqrepo_data_path=SEQREPO_DATA_PATH, - uta_db_url=UTA_DB_URL, - uta_db_pwd=None) -> None: + seqrepo_data_path: str = SEQREPO_DATA_PATH, + uta_db_url: str = UTA_DB_URL, + uta_db_pwd: Optional[str] = None) -> None: """Initialize QueryHandler instance. :param str dynamodb_url: URL to gene-normalizer database source. :param str dynamodb_region: AWS default region for gene-normalizer. :param str seqrepo_data_path: Path to seqrepo data directory :param str uta_db_url: URL for UTA database - :param str uta_db_pwd: Password for UTA database user + :param Optional[str] uta_db_pwd: Password for UTA database user """ self.gene_normalizer = GeneQueryHandler(db_url=dynamodb_url, db_region=dynamodb_region) @@ -43,15 +48,18 @@ def __init__(self, seqrepo_data_path=seqrepo_data_path ) self.uta = UTA(db_url=uta_db_url, db_pwd=uta_db_pwd) + self.dp = SeqRepoDataProxy(self.seqrepo_access.seq_repo_client) + self.hgvs_dup_del_mode = HGVSDupDelMode(self.seqrepo_access) self.to_vrs_handler = self._init_to_vrs() self.normalize_handler = Normalize( self.seqrepo_access, self.uta, self.gene_normalizer ) - def _init_to_vrs(self, transcript_file_path=TRANSCRIPT_MAPPINGS_PATH, - refseq_file_path=REFSEQ_GENE_SYMBOL_PATH, - amino_acids_file_path=AMINO_ACID_PATH, - mane_data_path=REFSEQ_MANE_PATH) -> ToVRS: + def _init_to_vrs(self, + transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH, + refseq_file_path: str = REFSEQ_GENE_SYMBOL_PATH, + amino_acids_file_path: str = AMINO_ACID_PATH, + mane_data_path: str = REFSEQ_MANE_PATH) -> ToVRS: """Return toVRS instance :param str transcript_file_path: Path to transcript mappings file @@ -73,24 +81,25 @@ def _init_to_vrs(self, transcript_file_path=TRANSCRIPT_MAPPINGS_PATH, mane_transcript_mappings = MANETranscriptMappings( mane_data_path=mane_data_path ) - dp = SeqRepoDataProxy(self.seqrepo_access.seq_repo_client) - tlr = Translator(data_proxy=dp) + + tlr = Translator(data_proxy=self.dp) mane_transcript = MANETranscript( self.seqrepo_access, transcript_mappings, mane_transcript_mappings, self.uta ) validator = Validate( self.seqrepo_access, transcript_mappings, gene_symbol, - mane_transcript, self.uta, dp, tlr, amino_acid_cache + mane_transcript, self.uta, self.dp, tlr, + amino_acid_cache, self.gene_normalizer ) translator = Translate() return ToVRS( tokenizer, classifier, self.seqrepo_access, transcript_mappings, gene_symbol, amino_acid_cache, self.uta, mane_transcript_mappings, - mane_transcript, validator, translator + mane_transcript, validator, translator, self.gene_normalizer ) - def to_vrs(self, q)\ + def to_vrs(self, q: str)\ -> Tuple[Optional[Union[List[Allele], List[CopyNumber], List[Text], List[Haplotype], List[VariationSet]]], @@ -107,17 +116,42 @@ def to_vrs(self, q)\ if not translations: if q and q.strip(): - translations = [Text(definition=q)] + text = models.Text(definition=q) + text._id = ga4gh_identify(text) + translations = [Text(**text.as_dict())] else: translations = None return translations, warnings - def normalize(self, q) -> VariationDescriptor: + def normalize( + self, q: str, + hgvs_dup_del_mode: HGVSDupDelModeEnum = HGVSDupDelModeEnum.DEFAULT + ) -> Optional[VariationDescriptor]: """Return normalized Variation Descriptor for variation. :param q: Variation to normalize + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to interpret HGVS dup/del expressions + in VRS. :return: Variation Descriptor for variation """ - validations, warnings = \ - self.to_vrs_handler.get_validations(q, normalize_endpoint=True) - return self.normalize_handler.normalize(q, validations, warnings) + if hgvs_dup_del_mode: + hgvs_dup_del_mode = hgvs_dup_del_mode.strip().lower() + if not self.hgvs_dup_del_mode.is_valid_mode(hgvs_dup_del_mode): + self.normalize_handler.warnings = \ + [f"hgvs_dup_del_mode must be one of: " + f"{self.hgvs_dup_del_mode.valid_modes}"] + return None + else: + validations, warnings = \ + self.to_vrs_handler.get_validations( + q, normalize_endpoint=True, + hgvs_dup_del_mode=hgvs_dup_del_mode + ) + return self.normalize_handler.normalize(q, validations, + warnings) + else: + self.normalize_handler.warnings = \ + ["hgvs_dup_del_mode cannot be None"] + return None diff --git a/variation/schemas/classification_response_schema.py b/variation/schemas/classification_response_schema.py index c5583fbd..dfea70e7 100644 --- a/variation/schemas/classification_response_schema.py +++ b/variation/schemas/classification_response_schema.py @@ -35,6 +35,8 @@ class ClassificationType(IntEnum): CODING_DNA_INSERTION = 25 GENOMIC_INSERTION = 26 GENOMIC_UNCERTAIN_DELETION = 27 + GENOMIC_DUPLICATION = 28 + GENOMIC_DELETION_RANGE = 29 class ConfidenceRating(IntEnum): diff --git a/variation/schemas/normalize_response_schema.py b/variation/schemas/normalize_response_schema.py index 4a7836b2..56ca1f71 100644 --- a/variation/schemas/normalize_response_schema.py +++ b/variation/schemas/normalize_response_schema.py @@ -1,35 +1,47 @@ """Module for normalize endpoint response schema.""" +from enum import Enum from pydantic import BaseModel from pydantic.types import StrictStr -from ga4gh.vrsatile.pydantic.vrsatile_model import VariationDescriptor +from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor from typing import List, Optional, Dict, Any, Type from datetime import datetime +class HGVSDupDelMode(str, Enum): + """Define options for HGVSDupDelMode. + This mode determines how to interpret HGVS dup/del. + """ + + DEFAULT = "default" + CNV = "cnv" + REPEATED_SEQ_EXPR = "repeated_seq_expr" + LITERAL_SEQ_EXPR = "literal_seq_expr" + + class ServiceMeta(BaseModel): """Metadata regarding the variation-normalization service.""" - name = 'variation-normalizer' + name = "variation-normalizer" version: StrictStr response_datetime: datetime - url = 'https://github.com/cancervariants/variation-normalization' + url = "https://github.com/cancervariants/variation-normalization" class Config: """Configure schema example.""" @staticmethod def schema_extra(schema: Dict[str, Any], - model: Type['ServiceMeta']) -> None: + model: Type["ServiceMeta"]) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { - 'name': 'variation-normalizer', - 'version': '0.1.0', - 'response_datetime': '2021-04-05T16:44:15.367831', - 'url': 'https://github.com/cancervariants/variation-normalization' # noqa: E501 + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { + "name": "variation-normalizer", + "version": "0.1.0", + "response_datetime": "2021-04-05T16:44:15.367831", + "url": "https://github.com/cancervariants/variation-normalization" # noqa: E501 } @@ -46,32 +58,32 @@ class Config: @staticmethod def schema_extra(schema: Dict[str, Any], - model: Type['NormalizeService']) -> None: + model: Type["NormalizeService"]) -> None: """Configure OpenAPI schema.""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "variation_query": "BRAF V600E", "variation_descriptor": { "id": "normalize.variation:BRAF%20V600E", "type": "VariationDescriptor", "label": "NP_001361187.1:p.Val640Glu", - "variation_id": "ga4gh:VA.9dA0egRAIfVFDL1sdU1VP7HsBcG0-DtE", # noqa: E501 + "variation_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", # noqa: E501 "variation": { "location": { "interval": { - "end": 640, - "start": 639, - "type": "SimpleInterval" + "end": {"value": 640, "type": "Number"}, + "start": {"value": 639, "type": "Number"}, + "type": "SequenceInterval" }, "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", # noqa: E501 "type": "SequenceLocation" }, "state": { "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression" }, "type": "Allele" }, @@ -82,10 +94,7 @@ def schema_extra(schema: Dict[str, Any], "id": "normalize.gene:BRAF", "type": "GeneDescriptor", "label": "BRAF", - "value": { - "id": "hgnc:1097", - "type": "Gene" - }, + "gene_id": "hgnc:1097", "xrefs": [ "ncbigene:673", "ensembl:ENSG00000157764" @@ -138,9 +147,9 @@ def schema_extra(schema: Dict[str, Any], } }, "service_meta_": { - 'name': 'variation-normalizer', - 'version': '0.1.0', - 'response_datetime': '2021-04-05T16:44:15.367831', - 'url': 'https://github.com/cancervariants/variation-normalization' # noqa: E501 + "name": "variation-normalizer", + "version": "0.2.13", + "response_datetime": "2021-11-18T14:10:53.909158", + "url": "https://github.com/cancervariants/variation-normalization" # noqa: E501 } } diff --git a/variation/schemas/to_vrs_response_schema.py b/variation/schemas/to_vrs_response_schema.py index 192b4ec3..94413fa9 100644 --- a/variation/schemas/to_vrs_response_schema.py +++ b/variation/schemas/to_vrs_response_schema.py @@ -2,7 +2,7 @@ from pydantic import BaseModel from typing import List, Dict, Type, Any, Optional, Union from pydantic.types import StrictStr -from ga4gh.vrsatile.pydantic.vrs_model import Allele, Text, Haplotype, \ +from ga4gh.vrsatile.pydantic.vrs_models import Allele, Text, Haplotype, \ CopyNumber, VariationSet from variation.schemas.normalize_response_schema import ServiceMeta @@ -21,38 +21,117 @@ class Config: @staticmethod def schema_extra(schema: Dict[str, Any], - model: Type['ToVRSService']) -> None: + model: Type["ToVRSService"]) -> None: """Configure OpenAPI schema.""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "search_term": "BRAF V600E", "variations": [ { - "_id": "ga4gh:VA.u6sKlz0mMQvARmrlnt0Aksz6EbSkmL8z", + "_id": "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", + "type": "Allele", "location": { + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", # noqa: E501 "interval": { - "start": 599, - "end": 600, - "type": "SimpleInterval" - }, + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 599 + }, + "end": { + "type": "Number", + "value": 600 + } + } + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "E" + } + }, + { + "_id": "ga4gh:VA.vimwyw0pFTwatfFhi3rhhb153ARWsPrW", + "location": { + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.lKdPZpuT-VNvRuKDjsUItNgutfWYgWQd", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 599 + }, + "end": { + "type": "Number", + "value": 600 + } + } + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "E" + } + }, + { + "_id": "ga4gh:VA.7ys8TiDzrk04O3Upd63__rOBCEhv3P5d", + "type": "Allele", + "location": { + "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.ZJwurRo2HLY018wghYjDKSfIlEH0Y8At", # noqa: E501 - "type": "SequenceLocation" + "interval": { + "type": "SequenceInterval", + "start": {"type": "Number", "value": 599}, + "end": {"type": "Number", "value": 600} + } }, "state": { - "sequence": "E", - "type": "SequenceState" + "type": "LiteralSequenceExpression", + "sequence": "E" + } + }, + { + "_id": "ga4gh:VA.FzlrH5feNcQ3S9GayMU9EF008j-8Pbz5", + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.0Q-SgJX1V3seUUIu3qVUtEa55CQsGmEU", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": {"type": "Number", "value": 599}, + "end": {"type": "Number", "value": 600} + } }, - "type": "Allele" + "state": { + "type": "LiteralSequenceExpression", + "sequence": "E" + } + }, + { + "_id": "ga4gh:VA.8JkgnqIgYqufNl-OV_hpRG_aWF9UFQCE", + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.WaAJ_cXXn9YpMNfhcq9lnzIvaB9ALawo", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": {"type": "Number", "value": 639}, + "end": {"type": "Number", "value": 640} + } + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "E" + } } ], "service_meta_": { - 'name': 'variation-normalizer', - 'version': '0.1.0', - 'response_datetime': '2021-04-05T16:44:15.367831', - 'url': 'https://github.com/cancervariants/variation-normalization' # noqa: E501 + "name": "variation-normalizer", + "version": "0.2.13", + "response_datetime": "2021-11-18T14:10:53.909158", + "url": "https://github.com/cancervariants/variation-normalization" # noqa: E501 } } diff --git a/variation/schemas/token_response_schema.py b/variation/schemas/token_response_schema.py index 4769eecf..fee2d9c2 100644 --- a/variation/schemas/token_response_schema.py +++ b/variation/schemas/token_response_schema.py @@ -1,6 +1,6 @@ """Module for Token Schema.""" from pydantic import BaseModel -from typing import List, Union, Dict, Any, Type, Optional +from typing import List, Union, Dict, Any, Type, Optional, Literal from enum import IntEnum, Enum @@ -14,6 +14,14 @@ class TokenMatchType(IntEnum): UNSPECIFIED = 5 +class TokenType(str, Enum): + """Define token types.""" + + # TODO: Add other token types (issue 162) + GENOMIC_DUPLICATION = "GenomicDuplication" + GENOMIC_DUPLICATION_RANGE = "GenomicDuplicationRange" + + class Token(BaseModel): """A string from a given query.""" @@ -389,6 +397,63 @@ class LocusReferenceGenomicToken(Token): token_type = 'LocusReferenceGenomic' +class Insertion(Token): + """a sequence change between the translation initiation (start) and + termination (stop) codon where, compared to the reference sequence, + one or more amino acids are inserted, which is not a frame shift and + where the insertion is not a copy of a sequence immediately N-terminal + (5') - varnomen.hgvs.org + """ + + start_pos_flank: int + end_pos_flank: int + inserted_sequence: str + reference_sequence: ReferenceSequence + token_type: str + so_id: str + molecule_context: str + alt_type = 'insertion' + + +class AminoAcidInsertionToken(Insertion): + """Amino Acid Insertion.""" + + start_aa_flank: str + end_aa_flank: str + reference_sequence = ReferenceSequence.PROTEIN + token_type = 'AminoAcidInsertion' + so_id = 'SO:0001605' + molecule_context = 'protein' + + +class CodingDNAInsertionToken(Insertion): + """Coding DNA Insertion.""" + + reference_sequence = ReferenceSequence.CODING_DNA + inserted_sequence2: Optional[str] + token_type = 'CodingDNAInsertion' + so_id = 'SO:0000667' + molecule_context = 'transcript' + + +class GenomicInsertionToken(Insertion): + """Genomic Insertion.""" + + reference_sequence = ReferenceSequence.LINEAR_GENOMIC + inserted_sequence2: Optional[str] + token_type = 'GenomicInsertion' + so_id = 'SO:0000667' + molecule_context = 'genomic' + + +class DeletionAltType(str, Enum): + """Define alt types for deletions.""" + + DELETION = "deletion" + DELETION_RANGE = "deletion_range" + UNCERTAIN_DELETION = "uncertain_deletion" + + class Deletion(Token): """The point at which one or more contiguous nucleotides were excised. - Sequence Ontology @@ -400,7 +465,7 @@ class Deletion(Token): token_type: str so_id: str molecule_context: str - alt_type = 'deletion' + alt_type: DeletionAltType.DELETION = DeletionAltType.DELETION class AminoAcidDeletionToken(Deletion): @@ -441,66 +506,37 @@ class GenomicDeletionToken(Deletion): molecule_context = 'genomic' -class Insertion(Token): - """a sequence change between the translation initiation (start) and - termination (stop) codon where, compared to the reference sequence, - one or more amino acids are inserted, which is not a frame shift and - where the insertion is not a copy of a sequence immediately N-terminal - (5') - varnomen.hgvs.org - """ +class DeletionRange(Token): + """Deletions of the form (pos_pos)_(pos_pos).""" - start_pos_flank: int - end_pos_flank: int - inserted_sequence: str - reference_sequence: ReferenceSequence + start_pos1_del: Union[int, str] + start_pos2_del: int + end_pos1_del: int + end_pos2_del: Union[int, str] token_type: str - so_id: str + so_id = "SO:0001743" molecule_context: str - alt_type = 'insertion' - - -class AminoAcidInsertionToken(Insertion): - """Amino Acid Insertion.""" - - start_aa_flank: str - end_aa_flank: str - reference_sequence = ReferenceSequence.PROTEIN - token_type = 'AminoAcidInsertion' - so_id = 'SO:0001605' - molecule_context = 'protein' - - -class CodingDNAInsertionToken(Insertion): - """Coding DNA Insertion.""" - - reference_sequence = ReferenceSequence.CODING_DNA - inserted_sequence2: Optional[str] - token_type = 'CodingDNAInsertion' - so_id = 'SO:0000667' - molecule_context = 'transcript' + alt_type: Union[Literal[DeletionAltType.DELETION_RANGE], Literal[DeletionAltType.UNCERTAIN_DELETION]] = DeletionAltType.DELETION_RANGE # noqa: E501 -class GenomicInsertionToken(Insertion): - """Genomic Insertion.""" +class GenomicDeletionRangeToken(DeletionRange): + """Genomic deletion range token.""" + token_type = "GenomicDeletionRange" + molecule_context = "genomic" reference_sequence = ReferenceSequence.LINEAR_GENOMIC - inserted_sequence2: Optional[str] - token_type = 'GenomicInsertion' - so_id = 'SO:0000667' - molecule_context = 'genomic' -class UncertainDeletion(Token): +class UncertainDeletion(DeletionRange): """Uncertain Deletion.""" - start_pos1_del = "?" - start_pos2_del: int + start_pos1_del: Optional[Union[Literal['?'], int]] + start_pos2_del: Optional[int] end_pos1_del: int - end_pos2_del = "?" + end_pos2_del: Optional[Union[Literal['?'], int]] token_type: str - so_id = "SO:0001743" molecule_context: str - alt_type = 'uncertain_deletion' + alt_type: Literal[DeletionAltType.UNCERTAIN_DELETION] = DeletionAltType.UNCERTAIN_DELETION # noqa: E501 class GenomicUncertainDeletionToken(UncertainDeletion): @@ -509,3 +545,47 @@ class GenomicUncertainDeletionToken(UncertainDeletion): token_type = "GenomicUncertainDeletion" molecule_context = "genomic" reference_sequence = ReferenceSequence.LINEAR_GENOMIC + + +class DuplicationAltType(str, Enum): + """Define alt types for duplications.""" + + DUPLICATION = "duplication" + DUPLICATION_RANGE = "duplication_range" + UNCERTAIN_DUPLICATION = "uncertain_duplication" + + +class Duplication(Token): + """Duplications.""" + + start_pos1_dup: Union[Literal['?'], int] + start_pos2_dup: Optional[int] + token_type: TokenType + so_id = "SO:1000035" + molecule_context: str + alt_type: DuplicationAltType + + +class GenomicDuplicationToken(Duplication): + """Genomic duplication token schema.""" + + token_type = TokenType.GENOMIC_DUPLICATION + molecule_context = "genomic" + reference_sequence = ReferenceSequence.LINEAR_GENOMIC + alt_type: Literal[DuplicationAltType.DUPLICATION] = DuplicationAltType.DUPLICATION # noqa: E501 + + +class DuplicationRange(Duplication): + """Duplications of the form (#_#)_(#_#)dup""" + + end_pos1_dup: int + end_pos2_dup: Optional[Union[Literal['?'], int]] + so_id = "SO:0001742" # check: Copy Number gain? + + +class GenomicDuplicationRangeToken(DuplicationRange): + """Genomic Duplication Range token schema""" + + token_type = TokenType.GENOMIC_DUPLICATION_RANGE + molecule_context = "genomic" + reference_sequence = ReferenceSequence.LINEAR_GENOMIC diff --git a/variation/to_vrs.py b/variation/to_vrs.py index c08d2c89..921bf54a 100644 --- a/variation/to_vrs.py +++ b/variation/to_vrs.py @@ -1,6 +1,6 @@ """Module for to VRS translation.""" from typing import Tuple, Optional, List, Union -from ga4gh.vrsatile.pydantic.vrs_model import Allele, Haplotype, CopyNumber,\ +from ga4gh.vrsatile.pydantic.vrs_models import Allele, Haplotype, CopyNumber,\ VariationSet, Text from variation.schemas.validation_response_schema import ValidationSummary from variation.classifiers import Classify @@ -13,6 +13,9 @@ from variation.tokenizers import GeneSymbol from variation.tokenizers.caches import AminoAcidCache from urllib.parse import unquote +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum class ToVRS: @@ -24,8 +27,25 @@ def __init__(self, tokenizer: Tokenize, classifier: Classify, gene_symbol: GeneSymbol, amino_acid_cache: AminoAcidCache, uta: UTA, mane_transcript_mappings: MANETranscriptMappings, mane_transcript: MANETranscript, validator: Validate, - translator: Translate): - """Initialize the ToVRS class.""" + translator: Translate, + gene_normalizer: GeneQueryHandler) -> None: + """Initialize the ToVRS class. + + :param Tokenize tokenizer: Tokenizer class for tokenizing + :param Classify classifier: Classifier class for classifying tokens + :param SeqRepoAccess seqrepo_access: Access to SeqRepo + :param TranscriptMappings transcript_mappings: Transcript mappings + data class + :param GeneSymbol gene_symbol: Class for identifying gene symbols + :param AminoAcidCache amino_acid_cache: Amino Acid data class + :param UTA uta: UTA DB and queries + :param MANETranscriptMappings mane_transcript_mappings: Class for + getting mane transcript data from gene + :param MANETranscript mane_transcript: Mane transcript data class + :param Validate validator: Validator class for validating valid inputs + :param Translate translator: Translating valid inputs + :param GeneQueryHandler gene_normalizer: Gene normalizer access + """ self.tokenizer = tokenizer self.classifier = classifier self.seq_repo_access = seqrepo_access @@ -37,14 +57,21 @@ def __init__(self, tokenizer: Tokenize, classifier: Classify, self.mane_transcript = mane_transcript self.validator = validator self.translator = translator + self.gene_normalizer = gene_normalizer - def get_validations(self, q, normalize_endpoint=False)\ - -> Tuple[Optional[ValidationSummary], Optional[List[str]]]: + def get_validations( + self, q: str, normalize_endpoint: bool = False, + hgvs_dup_del_mode: HGVSDupDelModeEnum = HGVSDupDelModeEnum.DEFAULT + ) -> Tuple[Optional[ValidationSummary], Optional[List[str]]]: """Return validation results for a given variation. :param str q: variation to get validation results for :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to interpret HGVS dup/del expressions + in VRS. :return: ValidationSummary for the variation and list of warnings """ warnings = list() @@ -53,13 +80,15 @@ def get_validations(self, q, normalize_endpoint=False)\ tokens = self.tokenizer.perform(unquote(q.strip()), warnings) classifications = self.classifier.perform(tokens) validations = self.validator.perform( - classifications, normalize_endpoint, warnings + classifications, normalize_endpoint, warnings, + hgvs_dup_del_mode=hgvs_dup_del_mode ) if not warnings: warnings = validations.warnings return validations, warnings - def get_translations(self, validations, warnings)\ + def get_translations(self, validations: ValidationSummary, + warnings: List)\ -> Tuple[Optional[Union[List[Allele], List[CopyNumber], List[Text], List[Haplotype], List[VariationSet]]], @@ -67,7 +96,7 @@ def get_translations(self, validations, warnings)\ """Return a list translations from a ValidationSummary. :param ValidationSummary validations: Valid and Invalid results - :param list warnings: List of warnings + :param List warnings: List of warnings :return: A list of unique translations from valid results """ translations = [] diff --git a/variation/tokenizers/__init__.py b/variation/tokenizers/__init__.py index 7c1f2407..b8db09e2 100644 --- a/variation/tokenizers/__init__.py +++ b/variation/tokenizers/__init__.py @@ -39,3 +39,5 @@ from .coding_dna_insertion import CodingDNAInsertion # noqa: F401 from .genomic_insertion import GenomicInsertion # noqa: F401 from .genomic_uncertain_deletion import GenomicUncertainDeletion # noqa: F401 +from .genomic_duplication import GenomicDuplication # noqa: F401 +from .genomic_deletion_range import GenomicDeletionRange # noqa: F401 diff --git a/variation/tokenizers/deletion_base.py b/variation/tokenizers/deletion_base.py index e3af4582..e3716fee 100644 --- a/variation/tokenizers/deletion_base.py +++ b/variation/tokenizers/deletion_base.py @@ -1,6 +1,6 @@ """A module for Deletion Tokenization Base Class.""" from abc import abstractmethod -from typing import Optional, Dict +from typing import Optional, Dict, List from .tokenizer import Tokenizer from .caches import AminoAcidCache, NucleotideCache from .tokenize_base import TokenizeBase @@ -21,7 +21,11 @@ def __init__(self, amino_acid_cache: AminoAcidCache, self.tokenize_base = TokenizeBase(amino_acid_cache, nucleotide_cache) def match(self, input_string: str) -> Optional[Deletion]: - """Return tokens that match the input string.""" + """Return tokens that match the input string. + + :param str input_string: Input string + :return: Deletion token if a match is found + """ if input_string is None: return None @@ -48,13 +52,13 @@ def match(self, input_string: str) -> Optional[Deletion]: self._get_parts(parts) return self.return_token(self.parts) - def _get_parts(self, parts): - """Get parts for DelIns. + def _get_parts(self, parts: List) -> None: + """Get parts for DelIns by updating `self.parts` - :param list parts: Parts of input string + :param List parts: Parts of input string """ if len(parts) != 2: - return + return None # Get reference sequence reference_sequence = parts[0][:1] @@ -62,7 +66,7 @@ def _get_parts(self, parts): positions_deleted = self.tokenize_base.get_positions_deleted(parts) if not positions_deleted: - return + return None if parts[1]: self.parts['deleted_sequence'] = \ @@ -77,11 +81,19 @@ def _get_parts(self, parts): else: end_pos_del = None + if start_pos_del and end_pos_del: + if start_pos_del > end_pos_del: + return None + self.parts['start_pos_del'] = start_pos_del self.parts['end_pos_del'] = end_pos_del self.parts['reference_sequence'] = reference_sequence @abstractmethod - def return_token(self, params: Dict[str, str]): - """Return token instance.""" + def return_token(self, params: Dict[str, str]) -> Optional[Deletion]: + """Return token instance. + + :param Dict params: Params for Deletion token + :return: Deletion token + """ raise NotImplementedError diff --git a/variation/tokenizers/deletion_range_base.py b/variation/tokenizers/deletion_range_base.py new file mode 100644 index 00000000..95a08b58 --- /dev/null +++ b/variation/tokenizers/deletion_range_base.py @@ -0,0 +1,68 @@ +"""A module for tokenizing genomic deletion ranges.""" +from typing import Dict, Optional, List +from .tokenizer import Tokenizer +from variation.schemas.token_response_schema import TokenMatchType, \ + DeletionRange +from abc import abstractmethod + + +class DeletionRangeBase(Tokenizer): + """The tokenizer class for genomic deletion range.""" + + def __init__(self) -> None: + """Initialize the Genomic Deletion Range Class.""" + self.parts = None + + def match(self, input_string: str) -> Optional[DeletionRange]: + """Return tokens that match the input string. + + :param str input_string: Input string + :return: DeletionRange token if a match is found + """ + if input_string is None: + return None + + self.parts = { + 'token': input_string, + 'input_string': input_string, + 'match_type': TokenMatchType.UNSPECIFIED.value, + 'start_pos1_del': None, + 'start_pos2_del': None, + 'end_pos1_del': None, + 'end_pos2_del': None, + 'reference_sequence': None + } + + input_string = str(input_string).lower() + if not input_string.endswith('del'): + return None + + if input_string.startswith('g.'): + self.parts['reference_sequence'] = 'g' + elif input_string.startswith('c.'): + self.parts['reference_sequence'] = 'c' + elif input_string.startswith('p.'): + self.parts['reference_sequence'] = 'p' + else: + return None + + parts = input_string.split('_') + self._get_parts(parts) + return self.return_token(self.parts) + + @abstractmethod + def _get_parts(self, parts: List) -> None: + """Set `self.parts` for genomic deletion range + + :param List parts: Parts of input string + """ + raise NotImplementedError + + @abstractmethod + def return_token(self, params: Dict[str, str]) -> Optional[DeletionRange]: + """Return token instance. + + :param Dict params: Params for DeletionRange token + :return: DeletionRange token + """ + raise NotImplementedError diff --git a/variation/tokenizers/duplication_base.py b/variation/tokenizers/duplication_base.py new file mode 100644 index 00000000..3ed02d1e --- /dev/null +++ b/variation/tokenizers/duplication_base.py @@ -0,0 +1,51 @@ +"""A module for Duplication Tokenization Base Class.""" +from abc import abstractmethod +from typing import Optional +from .tokenizer import Tokenizer +from variation.schemas.token_response_schema import Duplication, \ + TokenMatchType, DuplicationAltType + + +class DuplicationBase(Tokenizer): + """Class for tokenizing Deletions.""" + + def __init__(self) -> None: + """Initialize the Deletion Base Class.""" + self.parts = None + + def match(self, input_string: str) -> Optional[Duplication]: + """Return tokens that match the input string.""" + if input_string is None: + return None + + self.parts = { + 'token': input_string, + 'input_string': input_string, + 'match_type': TokenMatchType.UNSPECIFIED.value, + 'start_pos1_dup': None, + 'start_pos2_dup': None, + 'end_pos1_dup': None, + 'end_pos2_dup': None, + 'reference_sequence': None, + 'alt_type': DuplicationAltType.DUPLICATION + } + + input_string = str(input_string).lower() + if not input_string.endswith('dup'): + return None + + parts = [input_string[:-3]] + self._get_parts(parts) + return self.return_token() + + @abstractmethod + def _get_parts(self, parts): + """Get parts for DelIns. + :param list parts: Parts of input string + """ + raise NotImplementedError + + @abstractmethod + def return_token(self): + """Return token instance.""" + raise NotImplementedError diff --git a/variation/tokenizers/genomic_deletion_range.py b/variation/tokenizers/genomic_deletion_range.py new file mode 100644 index 00000000..a50155b3 --- /dev/null +++ b/variation/tokenizers/genomic_deletion_range.py @@ -0,0 +1,63 @@ +"""A module for Genomic Deletion Range Tokenization.""" +from variation.schemas.token_response_schema import GenomicDeletionRangeToken +from variation.tokenizers.deletion_range_base import DeletionRangeBase + + +class GenomicDeletionRange(DeletionRangeBase): + """Class for tokenizing deletion range at the genomic coordinate.""" + + def _get_parts(self, parts): + """Set parts for genomic deletion range. + + :param list parts: Parts of input string + """ + if len(parts) != 4: + return None + + conditions = ( + parts[0].startswith('g.('), + parts[1].endswith(')'), + parts[2].startswith('('), + parts[3].endswith(')del') + ) + + if all(conditions): + parts[0] = parts[0][3:] + parts[1] = parts[1][:-1] + parts[2] = parts[2][1:] + parts[3] = parts[3][:-4] + + try: + parts[0] = int(parts[0]) + parts[1] = int(parts[1]) + parts[2] = int(parts[2]) + parts[3] = int(parts[3]) + except ValueError: + return None + else: + prev_val = None + for i in range(4): + val = parts[i] + if val not in ["?", None]: + if prev_val is not None: + if prev_val > val: + return None + prev_val = val + + self.parts['start_pos1_del'] = parts[0] + self.parts['start_pos2_del'] = parts[1] + self.parts['end_pos1_del'] = parts[2] + self.parts['end_pos2_del'] = parts[3] + return None + + def return_token(self, params): + """Return Genomic Deletion Range token.""" + conditions = ( + params['start_pos1_del'] is not None, + params['start_pos2_del'] is not None, + params['end_pos1_del'] is not None, + params['end_pos2_del'] is not None + ) + if all(conditions): + if params['reference_sequence'] == 'g': + return GenomicDeletionRangeToken(**params) diff --git a/variation/tokenizers/genomic_duplication.py b/variation/tokenizers/genomic_duplication.py new file mode 100644 index 00000000..e804dd91 --- /dev/null +++ b/variation/tokenizers/genomic_duplication.py @@ -0,0 +1,118 @@ +"""A module for Genomic Duplication Tokenization.""" +from variation.schemas.token_response_schema import DuplicationAltType, \ + GenomicDuplicationToken, GenomicDuplicationRangeToken +from variation.tokenizers.duplication_base import DuplicationBase + + +class GenomicDuplication(DuplicationBase): + """Class for tokenizing duplications on the genomic coordinate.""" + + def _get_parts(self, parts): + if len(parts) != 1 or not parts[0].startswith('g.'): + return None + + parts[0] = parts[0][2:] + if '_' in parts[0]: + if parts[0].count('_') == 1: + pos = parts[0].split('_') + try: + pos[0] = int(pos[0]) + pos[1] = int(pos[1]) + except ValueError: + pass + else: + if pos[0] < pos[1]: + self.parts['start_pos1_dup'] = pos[0] + self.parts['start_pos2_dup'] = pos[1] + self.parts['reference_sequence'] = 'g' + else: + self.parts['alt_type'] = DuplicationAltType.DUPLICATION_RANGE + parts = parts[0].split('_') + len_parts = len(parts) + if len_parts == 4: + for part_ix, parts_field in [ + (0, 'start_pos1_dup'), + (1, 'start_pos2_dup'), + (2, 'end_pos1_dup'), + (3, 'end_pos2_dup') + ]: + part_val = self._check_uncertain_or_int(parts[part_ix]) + if part_val is None: + return None + else: + self.parts[parts_field] = part_val + self.parts['reference_sequence'] = 'g' + + elif len_parts == 3: + if '(' in parts[0] and ')' in parts[1]: + # Format is: (?_#)_#dup + for part_ix, parts_field in [ + (0, 'start_pos1_dup'), + (1, 'start_pos2_dup'), + (2, 'end_pos1_dup') + ]: + part_val = self._check_uncertain_or_int( + parts[part_ix]) + if part_val is None: + return None + else: + self.parts[parts_field] = part_val + else: + # Format is #_(#_?)dup + for part_ix, parts_field in [ + (0, 'start_pos1_dup'), + (1, 'end_pos1_dup'), + (2, 'end_pos2_dup') + ]: + part_val = self._check_uncertain_or_int( + parts[part_ix]) + if part_val is None: + return None + else: + self.parts[parts_field] = part_val + self.parts['reference_sequence'] = 'g' + else: + try: + pos = int(parts[0]) + except ValueError: + pass + else: + self.parts['start_pos1_dup'] = pos + self.parts['reference_sequence'] = 'g' + return None + + def _check_uncertain_or_int(self, part): + part = part.replace('(', '') + part = part.replace(')', '') + try: + return int(part) + except ValueError: + if part == '?': + self.parts['alt_type'] = \ + DuplicationAltType.UNCERTAIN_DUPLICATION + return part + return None + + def return_token(self): + """Return token instance if a match is found.""" + # we only set this field if it's valid + if self.parts['reference_sequence'] == 'g': + if self.parts['end_pos1_dup'] is None and \ + self.parts['end_pos2_dup'] is None: + + if self.parts['start_pos2_dup'] and self.parts['start_pos1_dup'] > self.parts['start_pos2_dup']: # noqa: E501 + return + + self.parts['alt_type'] = DuplicationAltType.DUPLICATION + return GenomicDuplicationToken(**self.parts) + else: + prev_val = None + for field in ['start_pos1_dup', 'start_pos2_dup', + 'end_pos1_dup', 'end_pos2_dup']: + val = self.parts[field] + if val not in ["?", None]: + if prev_val is not None: + if prev_val > val: + return + prev_val = val + return GenomicDuplicationRangeToken(**self.parts) diff --git a/variation/tokenizers/genomic_uncertain_deletion.py b/variation/tokenizers/genomic_uncertain_deletion.py index 037938cf..3b676ff0 100644 --- a/variation/tokenizers/genomic_uncertain_deletion.py +++ b/variation/tokenizers/genomic_uncertain_deletion.py @@ -1,69 +1,102 @@ """A module for tokenizing genomic uncertain deletion.""" -from typing import Optional -from .tokenizer import Tokenizer -from variation.schemas.token_response_schema import TokenMatchType, \ +from pydantic.error_wrappers import ValidationError + +from variation.schemas.token_response_schema import \ GenomicUncertainDeletionToken +from variation.tokenizers.deletion_range_base import DeletionRangeBase -class GenomicUncertainDeletion(Tokenizer): +class GenomicUncertainDeletion(DeletionRangeBase): """The tokenizer class for genomic uncertain deletion.""" - def __init__(self) -> None: - """Initialize the Genomic Uncertain Deletion Class.""" - self.parts = None - - def match(self, input_string: str)\ - -> Optional[GenomicUncertainDeletionToken]: - """Return tokens that match the input string.""" - if input_string is None: - return None - - self.parts = { - 'token': input_string, - 'input_string': input_string, - 'match_type': TokenMatchType.UNSPECIFIED.value, - 'start_pos2_del': None, - 'end_pos1_del': None - } + def _get_parts(self, parts): + """Set parts for genomic uncertain deletion. - input_string = str(input_string).lower() - conditions = ( - input_string.endswith('del'), - input_string.startswith('g.'), - input_string.count('_') == 3 - ) - if not all(conditions): + :param list parts: Parts of input string + """ + len_parts = len(parts) + if len_parts not in [3, 4]: return None - parts = input_string.split('_') - self._get_parts(parts) - if self.parts['start_pos2_del'] is None or \ - self.parts['end_pos1_del'] is None: + if not parts[0].startswith('g.'): return None - return GenomicUncertainDeletionToken(**self.parts) - def _get_parts(self, parts): - """Set parts for genomic copy number loss. - - :param list parts: Parts of input string - """ - conditions = ( - len(parts) == 4, - parts[0] == 'g.(?' and parts[3] == '?)del', - parts[1].endswith(')'), - parts[2].startswith('(') - ) - if all(conditions): - parts[1] = parts[1][:-1] - parts[2] = parts[2][1:] + parts[0] = parts[0][2:] + parts[len_parts - 1] = parts[len_parts - 1][:-3] - try: - parts[1] = int(parts[1]) - parts[2] = int(parts[2]) - except ValueError: - return None + if len_parts == 3: + if '(' in parts[0] and ')' in parts[1]: + # Format is: (?_#)_#del + for part_ix, parts_field in [ + (0, 'start_pos1_del'), + (1, 'start_pos2_del'), + (2, 'end_pos1_del') + ]: + part_val = self._check_uncertain_or_int( + parts[part_ix]) + if part_val is None: + return None + else: + self.parts[parts_field] = part_val else: - if parts[1] < parts[2]: - self.parts['start_pos2_del'] = parts[1] - self.parts['end_pos1_del'] = parts[2] + # Format is #_(#_?)del + for part_ix, parts_field in [ + (0, 'start_pos1_del'), + (1, 'end_pos1_del'), + (2, 'end_pos2_del') + ]: + part_val = self._check_uncertain_or_int( + parts[part_ix]) + if part_val is None: + return None + else: + self.parts[parts_field] = part_val + self.parts['reference_sequence'] = 'g' + elif len_parts == 4: + for part_ix, parts_field in [ + (0, 'start_pos1_del'), + (1, 'start_pos2_del'), + (2, 'end_pos1_del'), + (3, 'end_pos2_del') + ]: + part_val = self._check_uncertain_or_int(parts[part_ix]) + if part_val is None: + return None + else: + self.parts[parts_field] = part_val + self.parts['reference_sequence'] = 'g' return None + + def _check_uncertain_or_int(self, part): + part = part.replace('(', '') + part = part.replace(')', '') + try: + return int(part) + except ValueError: + if part == '?': + return part + return None + + def return_token(self, params): + """Return Genomic Uncertain Deletion token.""" + if params['reference_sequence'] == 'g': + conditions = ( + isinstance(params['start_pos1_del'], int), + isinstance(params['start_pos2_del'], int), + isinstance(params['end_pos1_del'], int), + isinstance(params['end_pos1_del'], int) + ) + if not all(conditions): + try: + prev_val = None + for field in ['start_pos1_del', 'start_pos2_del', + 'end_pos1_del', 'end_pos2_del']: + val = self.parts[field] + if val not in ["?", None]: + if prev_val is not None: + if prev_val > val: + return + prev_val = val + return GenomicUncertainDeletionToken(**params) + except ValidationError: + return None diff --git a/variation/tokenizers/tokenize.py b/variation/tokenizers/tokenize.py index 8da32ec2..9ce53aef 100644 --- a/variation/tokenizers/tokenize.py +++ b/variation/tokenizers/tokenize.py @@ -35,6 +35,8 @@ from .coding_dna_insertion import CodingDNAInsertion from .genomic_insertion import GenomicInsertion from .genomic_uncertain_deletion import GenomicUncertainDeletion +from .genomic_duplication import GenomicDuplication +from .genomic_deletion_range import GenomicDeletionRange from variation.schemas.token_response_schema import Token, TokenMatchType from .caches import NucleotideCache @@ -77,7 +79,9 @@ def __init__(self, amino_acid_cache, gene_symbol: GeneSymbol) -> None: AminoAcidInsertion(amino_acid_cache, nucleotide_cache), CodingDNAInsertion(amino_acid_cache, nucleotide_cache), GenomicInsertion(amino_acid_cache, nucleotide_cache), - GenomicUncertainDeletion() + GenomicUncertainDeletion(), + GenomicDuplication(), + GenomicDeletionRange() # ProteinTermination(amino_acid_cache), # UnderExpression(), # WildType(), diff --git a/variation/translators/__init__.py b/variation/translators/__init__.py index 894cc164..1b9bc542 100644 --- a/variation/translators/__init__.py +++ b/variation/translators/__init__.py @@ -18,3 +18,5 @@ from .coding_dna_insertion import CodingDNAInsertion # noqa: F401 from .genomic_insertion import GenomicInsertion # noqa: F401 from .genomic_uncertain_deletion import GenomicUncertainDeletion # noqa: F401 +from .genomic_duplication import GenomicDuplication # noqa: F401 +from .genomic_deletion_range import GenomicDeletionRange # noqa: F401 diff --git a/variation/translators/genomic_deletion_range.py b/variation/translators/genomic_deletion_range.py new file mode 100644 index 00000000..70ebe702 --- /dev/null +++ b/variation/translators/genomic_deletion_range.py @@ -0,0 +1,17 @@ +"""Module for Genomic Deletion Range Translation.""" +from variation.translators.translator import Translator +from variation.schemas.classification_response_schema import ClassificationType +from variation.schemas.token_response_schema import \ + GenomicDeletionRangeToken + + +class GenomicDeletionRange(Translator): + """The Genomic Insertion Translator class.""" + + def can_translate(self, type: ClassificationType) -> bool: + """Return if classification type is Genomic Insertion.""" + return type == ClassificationType.GENOMIC_DELETION_RANGE + + def is_token_instance(self, token): + """Return if the token is an Genomic Deletion Range token instance.""" + return isinstance(token, GenomicDeletionRangeToken) diff --git a/variation/translators/genomic_duplication.py b/variation/translators/genomic_duplication.py new file mode 100644 index 00000000..797c9fbd --- /dev/null +++ b/variation/translators/genomic_duplication.py @@ -0,0 +1,18 @@ +"""Module for Genomic Duplication Translation.""" +from variation.translators.translator import Translator +from variation.schemas.classification_response_schema import ClassificationType +from variation.schemas.token_response_schema import \ + GenomicDuplicationRangeToken, GenomicDuplicationToken + + +class GenomicDuplication(Translator): + """The Genomic Insertion Translator class.""" + + def can_translate(self, type: ClassificationType) -> bool: + """Return if classification type is Genomic Insertion.""" + return type == ClassificationType.GENOMIC_DUPLICATION + + def is_token_instance(self, token): + """Return if the token is an Genomic Duplication token instance.""" + return isinstance(token, (GenomicDuplicationToken, + GenomicDuplicationRangeToken)) diff --git a/variation/translators/translate.py b/variation/translators/translate.py index be89588b..bc8adecd 100644 --- a/variation/translators/translate.py +++ b/variation/translators/translate.py @@ -1,6 +1,4 @@ """Module for translation.""" -from ga4gh.vrsatile.pydantic.vrs_model import Allele, CopyNumber,\ - VariationSet, Haplotype from variation.schemas.validation_response_schema import ValidationResult from .translator import Translator from .amino_acid_substitution import AminoAcidSubstitution @@ -20,7 +18,9 @@ from .coding_dna_insertion import CodingDNAInsertion from .genomic_insertion import GenomicInsertion from .genomic_uncertain_deletion import GenomicUncertainDeletion -from typing import List, Optional, Union +from .genomic_duplication import GenomicDuplication +from .genomic_deletion_range import GenomicDeletionRange +from typing import List, Optional, Dict class Translate: @@ -45,11 +45,13 @@ def __init__(self) -> None: AminoAcidInsertion(), CodingDNAInsertion(), GenomicInsertion(), - GenomicUncertainDeletion() + GenomicDeletionRange(), + GenomicUncertainDeletion(), + GenomicDuplication() ] def perform(self, res: ValidationResult) \ - -> Optional[Union[Allele, CopyNumber, Haplotype, VariationSet]]: + -> Optional[Dict]: """Translate a valid variation query.""" for translator in self.all_translators: if translator.can_translate( diff --git a/variation/translators/translator.py b/variation/translators/translator.py index 85e53ffa..6117f21e 100644 --- a/variation/translators/translator.py +++ b/variation/translators/translator.py @@ -1,6 +1,8 @@ """Module for translation.""" from abc import ABC, abstractmethod -from ga4gh.vrsatile.pydantic.vrs_model import Allele, CopyNumber # noqa: F401 +from typing import Dict, Optional +from ga4gh.vrsatile.pydantic.vrs_models import Allele, CopyNumber +from pydantic.error_wrappers import ValidationError from variation.schemas.validation_response_schema import ValidationResult from variation.schemas.classification_response_schema import ClassificationType @@ -18,8 +20,8 @@ def is_token_instance(self, token): """Check that the token is the correct instance for a translator.""" raise NotImplementedError - def translate(self, res: ValidationResult) -> Allele: - """Translate to VRS representation for an Allele.""" + def translate(self, res: ValidationResult) -> Optional[Dict]: + """Translate to VRS Variation representation.""" instance_tokens = [t for t in res.classification.all_tokens if self.is_token_instance(t)] len_instance_tokens = len(instance_tokens) @@ -35,7 +37,12 @@ def translate(self, res: ValidationResult) -> Allele: t = f"{tokens[0]} ({tokens[0].replace('ter', '*')})" if t.lower() == tokens[1].lower(): if variation_type == 'Allele': - return Allele(**res.variation) + try: + Allele(**res.variation) + except ValidationError: + return None + else: + return res.variation raise Exception(f'Should not have more than one ' f'{self.__class__.__name__} ' @@ -44,14 +51,23 @@ def translate(self, res: ValidationResult) -> Allele: if variation_type == 'Allele': if not res.variation['location']: raise Exception("Cannot translate an allele with no location") - variation = Allele(**res.variation) + try: + Allele(**res.variation) + except ValidationError: + variation = None + else: + variation = res.variation elif variation_type == 'CopyNumber': if res.variation['subject']['type'] == "Allele": if not res.variation['subject']['location']: raise Exception("Cannot translate a CNV with no location") - variation = CopyNumber(**res.variation) + try: + CopyNumber(**res.variation) + except ValidationError: + variation = None + else: + variation = res.variation else: raise Exception(f"{variation_type} not supported in " f"Variation Normalization") - return variation diff --git a/variation/validators/__init__.py b/variation/validators/__init__.py index 3e7612e2..cace9f05 100644 --- a/variation/validators/__init__.py +++ b/variation/validators/__init__.py @@ -21,3 +21,5 @@ from .coding_dna_insertion import CodingDNAInsertion # noqa: F401 from .genomic_insertion import GenomicInsertion # noqa: F401 from .genomic_uncertain_deletion import GenomicUncertainDeletion # noqa: F401 +from .genomic_duplication import GenomicDuplication # noqa: F401 +from .genomic_deletion_range import GenomicDeletionRange # noqa: F401 diff --git a/variation/validators/amino_acid_deletion.py b/variation/validators/amino_acid_deletion.py index 56596e85..81e7f2af 100644 --- a/variation/validators/amino_acid_deletion.py +++ b/variation/validators/amino_acid_deletion.py @@ -13,6 +13,7 @@ from ga4gh.vrs.dataproxy import SeqRepoDataProxy from ga4gh.vrs.extras.translator import Translator import logging +from gene.query import QueryHandler as GeneQueryHandler logger = logging.getLogger('variation') @@ -27,6 +28,7 @@ def __init__(self, seq_repo_access: SeqRepoAccess, gene_symbol: GeneSymbol, mane_transcript: MANETranscript, uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, + gene_normalizer: GeneQueryHandler, amino_acid_cache: AminoAcidCache) \ -> None: """Initialize the validator. @@ -38,11 +40,12 @@ def __init__(self, seq_repo_access: SeqRepoAccess, :param MANETranscript mane_transcript: Access MANE Transcript information :param UTA uta: Access to UTA queries + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer :param amino_acid_cache: Amino Acid codes and conversions """ super().__init__( seq_repo_access, transcript_mappings, gene_symbol, mane_transcript, - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) self._amino_acid_cache = amino_acid_cache self.amino_acid_base = AminoAcidBase(seq_repo_access, amino_acid_cache) @@ -63,7 +66,8 @@ def get_transcripts(self, gene_tokens, classification, errors)\ def get_valid_invalid_results(self, classification_tokens, transcripts, classification, results, gene_tokens, normalize_endpoint, mane_data_found, - is_identifier) -> None: + is_identifier, hgvs_dup_del_mode)\ + -> None: """Add validation result objects to a list of results. :param list classification_tokens: A list of classification Tokens @@ -77,6 +81,10 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, :param dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -106,7 +114,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, ) self.add_mane_data( mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens + s.alt_type, s, ) self.add_validation_result(allele, valid_alleles, results, diff --git a/variation/validators/amino_acid_delins.py b/variation/validators/amino_acid_delins.py index be922102..351ca2a2 100644 --- a/variation/validators/amino_acid_delins.py +++ b/variation/validators/amino_acid_delins.py @@ -1,8 +1,8 @@ """The module for Amino Acid DelIns Validation.""" from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import AminoAcidDelInsToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.validators.validator import Validator from variation.schemas.token_response_schema import GeneMatchToken from variation.tokenizers import GeneSymbol @@ -13,6 +13,9 @@ from ga4gh.vrs.dataproxy import SeqRepoDataProxy from ga4gh.vrs.extras.translator import Translator import logging +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') @@ -27,6 +30,7 @@ def __init__(self, seq_repo_access: SeqRepoAccess, gene_symbol: GeneSymbol, mane_transcript: MANETranscript, uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, + gene_normalizer: GeneQueryHandler, amino_acid_cache: AminoAcidCache) \ -> None: """Initialize the validator. @@ -38,11 +42,12 @@ def __init__(self, seq_repo_access: SeqRepoAccess, :param MANETranscript mane_transcript: Access MANE Transcript information :param UTA uta: Access to UTA queries + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer :param amino_acid_cache: Amino Acid codes and conversions """ super().__init__( seq_repo_access, transcript_mappings, gene_symbol, mane_transcript, - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) self._amino_acid_cache = amino_acid_cache self.amino_acid_base = AminoAcidBase(seq_repo_access, amino_acid_cache) @@ -60,23 +65,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_protein_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -106,7 +117,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, ) self.add_mane_data( mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens, alt=s.inserted_sequence + s.alt_type, s, alt=s.inserted_sequence ) self.add_validation_result(allele, valid_alleles, results, diff --git a/variation/validators/amino_acid_insertion.py b/variation/validators/amino_acid_insertion.py index 375df5a6..6ba546fa 100644 --- a/variation/validators/amino_acid_insertion.py +++ b/variation/validators/amino_acid_insertion.py @@ -1,8 +1,8 @@ """The module for Amino Acid Insertion Validation.""" from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import AminoAcidInsertionToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.validators.validator import Validator from variation.schemas.token_response_schema import GeneMatchToken from variation.tokenizers import GeneSymbol @@ -13,6 +13,9 @@ from ga4gh.vrs.dataproxy import SeqRepoDataProxy from ga4gh.vrs.extras.translator import Translator import logging +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') @@ -27,6 +30,7 @@ def __init__(self, seq_repo_access: SeqRepoAccess, gene_symbol: GeneSymbol, mane_transcript: MANETranscript, uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, + gene_normalizer: GeneQueryHandler, amino_acid_cache: AminoAcidCache) \ -> None: """Initialize the validator. @@ -38,11 +42,12 @@ def __init__(self, seq_repo_access: SeqRepoAccess, :param MANETranscript mane_transcript: Access MANE Transcript information :param UTA uta: Access to UTA queries + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer :param amino_acid_cache: Amino Acid codes and conversions """ super().__init__( seq_repo_access, transcript_mappings, gene_symbol, mane_transcript, - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) self._amino_acid_cache = amino_acid_cache self.amino_acid_base = AminoAcidBase(seq_repo_access, amino_acid_cache) @@ -60,23 +65,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_protein_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -108,7 +119,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, ) self.add_mane_data(mane, mane_data_found, s.reference_sequence, s.alt_type, s, - gene_tokens, alt=s.inserted_sequence) + alt=s.inserted_sequence) self.add_validation_result(allele, valid_alleles, results, classification, s, t, gene_tokens, diff --git a/variation/validators/coding_dna_deletion.py b/variation/validators/coding_dna_deletion.py index ef2e9abb..e8d290f5 100644 --- a/variation/validators/coding_dna_deletion.py +++ b/variation/validators/coding_dna_deletion.py @@ -1,18 +1,21 @@ """The module for Coding DNA Deletion Validation.""" -from variation.validators.deletion_base import DeletionBase +from variation.validators.duplication_deletion_base import\ + DuplicationDeletionBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import CodingDNADeletionToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.schemas.token_response_schema import GeneMatchToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) -class CodingDNADeletion(DeletionBase): +class CodingDNADeletion(DuplicationDeletionBase): """The Coding DNA Deletion Validator class.""" def get_transcripts(self, gene_tokens, classification, errors)\ @@ -27,23 +30,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_coding_dna_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -80,7 +89,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, self.add_mane_data( mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens + s.alt_type, s, ) self.add_validation_result( diff --git a/variation/validators/coding_dna_delins.py b/variation/validators/coding_dna_delins.py index ec30bc34..ee373342 100644 --- a/variation/validators/coding_dna_delins.py +++ b/variation/validators/coding_dna_delins.py @@ -1,11 +1,13 @@ """The module for Coding DNA DelIns Validation.""" from variation.validators.delins_base import DelInsBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import CodingDNADelInsToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.schemas.token_response_schema import GeneMatchToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') @@ -27,23 +29,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_coding_dna_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -75,7 +83,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, self.add_mane_data( mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens, alt=s.inserted_sequence1 + s.alt_type, s, alt=s.inserted_sequence1 ) self.add_validation_result( diff --git a/variation/validators/coding_dna_silent_mutation.py b/variation/validators/coding_dna_silent_mutation.py index 32ff076a..6b574a79 100644 --- a/variation/validators/coding_dna_silent_mutation.py +++ b/variation/validators/coding_dna_silent_mutation.py @@ -1,12 +1,14 @@ """The module for Coding DNA Substitution Validation.""" from .single_nucleotide_variation_base import SingleNucleotideVariationBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import\ CodingDNASilentMutationToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.schemas.token_response_schema import GeneMatchToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') @@ -28,23 +30,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_coding_dna_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ self.silent_mutation_valid_invalid_results( classification_tokens, transcripts, classification, results, diff --git a/variation/validators/coding_dna_substitution.py b/variation/validators/coding_dna_substitution.py index 59c52622..a85154a8 100644 --- a/variation/validators/coding_dna_substitution.py +++ b/variation/validators/coding_dna_substitution.py @@ -1,11 +1,13 @@ """The module for Coding DNA Substitution Validation.""" from .single_nucleotide_variation_base import SingleNucleotideVariationBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import CodingDNASubstitutionToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.schemas.token_response_schema import GeneMatchToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum # TODO: # LRG_ (LRG_199t1:c) @@ -30,23 +32,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_coding_dna_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -84,8 +92,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, self.add_mane_data( mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens, - alt=s.new_nucleotide + s.alt_type, s, alt=s.new_nucleotide ) self.add_validation_result( diff --git a/variation/validators/deletion_base.py b/variation/validators/deletion_base.py deleted file mode 100644 index 69c80d8b..00000000 --- a/variation/validators/deletion_base.py +++ /dev/null @@ -1,77 +0,0 @@ -"""The module for Deletion Validation.""" -from typing import Optional -from variation.validators.validator import Validator -import logging - -logger = logging.getLogger('variation') -logger.setLevel(logging.DEBUG) - - -class DeletionBase(Validator): - """The Deletion Validator Base class.""" - - def get_reference_sequence(self, ac, start, end, errors, cds_start=None)\ - -> Optional[str]: - """Get deleted reference sequence. - - :param str ac: Accession - :param int start: Start position - :param int end: End position - :param list errors: List of errors - :param int cds_start: Coding start site - :return: Reference sequence of nucleotides - """ - if cds_start: - start += cds_start - if end is not None: - end += cds_start - - if start and not end: - ref_sequence = self.seqrepo_access.get_sequence( - ac, start - ) - elif start is not None and end is not None: - ref_sequence = self.seqrepo_access.get_sequence( - ac, start, end - ) - else: - ref_sequence = None - - if not ref_sequence: - errors.append("Unable to get reference sequence.") - return ref_sequence - - def check_reference_sequence(self, t, s, errors, cds_start=None) -> bool: - """Check that reference sequence matches deleted sequence. - - :param str t: Accession - :param Token s: Classification token - :param list errors: List of errors - :param int cds_start: Coding start site - :return: `True` if ref_sequences matches deleted_sequence. - `False` otherwise. - """ - ref_sequence = self.get_reference_sequence( - t, s.start_pos_del, s.end_pos_del, errors, cds_start=cds_start - ) - - if not errors and ref_sequence and s.deleted_sequence: - if ref_sequence != s.deleted_sequence: - errors.append(f"Expected deleted sequence {ref_sequence} " - f"but got {s.deleted_sequence}") - - def concise_description(self, transcript, token) -> str: - """Return a HGVS description of the identified variation. - - :param str transcript: Transcript accession - :param Token token: Classification token - :return: HGVS expression - """ - position = f"{token.start_pos_del}" - if token.end_pos_del is not None: - position += f"_{token.end_pos_del}" - - descr = f"{transcript}:{token.reference_sequence}.{position}del" - if token.deleted_sequence: - descr += f"{token.deleted_sequence}" - return descr diff --git a/variation/validators/delins_base.py b/variation/validators/delins_base.py index 07434a5e..1ba34a59 100644 --- a/variation/validators/delins_base.py +++ b/variation/validators/delins_base.py @@ -1,6 +1,12 @@ """The module for DelIns Validation.""" +from typing import List, Dict, Optional +from variation.schemas.classification_response_schema import Classification, \ + ClassificationType +from variation.schemas.token_response_schema import Token, GeneMatchToken from variation.validators.validator import Validator import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -9,6 +15,89 @@ class DelInsBase(Validator): """The DelIns Validator Base class.""" + def is_token_instance(self, t: Token) -> bool: + """Check to see if token is instance of a token type. + + :param Token t: Classification token to find type of + :return: `True` if token is instance of class token. `False` otherwise. + """ + raise NotImplementedError + + def variation_name(self) -> str: + """Return the variation name. + + :return: variation class name + """ + raise NotImplementedError + + def human_description(self, transcript: str, token: Token) -> str: + """Return a human description of the identified variation. + + :param str transcript: Transcript accession + :param Token token: Classification token + :return: Human description of the variation change + """ + raise NotImplementedError + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: + """Return a list of gene tokens for a classification. + + :param Classification classification: Classification for a list of + tokens + :return: A list of gene tokens for the classification + """ + raise NotImplementedError + + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: + """Get transcript accessions for a given classification. + + :param List gene_tokens: A list of gene tokens + :param Classification classification: A classification for a list of + tokens + :param List errors: List of errors + :return: List of transcript accessions + """ + raise NotImplementedError + + def validates_classification_type( + self, classification_type: ClassificationType) -> bool: + """Check that classification type can be validated by validator. + + :param ClassificationType classification_type: The type of variation + :return: `True` if classification_type matches validator's + classification type. `False` otherwise. + """ + raise NotImplementedError + + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum) \ + -> None: + """Add validation result objects to a list of results. + + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions + :param Classification classification: A classification for a list of + tokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification + :param bool normalize_endpoint: `True` if normalize endpoint is being + used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + """ + raise NotImplementedError + def concise_description(self, transcript, token) -> str: """Return a HGVS description of the identified variation. diff --git a/variation/validators/duplication_deletion_base.py b/variation/validators/duplication_deletion_base.py new file mode 100644 index 00000000..9c18b9ea --- /dev/null +++ b/variation/validators/duplication_deletion_base.py @@ -0,0 +1,340 @@ +"""The base class for Duplication and Deletion Validation.""" +from typing import Optional, List, Dict, Tuple +from variation.schemas.classification_response_schema import Classification, \ + ClassificationType +from variation.schemas.token_response_schema import Token, GeneMatchToken +from variation.validators.validator import Validator +from variation.hgvs_dup_del_mode import HGVSDupDelMode +from variation.data_sources import SeqRepoAccess, TranscriptMappings, UTA +from variation.tokenizers import GeneSymbol +from variation.mane_transcript import MANETranscript +from ga4gh.vrs.dataproxy import SeqRepoDataProxy +from ga4gh.vrs.extras.translator import Translator +from gene.query import QueryHandler as GeneQueryHandler +import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum + +logger = logging.getLogger('variation') +logger.setLevel(logging.DEBUG) + + +class DuplicationDeletionBase(Validator): + """The Deletion Validator Base class.""" + + def __init__(self, seq_repo_access: SeqRepoAccess, + transcript_mappings: TranscriptMappings, + gene_symbol: GeneSymbol, + mane_transcript: MANETranscript, + uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, + gene_normalizer: GeneQueryHandler): + """Initialize the Deletion Base validator. + + :param SeqRepoAccess seq_repo_access: Access to SeqRepo data + :param TranscriptMappings transcript_mappings: Access to transcript + mappings + :param GeneSymbol gene_symbol: Gene symbol tokenizer + :param MANETranscript mane_transcript: Access MANE Transcript + information + :param UTA uta: Access to UTA queries + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer + """ + super().__init__( + seq_repo_access, transcript_mappings, gene_symbol, mane_transcript, + uta, dp, tlr, gene_normalizer + ) + self.hgvs_dup_del_mode = HGVSDupDelMode(seq_repo_access) + + def is_token_instance(self, t: Token) -> bool: + """Check to see if token is instance of a token type. + + :param Token t: Classification token to find type of + :return: `True` if token is instance of class token. `False` otherwise. + """ + raise NotImplementedError + + def variation_name(self) -> str: + """Return the variation name. + + :return: variation class name + """ + raise NotImplementedError + + def human_description(self, transcript: str, token: Token) -> str: + """Return a human description of the identified variation. + + :param str transcript: Transcript accession + :param Token token: Classification token + :return: Human description of the variation change + """ + raise NotImplementedError + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: + """Return a list of gene tokens for a classification. + + :param Classification classification: Classification for a list of + tokens + :return: A list of gene tokens for the classification + """ + raise NotImplementedError + + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: + """Get transcript accessions for a given classification. + + :param List gene_tokens: A list of gene tokens + :param Classification classification: A classification for a list of + tokens + :param List errors: List of errors + :return: List of transcript accessions + """ + raise NotImplementedError + + def validates_classification_type( + self, classification_type: ClassificationType) -> bool: + """Check that classification type can be validated by validator. + + :param ClassificationType classification_type: The type of variation + :return: `True` if classification_type matches validator's + classification type. `False` otherwise. + """ + raise NotImplementedError + + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum)\ + -> None: + """Add validation result objects to a list of results. + + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions + :param Classification classification: A classification for a list of + tokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification + :param bool normalize_endpoint: `True` if normalize endpoint is being + used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + """ + raise NotImplementedError + + def get_reference_sequence( + self, ac: str, start: int, end: int, errors: List, + cds_start: int = None) -> Optional[str]: + """Get deleted reference sequence. + + :param str ac: Accession + :param int start: Start position + :param int end: End position + :param list errors: List of errors + :param int cds_start: Coding start site + :return: Reference sequence of nucleotides + """ + if cds_start: + start += cds_start + if end is not None: + end += cds_start + + if start and not end: + ref_sequence = self.seqrepo_access.get_sequence( + ac, start + ) + elif start is not None and end is not None: + ref_sequence = self.seqrepo_access.get_sequence( + ac, start, end + ) + else: + ref_sequence = None + + if not ref_sequence: + errors.append("Unable to get reference sequence.") + return ref_sequence + + def check_reference_sequence(self, t: str, s: Token, errors: List, + cds_start: int = None) -> bool: + """Check that reference sequence matches deleted sequence. + + :param str t: Accession + :param Token s: Classification token + :param List errors: List of errors + :param int cds_start: Coding start site + :return: `True` if ref_sequences matches deleted_sequence. + `False` otherwise. + """ + ref_sequence = self.get_reference_sequence( + t, s.start_pos_del, s.end_pos_del, errors, cds_start=cds_start + ) + + if errors: + return False + else: + if ref_sequence and s.deleted_sequence: + if ref_sequence != s.deleted_sequence: + errors.append(f"Expected deleted sequence {ref_sequence} " + f"but got {s.deleted_sequence}") + return False + return True + + def concise_description(self, transcript: str, token: Token) -> str: + """Return a HGVS description of the identified variation. + + :param str transcript: Transcript accession + :param Token token: Classification token + :return: HGVS expression + """ + position = f"{token.start_pos_del}" + if token.end_pos_del is not None: + position += f"_{token.end_pos_del}" + + descr = f"{transcript}:{token.reference_sequence}.{position}del" + if token.deleted_sequence: + descr += f"{token.deleted_sequence}" + return descr + + def add_normalized_genomic_dup_del( + self, s: Token, t: str, start: int, end: int, gene: str, + so_id: str, errors: List, hgvs_dup_del_mode: HGVSDupDelModeEnum, + mane_data_found: Dict) -> None: + """Add normalized genomic dup or del to mane data + + :param Token s: Classification token + :param str t: Accession + :param int start: Start position + :param int end: ENd position + :param str gene: Gene + :param str so_id: Sequence ontology id + :param List errors: List of errors + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + :param Dict mane_data_found: MANE Transcript information found + """ + mane = self.mane_transcript.get_mane_transcript( + t, start, end, s.reference_sequence, gene=gene, + normalize_endpoint=True + ) + + if mane: + s.reference_sequence = 'c' + s.molecule_context = 'transcript' + s.so_id = so_id + + allele = self.to_vrs_allele( + mane['refseq'], mane['pos'][0], mane['pos'][1], + s.reference_sequence, s.alt_type, errors, + cds_start=mane['coding_start_site'] + ) + + mane_variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, hgvs_dup_del_mode) + + if mane_variation: + self._add_dict_to_mane_data( + mane['refseq'], s, mane_variation, + mane_data_found, mane['status'] + ) + + def validate_gene_or_accession_pos(self, t: str, pos_list: List, + errors: List, + gene: Optional[str] = None) -> None: + """Validate positions on gene or accession. + If not valid, add to list of errors + + :param str t: Accession + :param List pos_list: List of positions to validate + :param List errors: List of errors + :param Optional[str] gene: Gene + """ + if gene: + len_pos_list = len(pos_list) + pos1, pos2 = pos_list[0], pos_list[1] + if len_pos_list == 2: + pos3, pos4 = None, None + elif len_pos_list == 4: + pos3, pos4 = pos_list[2], pos_list[3] + else: + errors.append(f"Unexpected amount of positions:" + f" {len_pos_list}") + return + self._validate_gene_pos( + gene, t, pos1, pos2, errors, pos3=pos3, pos4=pos4) + else: + for pos in pos_list: + self._check_index(t, pos, errors) + + def get_grch38_pos_ac( + self, t: str, pos1: int, pos2: int, + pos3: Optional[int] = None, pos4: Optional[int] = None + ) -> Tuple[str, int, int, Optional[int], Optional[int], Optional[Dict]]: + """Get GRCh38 start/end and accession + + :param str t: Accession + :param int pos1: Position 1 (assumes 1-based) + :param int pos2: Position 2 (assumes 1-based) + :param int pos3: Position 3 (assumes 1-based) + :param int pos4: Position 4 (assumes 1-based) + :return: GRCh38 data (accession and positions) + """ + grch38_start = self.mane_transcript.g_to_grch38(t, pos1, pos2) + if grch38_start: + pos1, pos2 = grch38_start['pos'] + if pos3 is not None and pos4 is not None: + grch38_end = self.mane_transcript.g_to_grch38(t, pos3, pos4) + if grch38_end: + pos3, pos4 = grch38_end['pos'] + t = grch38_start['ac'] + return t, pos1, pos2, pos3, pos4, grch38_start + + def add_grch38_to_mane_data( + self, t: str, s: Token, errors: List, + grch38: Dict, mane_data_found: Dict, + hgvs_dup_del_mode: HGVSDupDelModeEnum, + ival: Optional[Tuple] = None, + use_vrs_allele_range: bool = True) -> None: + """Add grch38 variation to mane data + + :param str t: Accession + :param Token s: Classification token + :param List errors: List of errors + :param Dict grch38: GRCh38 data + :param Dict mane_data_found: MANE data found for initial query + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + :param Optionals[Tuple] ival: Interval + :param bool use_vrs_allele_range: `True` if allele should be computed + using `to_vrs_allele_ranges` method. `False` if allele should be + computed using `to_vrs_allele` method. + """ + if errors: + return + + if grch38: + t = grch38['ac'] + + if use_vrs_allele_range: + allele = self.to_vrs_allele_ranges( + t, s.reference_sequence, s.alt_type, errors, ival) + else: + allele = self.to_vrs_allele( + t, grch38['pos'][0], grch38['pos'][1], s.reference_sequence, + s.alt_type, errors) + + grch38_variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, hgvs_dup_del_mode) + + if grch38_variation: + self._add_dict_to_mane_data( + grch38['ac'], s, grch38_variation, mane_data_found, 'GRCh38') diff --git a/variation/validators/genomic_deletion.py b/variation/validators/genomic_deletion.py index 472a8fd4..9ef45aaa 100644 --- a/variation/validators/genomic_deletion.py +++ b/variation/validators/genomic_deletion.py @@ -1,22 +1,26 @@ """The module for Genomic Deletion Validation.""" -from variation.validators.deletion_base import DeletionBase +from variation.validators.duplication_deletion_base import\ + DuplicationDeletionBase from variation.schemas.classification_response_schema import \ - ClassificationType -from variation.schemas.token_response_schema import GenomicDeletionToken -from typing import List, Optional + ClassificationType, Classification +from variation.schemas.token_response_schema import GenomicDeletionToken, Token +from typing import List, Optional, Dict from variation.schemas.token_response_schema import GeneMatchToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) -class GenomicDeletion(DeletionBase): +class GenomicDeletion(DuplicationDeletionBase): """The Genomic Deletion Validator class.""" - def get_transcripts(self, gene_tokens, classification, errors)\ - -> Optional[List[str]]: + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: """Get transcript accessions for a given classification. :param list gene_tokens: A list of gene tokens @@ -27,48 +31,67 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_genomic_transcripts(classification, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of Tokens - :param list transcripts: A list of transcript strings + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: A list to store validation result objects - :param list gene_tokens: List of GeneMatchTokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: for t in transcripts: errors = list() t = self.get_accession(t, classification) - - allele = self.to_vrs_allele(t, s.start_pos_del, s.end_pos_del, - s.reference_sequence, s.alt_type, - errors) - - if not errors: + start, end = s.start_pos_del, None + if s.end_pos_del is None: + end = start + else: + end = s.end_pos_del + + # Validate pos + if gene_tokens: + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene_tokens[0].token) + else: self.check_reference_sequence(t, s, errors) if not errors: - mane = self.mane_transcript.get_mane_transcript( + allele = self.to_vrs_allele( t, s.start_pos_del, s.end_pos_del, - s.reference_sequence, - gene=gene_tokens[0].token if gene_tokens else None, - normalize_endpoint=normalize_endpoint - ) - self.add_mane_data( - mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens + s.reference_sequence, s.alt_type, errors) + + variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, hgvs_dup_del_mode, + pos=(start, end) ) + else: + variation = None + + if not errors: + self._get_normalize_variation( + gene_tokens, s, t, errors, hgvs_dup_del_mode, + mane_data_found, start, end) self.add_validation_result( - allele, valid_alleles, results, + variation, valid_alleles, results, classification, s, t, gene_tokens, errors ) @@ -80,7 +103,41 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, classification, gene_tokens ) - def get_gene_tokens(self, classification) -> List[GeneMatchToken]: + def _get_normalize_variation( + self, gene_tokens: List, s: Token, t: str, errors: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum, mane_data_found: Dict, + start: int, end: int) -> None: + """Get variation that will be returned in normalize endpoint. + + :param List gene_tokens: List of gene tokens + :param Token s: Classification token + :param str t: Accession + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Mode to use for + interpreting HGVS duplications and deletions + :param Dict mane_data_found: MANE Transcript data found for given query + :param int start: Start pos change + :param int end: End pos change + """ + if gene_tokens: + self.add_normalized_genomic_dup_del( + s, t, s.start_pos_del, s.end_pos_del, gene_tokens[0].token, + 'SO:0000159', errors, hgvs_dup_del_mode, mane_data_found) + else: + # No gene provided, then use GRCh38 assesmbly + if not self._is_grch38_assembly(t): + grch38 = self.mane_transcript.g_to_grch38(t, start, end) + else: + grch38 = dict(ac=t, pos=(start, end)) + + if grch38: + self.validate_gene_or_accession_pos( + grch38['ac'], [grch38['pos'][0], grch38['pos'][1]], errors) + self.add_grch38_to_mane_data( + t, s, errors, grch38, mane_data_found, hgvs_dup_del_mode, + use_vrs_allele_range=False) + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: """Return gene tokens for a classification. :param Classification classification: The classification for tokens @@ -88,12 +145,15 @@ def get_gene_tokens(self, classification) -> List[GeneMatchToken]: """ return self.get_gene_symbol_tokens(classification) - def variation_name(self): + def variation_name(self) -> str: """Return the variation name.""" return 'genomic deletion' - def is_token_instance(self, t): - """Check that token is Genomic Deletion.""" + def is_token_instance(self, t: Token): + """Check that token is Genomic Deletion. + + :param Token t: Classification token + """ return t.token_type == 'GenomicDeletion' def validates_classification_type( @@ -101,6 +161,9 @@ def validates_classification_type( classification_type: ClassificationType) -> bool: """Return whether or not the classification type is Genomic DelIns. + + :param ClassificationType classification_type: Classification type + :return: `True` if classification type matches, `False` otherwise """ return classification_type == ClassificationType.GENOMIC_DELETION diff --git a/variation/validators/genomic_deletion_range.py b/variation/validators/genomic_deletion_range.py new file mode 100644 index 00000000..b86378d2 --- /dev/null +++ b/variation/validators/genomic_deletion_range.py @@ -0,0 +1,249 @@ +"""The module for Genomic Deletion Range Validation.""" +from variation.schemas.classification_response_schema import \ + ClassificationType, Classification +from variation.schemas.token_response_schema import \ + GenomicDeletionRangeToken, Token +from typing import List, Optional, Dict, Tuple +from variation.schemas.token_response_schema import GeneMatchToken +import logging +from variation.hgvs_dup_del_mode import HGVSDupDelMode +from variation.data_sources import SeqRepoAccess, TranscriptMappings, UTA +from variation.tokenizers import GeneSymbol +from variation.mane_transcript import MANETranscript +from ga4gh.vrs.dataproxy import SeqRepoDataProxy +from ga4gh.vrs.extras.translator import Translator +from ga4gh.vrs import models +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum +from variation.validators.duplication_deletion_base import\ + DuplicationDeletionBase +logger = logging.getLogger('variation') +logger.setLevel(logging.DEBUG) + + +class GenomicDeletionRange(DuplicationDeletionBase): + """The Genomic Deletion Range Validator class.""" + + def __init__(self, seq_repo_access: SeqRepoAccess, + transcript_mappings: TranscriptMappings, + gene_symbol: GeneSymbol, + mane_transcript: MANETranscript, + uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, + gene_normalizer: GeneQueryHandler): + """Initialize the Genomic Deletion Range validator. + + :param SeqRepoAccess seq_repo_access: Access to SeqRepo data + :param TranscriptMappings transcript_mappings: Access to transcript + mappings + :param GeneSymbol gene_symbol: Gene symbol tokenizer + :param MANETranscript mane_transcript: Access MANE Transcript + information + :param UTA uta: Access to UTA queries + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer + """ + super().__init__( + seq_repo_access, transcript_mappings, gene_symbol, mane_transcript, + uta, dp, tlr, gene_normalizer + ) + self.hgvs_dup_del_mode = HGVSDupDelMode(seq_repo_access) + + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: + """Get transcript accessions for a given classification. + + :param List gene_tokens: A list of gene tokens + :param Classification classification: A classification for a list of + tokens + :param List errors: List of errors + :return: List of transcript accessions + """ + return self.get_genomic_transcripts(classification, errors) + + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: + """Add validation result objects to a list of results. + + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions + :param Classification classification: A classification for a list of + tokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification + :param bool normalize_endpoint: `True` if normalize endpoint is being + used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + """ + valid_alleles = list() + for s in classification_tokens: + for t in transcripts: + errors = list() + t = self.get_accession(t, classification) + + variation = self._get_variation( + s, t, errors, gene_tokens, hgvs_dup_del_mode) + + if not errors: + self._get_normalize_variation( + gene_tokens, s, t, errors, hgvs_dup_del_mode, + mane_data_found) + + self.add_validation_result( + variation, valid_alleles, results, + classification, s, t, gene_tokens, errors + ) + + if is_identifier: + break + + self.add_mane_to_validation_results( + mane_data_found, valid_alleles, results, + classification, gene_tokens + ) + + def _get_variation( + self, s: Token, t: str, errors: List, gene_tokens: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum) -> Optional[Dict]: + """Get variation data. + + :param Token s: Classification token + :param str t: Accession + :param List errors: List of errors + :param List gene_tokens: List of GeneMatchTokens for a classification + :param HGVSDupDelMode hgvs_dup_del_mode: Mode to use for interpreting + HGVS duplications and deletions + :return: Dictionary containing start/end position changes and variation + """ + variation, start, end = None, None, None + ival, grch38 = self._get_ival(t, s, errors, gene_tokens) + + if not errors: + if grch38: + t = grch38['ac'] + + allele = self.to_vrs_allele_ranges( + t, s.reference_sequence, s.alt_type, errors, ival) + + if start is not None and end is not None: + pos = (start, end) + else: + pos = None + + variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, + hgvs_dup_del_mode, pos=pos) + return variation + + def _get_normalize_variation( + self, gene_tokens: List, s: Token, t: str, errors: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum, + mane_data_found: Dict) -> None: + """Get variation that will be returned in normalize endpoint. + + :param List gene_tokens: List of gene tokens + :param Token s: Classification token + :param str t: Accession + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Mode to use for + interpreting HGVS duplications and deletions + :param Dict mane_data_found: MANE Transcript data found for given query + """ + ival, grch38 = self._get_ival(t, s, errors, gene_tokens, is_norm=True) + self.add_grch38_to_mane_data( + t, s, errors, grch38, mane_data_found, hgvs_dup_del_mode, + ival=ival) + + def _get_ival( + self, t: str, s: Token, errors: List, gene_tokens: List, + is_norm: bool = False + ) -> Optional[Tuple[Optional[models.SequenceInterval], Optional[Dict]]]: + """Get ival for variations with ranges. + + :param str t: Accession + :param Token t: Classification token + :param List errors: List of errors + :param List gene_tokens: LIst of gene tokens + :param bool is_norm: `True` if normalize endpoint is being used. + `False` otherwise. + :return: Sequence Interval and GRCh38 data if normalize endpoint + is being used + """ + ival, grch38, start1, start2, end1, end2 = None, None, None, None, None, None # noqa: E501 + # (#_#)_(#_#) + if is_norm: + t, start1, start2, end1, end2, grch38 = self.get_grch38_pos_ac( + t, s.start_pos1_del, s.start_pos2_del, pos3=s.end_pos1_del, + pos4=s.end_pos2_del + ) + else: + start1 = s.start_pos1_del + start2 = s.start_pos2_del + end1 = s.end_pos1_del + end2 = s.end_pos2_del + + gene = gene_tokens[0].token if gene_tokens else None + self.validate_gene_or_accession_pos( + t, [start1, start2, end1, end2], errors, gene=gene) + + if not errors and start1 and start2 and end1 and end2: + ival = self._get_ival_certain_range( + start1, start2, end1, end2 + ) + + return ival, grch38 + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: + """Return gene tokens for a classification. + + :param Classification classification: The classification for tokens + :return: A list of Gene Match Tokens in the classification + """ + return self.get_gene_symbol_tokens(classification) + + def variation_name(self) -> str: + """Return the variation name.""" + return 'genomic deletion range' + + def is_token_instance(self, t: Token) -> bool: + """Check that token is Genomic Deletion Range. + + :param Token t: Classification token + """ + return t.token_type == 'GenomicDeletionRange' + + def validates_classification_type( + self, + classification_type: ClassificationType) -> bool: + """Return whether or not the classification type is + Genomic Deletion Range. + + :param ClassificationType classification_type: Classification type + :return: `True` if classification type matches, `False` otherwise + """ + return classification_type == \ + ClassificationType.GENOMIC_DELETION_RANGE + + def human_description(self, transcript, + token: GenomicDeletionRangeToken) -> str: + """Return a human description of the identified variation.""" + descr = f"A Genomic Deletion from" \ + f" ({token.start_pos1_del}_{token.start_pos2_del}) to " \ + f"({token.end_pos1_del}_{token.end_pos2_del}) on {transcript}" + return descr + + def concise_description(self, transcript, token): + """Return a concise description of the identified variation.""" + return f"{transcript}:g.({token.start_pos1_del}_" \ + f"{token.start_pos2_del})_({token.end_pos1_del}_" \ + f"{token.end_pos2_del})" diff --git a/variation/validators/genomic_delins.py b/variation/validators/genomic_delins.py index f08d0e2a..0038a25e 100644 --- a/variation/validators/genomic_delins.py +++ b/variation/validators/genomic_delins.py @@ -1,11 +1,13 @@ """The module for Genomic DelIns Validation.""" from variation.validators.delins_base import DelInsBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import GenomicDelInsToken -from typing import List, Optional +from typing import List, Optional, Dict from variation.schemas.token_response_schema import GeneMatchToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') @@ -27,23 +29,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_genomic_transcripts(classification, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -67,7 +75,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, self.add_mane_data( mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens, alt=s.inserted_sequence1 + s.alt_type, s, alt=s.inserted_sequence1 ) self.add_validation_result( diff --git a/variation/validators/genomic_duplication.py b/variation/validators/genomic_duplication.py new file mode 100644 index 00000000..8f329157 --- /dev/null +++ b/variation/validators/genomic_duplication.py @@ -0,0 +1,360 @@ +"""The module for Genomic Duplication Validation.""" +from variation.validators.duplication_deletion_base import\ + DuplicationDeletionBase +from variation.schemas.classification_response_schema import \ + ClassificationType, Classification +from variation.schemas.token_response_schema import \ + TokenType, DuplicationAltType, \ + GenomicDuplicationToken, GenomicDuplicationRangeToken, Token # noqa: F401 +from typing import List, Optional, Dict, Tuple +from variation.schemas.token_response_schema import GeneMatchToken +import logging +from ga4gh.vrs import models +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum + +logger = logging.getLogger('variation') +logger.setLevel(logging.DEBUG) + + +class GenomicDuplication(DuplicationDeletionBase): + """The Genomic Duplication Validator class.""" + + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: + """Get transcript accessions for a given classification. + + :param List gene_tokens: A list of gene tokens + :param Classification classification: A classification for a list of + tokens + :param List errors: List of errors + :return: List of transcript accessions + """ + return self.get_genomic_transcripts(classification, errors) + + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: + """Add validation result objects to a list of results. + + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions + :param Classification classification: A classification for a list of + tokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification + :param bool normalize_endpoint: `True` if normalize endpoint is being + used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + """ + valid_alleles = list() + for s in classification_tokens: + for t in transcripts: + errors = list() + t = self.get_accession(t, classification) + + result = self._get_variation( + s, t, errors, gene_tokens, hgvs_dup_del_mode, + gene=gene_tokens[0].token if gene_tokens else None) + variation = result['variation'] + start = result['start'] + end = result['end'] + + if not errors: + self._get_normalize_variation( + gene_tokens, s, t, errors, hgvs_dup_del_mode, + mane_data_found, start, end) + + self.add_validation_result( + variation, valid_alleles, results, + classification, s, t, gene_tokens, errors + ) + + if is_identifier: + break + + self.add_mane_to_validation_results( + mane_data_found, valid_alleles, results, + classification, gene_tokens + ) + + def _get_variation(self, s: Token, t: str, errors: List, gene_tokens: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum, + gene: str = None) -> Optional[Dict]: + """Get variation data. + + :param Token s: Classification token + :param str t: Accession + :param List errors: List of errors + :param HGVSDupDelMode hgvs_dup_del_mode: Mode to use for interpreting + HGVS duplications and deletions + :param str gene: Gene symbol token + :return: Dictionary containing start/end position changes and variation + """ + variation, start, end = None, None, None + if s.token_type == TokenType.GENOMIC_DUPLICATION: + start = s.start_pos1_dup + if s.start_pos2_dup is None: + # Format: #dup + end = s.start_pos1_dup + else: + # Format: #_#dup + end = s.start_pos2_dup + + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene) + + if not errors: + allele = self.to_vrs_allele( + t, start, end, s.reference_sequence, + s.alt_type, errors) + variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, hgvs_dup_del_mode, + pos=(start, end)) + elif s.token_type == TokenType.GENOMIC_DUPLICATION_RANGE: + ival, grch38 = self._get_ival(t, s, gene_tokens, errors) + + if not errors: + if grch38: + t = grch38['ac'] + + allele = self.to_vrs_allele_ranges( + t, s.reference_sequence, s.alt_type, errors, ival) + if start is not None and end is not None: + pos = (start, end) + else: + pos = None + variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, + hgvs_dup_del_mode, pos=pos) + else: + errors.append(f"Token type not supported: {s.token_type}") + + return { + 'start': start, + 'end': end, + 'variation': variation + } + + def _get_normalize_variation(self, gene_tokens: List, s: Token, t: str, + errors: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum, + mane_data_found: Dict, start: int, + end: int) -> None: + """Get variation that will be returned in normalize endpoint. + + :param List gene_tokens: List of gene tokens + :param Token s: Classification token + :param str t: Accession + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Mode to use for + interpreting HGVS duplications and deletions + :param Dict mane_data_found: MANE Transcript data found for given query + :param int start: Start pos change + :param int end: End pos change + """ + if s.token_type == TokenType.GENOMIC_DUPLICATION_RANGE: + if s.alt_type != DuplicationAltType.UNCERTAIN_DUPLICATION: + # (#_#)_(#_#) + ival, grch38 = self._get_ival( + t, s, gene_tokens, errors, is_norm=True) + self.add_grch38_to_mane_data( + t, s, errors, grch38, mane_data_found, + hgvs_dup_del_mode, ival=ival + ) + else: + ival, grch38 = self._get_ival(t, s, gene_tokens, errors, + is_norm=True) + self.add_grch38_to_mane_data( + t, s, errors, grch38, mane_data_found, + hgvs_dup_del_mode, ival=ival) + else: + # #dup or #_#dup + if gene_tokens: + gene = gene_tokens[0].token + + # Validate position + self._validate_gene_pos(gene, t, start, end, errors) + if errors: + return + + self.add_normalized_genomic_dup_del( + s, t, start, end, gene_tokens[0].token, 'SO:1000035', + errors, hgvs_dup_del_mode, mane_data_found) + else: + grch38 = self.mane_transcript.g_to_grch38( + t, start, end) + + if grch38: + self.validate_gene_or_accession_pos( + grch38['ac'], [grch38['pos'][0], grch38['pos'][1]], + errors) + self.add_grch38_to_mane_data( + t, s, errors, grch38, mane_data_found, + hgvs_dup_del_mode, use_vrs_allele_range=False + ) + + def _get_ival( + self, t: str, s: Token, gene_tokens: List, errors: List, + is_norm: bool = False + ) -> Optional[Tuple[models.SequenceInterval, Dict]]: + """Get ival for variations with ranges. + + :param str t: Accession + :param Token t: Classification token + :param List gene_tokens: List of gene tokens + :param List errors: List of errors + :param bool is_norm: `True` if normalize endpoint is being used. + `False` otherwise. + :return: Sequence Interval and GRCh38 data if normalize endpoint + is being used + """ + ival, start, end, grch38 = None, None, None, None + gene = gene_tokens[0].token if gene_tokens else None + if s.alt_type != DuplicationAltType.UNCERTAIN_DUPLICATION: + # (#_#)_(#_#) + if is_norm: + t, start1, start2, end1, end2, grch38 = self.get_grch38_pos_ac( + t, s.start_pos1_dup, s.start_pos2_dup, pos3=s.end_pos1_dup, + pos4=s.end_pos2_dup + ) + else: + start1 = s.start_pos1_dup + start2 = s.start_pos2_dup + end1 = s.end_pos1_dup + end2 = s.end_pos2_dup + + if start1 and start2 and end1 and end2: + self.validate_gene_or_accession_pos( + t, [start1, start2, end1, end2], errors, gene=gene) + + if not errors: + ival = self._get_ival_certain_range( + start1, start2, end1, end2) + else: + if s.start_pos1_dup == '?' and s.end_pos2_dup == '?': + # format: (?_#)_(#_?) + if is_norm: + t, start, end, _, _, grch38 = self.get_grch38_pos_ac( + t, s.start_pos2_dup, s.end_pos1_dup + ) + else: + start = s.start_pos2_dup + end = s.end_pos1_dup + + # Validate positions + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene) + + if not errors and start and end: + ival = models.SequenceInterval( + start=self._get_start_indef_range(start), + end=self._get_end_indef_range(end) + ) + elif s.start_pos1_dup == '?' and \ + s.start_pos2_dup != '?' and \ + s.end_pos1_dup != '?' and \ + s.end_pos2_dup is None: + # format: (?_#)_# + if is_norm: + t, start, end, _, _, grch38 = self.get_grch38_pos_ac( + t, s.start_pos2_dup, s.end_pos1_dup) + else: + start = s.start_pos2_dup + end = s.end_pos1_dup + + # Validate positions + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene) + + if not errors and start and end: + ival = models.SequenceInterval( + start=self._get_start_indef_range(start), + end=models.Number(value=end) + ) + elif s.start_pos1_dup != '?' and \ + s.start_pos2_dup is None and \ + s.end_pos1_dup != '?' and \ + s.end_pos2_dup == '?': + # format: #_(#_?) + if is_norm: + t, start, end, _, _, grch38 = self.get_grch38_pos_ac( + t, s.start_pos1_dup, s.end_pos1_dup + ) + else: + start = s.start_pos1_dup + end = s.end_pos1_dup + start -= 1 + + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene) + + if not errors and start and end: + ival = models.SequenceInterval( + start=models.Number(value=start), + end=self._get_end_indef_range(end) + ) + else: + errors.append("Not yet supported") + return ival, grch38 + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: + """Return gene tokens for a classification. + + :param Classification classification: The classification for tokens + :return: A list of Gene Match Tokens in the classification + """ + return self.get_gene_symbol_tokens(classification) + + def variation_name(self) -> str: + """Return the variation name.""" + return 'genomic duplication' + + def is_token_instance(self, t): + """Check that token is an instance of Genomic Duplication.""" + return t.token_type in ['GenomicDuplication', + 'GenomicDuplicationRange'] + + def validates_classification_type( + self, + classification_type: ClassificationType) -> bool: + """Return whether or not the classification type is + Genomic Duplication. + + :param ClassificationType classification_type: Classification type + :return: `True` if classification type matches, `False` otherwise + """ + return classification_type == \ + ClassificationType.GENOMIC_DUPLICATION + + def human_description( + self, transcript: str, token: Token) -> str: + """Return a human description of the identified variation. + + :param str transcript: Accession + :param Token token: Classification token + :return: Human description of variant + """ + if token.token_type == 'GenomicDuplication': + descr = "A Genomic Deletion " + else: + descr = "A Genomic Deletion Range " + return descr + + def concise_description(self, transcript: str, token: Token) -> str: + """Return a concise description of the identified variation. + + :param str transcript: Accession + :param Token token: Classification token + """ + return "" diff --git a/variation/validators/genomic_silent_mutation.py b/variation/validators/genomic_silent_mutation.py index 744d5143..afec298c 100644 --- a/variation/validators/genomic_silent_mutation.py +++ b/variation/validators/genomic_silent_mutation.py @@ -1,11 +1,12 @@ """The module for Genomic Silent Mutation Validation.""" -from typing import Optional, List +from typing import Optional, List, Dict from .single_nucleotide_variation_base import SingleNucleotideVariationBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import GenomicSilentMutationToken import logging - +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -25,23 +26,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_genomic_transcripts(classification, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, - gene_tokens, normalize_endpoint, - mane_data_found, is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ self.silent_mutation_valid_invalid_results( classification_tokens, transcripts, classification, results, diff --git a/variation/validators/genomic_substitution.py b/variation/validators/genomic_substitution.py index 9c29da16..32e5d90e 100644 --- a/variation/validators/genomic_substitution.py +++ b/variation/validators/genomic_substitution.py @@ -1,10 +1,12 @@ """The module for Genomic Substitution Validation.""" -from typing import Optional, List +from typing import Optional, List, Dict from .single_nucleotide_variation_base import SingleNucleotideVariationBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import GenomicSubstitutionToken import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -25,23 +27,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_genomic_transcripts(classification, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -67,7 +75,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, self.add_mane_data(mane, mane_data_found, s.reference_sequence, s.alt_type, s, - gene_tokens, alt=s.new_nucleotide) + alt=s.new_nucleotide) self.add_validation_result( allele, valid_alleles, results, diff --git a/variation/validators/genomic_uncertain_deletion.py b/variation/validators/genomic_uncertain_deletion.py index 6d0a434c..cad0c966 100644 --- a/variation/validators/genomic_uncertain_deletion.py +++ b/variation/validators/genomic_uncertain_deletion.py @@ -1,47 +1,61 @@ """The module for Genomic Uncertain Deletion Validation.""" -from .validator import Validator +from variation.validators.duplication_deletion_base import\ + DuplicationDeletionBase from variation.schemas.classification_response_schema import \ - ClassificationType + ClassificationType, Classification from variation.schemas.token_response_schema import \ - GenomicUncertainDeletionToken -from typing import List, Optional + GenomicUncertainDeletionToken, Token +from typing import List, Optional, Dict, Tuple from variation.schemas.token_response_schema import GeneMatchToken import logging +from ga4gh.vrs import models +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) -class GenomicUncertainDeletion(Validator): +class GenomicUncertainDeletion(DuplicationDeletionBase): """The Genomic UncertainDeletion Validator class.""" - def get_transcripts(self, gene_tokens, classification, errors)\ - -> Optional[List[str]]: + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: """Get transcript accessions for a given classification. - :param list gene_tokens: A list of gene tokens + :param List gene_tokens: A list of gene tokens :param Classification classification: A classification for a list of tokens - :param list errors: List of errors + :param List errors: List of errors :return: List of transcript accessions """ return self.get_genomic_transcripts(classification, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of Tokens - :param list transcripts: A list of transcript strings + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: A list to store validation result objects - :param list gene_tokens: List of GeneMatchTokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -49,37 +63,17 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, errors = list() t = self.get_accession(t, classification) - allele = self.to_vrs_allele(t, s.start_pos2_del, - s.end_pos1_del, - s.reference_sequence, s.alt_type, - errors) - cnv = self.to_vrs_cnv(t, allele, 'del') - if not cnv: - errors.append(f"Unable to get CNV for {t}") + result = self._get_variation(s, t, errors, gene_tokens, + hgvs_dup_del_mode) + variation = result['variation'] if not errors: - grch38 = self.mane_transcript.g_to_grch38( - t, s.start_pos2_del, s.end_pos1_del) - - if grch38: - mane = dict( - gene=None, - refseq=grch38['ac'] if grch38['ac'].startswith('NC') else None, # noqa: E501 - ensembl=grch38['ac'] if grch38['ac'].startswith('ENSG') else None, # noqa: E501 - pos=grch38['pos'], - strand=None, - status='GRCh38' - ) - else: - mane = None - - self.add_mane_data( - mane, mane_data_found, s.reference_sequence, - s.alt_type, s, gene_tokens - ) + self._get_normalize_variation( + gene_tokens, s, t, errors, hgvs_dup_del_mode, + mane_data_found) self.add_validation_result( - cnv, valid_alleles, results, + variation, valid_alleles, results, classification, s, t, gene_tokens, errors ) @@ -91,7 +85,147 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, classification, gene_tokens ) - def get_gene_tokens(self, classification) -> List[GeneMatchToken]: + def _get_variation( + self, s: Token, t: str, errors: List, gene_tokens: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum) -> Optional[Dict]: + """Get variation data. + + :param Token s: Classification token + :param str t: Accession + :param List errors: List of errors + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Mode to use for + interpreting HGVS duplications and deletions + :return: Dictionary containing start/end position changes and variation + """ + variation, start, end = None, None, None + ival, grch38 = self._get_ival(t, s, errors, gene_tokens) + + if not errors: + if grch38: + t = grch38['ac'] + + allele = self.to_vrs_allele_ranges( + t, s.reference_sequence, s.alt_type, errors, ival) + if start is not None and end is not None: + pos = (start, end) + else: + pos = None + variation = self.hgvs_dup_del_mode.interpret_variation( + t, s.alt_type, allele, errors, + hgvs_dup_del_mode, pos=pos) + + return { + 'start': start, + 'end': end, + 'variation': variation + } + + def _get_normalize_variation( + self, gene_tokens: List, s: Token, t: str, errors: List, + hgvs_dup_del_mode: HGVSDupDelModeEnum, + mane_data_found: Dict) -> None: + """Get variation that will be returned in normalize endpoint. + + :param List gene_tokens: List of gene tokens + :param Token s: Classification token + :param str t: Accession + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Mode to use for + interpreting HGVS duplications and deletions + :param dict mane_data_found: MANE Transcript data found for given query + """ + if not gene_tokens: + ival, grch38 = self._get_ival( + t, s, errors, gene_tokens, is_norm=True) + self.add_grch38_to_mane_data( + t, s, errors, grch38, mane_data_found, hgvs_dup_del_mode, + ival=ival) + + def _get_ival( + self, t: str, s: Token, errors: List, gene_tokens: List, + is_norm: bool = False + ) -> Optional[Tuple[models.SequenceInterval, Dict]]: + """Get ival for variations with ranges. + + :param str t: Accession + :param Token t: Classification token + :param List errors: List of errors + :param bool is_norm: `True` if normalize endpoint is being used. + `False` otherwise. + :return: Sequence Interval and GRCh38 data if normalize endpoint + is being used + """ + ival = None + grch38 = None + gene = gene_tokens[0].token if gene_tokens else None + if s.start_pos1_del == '?' and s.end_pos2_del == '?': + # format: (?_#)_(#_?) + if is_norm: + t, start, end, _, _, grch38 = self.get_grch38_pos_ac( + t, s.start_pos2_del, s.end_pos1_del + ) + else: + start = s.start_pos2_del + end = s.end_pos1_del + + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene) + + if not errors and start and end: + ival = models.SequenceInterval( + start=self._get_start_indef_range(start), + end=self._get_end_indef_range(end) + ) + elif s.start_pos1_del == '?' and \ + s.start_pos2_del != '?' and \ + s.end_pos1_del != '?' and \ + s.end_pos2_del is None: + # format: (?_#)_# + if is_norm: + t, start, end, _, _, grch38 = self.get_grch38_pos_ac( + t, s.start_pos2_del, s.end_pos1_del + ) + else: + start = s.start_pos2_del + end = s.end_pos1_del + + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene + ) + + if not errors and start and end: + ival = models.SequenceInterval( + start=self._get_start_indef_range(start), # noqa: E501 + end=models.Number(value=end) + ) + elif s.start_pos1_del != '?' and \ + s.start_pos2_del is None and \ + s.end_pos1_del != '?' and \ + s.end_pos2_del == '?': + # format: #_(#_?) + if is_norm: + t, start, end, _, _, grch38 = self.get_grch38_pos_ac( + t, s.start_pos1_del, s.end_pos1_del + ) + else: + start = s.start_pos1_del + end = s.end_pos1_del + + start -= 1 + + self.validate_gene_or_accession_pos( + t, [start, end], errors, gene=gene) + + if not errors and start and end: + ival = models.SequenceInterval( + start=models.Number(value=start), + end=self._get_end_indef_range(end) + ) + else: + errors.append("Not yet supported") + return ival, grch38 + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: """Return gene tokens for a classification. :param Classification classification: The classification for tokens @@ -99,12 +233,15 @@ def get_gene_tokens(self, classification) -> List[GeneMatchToken]: """ return self.get_gene_symbol_tokens(classification) - def variation_name(self): + def variation_name(self) -> str: """Return the variation name.""" return 'genomic uncertain deletion' - def is_token_instance(self, t): - """Check that token is Genomic Uncertain Deletion.""" + def is_token_instance(self, t: Token) -> bool: + """Check that token is Genomic Uncertain Deletion. + + :param Token t: Classification token + """ return t.token_type == 'GenomicUncertainDeletion' def validates_classification_type( @@ -112,19 +249,31 @@ def validates_classification_type( classification_type: ClassificationType) -> bool: """Return whether or not the classification type is Genomic Uncertain Deletion. + + :param ClassificationType classification_type: Classification type + :return: `True` if classification type matches, `False` otherwise """ return classification_type == \ ClassificationType.GENOMIC_UNCERTAIN_DELETION - def human_description(self, transcript, + def human_description(self, transcript: str, token: GenomicUncertainDeletionToken) -> str: - """Return a human description of the identified variation.""" + """Return a human description of the identified variation. + + :param str transcript: Accession + :param GenomicUncertainDeletionToken token: Classification token + """ descr = f"A Genomic Uncertain Deletion from" \ f" (?_{token.start_pos2_del}) to {token.end_pos1_del}_? " \ f"on {transcript}" return descr - def concise_description(self, transcript, token): - """Return a consice description of the identified variation.""" + def concise_description(self, transcript: str, + token: GenomicUncertainDeletionToken) -> str: + """Return a concise description of the identified variation. + + :param str transcript: Accession + :param GenomicUncertainDeletionToken token: Classification token + """ return f"{transcript}:g.(?_{token.start_pos2_del})_" \ f"({token.end_pos1_del}_?)" diff --git a/variation/validators/insertion_base.py b/variation/validators/insertion_base.py index 7834c02d..4a5389e2 100644 --- a/variation/validators/insertion_base.py +++ b/variation/validators/insertion_base.py @@ -1,7 +1,11 @@ """The module for Insertion Validation.""" +from typing import List, Dict +from variation.schemas.classification_response_schema import Classification from variation.schemas.token_response_schema import Token from variation.validators.validator import Validator import logging +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -10,23 +14,29 @@ class InsertionBase(Validator): """The Insertion Validator Base class.""" - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -65,7 +75,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, ) self.add_mane_data(mane, mane_data_found, s.reference_sequence, s.alt_type, s, - gene_tokens, alt=s.inserted_sequence) + alt=s.inserted_sequence) self.add_validation_result( allele, valid_alleles, results, diff --git a/variation/validators/polypeptide_sequence_variation_base.py b/variation/validators/polypeptide_sequence_variation_base.py index 45251f38..f85636d6 100644 --- a/variation/validators/polypeptide_sequence_variation_base.py +++ b/variation/validators/polypeptide_sequence_variation_base.py @@ -1,5 +1,5 @@ """The module for Polypeptide Sequence Variation Validation.""" -from typing import List, Optional +from typing import List, Optional, Dict from abc import abstractmethod from .validator import Validator from variation.schemas.token_response_schema import GeneMatchToken @@ -11,6 +11,10 @@ from ga4gh.vrs.dataproxy import SeqRepoDataProxy from ga4gh.vrs.extras.translator import Translator import logging +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.classification_response_schema import Classification +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -24,7 +28,8 @@ def __init__(self, seq_repo_access: SeqRepoAccess, gene_symbol: GeneSymbol, mane_transcript: MANETranscript, uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, - amino_acid_cache: AminoAcidCache) \ + amino_acid_cache: AminoAcidCache, + gene_normalizer: GeneQueryHandler) \ -> None: """Initialize the validator. @@ -35,11 +40,12 @@ def __init__(self, seq_repo_access: SeqRepoAccess, :param MANETranscript mane_transcript: Access MANE Transcript information :param UTA uta: Access to UTA queries + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer :param amino_acid_cache: Amino Acid codes and conversions """ super().__init__( seq_repo_access, transcript_mappings, gene_symbol, mane_transcript, - uta, dp, tlr + uta, dp, tlr, gene_normalizer ) self._amino_acid_cache = amino_acid_cache self.amino_acid_base = AminoAcidBase(seq_repo_access, amino_acid_cache) @@ -57,23 +63,29 @@ def get_transcripts(self, gene_tokens, classification, errors)\ """ return self.get_protein_transcripts(gene_tokens, errors) - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum + ) -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ valid_alleles = list() for s in classification_tokens: @@ -99,7 +111,7 @@ def get_valid_invalid_results(self, classification_tokens, transcripts, self.add_mane_data(mane, mane_data_found, s.reference_sequence, s.alt_type, - s, gene_tokens, alt=s.alt_protein) + s, alt=s.alt_protein) self.add_validation_result(allele, valid_alleles, results, classification, s, t, gene_tokens, diff --git a/variation/validators/single_nucleotide_variation_base.py b/variation/validators/single_nucleotide_variation_base.py index eebf603a..ab06eb7e 100644 --- a/variation/validators/single_nucleotide_variation_base.py +++ b/variation/validators/single_nucleotide_variation_base.py @@ -1,6 +1,12 @@ """The module for Single Nucleotide Variation Validation.""" +from typing import List, Dict, Optional from .validator import Validator import logging +from variation.schemas.classification_response_schema import Classification, \ + ClassificationType +from variation.schemas.token_response_schema import Token, GeneMatchToken +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -9,24 +15,106 @@ class SingleNucleotideVariationBase(Validator): """The Single Nucleotide Variation Validator Base class.""" - def silent_mutation_valid_invalid_results(self, classification_tokens, - transcripts, classification, - results, gene_tokens, - normalize_endpoint, - mane_data_found, - is_identifier) -> None: + def is_token_instance(self, t: Token) -> bool: + """Check to see if token is instance of a token type. + + :param Token t: Classification token to find type of + :return: `True` if token is instance of class token. `False` otherwise. + """ + raise NotImplementedError + + def variation_name(self) -> str: + """Return the variation name. + + :return: variation class name + """ + raise NotImplementedError + + def human_description(self, transcript: str, token: Token) -> str: + """Return a human description of the identified variation. + + :param str transcript: Transcript accession + :param Token token: Classification token + :return: Human description of the variation change + """ + raise NotImplementedError + + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: + """Return a list of gene tokens for a classification. + + :param Classification classification: Classification for a list of + tokens + :return: A list of gene tokens for the classification + """ + raise NotImplementedError + + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: + """Get transcript accessions for a given classification. + + :param List gene_tokens: A list of gene tokens + :param Classification classification: A classification for a list of + tokens + :param List errors: List of errors + :return: List of transcript accessions + """ + raise NotImplementedError + + def validates_classification_type( + self, classification_type: ClassificationType) -> bool: + """Check that classification type can be validated by validator. + + :param ClassificationType classification_type: The type of variation + :return: `True` if classification_type matches validator's + classification type. `False` otherwise. + """ + raise NotImplementedError + + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum) \ + -> None: + """Add validation result objects to a list of results. + + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions + :param Classification classification: A classification for a list of + tokens + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification + :param bool normalize_endpoint: `True` if normalize endpoint is being + used. `False` otherwise. + :param Dict mane_data_found: MANE Transcript information found + :param bool is_identifier: `True` if identifier is given for exact + location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + """ + raise NotImplementedError + + def silent_mutation_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool) -> None: """Add validation result objects to a list of results for Silent Mutations. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. """ @@ -71,8 +159,7 @@ def silent_mutation_valid_invalid_results(self, classification_tokens, ) self.add_mane_data(mane, mane_data_found, - s.reference_sequence, s.alt_type, s, - gene_tokens) + s.reference_sequence, s.alt_type, s) self.add_validation_result( allele, valid_alleles, results, diff --git a/variation/validators/validate.py b/variation/validators/validate.py index 0ee69b65..bea9492c 100644 --- a/variation/validators/validate.py +++ b/variation/validators/validate.py @@ -22,9 +22,14 @@ from .coding_dna_insertion import CodingDNAInsertion from .genomic_insertion import GenomicInsertion from .genomic_uncertain_deletion import GenomicUncertainDeletion +from .genomic_duplication import GenomicDuplication +from .genomic_deletion_range import GenomicDeletionRange from ga4gh.vrs.dataproxy import SeqRepoDataProxy from ga4gh.vrs.extras.translator import Translator from typing import List +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum class Validate: @@ -35,7 +40,8 @@ def __init__(self, seqrepo_access: SeqRepoAccess, gene_symbol: GeneSymbol, mane_transcript: MANETranscript, uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, - amino_acid_cache: AminoAcidCache) -> None: + amino_acid_cache: AminoAcidCache, + gene_normalizer: GeneQueryHandler) -> None: """Initialize the validate class. :param SeqRepoAccess seqrepo_access: Access to SeqRepo data @@ -45,11 +51,13 @@ def __init__(self, seqrepo_access: SeqRepoAccess, :param MANETranscript mane_transcript: Access MANE Transcript information :param UTA uta: Access to UTA queries + :param Translator tlr: Translator class + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer :param amino_acid_cache: Amino Acid codes and conversions """ params = [ seqrepo_access, transcript_mappings, gene_symbol, - mane_transcript, uta, dp, tlr + mane_transcript, uta, dp, tlr, gene_normalizer ] amino_acid_params = params[:] amino_acid_params.append(amino_acid_cache) @@ -70,13 +78,28 @@ def __init__(self, seqrepo_access: SeqRepoAccess, AminoAcidInsertion(*amino_acid_params), CodingDNAInsertion(*params), GenomicInsertion(*params), - GenomicUncertainDeletion(*params) + GenomicDeletionRange(*params), + GenomicUncertainDeletion(*params), + GenomicDuplication(*params) ] - def perform(self, classifications: List[Classification], - normalize_endpoint, warnings=None) \ - -> ValidationSummary: - """Validate a list of classifications.""" + def perform( + self, classifications: List[Classification], + normalize_endpoint: bool, warnings: List = None, + hgvs_dup_del_mode: HGVSDupDelModeEnum = HGVSDupDelModeEnum.DEFAULT + ) -> ValidationSummary: + """Validate a list of classifications. + + :param List classifications: List of classifications + :param bool normalize_endpoint: `True` if normalize endpoint is being + used. `False` otherwise. + :param List warnings: List of warnings + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + :return: ValidationSummary containing valid and invalid results + """ valid_possibilities = list() invalid_possibilities = list() if not warnings: @@ -87,8 +110,8 @@ def perform(self, classifications: List[Classification], for validator in self.validators: if validator.validates_classification_type( classification.classification_type): - results = validator.validate(classification, - normalize_endpoint) + results = validator.validate( + classification, normalize_endpoint, hgvs_dup_del_mode) for res in results: if res.is_valid: found_classification = True diff --git a/variation/validators/validator.py b/variation/validators/validator.py index 69622dce..49a2bb4d 100644 --- a/variation/validators/validator.py +++ b/variation/validators/validator.py @@ -1,12 +1,11 @@ """Module for Validation.""" import copy -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Tuple, Union from abc import ABC, abstractmethod -from ga4gh.vrsatile.pydantic.vrs_model import CopyNumber from variation.schemas.classification_response_schema import Classification, \ ClassificationType -from variation.schemas.token_response_schema import GeneMatchToken -import variation.schemas.token_response_schema as token_schema +from variation.schemas.token_response_schema import GeneMatchToken, Token, \ + GenomicSubstitutionToken from variation.schemas.validation_response_schema import ValidationResult, \ LookupType from variation.tokenizers import GeneSymbol @@ -21,6 +20,9 @@ from variation.validators.genomic_base import GenomicBase from variation.data_sources import UTA from bioutils.accessions import coerce_namespace +from gene.query import QueryHandler as GeneQueryHandler +from variation.schemas.normalize_response_schema\ + import HGVSDupDelMode as HGVSDupDelModeEnum logger = logging.getLogger('variation') logger.setLevel(logging.DEBUG) @@ -33,7 +35,8 @@ def __init__(self, seqrepo_access: SeqRepoAccess, transcript_mappings: TranscriptMappings, gene_symbol: GeneSymbol, mane_transcript: MANETranscript, - uta: UTA, dp: SeqRepoDataProxy, tlr: Translator) -> None: + uta: UTA, dp: SeqRepoDataProxy, tlr: Translator, + gene_normalizer: GeneQueryHandler) -> None: """Initialize the DelIns validator. :param SeqRepoAccess seqrepo_access: Access to SeqRepo data @@ -43,6 +46,8 @@ def __init__(self, seqrepo_access: SeqRepoAccess, :param MANETranscript mane_transcript: Access MANE Transcript information :param UTA uta: Access to UTA queries + :param Translator tlr: Translator class + :param GeneQueryHandler gene_normalizer: Access to gene-normalizer """ self.transcript_mappings = transcript_mappings self.seqrepo_access = seqrepo_access @@ -53,9 +58,10 @@ def __init__(self, seqrepo_access: SeqRepoAccess, self.uta = uta self.genomic_base = GenomicBase(self.dp, self.uta) self.mane_transcript = mane_transcript + self.gene_normalizer = gene_normalizer @abstractmethod - def is_token_instance(self, t) -> bool: + def is_token_instance(self, t: Token) -> bool: """Check to see if token is instance of a token type. :param Token t: Classification token to find type of @@ -72,7 +78,7 @@ def variation_name(self) -> str: raise NotImplementedError @abstractmethod - def human_description(self, transcript, token) -> str: + def human_description(self, transcript: str, token: Token) -> str: """Return a human description of the identified variation. :param str transcript: Transcript accession @@ -82,7 +88,7 @@ def human_description(self, transcript, token) -> str: raise NotImplementedError @abstractmethod - def concise_description(self, transcript, token) -> str: + def concise_description(self, transcript: str, token: Token) -> str: """Return a HGVS description of the identified variation. :param str transcript: Transcript accession @@ -92,7 +98,8 @@ def concise_description(self, transcript, token) -> str: raise NotImplementedError @abstractmethod - def get_gene_tokens(self, classification) -> List[GeneMatchToken]: + def get_gene_tokens( + self, classification: Classification) -> List[GeneMatchToken]: """Return a list of gene tokens for a classification. :param Classification classification: Classification for a list of @@ -102,60 +109,73 @@ def get_gene_tokens(self, classification) -> List[GeneMatchToken]: raise NotImplementedError @abstractmethod - def get_transcripts(self, gene_tokens, classification, errors)\ - -> Optional[List[str]]: + def get_transcripts(self, gene_tokens: List, + classification: Classification, + errors: List) -> Optional[List[str]]: """Get transcript accessions for a given classification. - :param list gene_tokens: A list of gene tokens + :param List gene_tokens: A list of gene tokens :param Classification classification: A classification for a list of tokens - :param list errors: List of errors + :param List errors: List of errors :return: List of transcript accessions """ raise NotImplementedError @abstractmethod - def validates_classification_type(self, - classification_type: ClassificationType)\ - -> bool: + def validates_classification_type( + self, classification_type: ClassificationType) -> bool: """Check that classification type can be validated by validator. - :param str classification_type: The type of variation + :param ClassificationType classification_type: The type of variation :return: `True` if classification_type matches validator's classification type. `False` otherwise. """ raise NotImplementedError @abstractmethod - def get_valid_invalid_results(self, classification_tokens, transcripts, - classification, results, gene_tokens, - normalize_endpoint, mane_data_found, - is_identifier) -> None: + def get_valid_invalid_results( + self, classification_tokens: List, transcripts: List, + classification: Classification, results: List, gene_tokens: List, + normalize_endpoint: bool, mane_data_found: Dict, + is_identifier: bool, hgvs_dup_del_mode: HGVSDupDelModeEnum)\ + -> None: """Add validation result objects to a list of results. - :param list classification_tokens: A list of classification Tokens - :param list transcripts: A list of transcript accessions + :param List classification_tokens: A list of classification Tokens + :param List transcripts: A list of transcript accessions :param Classification classification: A classification for a list of tokens - :param list results: Stores validation result objects - :param list gene_tokens: List of GeneMatchTokens for a classification + :param List results: Stores validation result objects + :param List gene_tokens: List of GeneMatchTokens for a classification :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :param dict mane_data_found: MANE Transcript information found + :param Dict mane_data_found: MANE Transcript information found :param bool is_identifier: `True` if identifier is given for exact location. `False` otherwise. + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. """ raise NotImplementedError - def validate(self, classification: Classification, normalize_endpoint) \ - -> List[ValidationResult]: + def validate( + self, classification: Classification, normalize_endpoint: bool, + hgvs_dup_del_mode: HGVSDupDelModeEnum = HGVSDupDelModeEnum.DEFAULT + ) -> List[ValidationResult]: """Return validation result for a given classification. :param Classification classification: A classification for a list of tokens :param bool normalize_endpoint: `True` if normalize endpoint is being used. `False` otherwise. - :return: Validation Result's containing valid and invalid results + :param HGVSDupDelModeEnum hgvs_dup_del_mode: Must be: `default`, `cnv`, + `repeated_seq_expr`, `literal_seq_expr`. + This parameter determines how to represent HGVS dup/del expressions + as VRS objects. + :return: List of ValidationResult's containing valid and invalid + results """ results = list() errors = list() @@ -201,28 +221,29 @@ def validate(self, classification: Classification, normalize_endpoint) \ self.get_valid_invalid_results( classification_tokens, transcripts, classification, results, gene_tokens, normalize_endpoint, mane_data_found, - is_identifier + is_identifier, hgvs_dup_del_mode ) return results @staticmethod - def get_validation_result(classification, classification_token, is_valid, - confidence_score, variation, human_description, - concise_description, errors, gene_tokens, - identifier=None, is_mane_transcript=False)\ - -> ValidationResult: + def get_validation_result( + classification: Classification, classification_token: Token, + is_valid: bool, confidence_score: int, variation: Dict, + human_description: str, concise_description: str, errors: List, + gene_tokens: List, identifier: str = None, + is_mane_transcript: bool = False) -> ValidationResult: """Return a validation result object. :param Classification classification: The classification for tokens :param Token classification_token: Classification token - :param boolean is_valid: Whether or not the classification is valid + :param bool is_valid: Whether or not the classification is valid :param int confidence_score: The classification confidence score - :param dict variation: A VRS Variation object + :param Dict variation: A VRS Variation object :param str human_description: A human description describing the variation :param str concise_description: HGVS expression for variation - :param list errors: A list of errors for the classification - :param list gene_tokens: List of GeneMatchTokens + :param List errors: A list of errors for the classification + :param List gene_tokens: List of GeneMatchTokens :param str identifier: Identifier for variation :param bool is_mane_transcript: `True` if result is MANE transcript. `False` otherwise. @@ -242,12 +263,12 @@ def get_validation_result(classification, classification_token, is_valid, identifier=identifier ) - def get_protein_transcripts(self, gene_tokens, errors)\ - -> Optional[List[str]]: + def get_protein_transcripts(self, gene_tokens: List, + errors: List) -> Optional[List[str]]: """Get transcripts for variations with protein reference sequence. - :param list gene_tokens: List of gene tokens for a classification - :param list errors: List of errors + :param List gene_tokens: List of gene tokens for a classification + :param List errors: List of errors :return: List of possible transcript accessions for the variation """ transcripts = self.transcript_mappings.protein_transcripts( @@ -257,12 +278,12 @@ def get_protein_transcripts(self, gene_tokens, errors)\ f'{gene_tokens[0].token}') return transcripts - def get_coding_dna_transcripts(self, gene_tokens, errors)\ - -> Optional[List[str]]: + def get_coding_dna_transcripts(self, gene_tokens: List, + errors: List) -> Optional[List[str]]: """Get transcripts for variations with coding DNA reference sequence. - :param list gene_tokens: List of gene tokens for a classification - :param list errors: List of errors + :param List gene_tokens: List of gene tokens for a classification + :param List errors: List of errors :return: List of possible transcript accessions for the variation """ transcripts = self.transcript_mappings.coding_dna_transcripts( @@ -272,13 +293,13 @@ def get_coding_dna_transcripts(self, gene_tokens, errors)\ f'{gene_tokens[0].token}') return transcripts - def get_genomic_transcripts(self, classification, errors)\ - -> Optional[List[str]]: + def get_genomic_transcripts(self, classification: Classification, + errors: List) -> Optional[List[str]]: """Get NC accessions for variations with genomic reference sequence. :param Classification classification: Classification for a list of tokens - :param list errors: List of errors + :param List errors: List of errors :return: List of possible NC accessions for the variation """ nc_accessions = self.genomic_base.get_nc_accessions(classification) @@ -287,8 +308,9 @@ def get_genomic_transcripts(self, classification, errors)\ f'{self.variation_name()}') return nc_accessions - def get_classification_tokens(self, classification)\ - -> Optional[List[Classification]]: + def get_classification_tokens( + self, classification: Classification + ) -> List[Optional[Classification]]: """Get classification tokens for a given instance. :param Classification classification: A classification for a list of @@ -298,8 +320,9 @@ def get_classification_tokens(self, classification)\ return [t for t in classification.all_tokens if self.is_token_instance(t)] - def get_gene_symbol_tokens(self, classification)\ - -> Optional[List[GeneMatchToken]]: + def get_gene_symbol_tokens( + self, classification: Classification + ) -> List[Optional[GeneMatchToken]]: """Return tokens with GeneSymbol token type from a classification. :param Classification classification: Classification of input string @@ -308,27 +331,27 @@ def get_gene_symbol_tokens(self, classification)\ return [t for t in classification.all_tokens if t.token_type == 'GeneSymbol'] - def _add_gene_symbol_to_tokens(self, gene_symbol, gene_symbols, - gene_tokens) -> None: + def _add_gene_symbol_to_tokens(self, gene_symbol: str, gene_symbols: List, + gene_tokens: List) -> None: """Add a gene symbol to list of gene match tokens. :param str gene_symbol: Gene symbol - :param list gene_symbols: List of gene symbols matched - :param list gene_tokens: List of GeneMatchTokens + :param List gene_symbols: List of gene symbols matched + :param List gene_tokens: List of GeneMatchTokens """ if gene_symbol and gene_symbol not in gene_symbols: gene_symbols.append(gene_symbol) gene_tokens.append(self._gene_matcher.match( gene_symbol)) - def _get_gene_tokens(self, classification, mappings)\ - -> Optional[List[GeneMatchToken]]: + def _get_gene_tokens(self, classification: Classification, + mappings: List) -> List[Optional[GeneMatchToken]]: """Get gene symbol tokens for protein or transcript reference sequences. :param Classification classification: Classification for a list of tokens - :param list mappings: List of transcript mapping methods for + :param List mappings: List of transcript mapping methods for corresponding reference sequence :return: A list of gene match tokens """ @@ -355,8 +378,9 @@ def _get_gene_tokens(self, classification, mappings)\ break return gene_tokens - def get_protein_gene_symbol_tokens(self, classification)\ - -> Optional[List[GeneMatchToken]]: + def get_protein_gene_symbol_tokens( + self, classification: Classification + ) -> List[Optional[GeneMatchToken]]: """Return gene tokens for a classification with protein reference sequence. @@ -371,8 +395,9 @@ def get_protein_gene_symbol_tokens(self, classification)\ ] return self._get_gene_tokens(classification, mappings) - def get_coding_dna_gene_symbol_tokens(self, classification)\ - -> Optional[List[GeneMatchToken]]: + def get_coding_dna_gene_symbol_tokens( + self, classification: Classification + ) -> List[Optional[GeneMatchToken]]: """Return gene symbol tokens for classifications with coding dna reference sequence. @@ -386,37 +411,38 @@ def get_coding_dna_gene_symbol_tokens(self, classification)\ ] return self._get_gene_tokens(classification, mappings) - def get_accession(self, t, classification) -> str: + def get_accession(self, t: str, classification: Classification) -> str: """Return accession for a classification :param str t: Accession - :param Token classification: Classification for token + :param Classification classification: Classification for token :return: Accession """ if 'HGVS' in classification.matching_tokens or \ 'ReferenceSequence' in classification.matching_tokens: hgvs_token = [t for t in classification.all_tokens if - isinstance(t, token_schema.Token) and t.token_type + isinstance(t, Token) and t.token_type in ['HGVS', 'ReferenceSequence']][0] hgvs_expr = hgvs_token.input_string t = hgvs_expr.split(':')[0] return t - def add_validation_result(self, variation, valid_variations, results, - classification, s, t, gene_tokens, errors, - identifier=None, - is_mane_transcript=False) -> bool: + def add_validation_result( + self, variation: Dict, valid_variations: List, results: List, + classification: Classification, s: Token, t: str, + gene_tokens: List, errors: List, identifier: str = None, + is_mane_transcript: bool = False) -> bool: """Add validation result to list of results. - :param dict variation: A VRS Variation object - :param list valid_variations: A list containing current valid + :param Dict variation: A VRS Variation object + :param List valid_variations: A list containing current valid variations - :param list results: A list of validation results + :param List results: A list of validation results :param Classification classification: The classification for tokens :param Token s: The classification token :param string t: Transcript - :param list gene_tokens: List of GeneMatchTokens - :param list errors: A list of errors for the classification + :param List gene_tokens: List of GeneMatchTokens + :param List errors: A list of errors for the classification :param str identifier: Identifier for variation :param bool is_mane_transcript: `True` if result is MANE transcript. `False` otherwise. @@ -449,21 +475,61 @@ def add_validation_result(self, variation, valid_variations, results, ) return False - def to_vrs_allele(self, ac, start, end, coordinate, alt_type, errors, - cds_start=None, alt=None) -> Optional[Dict]: - """Translate accession and position to VRS Allele Object. + def _get_start_indef_range(self, start: int) -> models.IndefiniteRange: + """Return indefinite range given start coordinate + + :param int start: Start position (assumes 1-based) + :return: Indefinite range model + """ + return models.IndefiniteRange(value=start - 1, comparator="<=") + + def _get_end_indef_range(self, end: int) -> models.IndefiniteRange: + """Return indefinite range given end coordinate + + :param int end: End position (assumes 1-based) + :return: Indefinite range model + """ + return models.IndefiniteRange(value=end, comparator=">=") + + def _get_ival_certain_range(self, start1: int, start2: int, end1: int, + end2: int) -> models.SequenceInterval: + """Return sequence interval + + :param int start1: Start left pos (assumes 1-based) + :param int start2: Start right pos (assumes 1-based) + :param int end1: End left pos (assumes 1-based) + :param int end2: End right pos (assumes 1-based) + :return: Sequence Interval model + """ + return models.SequenceInterval( + start=models.DefiniteRange(min=start1 - 1, max=start2 - 1), + end=models.DefiniteRange(min=end1 + 1, max=end2 + 1) + ) + + def _get_sequence_loc( + self, ac: str, interval: models.SequenceInterval + ) -> models.Location: + """Return VRS location :param str ac: Accession + :param models.SequenceInterval interval: VRS sequence interval + :return: VRS Location model + """ + return models.Location(sequence_id=coerce_namespace(ac), + interval=interval) + + def _get_ival_start_end( + self, coordinate: str, start: int, end: int, cds_start: int, + errors: List) -> Optional[Tuple[int, int]]: + """Get ival_start and ival_end coordinates. + + :param str coordinate: Coordinate used. Must be either `p`, `c`, or `g` :param int start: Start position change :param int end: End position change - :param str coordinate: Coordinate used. Must be either `p`, `c`, or `g` - :param str alt_type: Type of alteration - :param list errors: List of errors :param int cds_start: Coding start site - :param str alt: Alteration - :return: VRS Allele Object + :param List errors: List of errors + :return: Tuple[ival_start, ival_end] """ - sequence_id = coerce_namespace(ac) try: start = int(start) if end is None: @@ -477,51 +543,57 @@ def to_vrs_allele(self, ac, start, end, coordinate, alt_type, errors, if cds_start: start += cds_start end += cds_start + return start, end - ival_start = start - ival_end = end + def to_vrs_allele_ranges( + self, ac: str, coordinate: str, alt_type: str, errors: List, + ival: models.SequenceInterval) -> Optional[Dict]: + """Translate variation ranges to VRS Allele Object. - # Right now, this follows HGVS conventions - # This will change once we support other representations - if alt_type == 'uncertain_deletion': - interval = models.SequenceInterval( - start=models.IndefiniteRange( - value=ival_start - 1, - comparator="<=" - ), - end=models.IndefiniteRange( - value=ival_end, - comparator=">=" - ) - ) + :param str ac: Accession + :param str coordinate: Coordinate used. Must be either `p`, `c`, or `g` + :param str alt_type: Type of alteration + :param list errors: List of errors + :param models.SequenceInterval ival: Sequence Interval + :return: VRS Allele object + """ + if coordinate == 'c': + # TODO: Once we add support for ranges on c. coord + return None + if alt_type in ['uncertain_deletion', 'uncertain_duplication', + 'duplication_range', 'deletion_range']: sstate = models.LiteralSequenceExpression( sequence="" ) else: - if alt_type == 'insertion': - state = alt - ival_end = ival_start - elif alt_type in ['substitution', 'deletion', 'delins', - 'silent_mutation', 'nonsense']: - if alt_type == 'silent_mutation': - state = self.seqrepo_access.get_sequence( - ac, ival_start - ) - else: - state = alt or '' - ival_start -= 1 - else: - errors.append(f"alt_type not supported: {alt_type}") - return None + errors.append("No state") + return None + + return self._vrs_allele(ac, ival, sstate, alt_type, errors) - interval = models.SimpleInterval(start=ival_start, end=ival_end) - sstate = models.SequenceState(sequence=state) + def _vrs_allele(self, ac: str, interval: models.SequenceInterval, + sstate: Union[models.LiteralSequenceExpression, + models.DerivedSequenceExpression, + models.RepeatedSequenceExpression], + alt_type: str, errors: List) -> Optional[Dict]: + """Create a VRS Allele object. - location = models.Location(sequence_id=sequence_id, interval=interval) + :param str ac: Accession + :param SequenceInterval interval: Sequence Interval + :param sstate: State + :type sstate: models.LiteralSequenceExpression or + models.DerivedSequenceExpression or + models.RepeatedSequenceExpression + :param str alt_type: Type of alteration + :param List errors: List of errors + :return: VRS Allele object represented as a Dict + """ + location = self._get_sequence_loc(ac, interval) allele = models.Allele(location=location, state=sstate) # Ambiguous regions do not get normalized - if alt_type != "uncertain_deletion": + if alt_type not in ["uncertain_deletion", "uncertain_duplication", + "duplication_range", "deletion_range"]: try: allele = normalize(allele, self.dp) if alt_type == 'deletion': @@ -531,91 +603,150 @@ def to_vrs_allele(self, ac, start, end, coordinate, alt_type, errors, return None if not allele: - errors.append(f"Unable to find allele for accession, {ac}, " - f"and position ({start}, {end})") + errors.append("Unable to get allele") return None seq_id = self.dp.translate_sequence_identifier( allele.location.sequence_id._value, "ga4gh")[0] allele.location.sequence_id = seq_id + allele.location._id = ga4gh_identify(allele.location) allele._id = ga4gh_identify(allele) return allele.as_dict() - def _get_chr(self, ac) -> Optional[str]: - """Get chromosome for accession. - - :param str ac: Accession - :return: Chromosome - """ - aliases = self.seqrepo_access.aliases(ac) - return ([a.split(':')[-1] for a in aliases - if a.startswith('GRCh') and '.' not in a and 'chr' not in a] or [None])[0] # noqa: E501 - - def to_vrs_cnv(self, ac, allele, del_or_dup, chr=None)\ - -> Optional[CopyNumber]: - """Return a Copy Number Variation. + def to_vrs_allele( + self, ac: str, start: int, end: int, coordinate: str, + alt_type: str, errors: List, cds_start: int = None, + alt: str = None) -> Optional[Dict]: + """Translate accession and position to VRS Allele Object. :param str ac: Accession - :param dict allele: VRS Allele Object - :param str del_or_dup: `del` if deletion, `dup` if duplication - :param str chr: The chromosome the accession is located on - :return: VRS Copy Number object + :param int start: Start position change + :param int end: End position change + :param str coordinate: Coordinate used. Must be either `p`, `c`, or `g` + :param str alt_type: Type of alteration + :param List errors: List of errors + :param int cds_start: Coding start site + :param str alt: Alteration + :return: VRS Allele Object """ - if chr is None: - chr = self._get_chr(ac) - - if chr is None: - logger.warning(f"Unable to find chromosome on {ac}") + ival_coords = self._get_ival_start_end(coordinate, start, end, + cds_start, errors) + if not ival_coords: return None - - if chr == 'X': - copies = models.DefiniteRange( - min=0 if del_or_dup == 'del' else 2, - max=1 if del_or_dup == 'del' else 3 - ) - elif chr == 'Y': - copies = models.Number( - value=0 if del_or_dup == 'del' else 2 - ) + if ival_coords[0] > ival_coords[1]: + ival_end, ival_start = ival_coords else: - # Chr 1-22 - copies = models.Number( - value=1 if del_or_dup == 'del' else 3 - ) + ival_start, ival_end = ival_coords - cnv = models.CopyNumber( - subject=models.DerivedSequenceExpression( - location=allele['location'], - reverse_complement=False # TODO: CHANGE THIS - ), - copies=copies - ) - cnv._id = ga4gh_identify(cnv) - return cnv.as_dict() + # Right now, this follows HGVS conventions + # This will change once we support other representations + if alt_type == 'insertion': + state = alt + ival_end = ival_start + elif alt_type in ['substitution', 'deletion', 'delins', + 'silent_mutation', 'nonsense']: + if alt_type == 'silent_mutation': + state = self.seqrepo_access.get_sequence( + ac, ival_start + ) + if state is None: + errors.append(f"Unable to get sequence on {ac} from " + f"{ival_start}") + return None + else: + state = alt or '' + ival_start -= 1 + elif alt_type == 'duplication': + ref = self.seqrepo_access.get_sequence(ac, ival_start, + ival_end) + if ref is not None: + state = ref + ref + else: + errors.append(f"Unable to get sequence on {ac} from " + f"{ival_start} to {ival_end}") + return None + ival_start -= 1 + else: + errors.append(f"alt_type not supported: {alt_type}") + return None - def add_mane_data(self, mane, mane_data, coordinate, alt_type, s, - gene_tokens, alt=None) -> None: - """Add mane transcript information to mane_data. + interval = models.SequenceInterval( + start=models.Number(value=ival_start), + end=models.Number(value=ival_end)) + sstate = models.LiteralSequenceExpression(sequence=state) + return self._vrs_allele(ac, interval, sstate, alt_type, errors) + + def _validate_gene_pos(self, gene: str, alt_ac: str, pos1: int, pos2: int, + errors: List, pos3: int = None, pos4: int = None, + residue_mode: str = "residue") -> None: + """Validate whether free text genomic query is valid input. + If invalid input, add error to list of errors + + :param str gene: Queried gene + :param str alt_ac: Genomic accession + :param int pos1: Queried genomic position + :param int pos2: Queried genomic position + :param int pos3: Queried genomic position + :param int pos4: Queried genomic position + :param str residue_mode: Must be either `inter-residue` or `residue` + :param List errors: List of errors + """ + gene_start_end = {"start": None, "end": None} + resp = self.gene_normalizer.search(gene, incl="Ensembl") + if resp.source_matches: + ensembl_resp = resp.source_matches[0] + if ensembl_resp.records[0].locations: + ensembl_loc = ensembl_resp.records[0].locations[0] + gene_start_end["start"] = ensembl_loc.interval.start.value + gene_start_end["end"] = ensembl_loc.interval.end.value - 1 + + if gene_start_end["start"] is None and gene_start_end["end"] is None: + errors.append(f"gene-normalizer unable to find Ensembl location" + f"for {gene}") + else: + assembly = self.uta.get_chr_assembly(alt_ac) + if assembly: + # Not in GRCh38 assembly. Gene normalizer only uses 38, so we + # need to liftover to GRCh37 coords + chromosome, assembly = assembly + for key in gene_start_end.keys(): + gene_pos = gene_start_end[key] + gene_pos_liftover = \ + self.uta.liftover_to_37.convert_coordinate(chromosome, + gene_pos) + if gene_pos_liftover is None or len(gene_pos_liftover) == 0: # noqa: E501 + errors.append(f"{gene_pos} does not" + f" exist on {chromosome}") + return None + else: + gene_start_end[key] = gene_pos_liftover[0][1] + + gene_start = gene_start_end["start"] + gene_end = gene_start_end["end"] + + for pos in [pos1, pos2, pos3, pos4]: + if pos not in ["?", None]: + if residue_mode == "residue": + pos -= 1 + if not (gene_start <= pos <= gene_end): + errors.append(f"Position {pos} out of index on " + f"{alt_ac} on gene, {gene}") + + def _get_coord_alt(self, coordinate: str, mane: Dict, + s_copy: Token) -> Optional[Tuple[str, str]]: + """Get coordinate and alteration - :param dict mane: MANE Transcript information - :param dict mane_data: MANE Transcript data found for given query :param str coordinate: Coordinate used. Must be either `p`, `c`, or `g` - :param str alt_type: Type of alteration - :param Token s: Classification token - :param list gene_tokens: List of GeneMatchTokens for a classification - :param str alt: Alteration + :param Dict mane: Mane data + :param Token s_copy: classification token + :return: Coordinate, alteration """ - if not mane: - return - - s_copy = copy.deepcopy(s) if coordinate == 'g' and mane['status'].lower() != 'grch38': s_copy.molecule_context = 'transcript' s_copy.reference_sequence = 'c' coordinate = s_copy.reference_sequence - if isinstance(s_copy, - token_schema.GenomicSubstitutionToken) and \ + if isinstance(s_copy, GenomicSubstitutionToken) and \ mane['strand'] == '-': ref_rev = s_copy.ref_nucleotide[::-1] alt_rev = s_copy.new_nucleotide[::-1] @@ -634,47 +765,84 @@ def add_mane_data(self, mane, mane_data, coordinate, alt_type, s, for nt in alt_rev: s_copy.new_nucleotide += complements[nt] alt = s_copy.new_nucleotide + else: + alt = None + return coordinate, alt + return None - new_allele = self.to_vrs_allele( - mane['refseq'], mane['pos'][0], mane['pos'][1], - coordinate, alt_type, [], - cds_start=mane.get('coding_start_site', None), alt=alt - ) + def add_mane_data( + self, mane: Dict, mane_data: Dict, coordinate: str, alt_type: str, + s: Token, alt: str = None, mane_variation: Dict = None) -> None: + """Add mane transcript information to mane_data. - if not new_allele: - return + :param Dict mane: MANE data + :param Dict mane_data: All MANE data found for given query + :param str coordinate: Coordinate used. Must be either `p`, `c`, or `g` + :param str alt_type: Type of alteration + :param Token s: Classification token + :param str alt: Alteration + :param Dict mane_variation: VRS Variation for mane data + """ + if not mane: + return None - if alt_type == 'uncertain_deletion': - variation = self.to_vrs_cnv(mane['refseq'], new_allele, 'del') - if not variation: - return None - _id = variation['_id'] - else: + s_copy = copy.deepcopy(s) + coord_alt = self._get_coord_alt(coordinate, mane, s_copy) + if coord_alt: + coordinate = coord_alt[0] if coord_alt[0] else coordinate + alt = coord_alt[1] if coord_alt[1] else alt + + if mane_variation is None: + new_allele = self.to_vrs_allele( + mane['refseq'], mane['pos'][0], mane['pos'][1], + coordinate, alt_type, [], + cds_start=mane.get('coding_start_site', None), alt=alt + ) variation = new_allele - _id = variation['_id'] + else: + variation = mane_variation + + if not variation: + return None + + self._add_dict_to_mane_data(mane['refseq'], s_copy, variation, + mane_data, mane['status']) - key = '_'.join(mane['status'].lower().split()) + def _add_dict_to_mane_data(self, ac: str, s: Token, variation: Dict, + mane_data: Dict, status: str) -> None: + """Add variation data to mane data for normalize endpoint. + + :param str ac: Accession + :param Token s: Classification token + :param Dict variation: VRS Variation object + :param Dict mane_data: MANE Transcript data found for given query + :param str status: Status for variation (GRCh38, MANE Select, + MANE Clinical Plus) + """ + _id = variation['_id'] + key = '_'.join(status.lower().split()) if _id in mane_data[key].keys(): mane_data[key][_id]['count'] += 1 else: mane_data[key][_id] = { - 'classification_token': s_copy, - 'accession': mane['refseq'], + 'classification_token': s, + 'accession': ac, 'count': 1, 'variation': variation, - 'label': mane['refseq'] # TODO: Use VRS to translate + 'label': ac # TODO: Use VRS to translate } - def add_mane_to_validation_results(self, mane_data, valid_alleles, - results, classification, gene_tokens): + def add_mane_to_validation_results( + self, mane_data: Dict, valid_alleles: List, results: List, + classification: Classification, gene_tokens: List) -> None: """Add MANE Transcript data to list of validation results. - :param dict mane_data: MANE Transcript data found for given query - :param list valid_alleles: A list containing current valid alleles - :param list results: A list of validation results + :param Dict mane_data: MANE Transcript data found for given query + :param List valid_alleles: A list containing current valid alleles + :param List results: A list of validation results :param Classification classification: The classification for tokens - :param list gene_tokens: List of GeneMatchTokens + :param List gene_tokens: List of GeneMatchTokens """ mane_data_keys = mane_data.keys() for key in ['mane_select', 'mane_plus_clinical', 'grch38', @@ -703,3 +871,41 @@ def add_mane_to_validation_results(self, mane_data, valid_alleles, identifier=identifier, is_mane_transcript=True ) return + + def _check_index(self, ac: str, pos: int, errors: List) -> Optional[str]: + """Check that index actually exists + + :param str ac: Accession + :param int pos: Position changes + :param List errors: List of errors + :return: Reference sequence + """ + seq = self.seqrepo_access.get_sequence(ac, pos) + if not seq: + errors.append(f"Pos {pos} not found on {ac}") + return None + return seq + + def _grch38_dict(self, ac: str, pos: Tuple[int, int]) -> Dict: + """Create dict for normalized concepts + + :param str ac: Acession + :param Tuple[int, int] pos: Position changes + :return: GRCh38 data + """ + return dict( + gene=None, + refseq=ac if ac.startswith('NC') else None, + ensembl=ac if ac.startswith('ENSG') else None, + pos=pos, + strand=None, + status='GRCh38' + ) + + def _is_grch38_assembly(self, t: str) -> bool: + """Return whether or not accession is GRCh38 assembly. + + :param str t: Accession + :return: `True` if accession is GRCh38 assembly. `False` otherwise + """ + return 'GRCh38' in [a for a in self.dp.get_metadata(t)['aliases'] if a.startswith('GRCh')][0] # noqa: E501 diff --git a/variation/version.py b/variation/version.py index b5c9b6cb..11ef0928 100644 --- a/variation/version.py +++ b/variation/version.py @@ -1 +1 @@ -__version__ = "0.2.12" +__version__ = "0.2.13"