Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: add starting_assembly as argument when using genomic_to_tx_segment #391

Merged
merged 21 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 37 additions & 51 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ async def genomic_to_tx_segment(
transcript: str | None = None,
gene: str | None = None,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
starting_assembly: Assembly = Assembly.GRCH38,
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
) -> GenomicTxSegService:
"""Get transcript segment data for genomic data, lifted over to GRCh38.

Expand Down Expand Up @@ -452,6 +453,8 @@ async def genomic_to_tx_segment(
value is provided.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:param starting_assembly: The assembly that the supplied coordinate comes from. Set to
GRCh38 by default
:return: Genomic data (inter-residue coordinates)
"""
errors = []
Expand All @@ -477,6 +480,7 @@ async def genomic_to_tx_segment(
gene=gene,
is_seg_start=True,
coordinate_type=coordinate_type,
starting_assembly=starting_assembly,
)
if start_tx_seg_data.errors:
return _return_service_errors(start_tx_seg_data.errors)
Expand All @@ -497,6 +501,7 @@ async def genomic_to_tx_segment(
gene=gene,
is_seg_start=False,
coordinate_type=coordinate_type,
starting_assembly=starting_assembly,
)
if end_tx_seg_data.errors:
return _return_service_errors(end_tx_seg_data.errors)
Expand Down Expand Up @@ -727,6 +732,7 @@ async def _genomic_to_tx_segment(
gene: str | None = None,
is_seg_start: bool = True,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
starting_assembly: Assembly = Assembly.GRCH38,
) -> GenomicTxSeg:
"""Given genomic data, generate a boundary for a transcript segment.

Expand All @@ -752,6 +758,8 @@ async def _genomic_to_tx_segment(
``False`` if ``genomic_pos`` is where the transcript segment ends.
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
``seg_end_genomic``. Expects inter-residue coordinates by default
:param starting_assembly: The assembly that the supplied coordinate comes from. Set to
GRCh38 by default
:return: Data for a transcript segment boundary (inter-residue coordinates)
"""
params = {key: None for key in GenomicTxSeg.model_fields}
Expand All @@ -775,6 +783,9 @@ async def _genomic_to_tx_segment(
return GenomicTxSeg(
errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"]
)
if starting_assembly == Assembly.GRCH37:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
genomic_ac = grch38_ac[0]
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
else:
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)

Expand All @@ -784,13 +795,17 @@ async def _genomic_to_tx_segment(
)
genomic_ac = genomic_acs[0]

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
genomic_ac,
genomic_pos,
)
if err_msg:
return GenomicTxSeg(errors=[err_msg])
# Liftover to GRCh38 if the provided assembly is GRCh37
if starting_assembly == Assembly.GRCH37:
genomic_pos = await self._get_grch38_pos(
genomic_ac, genomic_pos, chromosome=chromosome if chromosome else None
)
if not genomic_pos:
return GenomicTxSeg(
errors=[
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful."
]
)

# Select a transcript if not provided
if not transcript:
Expand Down Expand Up @@ -903,59 +918,30 @@ async def _genomic_to_tx_segment(
),
)

async def _get_grch38_ac_pos(
async def _get_grch38_pos(
self,
genomic_ac: str,
genomic_pos: int,
grch38_ac: str | None = None,
) -> tuple[str | None, int | None, str | None]:
chromosome: str | None = None,
) -> int | None:
"""Get GRCh38 genomic representation for accession and position

:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
:param genomic_pos: Genomic position on ``genomic_ac``
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
provided, will attempt to retrieve associated GRCh38 accession from UTA.
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
if unable to get GRCh38 representation
:param genomic_pos: A genomic coordinate in GRCh37
:param genomic_ac: The genomic accession in GRCh38
:param chromosome: The chromosome that genomic_pos occurs on
:return The genomic coordinate in GRCh38
"""
# Validate accession exists
if not grch38_ac:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return None, None, f"Unrecognized genomic accession: {genomic_ac}."

grch38_ac = grch38_ac[0]

if grch38_ac != genomic_ac:
# Ensure genomic_ac is GRCh37
if not chromosome:
chromosome, _ = self.seqrepo_access.translate_identifier(
genomic_ac, Assembly.GRCH37.value
genomic_ac, target_namespaces=Assembly.GRCH38.value
)
if not chromosome:
_logger.warning(
"SeqRepo could not find associated %s assembly for genomic accession %s.",
Assembly.GRCH37.value,
genomic_ac,
)
return (
None,
None,
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
)
chromosome = chromosome[-1].split(":")[-1]
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return (
None,
None,
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
)
genomic_pos = liftover_data[1]
genomic_ac = grch38_ac

return genomic_ac, genomic_pos, None
liftover_data = self.liftover.get_liftover(
chromosome, genomic_pos, Assembly.GRCH38
)
if liftover_data is None:
return None
return liftover_data[1]
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved

async def _validate_gene_coordinates(
self,
Expand Down
83 changes: 41 additions & 42 deletions tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
_ExonCoord,
)
from cool_seq_tool.schemas import (
Assembly,
CoordinateType,
Strand,
)
Expand Down Expand Up @@ -704,48 +705,20 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True):


@pytest.mark.asyncio()
async def test_get_grch38_ac_pos(test_egc_mapper):
"""Test that _get_grch38_ac_pos works correctly"""
grch38_ac = "NC_000001.11"
grch38_pos = 154192135
expected = grch38_ac, grch38_pos, None

# GRCh37 provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611)
assert grch38_data == expected

# GRCh38 provided, no grch38_ac
grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos)
assert grch38_data == expected

# GRCh38 and grch38_ac provided
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
grch38_ac, grch38_pos, grch38_ac=grch38_ac
)
assert grch38_data == expected

# Unrecognized accession
invalid_ac = "NC_0000026.10"
grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611)
assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.")

# GRCh36 used
grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611)
assert grch38_data == (
None,
None,
"`genomic_ac` must use GRCh37 or GRCh38 assembly.",
)
async def test_get_grch38_pos(test_egc_mapper):
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
"""Test that get_grch38_pos works correctly"""
genomic_pos = await test_egc_mapper._get_grch38_pos("NC_000011.10", 9609996)
assert genomic_pos == 9588449

# Unsuccessful liftover
grch38_data = await test_egc_mapper._get_grch38_ac_pos(
"NC_000001.10", 9999999999999999999
genomic_pos = await test_egc_mapper._get_grch38_pos(
"NC_000011.10", 9609996, "chr11"
)
assert grch38_data == (
None,
None,
"Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.",
assert genomic_pos == 9588449

genomic_pos = await test_egc_mapper._get_grch38_pos(
"NC_000011.10", 9609996999999999
)
assert genomic_pos is None


@pytest.mark.asyncio()
Expand Down Expand Up @@ -893,6 +866,21 @@ def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords
assert resp is True # Breakpoint does occur on an exon


def test_use_alt_start_i(test_egc_mapper):
"""Test when to use alt_start_i or alt_end_i from UTA"""
resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.POSITIVE)
assert resp

resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.NEGATIVE)
assert resp

resp = test_egc_mapper._use_alt_start_i(is_seg_start=True, strand=Strand.NEGATIVE)
assert not resp

resp = test_egc_mapper._use_alt_start_i(is_seg_start=False, strand=Strand.POSITIVE)
assert not resp


@pytest.mark.asyncio()
async def test_genomic_to_transcript_fusion_context(
test_egc_mapper,
Expand Down Expand Up @@ -1253,6 +1241,7 @@ async def test_braf(test_egc_mapper, mane_braf):
"seg_start_genomic": 140501359, # GRCh38 coords: 140801559
"seg_end_genomic": 140453136, # GRCh38 coords: 140753336
"gene": "BRAF",
"starting_assembly": Assembly.GRCH37.value,
}
# MANE
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
Expand All @@ -1276,6 +1265,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_start_genomic": 9597639,
"seg_end_genomic": 9609996,
"transcript": "NM_003390.3",
"starting_assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
Expand All @@ -1292,6 +1282,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_end_genomic": 9609996,
"transcript": "NM_003390.3",
"gene": "WEE1",
"starting_assembly": Assembly.GRCH37.value,
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11)
Expand All @@ -1307,6 +1298,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11):
"seg_start_genomic": 9597639, # GRCh38 coords: 9576092
"seg_end_genomic": 9609996, # GRCh38 coords: 9588449
"gene": "WEE1",
"starting_assembly": Assembly.GRCH37.value,
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
}
g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11)
Expand Down Expand Up @@ -1418,13 +1410,19 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
"gene": "WEE1",
"genomic_ac": "NC_000011.9",
"seg_end_genomic": 9609996,
"starting_assembly": Assembly.GRCH37.value,
}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

# inputs = {"gene": "WEE1", "chromosome": "11", "seg_end_genomic": 9588449}
# resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
# assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))
inputs = {
"gene": "WEE1",
"chromosome": "11",
"seg_end_genomic": 9609996,
"starting_assembly": Assembly.GRCH37.value,
}
resp = await test_egc_mapper.genomic_to_tx_segment(**inputs)
assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_end))

inputs = {"transcript": "NM_003390.3", "exon_start": 2}
resp = await test_egc_mapper.tx_segment_to_genomic(**inputs)
Expand Down Expand Up @@ -1456,6 +1454,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
seg_start_genomic=73442503,
seg_end_genomic=73457929, # not on an exon
gene="ELN",
starting_assembly=Assembly.GRCH37.value,
)
genomic_tx_seg_service_checks(resp, eln_grch38_intronic)

Expand Down
Loading