From 639c1df71bd10a47838ff9560dda41d4a73eb2fd Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 21 Aug 2024 12:13:09 -0400 Subject: [PATCH] fix: genomic accession mismatch when getting nearest tx junction (#356) close #329 * When lifting over to GRCh38 assembly, genomic accession and genomic position should be updated accordingly --- .../mappers/exon_genomic_coords.py | 91 +++++++++++++----- tests/mappers/test_exon_genomic_coords.py | 92 ++++++++++++++++++- 2 files changed, 161 insertions(+), 22 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 94d04b5..372da53 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -407,6 +407,8 @@ async def genomic_to_tx_segment( ) -> GenomicTxSegService: """Get transcript segment data for genomic data, lifted over to GRCh38. + If liftover to GRCh38 is unsuccessful, will return errors. + Must provide inter-residue coordinates. MANE Transcript data will be returned if and only if ``transcript`` is not @@ -667,6 +669,9 @@ async def _genomic_to_tx_segment( ) -> GenomicTxSeg: """Given genomic data, generate a boundary for a transcript segment. + Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return + errors. + :param genomic_pos: Genomic position where the transcript segment starts or ends (inter-residue based) :param chromosome: Chromosome. Must give chromosome without a prefix @@ -710,6 +715,13 @@ async def _genomic_to_tx_segment( ) genomic_ac = genomic_acs[0] + # Always liftover to GRCh38 + genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( + genomic_ac, genomic_pos + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) + if not transcript: # Select a transcript if not provided mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data( @@ -849,6 +861,58 @@ async def _genomic_to_tx_segment( genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript ) + async def _get_grch38_ac_pos( + self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None + ) -> tuple[str | None, int | None, str | None]: + """Get GRCh38 genomic representation for accession and position + + :param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly) + :param genomic_pos: Genomic position on ``genomic_ac`` + :param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not + provided, will attempt to retrieve associated GRCh38 accession from UTA. + :return: Tuple containing GRCh38 accession, GRCh38 position, and error message + if unable to get GRCh38 representation + """ + if not grch38_ac: + grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac) + if not grch38_ac: + return None, None, f"Unrecognized genomic accession: {genomic_ac}." + + grch38_ac = grch38_ac[0] + + if grch38_ac != genomic_ac: + # Ensure genomic_ac is GRCh37 + chromosome, _ = self.seqrepo_access.translate_identifier( + genomic_ac, Assembly.GRCH37.value + ) + if not chromosome: + _logger.warning( + "SeqRepo could not find associated %s assembly for genomic accession %s.", + Assembly.GRCH37.value, + genomic_ac, + ) + return ( + None, + None, + f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.", + ) + + chromosome = chromosome[-1].split(":")[-1] + liftover_data = self.liftover.get_liftover( + chromosome, genomic_pos, Assembly.GRCH38 + ) + if liftover_data is None: + return ( + None, + None, + f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.", + ) + + genomic_pos = liftover_data[1] + genomic_ac = grch38_ac + + return genomic_ac, genomic_pos, None + async def _get_genomic_ac_gene( self, pos: int, @@ -995,27 +1059,12 @@ async def _get_tx_seg_genomic_metadata( tx_ac = mane_data["RefSeq_nuc"] grch38_ac = mane_data["GRCh38_chr"] - if grch38_ac != genomic_ac: - # Ensure genomic_ac is GRCh37 - chromosome, _ = self.seqrepo_access.translate_identifier( - genomic_ac, Assembly.GRCH37.value - ) - if not chromosome: - return GenomicTxSeg(errors=["`genomic_ac` must use GRCh37 or GRCh38"]) - - chromosome = chromosome[-1].split(":")[-1] - liftover_data = self.liftover.get_liftover( - chromosome, genomic_pos, Assembly.GRCH38 - ) - if liftover_data is None: - return GenomicTxSeg( - errors=[ - f"Position {genomic_pos} does not exist on chromosome {chromosome}" - ] - ) - - genomic_pos = liftover_data[1] - genomic_ac = grch38_ac + # Always liftover to GRCh38 + genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos( + genomic_ac, genomic_pos, grch38_ac=grch38_ac + ) + if err_msg: + return GenomicTxSeg(errors=[err_msg]) tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac) if not tx_exons: diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 273227a..b2f5d8a 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -525,6 +525,41 @@ def gusbp3_exon2_end(): return GenomicTxSegService(**params) +@pytest.fixture(scope="module") +def eln_grch38_intronic(): + """Create test fixture for ELN (issue-329)""" + params = { + "gene": "ELN", + "genomic_ac": "NC_000007.14", + "tx_ac": "NM_000501.4", + "seg_start": { + "exon_ord": 0, + "offset": 1, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + }, + "start": 74028173, + }, + }, + "seg_end": { + "exon_ord": 7, + "offset": 431, + "genomic_location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + }, + "end": 74043599, + }, + }, + } + return GenomicTxSegService(**params) + + @pytest.fixture(scope="module") def gusbp3_exon5_start(): """Create test fixture for GUSBP3, start of exon 5 (negative strand)""" @@ -667,6 +702,51 @@ def genomic_tx_seg_checks(actual, expected=None, is_valid=True): assert len(actual.errors) > 0 +@pytest.mark.asyncio() +async def test_get_grch38_ac_pos(test_egc_mapper): + """Test that _get_grch38_ac_pos works correctly""" + grch38_ac = "NC_000001.11" + grch38_pos = 154192135 + expected = grch38_ac, grch38_pos, None + + # GRCh37 provided + grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.10", 154164611) + assert grch38_data == expected + + # GRCh38 provided, no grch38_ac + grch38_data = await test_egc_mapper._get_grch38_ac_pos(grch38_ac, grch38_pos) + assert grch38_data == expected + + # GRCh38 and grch38_ac provided + grch38_data = await test_egc_mapper._get_grch38_ac_pos( + grch38_ac, grch38_pos, grch38_ac=grch38_ac + ) + assert grch38_data == expected + + # Unrecognized accession + invalid_ac = "NC_0000026.10" + grch38_data = await test_egc_mapper._get_grch38_ac_pos(invalid_ac, 154164611) + assert grch38_data == (None, None, f"Unrecognized genomic accession: {invalid_ac}.") + + # GRCh36 used + grch38_data = await test_egc_mapper._get_grch38_ac_pos("NC_000001.9", 154164611) + assert grch38_data == ( + None, + None, + "`genomic_ac` must use GRCh37 or GRCh38 assembly.", + ) + + # Unsuccessful liftover + grch38_data = await test_egc_mapper._get_grch38_ac_pos( + "NC_000001.10", 9999999999999999999 + ) + assert grch38_data == ( + None, + None, + "Lifting over 9999999999999999999 on NC_000001.10 from GRCh37 to GRCh38 was unsuccessful.", + ) + + @pytest.mark.asyncio() async def test_get_all_exon_coords( test_egc_mapper, nm_152263_exons, nm_152263_exons_genomic_coords @@ -1194,7 +1274,7 @@ async def test_transcript_to_genomic( @pytest.mark.asyncio() -async def test_valid_inputs(test_egc_mapper): +async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic): """Test that valid inputs don"t return any errors""" inputs = { "gene": "TPM3", @@ -1240,6 +1320,16 @@ async def test_valid_inputs(test_egc_mapper): ) assert all((resp.gene, resp.genomic_ac, resp.tx_ac, resp.seg_start, resp.seg_end)) + # Liftover + intronic space + resp = await test_egc_mapper.genomic_to_tx_segment( + genomic_ac="NC_000007.13", # not latest AC for chr 7 + seg_start_genomic=73442503, + seg_end_genomic=73457929, # not on an exon + gene="ELN", + get_nearest_transcript_junction=True, + ) + genomic_tx_seg_service_checks(resp, eln_grch38_intronic) + @pytest.mark.asyncio() async def test_invalid(test_egc_mapper):