From 0673d98ba620e49eb447d5dac0d3c3143b15e8f1 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 26 Dec 2023 12:43:30 +1100 Subject: [PATCH 1/3] MAINT: make test more explicit --- tests/test_aligndb.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_aligndb.py b/tests/test_aligndb.py index 64d0f4d..638769b 100644 --- a/tests/test_aligndb.py +++ b/tests/test_aligndb.py @@ -177,15 +177,16 @@ def test_len_gapped(): def test_all_gaps_in_slice(): # slicing GapPositions # sample seq 1 - # AC--GTA-TG -> gp = GapPositions(gaps=[[2,2],[5,1]], seq_length=7) - gp = GapPositions( - gaps=numpy.array([[2, 2], [5, 1]], dtype=numpy.int32), seq_length=7 - ) - got = gp[1:9] - expect_gaps = numpy.array([[1, 2], [4, 1]], dtype=numpy.int32) + data = "AC--GTA-TG" + seq = make_seq(data, moltype="dna") + g, s = seq_to_gap_coords(seq) + gp = GapPositions(g, len(data.replace("-", ""))) + sl = slice(1, 9) + + got = gp[sl] + expect_gaps, expect_seq = seq_to_gap_coords(make_seq(data[sl], moltype="dna")) assert (got.gaps == expect_gaps).all() assert got.seq_length == 5 - # gp[1:9] -> C--GTA-T -> GapPositions(gaps=[[1,2],[4,1]], seq_length=5) @pytest.mark.parametrize( From 2150da432ee8e89f909b38be4c437b6b78e9804c Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 26 Dec 2023 12:47:00 +1100 Subject: [PATCH 2/3] MAINT: more robust build of GapPositions [CHANGED] slicing a 2D array as [:0] produces a result with shape (0, 2). We force it to (0, 0). --- src/ensembl_lite/_aligndb.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ensembl_lite/_aligndb.py b/src/ensembl_lite/_aligndb.py index 808007e..752ed41 100644 --- a/src/ensembl_lite/_aligndb.py +++ b/src/ensembl_lite/_aligndb.py @@ -242,6 +242,12 @@ class GapPositions: seq_length: int def __post_init__(self): + if not len(self.gaps): + # can get have a zero length array with shape != (0, 0) + # e.g. by slicing gaps[:0], but since there's no data + # we force it to have zero elements on both dimensions + self.gaps = self.gaps.reshape((0, 0)) + # make gap array immutable self.gaps.flags.writeable = False From cf17cae12e2e94eba853ccf4c84fb04fb24bf71d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 26 Dec 2023 12:56:22 +1100 Subject: [PATCH 3/3] TST: increase test cases for slicing GapPositions and fix errors --- src/ensembl_lite/_aligndb.py | 52 ++++++++++++++++++++++++++---------- tests/test_aligndb.py | 29 +++++++++++--------- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/src/ensembl_lite/_aligndb.py b/src/ensembl_lite/_aligndb.py index 752ed41..b477c02 100644 --- a/src/ensembl_lite/_aligndb.py +++ b/src/ensembl_lite/_aligndb.py @@ -180,15 +180,13 @@ def _ends_within_gap(gaps: numpy.ndarray, align_index: int) -> numpy.ndarray: gap_start = gap_index + total_gaps gap_end = gap_start + gap_length # align index can fall between gaps or within a gap - if gap_start == align_index: + if align_index <= gap_start: new_gaps = gaps[:i] break - elif align_index == gap_end: + elif gap_start < align_index <= gap_end: + # we end within a gap, so adjust the length of that gap new_gaps = gaps[: i + 1] - break - elif align_index < gap_start: - # align_index is before this gap - new_gaps = gaps[:i] + new_gaps[-1, 1] = align_index - gap_start break total_gaps += gap_length @@ -221,7 +219,10 @@ def _starts_within_gap(gaps: numpy.ndarray, align_index: int) -> numpy.ndarray: gap_start_diff = align_index - gap_index new_gaps[0, 1] = gap_length - gap_start_diff break - elif align_index == gap_end or align_index < gap_start: + elif align_index < gap_start: + new_gaps = gaps[i:] + break + elif align_index == gap_end: new_gaps = gaps[i + 1 :] break total_gaps += gap_length @@ -232,6 +233,26 @@ def _starts_within_gap(gaps: numpy.ndarray, align_index: int) -> numpy.ndarray: return new_gaps +def _within_a_gap(gaps: numpy.ndarray, start: int, stop: int) -> bool: + """return True if start/stop fall within a gap + + Parameters + ---------- + gaps + numpy 2D array + start, stop + start and stop are align indices + """ + # todo convert to numba + cumsum_gap_length = 0 + for gap_start, gap_length in gaps: + gap_start += cumsum_gap_length + if gap_start <= start < stop <= gap_start + gap_length: + return True + cumsum_gap_length += gap_length + return False + + @dataclass class GapPositions: # 2D numpy int array, @@ -264,28 +285,31 @@ def __getitem__(self, item: slice) -> typing.Self: f"{type(self).__name__!r} does not support negative indexes" ) # slice is before first gap or after last gap - if stop < gaps[0, 0] or start > gaps[-1].sum(): + total_gap_length = gaps[:, 1].sum() + if stop <= gaps[0, 0] or start > gaps[-1][0] + total_gap_length: gaps = numpy.empty(shape=(0, 0), dtype=gaps.dtype) return type(self)(gaps=gaps, seq_length=stop - start) - total_gaps = gaps[:, 1].sum() - if start < gaps[0, 0] and stop > gaps[-1, 0] + total_gaps: - # slice result contains all gaps, so we shift gaps left + if start < gaps[0, 0] and stop > gaps[-1, 0] + total_gap_length: + # slice result contains all gaps and shift gap left gaps = _adjust_gap_starts(gaps, start) seq_length = self.from_align_to_seq_index(stop) - start return type(self)(gaps=gaps, seq_length=seq_length) - - if start < gaps[0, 0]: + elif start < gaps[0, 0]: # start is in seq coords, ends within gaps seq_length = self.from_align_to_seq_index(stop) - start gaps = _ends_within_gap(gaps, stop) gaps = _adjust_gap_starts(gaps, start) - elif stop > total_gaps + gaps[-1, 0]: + elif stop > total_gap_length + gaps[-1, 0]: # slice starts within gaps gaps = _starts_within_gap(gaps, start) seq_start = self.from_align_to_seq_index(start) gaps = _adjust_gap_starts(gaps, seq_start) seq_length = self.from_align_to_seq_index(stop) - seq_start + elif _within_a_gap(gaps, start, stop): + # falls within a gap + gaps = numpy.array([[0, stop - start]], dtype=gaps.dtype) + return type(self)(gaps=gaps, seq_length=0) else: # slice is within the gaps gaps = _ends_within_gap(gaps, stop) diff --git a/tests/test_aligndb.py b/tests/test_aligndb.py index 638769b..8d20979 100644 --- a/tests/test_aligndb.py +++ b/tests/test_aligndb.py @@ -189,26 +189,29 @@ def test_all_gaps_in_slice(): assert got.seq_length == 5 +@pytest.mark.parametrize("data", ("----GTA-TG", "AC--GTA---", "AC--GTA-TG")) @pytest.mark.parametrize( "slice", ( - slice(1, 9), # slice spans all gaps - slice(0, 7), # slice ends within a gap - slice(0, 8), # slice ends within a gap - slice(0, 5), # slice ends within a gap - slice(2, 9), # slice starts within a gap - slice(3, 9), # variant starts within gap - slice(4, 9), # variant starts within gap - slice(6, 9), # variant starts within gap - slice(3, 8), # slice starts & ends within a gap - slice(2, 4), # result is all gaps + slice(0, 2), + slice(8, 10), + slice(1, 9), + slice(0, 7), + slice(0, 8), + slice(0, 5), + slice(2, 9), + slice(3, 9), + slice(4, 9), + slice(6, 9), + slice(3, 8), + slice(4, 6), + slice(2, 4), ), ) -def test_variant_slices(slice): - data = "AC--GTA-TG" +def test_variant_slices(data, slice): seq = make_seq(data, moltype="dna") g, s = seq_to_gap_coords(seq) - gaps = GapPositions(g, len(seq)) + gaps = GapPositions(g, len(s)) orig = gaps.gaps.copy() got = gaps[slice]