Skip to content

Commit

Permalink
Merge pull request #48 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
More test cases
  • Loading branch information
GavinHuttley authored Dec 26, 2023
2 parents 6929f4d + cf17cae commit bcfe75b
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 34 deletions.
58 changes: 44 additions & 14 deletions src/ensembl_lite/_aligndb.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,13 @@ def _ends_within_gap(gaps: numpy.ndarray, align_index: int) -> numpy.ndarray:
gap_start = gap_index + total_gaps
gap_end = gap_start + gap_length
# align index can fall between gaps or within a gap
if gap_start == align_index:
if align_index <= gap_start:
new_gaps = gaps[:i]
break
elif align_index == gap_end:
elif gap_start < align_index <= gap_end:
# we end within a gap, so adjust the length of that gap
new_gaps = gaps[: i + 1]
break
elif align_index < gap_start:
# align_index is before this gap
new_gaps = gaps[:i]
new_gaps[-1, 1] = align_index - gap_start
break

total_gaps += gap_length
Expand Down Expand Up @@ -221,7 +219,10 @@ def _starts_within_gap(gaps: numpy.ndarray, align_index: int) -> numpy.ndarray:
gap_start_diff = align_index - gap_index
new_gaps[0, 1] = gap_length - gap_start_diff
break
elif align_index == gap_end or align_index < gap_start:
elif align_index < gap_start:
new_gaps = gaps[i:]
break
elif align_index == gap_end:
new_gaps = gaps[i + 1 :]
break
total_gaps += gap_length
Expand All @@ -232,6 +233,26 @@ def _starts_within_gap(gaps: numpy.ndarray, align_index: int) -> numpy.ndarray:
return new_gaps


def _within_a_gap(gaps: numpy.ndarray, start: int, stop: int) -> bool:
"""return True if start/stop fall within a gap
Parameters
----------
gaps
numpy 2D array
start, stop
start and stop are align indices
"""
# todo convert to numba
cumsum_gap_length = 0
for gap_start, gap_length in gaps:
gap_start += cumsum_gap_length
if gap_start <= start < stop <= gap_start + gap_length:
return True
cumsum_gap_length += gap_length
return False


@dataclass
class GapPositions:
# 2D numpy int array,
Expand All @@ -242,6 +263,12 @@ class GapPositions:
seq_length: int

def __post_init__(self):
if not len(self.gaps):
# can get have a zero length array with shape != (0, 0)
# e.g. by slicing gaps[:0], but since there's no data
# we force it to have zero elements on both dimensions
self.gaps = self.gaps.reshape((0, 0))

# make gap array immutable
self.gaps.flags.writeable = False

Expand All @@ -258,28 +285,31 @@ def __getitem__(self, item: slice) -> typing.Self:
f"{type(self).__name__!r} does not support negative indexes"
)
# slice is before first gap or after last gap
if stop < gaps[0, 0] or start > gaps[-1].sum():
total_gap_length = gaps[:, 1].sum()
if stop <= gaps[0, 0] or start > gaps[-1][0] + total_gap_length:
gaps = numpy.empty(shape=(0, 0), dtype=gaps.dtype)
return type(self)(gaps=gaps, seq_length=stop - start)

total_gaps = gaps[:, 1].sum()
if start < gaps[0, 0] and stop > gaps[-1, 0] + total_gaps:
# slice result contains all gaps, so we shift gaps left
if start < gaps[0, 0] and stop > gaps[-1, 0] + total_gap_length:
# slice result contains all gaps and shift gap left
gaps = _adjust_gap_starts(gaps, start)
seq_length = self.from_align_to_seq_index(stop) - start
return type(self)(gaps=gaps, seq_length=seq_length)

if start < gaps[0, 0]:
elif start < gaps[0, 0]:
# start is in seq coords, ends within gaps
seq_length = self.from_align_to_seq_index(stop) - start
gaps = _ends_within_gap(gaps, stop)
gaps = _adjust_gap_starts(gaps, start)
elif stop > total_gaps + gaps[-1, 0]:
elif stop > total_gap_length + gaps[-1, 0]:
# slice starts within gaps
gaps = _starts_within_gap(gaps, start)
seq_start = self.from_align_to_seq_index(start)
gaps = _adjust_gap_starts(gaps, seq_start)
seq_length = self.from_align_to_seq_index(stop) - seq_start
elif _within_a_gap(gaps, start, stop):
# falls within a gap
gaps = numpy.array([[0, stop - start]], dtype=gaps.dtype)
return type(self)(gaps=gaps, seq_length=0)
else:
# slice is within the gaps
gaps = _ends_within_gap(gaps, stop)
Expand Down
44 changes: 24 additions & 20 deletions tests/test_aligndb.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,37 +177,41 @@ def test_len_gapped():
def test_all_gaps_in_slice():
# slicing GapPositions
# sample seq 1
# AC--GTA-TG -> gp = GapPositions(gaps=[[2,2],[5,1]], seq_length=7)
gp = GapPositions(
gaps=numpy.array([[2, 2], [5, 1]], dtype=numpy.int32), seq_length=7
)
got = gp[1:9]
expect_gaps = numpy.array([[1, 2], [4, 1]], dtype=numpy.int32)
data = "AC--GTA-TG"
seq = make_seq(data, moltype="dna")
g, s = seq_to_gap_coords(seq)
gp = GapPositions(g, len(data.replace("-", "")))
sl = slice(1, 9)

got = gp[sl]
expect_gaps, expect_seq = seq_to_gap_coords(make_seq(data[sl], moltype="dna"))
assert (got.gaps == expect_gaps).all()
assert got.seq_length == 5
# gp[1:9] -> C--GTA-T -> GapPositions(gaps=[[1,2],[4,1]], seq_length=5)


@pytest.mark.parametrize("data", ("----GTA-TG", "AC--GTA---", "AC--GTA-TG"))
@pytest.mark.parametrize(
"slice",
(
slice(1, 9), # slice spans all gaps
slice(0, 7), # slice ends within a gap
slice(0, 8), # slice ends within a gap
slice(0, 5), # slice ends within a gap
slice(2, 9), # slice starts within a gap
slice(3, 9), # variant starts within gap
slice(4, 9), # variant starts within gap
slice(6, 9), # variant starts within gap
slice(3, 8), # slice starts & ends within a gap
slice(2, 4), # result is all gaps
slice(0, 2),
slice(8, 10),
slice(1, 9),
slice(0, 7),
slice(0, 8),
slice(0, 5),
slice(2, 9),
slice(3, 9),
slice(4, 9),
slice(6, 9),
slice(3, 8),
slice(4, 6),
slice(2, 4),
),
)
def test_variant_slices(slice):
data = "AC--GTA-TG"
def test_variant_slices(data, slice):
seq = make_seq(data, moltype="dna")
g, s = seq_to_gap_coords(seq)
gaps = GapPositions(g, len(seq))
gaps = GapPositions(g, len(s))
orig = gaps.gaps.copy()
got = gaps[slice]

Expand Down

0 comments on commit bcfe75b

Please sign in to comment.