Skip to content

Commit

Permalink
Added utility to normalize subscripts for __getitem__.
Browse files Browse the repository at this point in the history
This mimics Bioconductor's normalizeSingleBracketSubscript function.
  • Loading branch information
LTLA committed Oct 27, 2023
1 parent 6276511 commit 537f3e2
Show file tree
Hide file tree
Showing 3 changed files with 247 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/biocutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
from .intersect import intersect
from .union import union
from .subset import subset
from .is_list_of_type import is_list_of_type
from .is_list_of_type import is_list_of_type
from .normalize_subscript import normalize_subscript
144 changes: 144 additions & 0 deletions src/biocutils/normalize_subscript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from typing import Union, Sequence, Optional, Tuple


def _raise_int(idx: int, length):
raise IndexError("subscript (" + str(idx) + ") out of range for vector-like object of length " + str(length))


has_numpy = False
try:
import numpy
has_numpy = True
except:
pass


def normalize_subscript(sub: Union[slice, range, Sequence, int, str, bool], length: int, names: Optional[Sequence[str]] = None) -> Tuple:
"""
Normalize a subscript for ``__getitem__`` or friends into a sequence of
integer indices, for consistent downstream use.
Args:
sub:
The subscript. This can be any of the following:
- A slice of elements.
- A range containing indices to elements. Negative values are
allowed. An error is raised if the indices are out of range.
- A single integer specifying the index of an element. A negative
value is allowed. An error is raised if the index is out of range.
- A single string that can be found in ``names``, which is
converted to the index of the first occurrence of that string in
``names``. An error is raised if the string cannot be found.
- A single boolean, which is converted into a list containing the
first element if true, and an empty list if false.
- A sequence of strings, integers and/or booleans. Strings are
converted to indices based on first occurrence in ``names``,
as described above. Integers should be indices to an element.
Each truthy boolean is converted to an index equal to its
position in ``sub``, and each Falsey boolean is ignored.
length:
Length of the object.
names:
List of names for each entry in the object. If not None, this
should have length equal to ``length``.
Returns:
A tuple containing (i) a sequence of integer indices in ``[0, length)``
specifying the subscript elements, and (ii) a boolean indicating whether
``sub`` was a scalar.
"""
if isinstance(sub, bool) or (has_numpy and isinstance(sub, numpy.bool_)): # before ints, as bools are ints.
if sub:
return [0], True
else:
return [], False

if isinstance(sub, int) or (has_numpy and isinstance(sub, numpy.generic)):
if sub < -length or sub >= length:
_raise_int(sub, length)
if sub < 0:
sub += length
return [int(sub)], True

if isinstance(sub, str):
if names is None:
raise IndexError("failed to find subscript '" + sub + "' for vector-like object with no names")
return [names.index(sub)], True

if isinstance(sub, slice):
return range(*sub.indices(length)), False
if isinstance(sub, range):
if len(sub) == 0:
return [], False

first = sub[0]
last = sub[-1]
if first >= length:
_raise_int(first, length)
if last >= length:
_raise_int(last, length)
if first < -length:
_raise_int(first, length)
if last < -length:
_raise_int(last, length)

if sub.start < 0:
if sub.stop < 0:
return range(length + sub.start, length + sub.stop, sub.step), False
else:
return [ (x < 0) * length + x for x in sub], False
else:
if sub.stop < 0:
return [ (x < 0) * length + x for x in sub], False
else:
return sub, False

can_return_early = False
for x in sub:
if isinstance(x, str) or isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)) or x < 0:
can_return_early = False;
break

if can_return_early:
for x in sub:
if x >= length:
_raise_int(x, length)
return sub, False

output = []
has_strings = set()
string_positions = []
for i, x in enumerate(sub):
if isinstance(x, str):
has_strings.add(x)
string_positions.append(len(output))
output.append(None)
elif isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)):
if x:
output.append(i)
elif x < 0:
if x < -length:
_raise_int(x, length)
output.append(x + length)
else:
if x >= length:
_raise_int(x, length)
output.append(x)

if len(has_strings):
if names is None:
raise IndexError("cannot find string subscripts for vector-like object with no names")

mapping = {}
for i, y in enumerate(names):
if y in has_strings:
mapping[y] = i
has_strings.remove(y) # remove it so we only consider the first.

for i in string_positions:
output[i] = mapping[sub[i]]

return output, False
101 changes: 101 additions & 0 deletions tests/test_normalize_subscript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from biocutils import normalize_subscript
import pytest
import numpy


def test_normalize_subscript_scalars():
assert normalize_subscript(10, 100) == ([10], True)
assert normalize_subscript(-1, 100) == ([99], True)
assert normalize_subscript(True, 100) == ([0], True)
assert normalize_subscript(False, 100) == ([], False)
assert normalize_subscript("C", 5, ["A", "B", "C", "D", "E" ]) == ([2], True)
assert normalize_subscript("B", 5, ["A", "B", "C", "B", "E" ]) == ([1], True) # takes first occurence.

with pytest.raises(IndexError) as ex:
normalize_subscript(100, 10)
assert str(ex.value).find("subscript (100)") >= 0

with pytest.raises(IndexError) as ex:
normalize_subscript(-11, 10)
assert str(ex.value).find("subscript (-11)") >= 0

with pytest.raises(IndexError) as ex:
normalize_subscript("foor", 10)
assert str(ex.value).find("subscript 'foor'") >= 0

with pytest.raises(ValueError) as ex:
normalize_subscript("F", 5, ["A", "B", "C", "D", "E"])


def test_normalize_subscript_slice():
assert normalize_subscript(slice(10, 40), 100) == (range(10, 40), False)
assert normalize_subscript(slice(-10, -20, -1), 100) == (range(90, 80, -1), False)


def test_normalize_subscript_range():
assert normalize_subscript(range(5, 2), 100) == ([], False)
assert normalize_subscript(range(10, 40), 100) == (range(10, 40), False)
assert normalize_subscript(range(-10, 40), 100) == (list(range(90, 100)) + list(range(40)), False)
assert normalize_subscript(range(50, -10, -1), 100) == (list(range(50, -1, -1)) + list(range(99, 90, -1)), False)
assert normalize_subscript(range(-10, -50, -1), 100) == (range(90, 50, -1), False)

with pytest.raises(IndexError) as ex:
normalize_subscript(range(10, 50), 20)
assert str(ex.value).find("subscript (49)") >= 0
normalize_subscript(range(10, 20), 20)

with pytest.raises(IndexError) as ex:
normalize_subscript(range(20, 0, -1), 20)
assert str(ex.value).find("subscript (20)") >= 0
normalize_subscript(range(19, 0, -1), 20)

with pytest.raises(IndexError) as ex:
normalize_subscript(range(-21, -10), 20)
assert str(ex.value).find("subscript (-21)") >= 0
normalize_subscript(range(-20, -10), 20)

with pytest.raises(IndexError) as ex:
normalize_subscript(range(-10, -22, -1), 20)
assert str(ex.value).find("subscript (-21)") >= 0
normalize_subscript(range(-10, -21, -1), 20)


def test_normalize_subscript_chaos():
assert normalize_subscript([0,2,4,6,8], 50) == ([0,2,4,6,8], False)

with pytest.raises(IndexError) as ex:
normalize_subscript([0,2,50,6,8], 50)
assert str(ex.value).find("subscript (50)") >= 0

assert normalize_subscript([0,-1,2,-3,4,-5,6,-7,8], 50) == ([0,49,2,47,4,45,6,43,8], False)

with pytest.raises(IndexError) as ex:
normalize_subscript([0,2,-51,6,8], 50)
assert str(ex.value).find("subscript (-51)") >= 0

assert normalize_subscript([False,10,True,20,False,30,True], 50) == ([10,2,20,30,6], False)

names = ["A", "B", "C", "D", "E", "F"]
assert normalize_subscript(["B",1,"D",2,"F",3,"A"], 6, names) == ([1,1,3,2,5,3,0], False)
assert normalize_subscript(["B",1,"A",2,"B",3,"A"], 6, ["A", "B", "A", "B", "A", "B"]) == ([1,1,0,2,1,3,0], False) # Takes the first occurence.

with pytest.raises(KeyError) as ex:
normalize_subscript(["B",1,"D",2,"G",3,"A"], 6, names)

with pytest.raises(IndexError) as ex:
normalize_subscript(["B",1,"D",2,"F",3,"A"], 6)
assert str(ex.value).find("vector-like object with no names") >= 0


def test_normalize_subscript_numpy():
out, x = normalize_subscript(numpy.array([1, 3, 5]), 6)
assert (out == numpy.array([1,3,5])).all()

out, x = normalize_subscript(numpy.array([-1, -3, -5]), 6)
assert (out == numpy.array([5,3,1])).all()

assert normalize_subscript(numpy.int64(5), 6) == ([5], True)
assert normalize_subscript(numpy.bool_(True), 6) == ([0], True)

# Now the trickiest part - are booleans converted correctly?
assert normalize_subscript(numpy.array([True, False, True, False, True]), 5) == ([0, 2, 4], False)

0 comments on commit 537f3e2

Please sign in to comment.