Added utility to normalize subscripts for __getitem__.

This mimics Bioconductor's normalizeSingleBracketSubscript function.
BiocPy · Oct 27, 2023 · 537f3e2 · 537f3e2
1 parent 6276511
commit 537f3e2
Show file tree

Hide file tree

Showing 3 changed files with 247 additions and 1 deletion.
diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py
@@ -21,4 +21,5 @@
 from .intersect import intersect
 from .union import union
 from .subset import subset
-from .is_list_of_type import is_list_of_type
+from .is_list_of_type import is_list_of_type
+from .normalize_subscript import normalize_subscript
diff --git a/src/biocutils/normalize_subscript.py b/src/biocutils/normalize_subscript.py
@@ -0,0 +1,144 @@
+from typing import Union, Sequence, Optional, Tuple
+
+
+def _raise_int(idx: int, length):
+    raise IndexError("subscript (" + str(idx) + ") out of range for vector-like object of length " + str(length))
+
+
+has_numpy = False
+try:
+    import numpy
+    has_numpy = True
+except:
+    pass
+
+
+def normalize_subscript(sub: Union[slice, range, Sequence, int, str, bool], length: int, names: Optional[Sequence[str]] = None) -> Tuple:
+    """
+    Normalize a subscript for ``__getitem__`` or friends into a sequence of
+    integer indices, for consistent downstream use.
+
+    Args:
+        sub: 
+            The subscript. This can be any of the following:
+
+            - A slice of elements. 
+            - A range containing indices to elements. Negative values are
+              allowed. An error is raised if the indices are out of range.
+            - A single integer specifying the index of an element. A negative
+              value is allowed. An error is raised if the index is out of range.
+            - A single string that can be found in ``names``, which is
+              converted to the index of the first occurrence of that string in
+              ``names``. An error is raised if the string cannot be found.
+            - A single boolean, which is converted into a list containing the
+              first element if true, and an empty list if false.
+            - A sequence of strings, integers and/or booleans. Strings are
+              converted to indices based on first occurrence in ``names``,
+              as described above. Integers should be indices to an element.
+              Each truthy boolean is converted to an index equal to its
+              position in ``sub``, and each Falsey boolean is ignored.
+
+        length:
+            Length of the object.
+
+        names:
+            List of names for each entry in the object. If not None, this
+            should have length equal to ``length``.
+
+    Returns:
+        A tuple containing (i) a sequence of integer indices in ``[0, length)``
+        specifying the subscript elements, and (ii) a boolean indicating whether
+        ``sub`` was a scalar.
+    """
+    if isinstance(sub, bool) or (has_numpy and isinstance(sub, numpy.bool_)): # before ints, as bools are ints.
+        if sub:
+            return [0], True
+        else:
+            return [], False
+
+    if isinstance(sub, int) or (has_numpy and isinstance(sub, numpy.generic)):
+        if sub < -length or sub >= length:
+            _raise_int(sub, length)
+        if sub < 0:
+            sub += length
+        return [int(sub)], True
+
+    if isinstance(sub, str):
+        if names is None:
+            raise IndexError("failed to find subscript '" + sub + "' for vector-like object with no names")
+        return [names.index(sub)], True
+
+    if isinstance(sub, slice):
+        return range(*sub.indices(length)), False
+    if isinstance(sub, range):
+        if len(sub) == 0:
+            return [], False
+
+        first = sub[0]
+        last = sub[-1]
+        if first >= length:
+            _raise_int(first, length)
+        if last >= length:
+            _raise_int(last, length)
+        if first < -length:
+            _raise_int(first, length)
+        if last < -length:
+            _raise_int(last, length)
+
+        if sub.start < 0:
+            if sub.stop < 0:
+                return range(length + sub.start, length + sub.stop, sub.step), False
+            else:
+                return [ (x < 0) * length + x for x in sub], False
+        else:
+            if sub.stop < 0:
+                return [ (x < 0) * length + x for x in sub], False
+            else:
+                return sub, False
+
+    can_return_early = False
+    for x in sub:
+        if isinstance(x, str) or isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)) or x < 0:
+            can_return_early = False;
+            break
+
+    if can_return_early:
+        for x in sub:
+            if x >= length:
+                _raise_int(x, length)
+        return sub, False
+
+    output = []
+    has_strings = set()
+    string_positions = []
+    for i, x in enumerate(sub):
+        if isinstance(x, str):
+            has_strings.add(x)
+            string_positions.append(len(output))
+            output.append(None)
+        elif isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)):
+            if x:
+                output.append(i)
+        elif x < 0:
+            if x < -length:
+                _raise_int(x, length)
+            output.append(x + length)
+        else:
+            if x >= length:
+                _raise_int(x, length)
+            output.append(x)
+
+    if len(has_strings):
+        if names is None:
+            raise IndexError("cannot find string subscripts for vector-like object with no names")
+
+        mapping = {}
+        for i, y in enumerate(names):
+            if y in has_strings:
+                mapping[y] = i
+                has_strings.remove(y) # remove it so we only consider the first.
+
+        for i in string_positions:
+            output[i] = mapping[sub[i]]
+
+    return output, False
diff --git a/tests/test_normalize_subscript.py b/tests/test_normalize_subscript.py
@@ -0,0 +1,101 @@
+from biocutils import normalize_subscript
+import pytest
+import numpy
+
+
+def test_normalize_subscript_scalars():
+    assert normalize_subscript(10, 100) == ([10], True)
+    assert normalize_subscript(-1, 100) == ([99], True)
+    assert normalize_subscript(True, 100) == ([0], True)
+    assert normalize_subscript(False, 100) == ([], False)
+    assert normalize_subscript("C", 5, ["A", "B", "C", "D", "E" ]) == ([2], True)
+    assert normalize_subscript("B", 5, ["A", "B", "C", "B", "E" ]) == ([1], True) # takes first occurence.
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(100, 10)
+    assert str(ex.value).find("subscript (100)") >= 0
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(-11, 10)
+    assert str(ex.value).find("subscript (-11)") >= 0
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript("foor", 10)
+    assert str(ex.value).find("subscript 'foor'") >= 0
+
+    with pytest.raises(ValueError) as ex:
+        normalize_subscript("F", 5, ["A", "B", "C", "D", "E"])
+
+
+def test_normalize_subscript_slice():
+    assert normalize_subscript(slice(10, 40), 100) == (range(10, 40), False)
+    assert normalize_subscript(slice(-10, -20, -1), 100) == (range(90, 80, -1), False)
+
+
+def test_normalize_subscript_range():
+    assert normalize_subscript(range(5, 2), 100) == ([], False)
+    assert normalize_subscript(range(10, 40), 100) == (range(10, 40), False)
+    assert normalize_subscript(range(-10, 40), 100) == (list(range(90, 100)) + list(range(40)), False)
+    assert normalize_subscript(range(50, -10, -1), 100) == (list(range(50, -1, -1)) + list(range(99, 90, -1)), False)
+    assert normalize_subscript(range(-10, -50, -1), 100) == (range(90, 50, -1), False)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(range(10, 50), 20)
+    assert str(ex.value).find("subscript (49)") >= 0
+    normalize_subscript(range(10, 20), 20)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(range(20, 0, -1), 20)
+    assert str(ex.value).find("subscript (20)") >= 0
+    normalize_subscript(range(19, 0, -1), 20)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(range(-21, -10), 20)
+    assert str(ex.value).find("subscript (-21)") >= 0
+    normalize_subscript(range(-20, -10), 20)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(range(-10, -22, -1), 20)
+    assert str(ex.value).find("subscript (-21)") >= 0
+    normalize_subscript(range(-10, -21, -1), 20)
+
+
+def test_normalize_subscript_chaos():
+    assert normalize_subscript([0,2,4,6,8], 50) == ([0,2,4,6,8], False)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript([0,2,50,6,8], 50)
+    assert str(ex.value).find("subscript (50)") >= 0
+
+    assert normalize_subscript([0,-1,2,-3,4,-5,6,-7,8], 50) == ([0,49,2,47,4,45,6,43,8], False)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript([0,2,-51,6,8], 50)
+    assert str(ex.value).find("subscript (-51)") >= 0
+
+    assert normalize_subscript([False,10,True,20,False,30,True], 50) == ([10,2,20,30,6], False)
+
+    names = ["A", "B", "C", "D", "E", "F"]
+    assert normalize_subscript(["B",1,"D",2,"F",3,"A"], 6, names) == ([1,1,3,2,5,3,0], False)
+    assert normalize_subscript(["B",1,"A",2,"B",3,"A"], 6, ["A", "B", "A", "B", "A", "B"]) == ([1,1,0,2,1,3,0], False) # Takes the first occurence.
+
+    with pytest.raises(KeyError) as ex:
+        normalize_subscript(["B",1,"D",2,"G",3,"A"], 6, names)
+
+    with pytest.raises(IndexError) as ex:
+        normalize_subscript(["B",1,"D",2,"F",3,"A"], 6)
+    assert str(ex.value).find("vector-like object with no names") >= 0
+
+
+def test_normalize_subscript_numpy():
+    out, x = normalize_subscript(numpy.array([1, 3, 5]), 6)
+    assert (out == numpy.array([1,3,5])).all()
+
+    out, x = normalize_subscript(numpy.array([-1, -3, -5]), 6)
+    assert (out == numpy.array([5,3,1])).all()
+
+    assert normalize_subscript(numpy.int64(5), 6) == ([5], True)
+    assert normalize_subscript(numpy.bool_(True), 6) == ([0], True)
+
+    # Now the trickiest part - are booleans converted correctly?
+    assert normalize_subscript(numpy.array([True, False, True, False, True]), 5) == ([0, 2, 4], False)