diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index fc4825c..92aaf8f 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -21,4 +21,5 @@ from .intersect import intersect from .union import union from .subset import subset -from .is_list_of_type import is_list_of_type \ No newline at end of file +from .is_list_of_type import is_list_of_type +from .normalize_subscript import normalize_subscript diff --git a/src/biocutils/normalize_subscript.py b/src/biocutils/normalize_subscript.py new file mode 100644 index 0000000..3514149 --- /dev/null +++ b/src/biocutils/normalize_subscript.py @@ -0,0 +1,144 @@ +from typing import Union, Sequence, Optional, Tuple + + +def _raise_int(idx: int, length): + raise IndexError("subscript (" + str(idx) + ") out of range for vector-like object of length " + str(length)) + + +has_numpy = False +try: + import numpy + has_numpy = True +except: + pass + + +def normalize_subscript(sub: Union[slice, range, Sequence, int, str, bool], length: int, names: Optional[Sequence[str]] = None) -> Tuple: + """ + Normalize a subscript for ``__getitem__`` or friends into a sequence of + integer indices, for consistent downstream use. + + Args: + sub: + The subscript. This can be any of the following: + + - A slice of elements. + - A range containing indices to elements. Negative values are + allowed. An error is raised if the indices are out of range. + - A single integer specifying the index of an element. A negative + value is allowed. An error is raised if the index is out of range. + - A single string that can be found in ``names``, which is + converted to the index of the first occurrence of that string in + ``names``. An error is raised if the string cannot be found. + - A single boolean, which is converted into a list containing the + first element if true, and an empty list if false. + - A sequence of strings, integers and/or booleans. Strings are + converted to indices based on first occurrence in ``names``, + as described above. Integers should be indices to an element. + Each truthy boolean is converted to an index equal to its + position in ``sub``, and each Falsey boolean is ignored. + + length: + Length of the object. + + names: + List of names for each entry in the object. If not None, this + should have length equal to ``length``. + + Returns: + A tuple containing (i) a sequence of integer indices in ``[0, length)`` + specifying the subscript elements, and (ii) a boolean indicating whether + ``sub`` was a scalar. + """ + if isinstance(sub, bool) or (has_numpy and isinstance(sub, numpy.bool_)): # before ints, as bools are ints. + if sub: + return [0], True + else: + return [], False + + if isinstance(sub, int) or (has_numpy and isinstance(sub, numpy.generic)): + if sub < -length or sub >= length: + _raise_int(sub, length) + if sub < 0: + sub += length + return [int(sub)], True + + if isinstance(sub, str): + if names is None: + raise IndexError("failed to find subscript '" + sub + "' for vector-like object with no names") + return [names.index(sub)], True + + if isinstance(sub, slice): + return range(*sub.indices(length)), False + if isinstance(sub, range): + if len(sub) == 0: + return [], False + + first = sub[0] + last = sub[-1] + if first >= length: + _raise_int(first, length) + if last >= length: + _raise_int(last, length) + if first < -length: + _raise_int(first, length) + if last < -length: + _raise_int(last, length) + + if sub.start < 0: + if sub.stop < 0: + return range(length + sub.start, length + sub.stop, sub.step), False + else: + return [ (x < 0) * length + x for x in sub], False + else: + if sub.stop < 0: + return [ (x < 0) * length + x for x in sub], False + else: + return sub, False + + can_return_early = False + for x in sub: + if isinstance(x, str) or isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)) or x < 0: + can_return_early = False; + break + + if can_return_early: + for x in sub: + if x >= length: + _raise_int(x, length) + return sub, False + + output = [] + has_strings = set() + string_positions = [] + for i, x in enumerate(sub): + if isinstance(x, str): + has_strings.add(x) + string_positions.append(len(output)) + output.append(None) + elif isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)): + if x: + output.append(i) + elif x < 0: + if x < -length: + _raise_int(x, length) + output.append(x + length) + else: + if x >= length: + _raise_int(x, length) + output.append(x) + + if len(has_strings): + if names is None: + raise IndexError("cannot find string subscripts for vector-like object with no names") + + mapping = {} + for i, y in enumerate(names): + if y in has_strings: + mapping[y] = i + has_strings.remove(y) # remove it so we only consider the first. + + for i in string_positions: + output[i] = mapping[sub[i]] + + return output, False diff --git a/tests/test_normalize_subscript.py b/tests/test_normalize_subscript.py new file mode 100644 index 0000000..a6fc02d --- /dev/null +++ b/tests/test_normalize_subscript.py @@ -0,0 +1,101 @@ +from biocutils import normalize_subscript +import pytest +import numpy + + +def test_normalize_subscript_scalars(): + assert normalize_subscript(10, 100) == ([10], True) + assert normalize_subscript(-1, 100) == ([99], True) + assert normalize_subscript(True, 100) == ([0], True) + assert normalize_subscript(False, 100) == ([], False) + assert normalize_subscript("C", 5, ["A", "B", "C", "D", "E" ]) == ([2], True) + assert normalize_subscript("B", 5, ["A", "B", "C", "B", "E" ]) == ([1], True) # takes first occurence. + + with pytest.raises(IndexError) as ex: + normalize_subscript(100, 10) + assert str(ex.value).find("subscript (100)") >= 0 + + with pytest.raises(IndexError) as ex: + normalize_subscript(-11, 10) + assert str(ex.value).find("subscript (-11)") >= 0 + + with pytest.raises(IndexError) as ex: + normalize_subscript("foor", 10) + assert str(ex.value).find("subscript 'foor'") >= 0 + + with pytest.raises(ValueError) as ex: + normalize_subscript("F", 5, ["A", "B", "C", "D", "E"]) + + +def test_normalize_subscript_slice(): + assert normalize_subscript(slice(10, 40), 100) == (range(10, 40), False) + assert normalize_subscript(slice(-10, -20, -1), 100) == (range(90, 80, -1), False) + + +def test_normalize_subscript_range(): + assert normalize_subscript(range(5, 2), 100) == ([], False) + assert normalize_subscript(range(10, 40), 100) == (range(10, 40), False) + assert normalize_subscript(range(-10, 40), 100) == (list(range(90, 100)) + list(range(40)), False) + assert normalize_subscript(range(50, -10, -1), 100) == (list(range(50, -1, -1)) + list(range(99, 90, -1)), False) + assert normalize_subscript(range(-10, -50, -1), 100) == (range(90, 50, -1), False) + + with pytest.raises(IndexError) as ex: + normalize_subscript(range(10, 50), 20) + assert str(ex.value).find("subscript (49)") >= 0 + normalize_subscript(range(10, 20), 20) + + with pytest.raises(IndexError) as ex: + normalize_subscript(range(20, 0, -1), 20) + assert str(ex.value).find("subscript (20)") >= 0 + normalize_subscript(range(19, 0, -1), 20) + + with pytest.raises(IndexError) as ex: + normalize_subscript(range(-21, -10), 20) + assert str(ex.value).find("subscript (-21)") >= 0 + normalize_subscript(range(-20, -10), 20) + + with pytest.raises(IndexError) as ex: + normalize_subscript(range(-10, -22, -1), 20) + assert str(ex.value).find("subscript (-21)") >= 0 + normalize_subscript(range(-10, -21, -1), 20) + + +def test_normalize_subscript_chaos(): + assert normalize_subscript([0,2,4,6,8], 50) == ([0,2,4,6,8], False) + + with pytest.raises(IndexError) as ex: + normalize_subscript([0,2,50,6,8], 50) + assert str(ex.value).find("subscript (50)") >= 0 + + assert normalize_subscript([0,-1,2,-3,4,-5,6,-7,8], 50) == ([0,49,2,47,4,45,6,43,8], False) + + with pytest.raises(IndexError) as ex: + normalize_subscript([0,2,-51,6,8], 50) + assert str(ex.value).find("subscript (-51)") >= 0 + + assert normalize_subscript([False,10,True,20,False,30,True], 50) == ([10,2,20,30,6], False) + + names = ["A", "B", "C", "D", "E", "F"] + assert normalize_subscript(["B",1,"D",2,"F",3,"A"], 6, names) == ([1,1,3,2,5,3,0], False) + assert normalize_subscript(["B",1,"A",2,"B",3,"A"], 6, ["A", "B", "A", "B", "A", "B"]) == ([1,1,0,2,1,3,0], False) # Takes the first occurence. + + with pytest.raises(KeyError) as ex: + normalize_subscript(["B",1,"D",2,"G",3,"A"], 6, names) + + with pytest.raises(IndexError) as ex: + normalize_subscript(["B",1,"D",2,"F",3,"A"], 6) + assert str(ex.value).find("vector-like object with no names") >= 0 + + +def test_normalize_subscript_numpy(): + out, x = normalize_subscript(numpy.array([1, 3, 5]), 6) + assert (out == numpy.array([1,3,5])).all() + + out, x = normalize_subscript(numpy.array([-1, -3, -5]), 6) + assert (out == numpy.array([5,3,1])).all() + + assert normalize_subscript(numpy.int64(5), 6) == ([5], True) + assert normalize_subscript(numpy.bool_(True), 6) == ([0], True) + + # Now the trickiest part - are booleans converted correctly? + assert normalize_subscript(numpy.array([True, False, True, False, True]), 5) == ([0, 2, 4], False)