-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added utility to normalize subscripts for __getitem__.
This mimics Bioconductor's normalizeSingleBracketSubscript function.
- Loading branch information
Showing
3 changed files
with
247 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
from typing import Union, Sequence, Optional, Tuple | ||
|
||
|
||
def _raise_int(idx: int, length): | ||
raise IndexError("subscript (" + str(idx) + ") out of range for vector-like object of length " + str(length)) | ||
|
||
|
||
has_numpy = False | ||
try: | ||
import numpy | ||
has_numpy = True | ||
except: | ||
pass | ||
|
||
|
||
def normalize_subscript(sub: Union[slice, range, Sequence, int, str, bool], length: int, names: Optional[Sequence[str]] = None) -> Tuple: | ||
""" | ||
Normalize a subscript for ``__getitem__`` or friends into a sequence of | ||
integer indices, for consistent downstream use. | ||
Args: | ||
sub: | ||
The subscript. This can be any of the following: | ||
- A slice of elements. | ||
- A range containing indices to elements. Negative values are | ||
allowed. An error is raised if the indices are out of range. | ||
- A single integer specifying the index of an element. A negative | ||
value is allowed. An error is raised if the index is out of range. | ||
- A single string that can be found in ``names``, which is | ||
converted to the index of the first occurrence of that string in | ||
``names``. An error is raised if the string cannot be found. | ||
- A single boolean, which is converted into a list containing the | ||
first element if true, and an empty list if false. | ||
- A sequence of strings, integers and/or booleans. Strings are | ||
converted to indices based on first occurrence in ``names``, | ||
as described above. Integers should be indices to an element. | ||
Each truthy boolean is converted to an index equal to its | ||
position in ``sub``, and each Falsey boolean is ignored. | ||
length: | ||
Length of the object. | ||
names: | ||
List of names for each entry in the object. If not None, this | ||
should have length equal to ``length``. | ||
Returns: | ||
A tuple containing (i) a sequence of integer indices in ``[0, length)`` | ||
specifying the subscript elements, and (ii) a boolean indicating whether | ||
``sub`` was a scalar. | ||
""" | ||
if isinstance(sub, bool) or (has_numpy and isinstance(sub, numpy.bool_)): # before ints, as bools are ints. | ||
if sub: | ||
return [0], True | ||
else: | ||
return [], False | ||
|
||
if isinstance(sub, int) or (has_numpy and isinstance(sub, numpy.generic)): | ||
if sub < -length or sub >= length: | ||
_raise_int(sub, length) | ||
if sub < 0: | ||
sub += length | ||
return [int(sub)], True | ||
|
||
if isinstance(sub, str): | ||
if names is None: | ||
raise IndexError("failed to find subscript '" + sub + "' for vector-like object with no names") | ||
return [names.index(sub)], True | ||
|
||
if isinstance(sub, slice): | ||
return range(*sub.indices(length)), False | ||
if isinstance(sub, range): | ||
if len(sub) == 0: | ||
return [], False | ||
|
||
first = sub[0] | ||
last = sub[-1] | ||
if first >= length: | ||
_raise_int(first, length) | ||
if last >= length: | ||
_raise_int(last, length) | ||
if first < -length: | ||
_raise_int(first, length) | ||
if last < -length: | ||
_raise_int(last, length) | ||
|
||
if sub.start < 0: | ||
if sub.stop < 0: | ||
return range(length + sub.start, length + sub.stop, sub.step), False | ||
else: | ||
return [ (x < 0) * length + x for x in sub], False | ||
else: | ||
if sub.stop < 0: | ||
return [ (x < 0) * length + x for x in sub], False | ||
else: | ||
return sub, False | ||
|
||
can_return_early = False | ||
for x in sub: | ||
if isinstance(x, str) or isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)) or x < 0: | ||
can_return_early = False; | ||
break | ||
|
||
if can_return_early: | ||
for x in sub: | ||
if x >= length: | ||
_raise_int(x, length) | ||
return sub, False | ||
|
||
output = [] | ||
has_strings = set() | ||
string_positions = [] | ||
for i, x in enumerate(sub): | ||
if isinstance(x, str): | ||
has_strings.add(x) | ||
string_positions.append(len(output)) | ||
output.append(None) | ||
elif isinstance(x, bool) or (has_numpy and isinstance(x, numpy.bool_)): | ||
if x: | ||
output.append(i) | ||
elif x < 0: | ||
if x < -length: | ||
_raise_int(x, length) | ||
output.append(x + length) | ||
else: | ||
if x >= length: | ||
_raise_int(x, length) | ||
output.append(x) | ||
|
||
if len(has_strings): | ||
if names is None: | ||
raise IndexError("cannot find string subscripts for vector-like object with no names") | ||
|
||
mapping = {} | ||
for i, y in enumerate(names): | ||
if y in has_strings: | ||
mapping[y] = i | ||
has_strings.remove(y) # remove it so we only consider the first. | ||
|
||
for i in string_positions: | ||
output[i] = mapping[sub[i]] | ||
|
||
return output, False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
from biocutils import normalize_subscript | ||
import pytest | ||
import numpy | ||
|
||
|
||
def test_normalize_subscript_scalars(): | ||
assert normalize_subscript(10, 100) == ([10], True) | ||
assert normalize_subscript(-1, 100) == ([99], True) | ||
assert normalize_subscript(True, 100) == ([0], True) | ||
assert normalize_subscript(False, 100) == ([], False) | ||
assert normalize_subscript("C", 5, ["A", "B", "C", "D", "E" ]) == ([2], True) | ||
assert normalize_subscript("B", 5, ["A", "B", "C", "B", "E" ]) == ([1], True) # takes first occurence. | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(100, 10) | ||
assert str(ex.value).find("subscript (100)") >= 0 | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(-11, 10) | ||
assert str(ex.value).find("subscript (-11)") >= 0 | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript("foor", 10) | ||
assert str(ex.value).find("subscript 'foor'") >= 0 | ||
|
||
with pytest.raises(ValueError) as ex: | ||
normalize_subscript("F", 5, ["A", "B", "C", "D", "E"]) | ||
|
||
|
||
def test_normalize_subscript_slice(): | ||
assert normalize_subscript(slice(10, 40), 100) == (range(10, 40), False) | ||
assert normalize_subscript(slice(-10, -20, -1), 100) == (range(90, 80, -1), False) | ||
|
||
|
||
def test_normalize_subscript_range(): | ||
assert normalize_subscript(range(5, 2), 100) == ([], False) | ||
assert normalize_subscript(range(10, 40), 100) == (range(10, 40), False) | ||
assert normalize_subscript(range(-10, 40), 100) == (list(range(90, 100)) + list(range(40)), False) | ||
assert normalize_subscript(range(50, -10, -1), 100) == (list(range(50, -1, -1)) + list(range(99, 90, -1)), False) | ||
assert normalize_subscript(range(-10, -50, -1), 100) == (range(90, 50, -1), False) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(range(10, 50), 20) | ||
assert str(ex.value).find("subscript (49)") >= 0 | ||
normalize_subscript(range(10, 20), 20) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(range(20, 0, -1), 20) | ||
assert str(ex.value).find("subscript (20)") >= 0 | ||
normalize_subscript(range(19, 0, -1), 20) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(range(-21, -10), 20) | ||
assert str(ex.value).find("subscript (-21)") >= 0 | ||
normalize_subscript(range(-20, -10), 20) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(range(-10, -22, -1), 20) | ||
assert str(ex.value).find("subscript (-21)") >= 0 | ||
normalize_subscript(range(-10, -21, -1), 20) | ||
|
||
|
||
def test_normalize_subscript_chaos(): | ||
assert normalize_subscript([0,2,4,6,8], 50) == ([0,2,4,6,8], False) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript([0,2,50,6,8], 50) | ||
assert str(ex.value).find("subscript (50)") >= 0 | ||
|
||
assert normalize_subscript([0,-1,2,-3,4,-5,6,-7,8], 50) == ([0,49,2,47,4,45,6,43,8], False) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript([0,2,-51,6,8], 50) | ||
assert str(ex.value).find("subscript (-51)") >= 0 | ||
|
||
assert normalize_subscript([False,10,True,20,False,30,True], 50) == ([10,2,20,30,6], False) | ||
|
||
names = ["A", "B", "C", "D", "E", "F"] | ||
assert normalize_subscript(["B",1,"D",2,"F",3,"A"], 6, names) == ([1,1,3,2,5,3,0], False) | ||
assert normalize_subscript(["B",1,"A",2,"B",3,"A"], 6, ["A", "B", "A", "B", "A", "B"]) == ([1,1,0,2,1,3,0], False) # Takes the first occurence. | ||
|
||
with pytest.raises(KeyError) as ex: | ||
normalize_subscript(["B",1,"D",2,"G",3,"A"], 6, names) | ||
|
||
with pytest.raises(IndexError) as ex: | ||
normalize_subscript(["B",1,"D",2,"F",3,"A"], 6) | ||
assert str(ex.value).find("vector-like object with no names") >= 0 | ||
|
||
|
||
def test_normalize_subscript_numpy(): | ||
out, x = normalize_subscript(numpy.array([1, 3, 5]), 6) | ||
assert (out == numpy.array([1,3,5])).all() | ||
|
||
out, x = normalize_subscript(numpy.array([-1, -3, -5]), 6) | ||
assert (out == numpy.array([5,3,1])).all() | ||
|
||
assert normalize_subscript(numpy.int64(5), 6) == ([5], True) | ||
assert normalize_subscript(numpy.bool_(True), 6) == ([0], True) | ||
|
||
# Now the trickiest part - are booleans converted correctly? | ||
assert normalize_subscript(numpy.array([True, False, True, False, True]), 5) == ([0, 2, 4], False) |