Skip to content

Commit

Permalink
Merge branch 'master' into py38-eol
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Dec 18, 2024
2 parents e18a1a3 + 693c169 commit 7fc2cb6
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 8 deletions.
7 changes: 6 additions & 1 deletion src/biocutils/factorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def factorize(
levels: Optional[Sequence] = None,
sort_levels: bool = False,
dtype: Optional[numpy.dtype] = None,
fail_missing: Optional[bool] = None,
) -> Tuple[list, numpy.ndarray]:
"""Convert a sequence of hashable values into a factor.
Expand All @@ -32,6 +33,10 @@ def factorize(
NumPy type of the array of indices, see
:py:func:`~biocutils.match.match` for details.
fail_missing:
Whether to raise an error upon encountering missing levels in
``x``, see :py:func:`~biocutils.match.match` for details.
Returns:
Tuple where the first element is a list of unique levels and the second
element in a NumPy array containing integer codes, i.e., indices into
Expand All @@ -51,5 +56,5 @@ def factorize(
if sort_levels:
levels.sort()

codes = match(x, levels, dtype=dtype)
codes = match(x, levels, dtype=dtype, fail_missing=fail_missing)
return levels, codes
29 changes: 23 additions & 6 deletions src/biocutils/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ def match(
targets: Union[dict, Sequence],
duplicate_method: DUPLICATE_METHOD = "first",
dtype: Optional[numpy.ndarray] = None,
fail_missing: Optional[bool] = None,
) -> numpy.ndarray:
"""Find a matching value of each element of ``x`` in ``target``.
Args:
x: Squence of values to match.
x:
Sequence of values to match.
targets:
Sequence of targets to be matched against. Alternatively, a
Expand All @@ -27,7 +29,12 @@ def match(
dtype:
NumPy type of the output array. This should be an integer type; if
missing values are expected, the type should be a signed integer.
If None, a suitable type is automatically determined.
If None, a suitable signed type is automatically determined.
fail_missing:
Whether to raise an error if ``x`` cannot be found in ``targets``.
If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned
type, otherwise it defaults to ``False``.
Returns:
Array of length equal to ``x``, containing the integer position of each
Expand All @@ -41,10 +48,20 @@ def match(
dtype = numpy.min_scalar_type(-len(targets)) # get a signed type
indices = numpy.zeros(len(x), dtype=dtype)

for i, y in enumerate(x):
if y not in targets:
indices[i] = -1
else:
if fail_missing is None:
fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger)

# Separate loops to reduce branching in the tight inner loop.
if not fail_missing:
for i, y in enumerate(x):
if y in targets:
indices[i] = targets[y]
else:
indices[i] = -1
else:
for i, y in enumerate(x):
if not y in targets:
raise ValueError("cannot find '" + str(y) + "' in 'targets'")
indices[i] = targets[y]

return indices
9 changes: 8 additions & 1 deletion src/biocutils/subset_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def subset_sequence(x: Any, indices: Sequence[int]) -> Any:
indices:
Sequence of non-negative integers specifying the integers of interest.
All indices should be less than ``len(x)``.
Returns:
The result of slicing ``x`` by ``indices``. The exact type
Expand All @@ -30,6 +31,12 @@ def _subset_sequence_list(x: list, indices: Sequence) -> list:
@subset_sequence.register
def _subset_sequence_range(x: range, indices: Sequence) -> Union[list, range]:
if isinstance(indices, range):
return x[slice(indices.start, indices.stop, indices.step)]
# We can just assume that all 'indices' are in [0, len(x)),
# so no need to handle out-of-range indices.
return range(
x.start + x.step * indices.start,
x.start + x.step * indices.stop,
x.step * indices.step
)
else:
return [x[i] for i in indices]
15 changes: 15 additions & 0 deletions tests/test_match.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from biocutils import match, map_to_index
import numpy
import pytest


def test_match_simple():
Expand Down Expand Up @@ -39,3 +40,17 @@ def test_match_dtype():
mm = match(["A", "B", "D", "A", "C", "B"], ["D", "C", "B", "A"], dtype=numpy.dtype("uint32"))
assert list(mm) == [3, 2, 0, 3, 1, 2]
assert mm.dtype == numpy.dtype("uint32")


def test_match_fail_missing():
x = match(["A", "E", "B", "D", "E"], ["D", "C", "B", "A"])
assert list(x) == [3, -1, 2, 0, -1]

with pytest.raises(ValueError, match="cannot find"):
match(["A", "E", "B", "D", "E"], ["D", "C", "B", "A"], fail_missing=True)

with pytest.raises(ValueError, match="cannot find"):
match(["A", "E", "B", "D", "E"], ["D", "C", "B", "A"], dtype=numpy.uint32)

x = match(["A", "C", "B", "D", "C"], ["D", "C", "B", "A"], fail_missing=True)
assert list(x) == [3, 1, 2, 0, 1]
11 changes: 11 additions & 0 deletions tests/test_subset_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,14 @@ def test_subset_range():
x = range(10, 20)
assert subset_sequence(x, range(2, 8, 2)) == range(12, 18, 2)
assert subset_sequence(x, [0, 1, 5, 9]) == [10, 11, 15, 19]
assert subset_sequence(x, range(9, -1, -1)) == range(19, 9, -1)

x = range(10, 30, 3)
assert subset_sequence(x, range(2, 7, 2)) == x[2:7:2]
assert subset_sequence(x, range(5, 0, -2)) == x[5:0:-2]
assert subset_sequence(x, range(len(x) - 1, -1, -1)) == x[::-1]

x = range(100, 21, -6)
assert subset_sequence(x, range(3, 10, 2)) == x[3:10:2]
assert subset_sequence(x, range(7, 1, -1)) == x[7:1:-1]
assert subset_sequence(x, range(len(x) - 1, -1, -1)) == x[::-1]

0 comments on commit 7fc2cb6

Please sign in to comment.