Merge branch 'master' into py38-eol

BiocPy · Dec 18, 2024 · 7fc2cb6 · 7fc2cb6
2 parents e18a1a3 + 693c169
commit 7fc2cb6
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 8 deletions.
diff --git a/src/biocutils/factorize.py b/src/biocutils/factorize.py
@@ -11,6 +11,7 @@ def factorize(
     levels: Optional[Sequence] = None,
     sort_levels: bool = False,
     dtype: Optional[numpy.dtype] = None,
+    fail_missing: Optional[bool] = None,
 ) -> Tuple[list, numpy.ndarray]:
     """Convert a sequence of hashable values into a factor.
 
@@ -32,6 +33,10 @@ def factorize(
             NumPy type of the array of indices, see
             :py:func:`~biocutils.match.match` for details.
 
+        fail_missing:
+            Whether to raise an error upon encountering missing levels in
+            ``x``, see :py:func:`~biocutils.match.match` for details.
+
     Returns:
         Tuple where the first element is a list of unique levels and the second
         element in a NumPy array containing integer codes, i.e., indices into
@@ -51,5 +56,5 @@ def factorize(
         if sort_levels:
             levels.sort()
 
-    codes = match(x, levels, dtype=dtype)
+    codes = match(x, levels, dtype=dtype, fail_missing=fail_missing)
     return levels, codes
diff --git a/src/biocutils/match.py b/src/biocutils/match.py
@@ -9,11 +9,13 @@ def match(
     targets: Union[dict, Sequence],
     duplicate_method: DUPLICATE_METHOD = "first",
     dtype: Optional[numpy.ndarray] = None,
+    fail_missing: Optional[bool] = None,
 ) -> numpy.ndarray:
     """Find a matching value of each element of ``x`` in ``target``.
 
     Args:
-        x: Squence of values to match.
+        x:
+            Sequence of values to match.
 
         targets:
             Sequence of targets to be matched against. Alternatively, a
@@ -27,7 +29,12 @@ def match(
         dtype:
             NumPy type of the output array. This should be an integer type; if
             missing values are expected, the type should be a signed integer.
-            If None, a suitable type is automatically determined.
+            If None, a suitable signed type is automatically determined.
+
+        fail_missing:
+            Whether to raise an error if ``x`` cannot be found in ``targets``.
+            If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned
+            type, otherwise it defaults to ``False``.
 
     Returns:
         Array of length equal to ``x``, containing the integer position of each
@@ -41,10 +48,20 @@ def match(
         dtype = numpy.min_scalar_type(-len(targets)) # get a signed type
     indices = numpy.zeros(len(x), dtype=dtype)
 
-    for i, y in enumerate(x):
-        if y not in targets:
-            indices[i] = -1
-        else:
+    if fail_missing is None:
+        fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger)
+
+    # Separate loops to reduce branching in the tight inner loop.
+    if not fail_missing:
+        for i, y in enumerate(x):
+            if y in targets:
+                indices[i] = targets[y]
+            else:
+                indices[i] = -1
+    else:
+        for i, y in enumerate(x):
+            if not y in targets:
+                raise ValueError("cannot find '" + str(y) + "' in 'targets'")
             indices[i] = targets[y]
 
     return indices
diff --git a/src/biocutils/subset_sequence.py b/src/biocutils/subset_sequence.py
@@ -14,6 +14,7 @@ def subset_sequence(x: Any, indices: Sequence[int]) -> Any:
 
         indices:
             Sequence of non-negative integers specifying the integers of interest.
+            All indices should be less than ``len(x)``.
 
     Returns:
         The result of slicing ``x`` by ``indices``. The exact type
@@ -30,6 +31,12 @@ def _subset_sequence_list(x: list, indices: Sequence) -> list:
 @subset_sequence.register
 def _subset_sequence_range(x: range, indices: Sequence) -> Union[list, range]:
     if isinstance(indices, range):
-        return x[slice(indices.start, indices.stop, indices.step)]
+        # We can just assume that all 'indices' are in [0, len(x)),
+        # so no need to handle out-of-range indices.
+        return range(
+            x.start + x.step * indices.start,
+            x.start + x.step * indices.stop,
+            x.step * indices.step
+        )
     else:
         return [x[i] for i in indices]
diff --git a/tests/test_match.py b/tests/test_match.py
@@ -1,5 +1,6 @@
 from biocutils import match, map_to_index
 import numpy
+import pytest
 
 
 def test_match_simple():
@@ -39,3 +40,17 @@ def test_match_dtype():
     mm = match(["A", "B", "D", "A", "C", "B"], ["D", "C", "B", "A"], dtype=numpy.dtype("uint32"))
     assert list(mm) == [3, 2, 0, 3, 1, 2]
     assert mm.dtype == numpy.dtype("uint32")
+
+
+def test_match_fail_missing():
+    x = match(["A", "E", "B", "D", "E"], ["D", "C", "B", "A"])
+    assert list(x) == [3, -1, 2, 0, -1]
+
+    with pytest.raises(ValueError, match="cannot find"):
+        match(["A", "E", "B", "D", "E"], ["D", "C", "B", "A"], fail_missing=True)
+
+    with pytest.raises(ValueError, match="cannot find"):
+        match(["A", "E", "B", "D", "E"], ["D", "C", "B", "A"], dtype=numpy.uint32)
+
+    x = match(["A", "C", "B", "D", "C"], ["D", "C", "B", "A"], fail_missing=True)
+    assert list(x) == [3, 1, 2, 0, 1]
diff --git a/tests/test_subset_sequence.py b/tests/test_subset_sequence.py
@@ -25,3 +25,14 @@ def test_subset_range():
     x = range(10, 20)
     assert subset_sequence(x, range(2, 8, 2)) == range(12, 18, 2)
     assert subset_sequence(x, [0, 1, 5, 9]) == [10, 11, 15, 19]
+    assert subset_sequence(x, range(9, -1, -1)) == range(19, 9, -1)
+
+    x = range(10, 30, 3)
+    assert subset_sequence(x, range(2, 7, 2)) == x[2:7:2]
+    assert subset_sequence(x, range(5, 0, -2)) == x[5:0:-2]
+    assert subset_sequence(x, range(len(x) - 1, -1, -1)) == x[::-1]
+
+    x = range(100, 21, -6)
+    assert subset_sequence(x, range(3, 10, 2)) == x[3:10:2]
+    assert subset_sequence(x, range(7, 1, -1)) == x[7:1:-1]
+    assert subset_sequence(x, range(len(x) - 1, -1, -1)) == x[::-1]