From 47aba98708951cb60c05423cfab31faf63bc93ec Mon Sep 17 00:00:00 2001
From: Aaron Lun <infinite.monkeys.with.keyboards@gmail.com>
Date: Tue, 7 Nov 2023 13:06:39 -0800
Subject: [PATCH] Migrated the Factor class from BiocFrame. (#7)

This has been slightly modified so that the codes are now a (signed integer)
NumPy array, and the levels are a string NumPy array. The idea is to always
use type-enforcing NumPy arrays internally instead of lists of arbitrary stuff.
---
 setup.cfg                          |   1 +
 src/biocutils/Factor.py            | 410 +++++++++++++++++++++++++++++
 src/biocutils/__init__.py          |   4 +-
 src/biocutils/factor.py            |  51 ----
 src/biocutils/factorize.py         |  43 +++
 src/biocutils/is_missing_scalar.py |  12 +
 src/biocutils/match.py             |  19 +-
 tests/test_Factor.py               | 199 ++++++++++++++
 tests/test_factor.py               |  49 ----
 tests/test_factorize.py            |  49 ++++
 tests/test_match.py                |  10 +-
 tests/test_package_utils.py        |   5 -
 12 files changed, 731 insertions(+), 121 deletions(-)
 create mode 100644 src/biocutils/Factor.py
 delete mode 100644 src/biocutils/factor.py
 create mode 100644 src/biocutils/factorize.py
 create mode 100644 src/biocutils/is_missing_scalar.py
 create mode 100644 tests/test_Factor.py
 delete mode 100644 tests/test_factor.py
 create mode 100644 tests/test_factorize.py

diff --git a/setup.cfg b/setup.cfg
index 7f77f53..26ce5e1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -67,6 +67,7 @@ testing =
     setuptools
     pytest
     pytest-cov
+    pandas
 
 [options.entry_points]
 # Add here console scripts like:
diff --git a/src/biocutils/Factor.py b/src/biocutils/Factor.py
new file mode 100644
index 0000000..d75c252
--- /dev/null
+++ b/src/biocutils/Factor.py
@@ -0,0 +1,410 @@
+from copy import deepcopy
+from typing import List, Sequence, Union, Optional
+from warnings import warn
+import numpy
+
+from .match import match
+from .factorize import factorize
+from .normalize_subscript import normalize_subscript
+from .is_missing_scalar import is_missing_scalar
+from .print_truncated import print_truncated_list
+
+
+def _check_levels_type(levels: numpy.ndarray):
+    if not numpy.issubdtype(levels.dtype, numpy.str_):
+        raise TypeError("all entries of 'levels' should be strings")
+    if numpy.ma.is_masked(levels):
+        raise TypeError("all entries of 'levels' should be non-missing")
+    if len(levels.shape) != 1:
+        raise TypeError("'codes' should be a 1-dimensional array")
+
+
+class Factor:
+    """Factor class, equivalent to R's ``factor``.
+
+    This is a vector of integer codes, each of which is an index into a list of
+    unique strings. The aim is to encode a list of strings as integers for
+    easier numerical analysis.
+    """
+
+    def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool = False, validate: bool = True):
+        """Initialize a Factor object.
+
+        Args:
+            codes:
+                Sequence of codes. Each value should be a non-negative integer
+                that refers to an entry ``levels``. Negative or None entries
+                are assumed to refer to missing values.
+
+            levels:
+                List of levels containing unique strings.
+
+            ordered:
+                Whether the levels are ordered.
+
+            validate:
+                Whether to validate the arguments. Internal use only.
+        """
+        if not isinstance(codes, numpy.ndarray):
+            replacement = numpy.ndarray(len(codes), dtype=numpy.min_scalar_type(-len(levels))) # get a signed type.
+            for i, x in enumerate(codes):
+                if is_missing_scalar(x) or x < 0:
+                    replacement[i] = -1
+                else:
+                    replacement[i] = x
+            codes = replacement
+        self._codes = codes
+
+        if not isinstance(levels, numpy.ndarray):
+            levels = numpy.array(levels, dtype=str)
+        self._levels = levels
+
+        self._ordered = bool(ordered)
+
+        if validate:
+            if not numpy.issubdtype(self._codes.dtype, numpy.signedinteger):
+                raise TypeError("all entries of 'codes' should be signed integers")
+            if len(self._codes.shape) != 1:
+                raise TypeError("'codes' should be a 1-dimensional array")
+
+            _check_levels_type(self._levels)
+
+            for x in codes:
+                if x >= len(self._levels):
+                    raise ValueError("all entries of 'codes' should refer to an entry of 'levels'")
+
+            if len(set(self._levels)) < len(self._levels):
+                raise ValueError("all entries of 'levels' should be unique")
+
+    def get_codes(self) -> numpy.ndarray:
+        """
+        Returns:
+            Array of integer codes, used as indices into the levels from
+            :py:attr:`~get_levels`. A masked array may also be returned if
+            any of the entries are missing.
+        """
+        return self._codes
+
+    @property
+    def codes(self) -> numpy.ndarray:
+        """See :py:attr:`~get_codes`."""
+        return self.get_codes()
+
+    def get_levels(self) -> numpy.ndarray:
+        """
+        Returns:
+            Array of strings containing the factor levels.
+        """
+        return self._levels
+
+    @property
+    def levels(self) -> numpy.ndarray:
+        """See :py:attr:`~get_levels`."""
+        return self.get_levels()
+
+    def get_ordered(self) -> bool:
+        """
+        Returns:
+            True if the levels are ordered, otherwise False.
+        """
+        return self._ordered
+
+    @property
+    def ordered(self) -> bool:
+        """See :py:attr:`~get_ordered`."""
+        return self.get_ordered()
+
+    def __len__(self) -> int:
+        """
+        Returns:
+            Length of the factor in terms of the number of codes.
+        """
+        return len(self._codes)
+
+    def __repr__(self) -> str:
+        """
+        Returns:
+            A stringified representation of this object.
+        """
+        tmp = "Factor(codes=" + print_truncated_list(self._codes) + ", levels=" + print_truncated_list(self._levels)
+        if self._ordered:
+            tmp += ", ordered=True"
+        tmp += ")"
+        return tmp
+
+    def __str__(self) -> str:
+        """
+        Returns:
+            A pretty-printed representation of this object.
+        """
+        message = "Factor of length " + str(len(self._codes)) + " with " + str(len(self._levels)) + " level"
+        if len(self._levels) != 0:
+            message += "s"
+        message += "\n"
+        message += "values: " + print_truncated_list(self._codes, transform=lambda i: self._levels[i]) + "\n"
+        message += "levels: " + print_truncated_list(self._levels, transform=lambda x: x) + "\n"
+        message += "ordered: " + str(self._ordered)
+        return message
+
+    def __getitem__(self, sub: Union[int, bool, Sequence]) -> Union[str, "Factor"]:
+        """Subset the ``Factor`` to the specified subset of indices.
+
+        Args:
+            sub:
+                Sequence of integers or booleans specifying the elements of
+                interest. Alternatively, an integer/boolean scalar specifying a
+                single element.
+
+        Returns:
+            If ``sub`` is a sequence, returns same type as caller (a new
+            ``Factor``) containing only the elements of interest from ``sub``.
+
+            If ``sub`` is a scalar, a string is returned containing the
+            level corresponding to the code at position ``sub``. This may
+            also be None if the code is missing.
+        """
+        sub, scalar = normalize_subscript(sub, len(self), None)
+        if scalar:
+            x = self._codes[sub[0]]
+            if x >= 0:
+                return self._levels[x]
+            else:
+                return None 
+        return type(self)(self._codes[sub], self._levels, self._ordered, validate=False)
+
+    def replace(self, sub: Sequence, value: Union[str, "Factor"], in_place: bool = False):
+        """
+        Replace items in the ``Factor`` list.  The ``subs`` elements in the
+        current object are replaced with the corresponding values in ``value``.
+        This is performed by finding the level for each entry of the
+        replacement ``value``, matching it to a level in the current object,
+        and replacing the entry of ``codes`` with the code of the matched
+        level. If there is no matching level, a missing value is inserted.
+
+        Args:
+            sub: 
+                Sequence of integers or booleans specifying the items to be
+                replaced.
+
+            value: 
+                If ``sub`` is a sequence, a ``Factor`` of the same length
+                containing the replacement values.
+
+            in_place:
+                Whether the replacement should be performed on the current
+                object.
+
+        Returns:
+            If ``in_place = False``, a new ``Factor`` is returned containing the
+            contents of the current object after replacement by ``value``.
+
+            If ``in_place = True``, the current object is returned after its
+            items have been replaced.
+        """
+        sub, scalar = normalize_subscript(sub, len(self), None)
+        codes = self._codes
+        if not in_place:
+            codes = codes.copy()
+
+        if len(self._levels) == len(value._levels) and (self._levels == value._levels).all():
+            for i, x in enumerate(sub):
+                codes[x] = value._codes[i]
+        else:
+            mapping = match(value._levels, self._levels)
+            for i, x in enumerate(sub):
+                v = value._codes[i]
+                if v >= 0:
+                    codes[x] = mapping[v]
+                else:
+                    codes[x] = -1
+
+        if in_place:
+            self._codes = codes
+            return self
+        else:
+            return type(self)(codes, self._levels, self._ordered, validate=False)
+
+    def __setitem__(self, args: Sequence[int], value: "Factor"):
+        """See :py:attr:`~replace` for details."""
+        return self.replace(args, value, in_place=True)
+
+    def drop_unused_levels(self, in_place: bool = False) -> "Factor":
+        """Drop unused levels.
+
+        Args:
+            in_place: Whether to perform this modification in-place.
+
+        Returns:
+            If ``in_place = False``, returns same type as caller (a new ``Factor`` object)
+            where all unused levels have been removed.
+
+            If ``in_place = True``, unused levels are removed from the
+            current object; a reference to the current object is returned.
+        """
+        if in_place:
+            new_codes = self._codes
+        else:
+            new_codes = self._codes.copy()
+
+        in_use = [False] * len(self._levels)
+        for x in self._codes:
+            if x >= 0:
+                in_use[x] = True
+
+        new_levels = []
+        reindex = [-1] * len(in_use)
+        for i, x in enumerate(in_use):
+            if x:
+                reindex[i] = len(new_levels)
+                new_levels.append(self._levels[i])
+        new_levels = numpy.array(new_levels)
+
+        for i, x in enumerate(self._codes):
+            if x >= 0:
+                new_codes[i] = reindex[x]
+
+        if in_place:
+            self._codes = new_codes
+            self._levels = new_levels
+            return self
+        else:
+            current_class_const = type(self)
+            return current_class_const(new_codes, new_levels, self._ordered, validate=False)
+
+    def set_levels(self, levels: Union[str, List[str]], in_place: bool = False) -> "Factor":
+        """Set or replace levels.
+
+        Args:
+            levels:
+                A list of replacement levels. These should be unique strings
+                with no missing values.
+
+                Alternatively a single string containing an existing level in
+                this object. The new levels are defined as a permutation of the
+                existing levels where the provided string is now the first
+                level. The order of all other levels is preserved.
+
+            in_place:
+                Whether to perform this modification in-place.
+
+        Returns:
+            If ``in_place = False``, returns same type as caller (a new
+            ``Factor`` object) where the levels have been replaced. This will
+            automatically update the codes so that they still refer to the same
+            string in the new ``levels``. If a code refers to a level that is
+            not present in the new ``levels``, it is replaced with None.
+
+            If ``in_place = True``, the levels are replaced in the current
+            object, and a reference to the current object is returned.
+        """
+        lmapping = {}
+        if isinstance(levels, str):
+            new_levels = [levels]
+            for x in self._levels:
+                if x == levels:
+                    lmapping[x] = 0
+                else:
+                    lmapping[x] = len(new_levels)
+                    new_levels.append(x)
+            if levels not in lmapping:
+                raise ValueError(
+                    "string 'levels' should already be present among object levels"
+                )
+        else:
+            new_levels = numpy.array(levels)
+            _check_levels_type(new_levels)
+            for i, x in enumerate(new_levels):
+                if x in lmapping:
+                    raise ValueError("levels should be unique")
+                lmapping[x] = i
+
+        mapping = [-1] * len(self._levels)
+        for i, x in enumerate(self._levels):
+            if x in lmapping:
+                mapping[i] = lmapping[x]
+
+        if in_place:
+            new_codes = self._codes
+        else:
+            new_codes = self._codes.copy()
+        for i, x in enumerate(new_codes):
+            if x >= 0:
+                new_codes[i] = mapping[x]
+            else:
+                new_codes[i] = -1
+
+        if in_place:
+            self._codes = new_codes
+            self._levels = new_levels
+            return self
+        else:
+            current_class_const = type(self)
+            return current_class_const(new_codes, new_levels, self._ordered, validate=False)
+
+    @levels.setter
+    def levels(self, levels: Union[str, List[str]]):
+        """See :py:attr:`~set_levels`."""
+        warn("Setting property 'levels'is an in-place operation, use 'set_levels' instead", UserWarning)
+        self.set_levels(levels, in_place=True)
+
+    def __copy__(self) -> "Factor":
+        """
+        Returns:
+            A shallow copy of the ``Factor`` object.
+        """
+        current_class_const = type(self)
+        return current_class_const(self._codes, self._levels, self._ordered, validate=False)
+
+    def __deepcopy__(self, memo) -> "Factor":
+        """
+        Returns:
+            A deep copy of the ``Factor`` object.
+        """
+        current_class_const = type(self)
+        return current_class_const(
+            deepcopy(self._codes, memo),
+            deepcopy(self._levels, memo),
+            self._ordered,
+            validate=False,
+        )
+
+    def to_pandas(self):
+        """Coerce to :py:class:`~pandas.Categorical` object.
+
+        Returns:
+            Categorical: A :py:class:`~pandas.Categorical` object.
+        """
+        from pandas import Categorical
+        return Categorical(
+            values=[self._levels[c] for c in self._codes],
+            ordered=self._ordered,
+        )
+
+    @staticmethod
+    def from_sequence(x: Sequence[str], levels: Optional[Sequence[str]] = None, sort_levels: bool = True, ordered: bool = False) -> "Factor":
+        """Convert a sequence of hashable values into a factor.
+
+        Args:
+            x: 
+                A sequence of strings. Any value may be None to indicate
+                missingness.
+
+            levels:
+                Sequence of reference levels, against which the entries in ``x`` are compared.
+                If None, this defaults to all unique values of ``x``.
+
+            sort_levels:
+                Whether to sort the automatically-determined levels. If False,
+                the levels are kept in order of their appearance in ``x``.  Not
+                used if ``levels`` is explicitly supplied.
+
+            ordered (bool):
+                Whether the levels should be assumed to be ordered.  Note that
+                this refers to their importance and has nothing to do with
+                their sorting order or with the setting of ``sort_levels``.
+
+        Returns:
+            A ``Factor`` object.
+        """
+        levels, indices = factorize(x, levels=levels, sort_levels=sort_levels)
+        return Factor(indices, levels=levels, ordered=ordered)
diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py
index 390188b..28c74c6 100644
--- a/src/biocutils/__init__.py
+++ b/src/biocutils/__init__.py
@@ -15,9 +15,11 @@
 finally:
     del version, PackageNotFoundError
 
-from .factor import factor
+from .Factor import Factor
+from .factorize import factorize
 from .intersect import intersect
 from .is_list_of_type import is_list_of_type
+from .is_missing_scalar import is_missing_scalar
 from .map_to_index import map_to_index
 from .match import match
 from .normalize_subscript import normalize_subscript
diff --git a/src/biocutils/factor.py b/src/biocutils/factor.py
deleted file mode 100644
index 3fc7391..0000000
--- a/src/biocutils/factor.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Optional, Sequence, Tuple
-
-
-def factor(
-    x: Sequence, levels: Optional[Sequence] = None, sort_levels: bool = False
-) -> Tuple[list, list]:
-    """Convert a sequence of hashable values into a factor.
-
-    Args:
-        x (Sequence): A sequence of hashable values.
-            Any value may be None to indicate missingness.
-
-        levels (Sequence, optional):
-            Sequence of reference levels, against which the entries in ``x`` are compared.
-            If None, this defaults to all unique values of ``x``.
-
-        sort_levels (bool):
-            Whether to sort the automatically-determined levels.
-            If False, the levels are kept in order of their appearance in ``x``.
-            Not used if ``levels`` is explicitly supplied.
-
-    Returns:
-        Tuple[list, list]: Tuple where the first list contains the unique levels
-        and the second list contains the integer index into the first list.
-        Indexing the first list by the second list will recover ``x``, except
-        for any None values in ``x``, which will be None in the second list.
-    """
-
-    if levels is None:
-        present = set()
-        levels = []
-        for val in x:
-            if val is not None and val not in present:
-                levels.append(val)
-                present.add(val)
-        if sort_levels:
-            levels.sort()
-
-    mapping = {}
-    for i, lev in enumerate(levels):
-        if lev is not None and lev not in mapping:
-            mapping[lev] = i
-
-    indices = []
-    for i, val in enumerate(x):
-        if val is None or val not in mapping:
-            indices.append(None)
-        else:
-            indices.append(mapping[val])
-
-    return levels, indices
diff --git a/src/biocutils/factorize.py b/src/biocutils/factorize.py
new file mode 100644
index 0000000..ed70e01
--- /dev/null
+++ b/src/biocutils/factorize.py
@@ -0,0 +1,43 @@
+from typing import Optional, Sequence, Tuple
+import numpy
+
+from .match import match
+from .is_missing_scalar import is_missing_scalar
+
+
+def factorize(x: Sequence, levels: Optional[Sequence] = None, sort_levels: bool = False) -> Tuple[list, numpy.ndarray]:
+    """Convert a sequence of hashable values into a factor.
+
+    Args:
+        x: 
+            A sequence of hashable values.
+            Any value may be None to indicate missingness.
+
+        levels:
+            Sequence of reference levels, against which the entries in ``x`` are compared.
+            If None, this defaults to all unique values of ``x``.
+
+        sort_levels:
+            Whether to sort the automatically-determined levels.
+            If False, the levels are kept in order of their appearance in ``x``.
+            Not used if ``levels`` is explicitly supplied.
+
+    Returns:
+        Tuple where the first list contains the unique levels and the second
+        array contains the integer index into the first list. Indexing the
+        first list by the second array will recover ``x``; except for any None
+        or masked values in ``x``, which will be -1 in the second array.
+    """
+
+    if levels is None:
+        present = set()
+        levels = []
+        for val in x:
+            if not is_missing_scalar(val) and val not in present:
+                levels.append(val)
+                present.add(val)
+        if sort_levels:
+            levels.sort()
+
+    codes = match(x, levels)
+    return levels, codes
diff --git a/src/biocutils/is_missing_scalar.py b/src/biocutils/is_missing_scalar.py
new file mode 100644
index 0000000..ea68aec
--- /dev/null
+++ b/src/biocutils/is_missing_scalar.py
@@ -0,0 +1,12 @@
+import numpy
+
+
+def is_missing_scalar(x) -> bool:
+    """
+    Args:
+        x:
+            Any scalar value.
+    Returns:
+        Whether ``x`` is None or a NumPy masked constant.
+    """
+    return x is None or numpy.ma.is_masked(x)
diff --git a/src/biocutils/match.py b/src/biocutils/match.py
index 8db953a..9244b10 100644
--- a/src/biocutils/match.py
+++ b/src/biocutils/match.py
@@ -1,13 +1,10 @@
 from typing import List, Sequence, Union
+import numpy
 
 from .map_to_index import DUPLICATE_METHOD, map_to_index
 
 
-def match(
-    x: Sequence,
-    targets: Union[dict, Sequence],
-    duplicate_method: DUPLICATE_METHOD = "first",
-) -> List[Union[int, None]]:
+def match(x: Sequence, targets: Union[dict, Sequence], duplicate_method: DUPLICATE_METHOD = "first") -> numpy.ndarray:
     """Find a matching value of each element of ``x`` in ``target``.
 
     Args:
@@ -23,12 +20,14 @@ def match(
         integer position of each entry of ``x`` inside ``target``; or None,
         if the entry of ``x`` is None or cannot be found in ``target``.
     """
-    if isinstance(targets, Sequence):
+    if not isinstance(targets, dict):
         targets = map_to_index(targets, duplicate_method=duplicate_method)
-    indices = []
+
+    indices = numpy.zeros(len(x), dtype=numpy.min_scalar_type(-len(targets))) # get a signed type
     for i, y in enumerate(x):
-        if y is None or y not in targets:
-            indices.append(None)
+        if y not in targets:
+            indices[i] = -1
         else:
-            indices.append(targets[y])
+            indices[i] = targets[y]
+
     return indices
diff --git a/tests/test_Factor.py b/tests/test_Factor.py
new file mode 100644
index 0000000..99b9dad
--- /dev/null
+++ b/tests/test_Factor.py
@@ -0,0 +1,199 @@
+from biocutils import Factor
+import pytest
+import copy
+
+
+def test_Factor_basics():
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    assert len(f) == 6
+    assert list(f) == ["A", "B", "C", "A", "C", "E"]
+    assert f.get_codes() == [0, 1, 2, 0, 2, 4]
+    assert f.get_levels() == ["A", "B", "C", "D", "E"]
+    assert not f.get_ordered()
+
+    with pytest.raises(TypeError) as ex:
+        Factor([0, "WHEE"], ["A", "B"])
+    assert str(ex.value).find("should be integers") >= 0
+
+    with pytest.raises(TypeError) as ex:
+        Factor([0, 1], ["A", None, "B"])
+    assert str(ex.value).find("non-missing strings") >= 0
+
+    with pytest.raises(ValueError) as ex:
+        Factor([0, 1, -1], ["A"])
+    assert str(ex.value).find("refer to an entry") >= 0
+
+    with pytest.raises(ValueError) as ex:
+        Factor([0, 1], ["A", "B", "A"])
+    assert str(ex.value).find("should be unique") >= 0
+
+    f = Factor([None] * 10, levels=["A", "B", "C", "D", "E"])
+    assert list(f) == [None] * 10
+
+
+def test_Factor_basics():
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    assert repr(f).startswith("Factor(")
+    assert str(f).startswith("Factor of length")
+
+    f = Factor([0, 1, 4, 2, 0, 3, 1, 3, 2, 4], levels=["A", "B", "C", "D", "E"])
+    assert repr(f).startswith("Factor(")
+    assert str(f).startswith("Factor of length")
+
+    f = Factor([], levels=["A", "B", "C", "D", "E"])
+    assert repr(f).startswith("Factor(")
+    assert str(f).startswith("Factor of length")
+
+    f = Factor([1], levels=["A", "B", "C", "D", "E"])
+    assert repr(f).startswith("Factor(")
+    assert str(f).startswith("Factor of length")
+
+    f = Factor([i % 5 for i in range(100)], levels=["A", "B", "C", "D", "E"])
+    assert repr(f).startswith("Factor(")
+    assert str(f).startswith("Factor of length")
+
+
+def test_Factor_getitem():
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    assert f[0] == "A"
+    assert f[2] == "C"
+    assert f[-1] == "E"
+
+    f2 = f[2:4]
+    assert list(f2.get_codes()) == [2, 0]
+    assert (f2.get_levels() == f.get_levels()).all()
+
+    f2 = f[[1, 3, 5]]
+    assert list(f2.get_codes()) == [1, 0, 4]
+    assert (f2.get_levels() == f.get_levels()).all()
+
+    f2 = f[[-1, -2, -3]]
+    assert list(f2.get_codes()) == [4, 2, 0]
+    assert (f2.get_levels() == f.get_levels()).all()
+
+
+def test_Factor_setitem():
+    f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"])
+    f2 = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"])
+
+    f[0:2] = f2[2:4]
+    assert list(f.get_codes()) == [2, 3, 2, 3, 2, 1]
+    assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
+
+    f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"])
+    f2 = Factor([0, 1, 2, 3, 2, 1], levels=["E", "D", "C", "B", "A"])
+    f[[-3, -2, -1]] = f2[0:3]
+    assert list(f.get_codes()) == [0, 1, 2, 4, 3, 2]
+    assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
+
+    f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"])
+    f2 = Factor([0, 1, 2, 3, 2, 1], levels=["e", "d", "c", "b", "a"])
+    f[:] = f2[:]
+    assert list(f.get_codes()) == [-1] * 6
+    assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
+
+
+def test_Factor_drop_unused_levels():
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    f2 = f.drop_unused_levels()
+    assert list(f2.get_levels()) == ["A", "B", "C", "E"]
+    assert list(f2) == list(f)
+
+    f = Factor([3, 4, 2, 3, 2, 4], levels=["A", "B", "C", "D", "E"])
+    f2 = f.drop_unused_levels(in_place=True)
+    assert list(f2.get_levels()) == ["C", "D", "E"]
+    assert list(f2) == ["D", "E", "C", "D", "C", "E"]
+
+
+def test_Factor_set_levels():
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    f2 = f.set_levels(["E", "D", "C", "B", "A"])
+    assert list(f2.get_levels()) == ["E", "D", "C", "B", "A"]
+    assert list(f2.get_codes()) == [4, 3, 2, 4, 2, 0]
+    assert list(f2) == list(f)
+
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    f2 = f.set_levels(["E", "C", "A"], in_place=True)
+    assert list(f2.get_levels()) == ["E", "C", "A"]
+    assert list(f2.get_codes()) == [2, -1, 1, 2, 1, 0]
+
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    f2 = f.set_levels("E")  # reorders
+    assert list(f2.get_levels()) == ["E", "A", "B", "C", "D"]
+    assert list(f2.get_codes()) == [1, 2, 3, 1, 3, 0]
+
+    with pytest.raises(ValueError) as ex:
+        f.set_levels("F")
+    assert str(ex.value).find("should already be present") >= 0
+
+    with pytest.raises(TypeError) as ex:
+        f.set_levels([None, "A"])
+    assert str(ex.value).find("should be strings") >= 0
+
+    with pytest.raises(ValueError) as ex:
+        f.set_levels(["A", "A"])
+    assert str(ex.value).find("should be unique") >= 0
+
+
+def test_Factor_copy():
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    out = copy.copy(f)
+    assert (f.get_codes() == out.get_codes()).all()
+    assert (f.get_levels() == out.get_levels()).all()
+
+    f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
+    out = copy.deepcopy(f)
+    assert (f.get_codes() == out.get_codes()).all()
+    assert (f.get_levels() == out.get_levels()).all()
+
+
+#def test_Factor_combine():
+#    # Same levels.
+#    f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"])
+#    f2 = Factor([1, 3, 1], levels=["A", "B", "C", "D", "E"])
+#    out = combine(f1, f2)
+#    assert out.get_levels() == f2.get_levels()
+#    assert out.get_codes() == [0, 2, 4, 2, 0, 1, 3, 1]
+#
+#    # Different levels.
+#    f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"])
+#    f2 = Factor([1, 3, 1], levels=["D", "E", "F", "G"])
+#    out = combine(f1, f2)
+#    assert out.get_levels() == ["A", "B", "C", "D", "E", "F", "G"]
+#    assert out.get_codes() == [0, 2, 4, 2, 0, 4, 6, 4]
+#
+#    f2 = Factor([1, 3, None], levels=["D", "E", "F", "G"])
+#    out = combine(f1, f2)
+#    assert out.get_codes() == [0, 2, 4, 2, 0, 4, 6, None]
+#
+#    # Ordering is preserved for the same levels, lost otherwise.
+#    f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"], ordered=True)
+#    f2 = Factor([1, 3, 1], levels=["A", "B", "C", "D", "E"], ordered=True)
+#    out = combine(f1, f2)
+#    assert out.get_ordered()
+#
+#    f2 = Factor([1, 3, 2], levels=["D", "E", "F", "G"], ordered=True)
+#    out = combine(f1, f2)
+#    assert not out.get_ordered()
+
+
+def test_Factor_pandas():
+    import pandas as pd
+    f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"])
+    pcat = f1.to_pandas()
+    assert pcat is not None
+    assert len(pcat) == len(f1)
+
+    f2 = Factor([1, 3, 2], levels=["D", "E", "F", "G"], ordered=True)
+    pcat = f2.to_pandas()
+    assert pcat is not None
+    assert len(pcat) == len(f2)
+    assert pcat.ordered == f2.get_ordered()
+
+
+def test_Factor_init_from_list():
+    f1 = Factor.from_sequence(["A", "B", "A", "B", "E"])
+
+    assert isinstance(f1, Factor)
+    assert len(f1) == 5
+    assert len(f1.get_levels()) == 3
diff --git a/tests/test_factor.py b/tests/test_factor.py
deleted file mode 100644
index 44e6337..0000000
--- a/tests/test_factor.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from biocutils import factor
-
-
-def test_factor_simple():
-    lev, ind = factor([1, 3, 5, 5, 3, 1])
-    assert lev == [1, 3, 5]
-    assert ind == [0, 1, 2, 2, 1, 0]
-
-    # Preserves the order.
-    lev, ind = factor(["C", "D", "A", "B", "C", "A"])
-    assert lev == ["C", "D", "A", "B"]
-    assert ind == [0, 1, 2, 3, 0, 2]
-
-    # Handles None-ness.
-    lev, ind = factor([1, None, 5, None, 3, None])
-    assert lev == [1, 5, 3]
-    assert ind == [0, None, 1, None, 2, None]
-
-
-def test_factor_levels():
-    revlev = [5, 4, 3, 2, 1]
-    lev, ind = factor([1, 3, 5, 5, 3, 1], levels=revlev)
-    assert lev == revlev
-    assert ind == [4, 2, 0, 0, 2, 4]
-
-    # Preserves duplicates.
-    duplicated = [5, 4, 5, 4, 3, 4, 2, 3, 1, 1, 2]
-    lev, ind = factor([1, 3, 5, 5, 3, 1], levels=duplicated)
-    assert lev == duplicated
-    assert ind == [8, 4, 0, 0, 4, 8]
-
-    # Ignores None.
-    noney = [None, 1, 2, 3, 4, 5, None]
-    lev, ind = factor([1, 3, 5, 5, 3, 1], levels=noney)
-    assert lev == noney
-    assert ind == [1, 3, 5, 5, 3, 1]
-
-
-def test_factor_sorted():
-    lev, ind = factor(["C", "D", "A", "B", "C", "A"], sort_levels=True)
-    assert lev == ["A", "B", "C", "D"]
-    assert ind == [2, 3, 0, 1, 2, 0]
-
-    # Not affected if you supply the levels directly.
-    lev, ind = factor(
-        ["C", "D", "A", "B", "C", "A"], levels=["D", "C", "B", "A"], sort_levels=True
-    )
-    assert lev == ["D", "C", "B", "A"]
-    assert ind == [1, 0, 3, 2, 1, 3]
diff --git a/tests/test_factorize.py b/tests/test_factorize.py
new file mode 100644
index 0000000..b3ba479
--- /dev/null
+++ b/tests/test_factorize.py
@@ -0,0 +1,49 @@
+from biocutils import factorize
+
+
+def test_factor_simple():
+    lev, ind = factorize([1, 3, 5, 5, 3, 1])
+    assert lev == [1, 3, 5]
+    assert list(ind) == [0, 1, 2, 2, 1, 0]
+
+    # Preserves the order.
+    lev, ind = factorize(["C", "D", "A", "B", "C", "A"])
+    assert lev == ["C", "D", "A", "B"]
+    assert list(ind) == [0, 1, 2, 3, 0, 2]
+
+    # Handles None-ness.
+    lev, ind = factorize([1, None, 5, None, 3, None])
+    assert lev == [1, 5, 3]
+    assert list(ind) == [0, -1, 1, -1, 2, -1]
+
+
+def test_factor_levels():
+    revlev = [5, 4, 3, 2, 1]
+    lev, ind = factorize([1, 3, 5, 5, 3, 1], levels=revlev)
+    assert lev == revlev
+    assert list(ind) == [4, 2, 0, 0, 2, 4]
+
+    # Preserves duplicates.
+    duplicated = [5, 4, 5, 4, 3, 4, 2, 3, 1, 1, 2]
+    lev, ind = factorize([1, 3, 5, 5, 3, 1], levels=duplicated)
+    assert lev == duplicated
+    assert list(ind) == [8, 4, 0, 0, 4, 8]
+
+    # Ignores None.
+    noney = [None, 1, 2, 3, 4, 5, None]
+    lev, ind = factorize([1, 3, 5, 5, 3, 1], levels=noney)
+    assert lev == noney
+    assert list(ind) == [1, 3, 5, 5, 3, 1]
+
+
+def test_factor_sorted():
+    lev, ind = factorize(["C", "D", "A", "B", "C", "A"], sort_levels=True)
+    assert lev == ["A", "B", "C", "D"]
+    assert list(ind) == [2, 3, 0, 1, 2, 0]
+
+    # Not affected if you supply the levels directly.
+    lev, ind = factorize(
+        ["C", "D", "A", "B", "C", "A"], levels=["D", "C", "B", "A"], sort_levels=True
+    )
+    assert lev == ["D", "C", "B", "A"]
+    assert list(ind) == [1, 0, 3, 2, 1, 3]
diff --git a/tests/test_match.py b/tests/test_match.py
index 23af59b..55de7da 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -6,24 +6,24 @@ def test_match_simple():
     levels = ["D", "C", "B", "A"]
 
     mm = match(x, levels)
-    assert mm == [3, 1, 2, 0, 3, 3, 1, 0, 2]
+    assert list(mm) == [3, 1, 2, 0, 3, 3, 1, 0, 2]
 
     mm2 = match(x, map_to_index(levels))
-    assert mm == mm2
+    assert (mm == mm2).all()
 
 
 def test_match_duplicates():
     x = [5, 1, 2, 3, 5, 6, 7, 7, 2, 1]
     mm = match(x, [1, 2, 3, 3, 5, 6, 1, 7, 6])
-    assert mm == [4, 0, 1, 2, 4, 5, 7, 7, 1, 0]
+    assert list(mm) == [4, 0, 1, 2, 4, 5, 7, 7, 1, 0]
 
     mm = match(x, [1, 2, 3, 3, 5, 6, 1, 7, 6], duplicate_method="last")
-    assert mm == [4, 6, 1, 3, 4, 8, 7, 7, 1, 6]
+    assert list(mm) == [4, 6, 1, 3, 4, 8, 7, 7, 1, 6]
 
 
 def test_match_none():
     mm = match(["A", None, "B", "D", None, "A", "C", None, "B"], ["D", "C", "B", "A"])
-    assert list(mm) == [3, None, 2, 0, None, 3, 1, None, 2]
+    assert list(mm) == [3, -1, 2, 0, -1, 3, 1, -1, 2]
 
     mm = match(["A", "B", "D", "A", "C", "B"], ["D", None, "C", "B", None, "A"])
     assert list(mm) == [5, 3, 0, 5, 2, 3]
diff --git a/tests/test_package_utils.py b/tests/test_package_utils.py
index 46a571c..2e023a8 100644
--- a/tests/test_package_utils.py
+++ b/tests/test_package_utils.py
@@ -5,11 +5,6 @@
 __license__ = "MIT"
 
 
-def test_for_pandas():
-    pkg = is_package_installed("pandas")
-
-    assert pkg is False
-
 
 def test_for_scipy():
     pkg = is_package_installed("scipy")