From 47aba98708951cb60c05423cfab31faf63bc93ec Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Tue, 7 Nov 2023 13:06:39 -0800 Subject: [PATCH] Migrated the Factor class from BiocFrame. (#7) This has been slightly modified so that the codes are now a (signed integer) NumPy array, and the levels are a string NumPy array. The idea is to always use type-enforcing NumPy arrays internally instead of lists of arbitrary stuff. --- setup.cfg | 1 + src/biocutils/Factor.py | 410 +++++++++++++++++++++++++++++ src/biocutils/__init__.py | 4 +- src/biocutils/factor.py | 51 ---- src/biocutils/factorize.py | 43 +++ src/biocutils/is_missing_scalar.py | 12 + src/biocutils/match.py | 19 +- tests/test_Factor.py | 199 ++++++++++++++ tests/test_factor.py | 49 ---- tests/test_factorize.py | 49 ++++ tests/test_match.py | 10 +- tests/test_package_utils.py | 5 - 12 files changed, 731 insertions(+), 121 deletions(-) create mode 100644 src/biocutils/Factor.py delete mode 100644 src/biocutils/factor.py create mode 100644 src/biocutils/factorize.py create mode 100644 src/biocutils/is_missing_scalar.py create mode 100644 tests/test_Factor.py delete mode 100644 tests/test_factor.py create mode 100644 tests/test_factorize.py diff --git a/setup.cfg b/setup.cfg index 7f77f53..26ce5e1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,6 +67,7 @@ testing = setuptools pytest pytest-cov + pandas [options.entry_points] # Add here console scripts like: diff --git a/src/biocutils/Factor.py b/src/biocutils/Factor.py new file mode 100644 index 0000000..d75c252 --- /dev/null +++ b/src/biocutils/Factor.py @@ -0,0 +1,410 @@ +from copy import deepcopy +from typing import List, Sequence, Union, Optional +from warnings import warn +import numpy + +from .match import match +from .factorize import factorize +from .normalize_subscript import normalize_subscript +from .is_missing_scalar import is_missing_scalar +from .print_truncated import print_truncated_list + + +def _check_levels_type(levels: numpy.ndarray): + if not numpy.issubdtype(levels.dtype, numpy.str_): + raise TypeError("all entries of 'levels' should be strings") + if numpy.ma.is_masked(levels): + raise TypeError("all entries of 'levels' should be non-missing") + if len(levels.shape) != 1: + raise TypeError("'codes' should be a 1-dimensional array") + + +class Factor: + """Factor class, equivalent to R's ``factor``. + + This is a vector of integer codes, each of which is an index into a list of + unique strings. The aim is to encode a list of strings as integers for + easier numerical analysis. + """ + + def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool = False, validate: bool = True): + """Initialize a Factor object. + + Args: + codes: + Sequence of codes. Each value should be a non-negative integer + that refers to an entry ``levels``. Negative or None entries + are assumed to refer to missing values. + + levels: + List of levels containing unique strings. + + ordered: + Whether the levels are ordered. + + validate: + Whether to validate the arguments. Internal use only. + """ + if not isinstance(codes, numpy.ndarray): + replacement = numpy.ndarray(len(codes), dtype=numpy.min_scalar_type(-len(levels))) # get a signed type. + for i, x in enumerate(codes): + if is_missing_scalar(x) or x < 0: + replacement[i] = -1 + else: + replacement[i] = x + codes = replacement + self._codes = codes + + if not isinstance(levels, numpy.ndarray): + levels = numpy.array(levels, dtype=str) + self._levels = levels + + self._ordered = bool(ordered) + + if validate: + if not numpy.issubdtype(self._codes.dtype, numpy.signedinteger): + raise TypeError("all entries of 'codes' should be signed integers") + if len(self._codes.shape) != 1: + raise TypeError("'codes' should be a 1-dimensional array") + + _check_levels_type(self._levels) + + for x in codes: + if x >= len(self._levels): + raise ValueError("all entries of 'codes' should refer to an entry of 'levels'") + + if len(set(self._levels)) < len(self._levels): + raise ValueError("all entries of 'levels' should be unique") + + def get_codes(self) -> numpy.ndarray: + """ + Returns: + Array of integer codes, used as indices into the levels from + :py:attr:`~get_levels`. A masked array may also be returned if + any of the entries are missing. + """ + return self._codes + + @property + def codes(self) -> numpy.ndarray: + """See :py:attr:`~get_codes`.""" + return self.get_codes() + + def get_levels(self) -> numpy.ndarray: + """ + Returns: + Array of strings containing the factor levels. + """ + return self._levels + + @property + def levels(self) -> numpy.ndarray: + """See :py:attr:`~get_levels`.""" + return self.get_levels() + + def get_ordered(self) -> bool: + """ + Returns: + True if the levels are ordered, otherwise False. + """ + return self._ordered + + @property + def ordered(self) -> bool: + """See :py:attr:`~get_ordered`.""" + return self.get_ordered() + + def __len__(self) -> int: + """ + Returns: + Length of the factor in terms of the number of codes. + """ + return len(self._codes) + + def __repr__(self) -> str: + """ + Returns: + A stringified representation of this object. + """ + tmp = "Factor(codes=" + print_truncated_list(self._codes) + ", levels=" + print_truncated_list(self._levels) + if self._ordered: + tmp += ", ordered=True" + tmp += ")" + return tmp + + def __str__(self) -> str: + """ + Returns: + A pretty-printed representation of this object. + """ + message = "Factor of length " + str(len(self._codes)) + " with " + str(len(self._levels)) + " level" + if len(self._levels) != 0: + message += "s" + message += "\n" + message += "values: " + print_truncated_list(self._codes, transform=lambda i: self._levels[i]) + "\n" + message += "levels: " + print_truncated_list(self._levels, transform=lambda x: x) + "\n" + message += "ordered: " + str(self._ordered) + return message + + def __getitem__(self, sub: Union[int, bool, Sequence]) -> Union[str, "Factor"]: + """Subset the ``Factor`` to the specified subset of indices. + + Args: + sub: + Sequence of integers or booleans specifying the elements of + interest. Alternatively, an integer/boolean scalar specifying a + single element. + + Returns: + If ``sub`` is a sequence, returns same type as caller (a new + ``Factor``) containing only the elements of interest from ``sub``. + + If ``sub`` is a scalar, a string is returned containing the + level corresponding to the code at position ``sub``. This may + also be None if the code is missing. + """ + sub, scalar = normalize_subscript(sub, len(self), None) + if scalar: + x = self._codes[sub[0]] + if x >= 0: + return self._levels[x] + else: + return None + return type(self)(self._codes[sub], self._levels, self._ordered, validate=False) + + def replace(self, sub: Sequence, value: Union[str, "Factor"], in_place: bool = False): + """ + Replace items in the ``Factor`` list. The ``subs`` elements in the + current object are replaced with the corresponding values in ``value``. + This is performed by finding the level for each entry of the + replacement ``value``, matching it to a level in the current object, + and replacing the entry of ``codes`` with the code of the matched + level. If there is no matching level, a missing value is inserted. + + Args: + sub: + Sequence of integers or booleans specifying the items to be + replaced. + + value: + If ``sub`` is a sequence, a ``Factor`` of the same length + containing the replacement values. + + in_place: + Whether the replacement should be performed on the current + object. + + Returns: + If ``in_place = False``, a new ``Factor`` is returned containing the + contents of the current object after replacement by ``value``. + + If ``in_place = True``, the current object is returned after its + items have been replaced. + """ + sub, scalar = normalize_subscript(sub, len(self), None) + codes = self._codes + if not in_place: + codes = codes.copy() + + if len(self._levels) == len(value._levels) and (self._levels == value._levels).all(): + for i, x in enumerate(sub): + codes[x] = value._codes[i] + else: + mapping = match(value._levels, self._levels) + for i, x in enumerate(sub): + v = value._codes[i] + if v >= 0: + codes[x] = mapping[v] + else: + codes[x] = -1 + + if in_place: + self._codes = codes + return self + else: + return type(self)(codes, self._levels, self._ordered, validate=False) + + def __setitem__(self, args: Sequence[int], value: "Factor"): + """See :py:attr:`~replace` for details.""" + return self.replace(args, value, in_place=True) + + def drop_unused_levels(self, in_place: bool = False) -> "Factor": + """Drop unused levels. + + Args: + in_place: Whether to perform this modification in-place. + + Returns: + If ``in_place = False``, returns same type as caller (a new ``Factor`` object) + where all unused levels have been removed. + + If ``in_place = True``, unused levels are removed from the + current object; a reference to the current object is returned. + """ + if in_place: + new_codes = self._codes + else: + new_codes = self._codes.copy() + + in_use = [False] * len(self._levels) + for x in self._codes: + if x >= 0: + in_use[x] = True + + new_levels = [] + reindex = [-1] * len(in_use) + for i, x in enumerate(in_use): + if x: + reindex[i] = len(new_levels) + new_levels.append(self._levels[i]) + new_levels = numpy.array(new_levels) + + for i, x in enumerate(self._codes): + if x >= 0: + new_codes[i] = reindex[x] + + if in_place: + self._codes = new_codes + self._levels = new_levels + return self + else: + current_class_const = type(self) + return current_class_const(new_codes, new_levels, self._ordered, validate=False) + + def set_levels(self, levels: Union[str, List[str]], in_place: bool = False) -> "Factor": + """Set or replace levels. + + Args: + levels: + A list of replacement levels. These should be unique strings + with no missing values. + + Alternatively a single string containing an existing level in + this object. The new levels are defined as a permutation of the + existing levels where the provided string is now the first + level. The order of all other levels is preserved. + + in_place: + Whether to perform this modification in-place. + + Returns: + If ``in_place = False``, returns same type as caller (a new + ``Factor`` object) where the levels have been replaced. This will + automatically update the codes so that they still refer to the same + string in the new ``levels``. If a code refers to a level that is + not present in the new ``levels``, it is replaced with None. + + If ``in_place = True``, the levels are replaced in the current + object, and a reference to the current object is returned. + """ + lmapping = {} + if isinstance(levels, str): + new_levels = [levels] + for x in self._levels: + if x == levels: + lmapping[x] = 0 + else: + lmapping[x] = len(new_levels) + new_levels.append(x) + if levels not in lmapping: + raise ValueError( + "string 'levels' should already be present among object levels" + ) + else: + new_levels = numpy.array(levels) + _check_levels_type(new_levels) + for i, x in enumerate(new_levels): + if x in lmapping: + raise ValueError("levels should be unique") + lmapping[x] = i + + mapping = [-1] * len(self._levels) + for i, x in enumerate(self._levels): + if x in lmapping: + mapping[i] = lmapping[x] + + if in_place: + new_codes = self._codes + else: + new_codes = self._codes.copy() + for i, x in enumerate(new_codes): + if x >= 0: + new_codes[i] = mapping[x] + else: + new_codes[i] = -1 + + if in_place: + self._codes = new_codes + self._levels = new_levels + return self + else: + current_class_const = type(self) + return current_class_const(new_codes, new_levels, self._ordered, validate=False) + + @levels.setter + def levels(self, levels: Union[str, List[str]]): + """See :py:attr:`~set_levels`.""" + warn("Setting property 'levels'is an in-place operation, use 'set_levels' instead", UserWarning) + self.set_levels(levels, in_place=True) + + def __copy__(self) -> "Factor": + """ + Returns: + A shallow copy of the ``Factor`` object. + """ + current_class_const = type(self) + return current_class_const(self._codes, self._levels, self._ordered, validate=False) + + def __deepcopy__(self, memo) -> "Factor": + """ + Returns: + A deep copy of the ``Factor`` object. + """ + current_class_const = type(self) + return current_class_const( + deepcopy(self._codes, memo), + deepcopy(self._levels, memo), + self._ordered, + validate=False, + ) + + def to_pandas(self): + """Coerce to :py:class:`~pandas.Categorical` object. + + Returns: + Categorical: A :py:class:`~pandas.Categorical` object. + """ + from pandas import Categorical + return Categorical( + values=[self._levels[c] for c in self._codes], + ordered=self._ordered, + ) + + @staticmethod + def from_sequence(x: Sequence[str], levels: Optional[Sequence[str]] = None, sort_levels: bool = True, ordered: bool = False) -> "Factor": + """Convert a sequence of hashable values into a factor. + + Args: + x: + A sequence of strings. Any value may be None to indicate + missingness. + + levels: + Sequence of reference levels, against which the entries in ``x`` are compared. + If None, this defaults to all unique values of ``x``. + + sort_levels: + Whether to sort the automatically-determined levels. If False, + the levels are kept in order of their appearance in ``x``. Not + used if ``levels`` is explicitly supplied. + + ordered (bool): + Whether the levels should be assumed to be ordered. Note that + this refers to their importance and has nothing to do with + their sorting order or with the setting of ``sort_levels``. + + Returns: + A ``Factor`` object. + """ + levels, indices = factorize(x, levels=levels, sort_levels=sort_levels) + return Factor(indices, levels=levels, ordered=ordered) diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 390188b..28c74c6 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -15,9 +15,11 @@ finally: del version, PackageNotFoundError -from .factor import factor +from .Factor import Factor +from .factorize import factorize from .intersect import intersect from .is_list_of_type import is_list_of_type +from .is_missing_scalar import is_missing_scalar from .map_to_index import map_to_index from .match import match from .normalize_subscript import normalize_subscript diff --git a/src/biocutils/factor.py b/src/biocutils/factor.py deleted file mode 100644 index 3fc7391..0000000 --- a/src/biocutils/factor.py +++ /dev/null @@ -1,51 +0,0 @@ -from typing import Optional, Sequence, Tuple - - -def factor( - x: Sequence, levels: Optional[Sequence] = None, sort_levels: bool = False -) -> Tuple[list, list]: - """Convert a sequence of hashable values into a factor. - - Args: - x (Sequence): A sequence of hashable values. - Any value may be None to indicate missingness. - - levels (Sequence, optional): - Sequence of reference levels, against which the entries in ``x`` are compared. - If None, this defaults to all unique values of ``x``. - - sort_levels (bool): - Whether to sort the automatically-determined levels. - If False, the levels are kept in order of their appearance in ``x``. - Not used if ``levels`` is explicitly supplied. - - Returns: - Tuple[list, list]: Tuple where the first list contains the unique levels - and the second list contains the integer index into the first list. - Indexing the first list by the second list will recover ``x``, except - for any None values in ``x``, which will be None in the second list. - """ - - if levels is None: - present = set() - levels = [] - for val in x: - if val is not None and val not in present: - levels.append(val) - present.add(val) - if sort_levels: - levels.sort() - - mapping = {} - for i, lev in enumerate(levels): - if lev is not None and lev not in mapping: - mapping[lev] = i - - indices = [] - for i, val in enumerate(x): - if val is None or val not in mapping: - indices.append(None) - else: - indices.append(mapping[val]) - - return levels, indices diff --git a/src/biocutils/factorize.py b/src/biocutils/factorize.py new file mode 100644 index 0000000..ed70e01 --- /dev/null +++ b/src/biocutils/factorize.py @@ -0,0 +1,43 @@ +from typing import Optional, Sequence, Tuple +import numpy + +from .match import match +from .is_missing_scalar import is_missing_scalar + + +def factorize(x: Sequence, levels: Optional[Sequence] = None, sort_levels: bool = False) -> Tuple[list, numpy.ndarray]: + """Convert a sequence of hashable values into a factor. + + Args: + x: + A sequence of hashable values. + Any value may be None to indicate missingness. + + levels: + Sequence of reference levels, against which the entries in ``x`` are compared. + If None, this defaults to all unique values of ``x``. + + sort_levels: + Whether to sort the automatically-determined levels. + If False, the levels are kept in order of their appearance in ``x``. + Not used if ``levels`` is explicitly supplied. + + Returns: + Tuple where the first list contains the unique levels and the second + array contains the integer index into the first list. Indexing the + first list by the second array will recover ``x``; except for any None + or masked values in ``x``, which will be -1 in the second array. + """ + + if levels is None: + present = set() + levels = [] + for val in x: + if not is_missing_scalar(val) and val not in present: + levels.append(val) + present.add(val) + if sort_levels: + levels.sort() + + codes = match(x, levels) + return levels, codes diff --git a/src/biocutils/is_missing_scalar.py b/src/biocutils/is_missing_scalar.py new file mode 100644 index 0000000..ea68aec --- /dev/null +++ b/src/biocutils/is_missing_scalar.py @@ -0,0 +1,12 @@ +import numpy + + +def is_missing_scalar(x) -> bool: + """ + Args: + x: + Any scalar value. + Returns: + Whether ``x`` is None or a NumPy masked constant. + """ + return x is None or numpy.ma.is_masked(x) diff --git a/src/biocutils/match.py b/src/biocutils/match.py index 8db953a..9244b10 100644 --- a/src/biocutils/match.py +++ b/src/biocutils/match.py @@ -1,13 +1,10 @@ from typing import List, Sequence, Union +import numpy from .map_to_index import DUPLICATE_METHOD, map_to_index -def match( - x: Sequence, - targets: Union[dict, Sequence], - duplicate_method: DUPLICATE_METHOD = "first", -) -> List[Union[int, None]]: +def match(x: Sequence, targets: Union[dict, Sequence], duplicate_method: DUPLICATE_METHOD = "first") -> numpy.ndarray: """Find a matching value of each element of ``x`` in ``target``. Args: @@ -23,12 +20,14 @@ def match( integer position of each entry of ``x`` inside ``target``; or None, if the entry of ``x`` is None or cannot be found in ``target``. """ - if isinstance(targets, Sequence): + if not isinstance(targets, dict): targets = map_to_index(targets, duplicate_method=duplicate_method) - indices = [] + + indices = numpy.zeros(len(x), dtype=numpy.min_scalar_type(-len(targets))) # get a signed type for i, y in enumerate(x): - if y is None or y not in targets: - indices.append(None) + if y not in targets: + indices[i] = -1 else: - indices.append(targets[y]) + indices[i] = targets[y] + return indices diff --git a/tests/test_Factor.py b/tests/test_Factor.py new file mode 100644 index 0000000..99b9dad --- /dev/null +++ b/tests/test_Factor.py @@ -0,0 +1,199 @@ +from biocutils import Factor +import pytest +import copy + + +def test_Factor_basics(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + assert len(f) == 6 + assert list(f) == ["A", "B", "C", "A", "C", "E"] + assert f.get_codes() == [0, 1, 2, 0, 2, 4] + assert f.get_levels() == ["A", "B", "C", "D", "E"] + assert not f.get_ordered() + + with pytest.raises(TypeError) as ex: + Factor([0, "WHEE"], ["A", "B"]) + assert str(ex.value).find("should be integers") >= 0 + + with pytest.raises(TypeError) as ex: + Factor([0, 1], ["A", None, "B"]) + assert str(ex.value).find("non-missing strings") >= 0 + + with pytest.raises(ValueError) as ex: + Factor([0, 1, -1], ["A"]) + assert str(ex.value).find("refer to an entry") >= 0 + + with pytest.raises(ValueError) as ex: + Factor([0, 1], ["A", "B", "A"]) + assert str(ex.value).find("should be unique") >= 0 + + f = Factor([None] * 10, levels=["A", "B", "C", "D", "E"]) + assert list(f) == [None] * 10 + + +def test_Factor_basics(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + assert repr(f).startswith("Factor(") + assert str(f).startswith("Factor of length") + + f = Factor([0, 1, 4, 2, 0, 3, 1, 3, 2, 4], levels=["A", "B", "C", "D", "E"]) + assert repr(f).startswith("Factor(") + assert str(f).startswith("Factor of length") + + f = Factor([], levels=["A", "B", "C", "D", "E"]) + assert repr(f).startswith("Factor(") + assert str(f).startswith("Factor of length") + + f = Factor([1], levels=["A", "B", "C", "D", "E"]) + assert repr(f).startswith("Factor(") + assert str(f).startswith("Factor of length") + + f = Factor([i % 5 for i in range(100)], levels=["A", "B", "C", "D", "E"]) + assert repr(f).startswith("Factor(") + assert str(f).startswith("Factor of length") + + +def test_Factor_getitem(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + assert f[0] == "A" + assert f[2] == "C" + assert f[-1] == "E" + + f2 = f[2:4] + assert list(f2.get_codes()) == [2, 0] + assert (f2.get_levels() == f.get_levels()).all() + + f2 = f[[1, 3, 5]] + assert list(f2.get_codes()) == [1, 0, 4] + assert (f2.get_levels() == f.get_levels()).all() + + f2 = f[[-1, -2, -3]] + assert list(f2.get_codes()) == [4, 2, 0] + assert (f2.get_levels() == f.get_levels()).all() + + +def test_Factor_setitem(): + f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) + f2 = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) + + f[0:2] = f2[2:4] + assert list(f.get_codes()) == [2, 3, 2, 3, 2, 1] + assert list(f.get_levels()) == ["A", "B", "C", "D", "E"] + + f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) + f2 = Factor([0, 1, 2, 3, 2, 1], levels=["E", "D", "C", "B", "A"]) + f[[-3, -2, -1]] = f2[0:3] + assert list(f.get_codes()) == [0, 1, 2, 4, 3, 2] + assert list(f.get_levels()) == ["A", "B", "C", "D", "E"] + + f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) + f2 = Factor([0, 1, 2, 3, 2, 1], levels=["e", "d", "c", "b", "a"]) + f[:] = f2[:] + assert list(f.get_codes()) == [-1] * 6 + assert list(f.get_levels()) == ["A", "B", "C", "D", "E"] + + +def test_Factor_drop_unused_levels(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.drop_unused_levels() + assert list(f2.get_levels()) == ["A", "B", "C", "E"] + assert list(f2) == list(f) + + f = Factor([3, 4, 2, 3, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.drop_unused_levels(in_place=True) + assert list(f2.get_levels()) == ["C", "D", "E"] + assert list(f2) == ["D", "E", "C", "D", "C", "E"] + + +def test_Factor_set_levels(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.set_levels(["E", "D", "C", "B", "A"]) + assert list(f2.get_levels()) == ["E", "D", "C", "B", "A"] + assert list(f2.get_codes()) == [4, 3, 2, 4, 2, 0] + assert list(f2) == list(f) + + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.set_levels(["E", "C", "A"], in_place=True) + assert list(f2.get_levels()) == ["E", "C", "A"] + assert list(f2.get_codes()) == [2, -1, 1, 2, 1, 0] + + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.set_levels("E") # reorders + assert list(f2.get_levels()) == ["E", "A", "B", "C", "D"] + assert list(f2.get_codes()) == [1, 2, 3, 1, 3, 0] + + with pytest.raises(ValueError) as ex: + f.set_levels("F") + assert str(ex.value).find("should already be present") >= 0 + + with pytest.raises(TypeError) as ex: + f.set_levels([None, "A"]) + assert str(ex.value).find("should be strings") >= 0 + + with pytest.raises(ValueError) as ex: + f.set_levels(["A", "A"]) + assert str(ex.value).find("should be unique") >= 0 + + +def test_Factor_copy(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + out = copy.copy(f) + assert (f.get_codes() == out.get_codes()).all() + assert (f.get_levels() == out.get_levels()).all() + + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + out = copy.deepcopy(f) + assert (f.get_codes() == out.get_codes()).all() + assert (f.get_levels() == out.get_levels()).all() + + +#def test_Factor_combine(): +# # Same levels. +# f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"]) +# f2 = Factor([1, 3, 1], levels=["A", "B", "C", "D", "E"]) +# out = combine(f1, f2) +# assert out.get_levels() == f2.get_levels() +# assert out.get_codes() == [0, 2, 4, 2, 0, 1, 3, 1] +# +# # Different levels. +# f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"]) +# f2 = Factor([1, 3, 1], levels=["D", "E", "F", "G"]) +# out = combine(f1, f2) +# assert out.get_levels() == ["A", "B", "C", "D", "E", "F", "G"] +# assert out.get_codes() == [0, 2, 4, 2, 0, 4, 6, 4] +# +# f2 = Factor([1, 3, None], levels=["D", "E", "F", "G"]) +# out = combine(f1, f2) +# assert out.get_codes() == [0, 2, 4, 2, 0, 4, 6, None] +# +# # Ordering is preserved for the same levels, lost otherwise. +# f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"], ordered=True) +# f2 = Factor([1, 3, 1], levels=["A", "B", "C", "D", "E"], ordered=True) +# out = combine(f1, f2) +# assert out.get_ordered() +# +# f2 = Factor([1, 3, 2], levels=["D", "E", "F", "G"], ordered=True) +# out = combine(f1, f2) +# assert not out.get_ordered() + + +def test_Factor_pandas(): + import pandas as pd + f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"]) + pcat = f1.to_pandas() + assert pcat is not None + assert len(pcat) == len(f1) + + f2 = Factor([1, 3, 2], levels=["D", "E", "F", "G"], ordered=True) + pcat = f2.to_pandas() + assert pcat is not None + assert len(pcat) == len(f2) + assert pcat.ordered == f2.get_ordered() + + +def test_Factor_init_from_list(): + f1 = Factor.from_sequence(["A", "B", "A", "B", "E"]) + + assert isinstance(f1, Factor) + assert len(f1) == 5 + assert len(f1.get_levels()) == 3 diff --git a/tests/test_factor.py b/tests/test_factor.py deleted file mode 100644 index 44e6337..0000000 --- a/tests/test_factor.py +++ /dev/null @@ -1,49 +0,0 @@ -from biocutils import factor - - -def test_factor_simple(): - lev, ind = factor([1, 3, 5, 5, 3, 1]) - assert lev == [1, 3, 5] - assert ind == [0, 1, 2, 2, 1, 0] - - # Preserves the order. - lev, ind = factor(["C", "D", "A", "B", "C", "A"]) - assert lev == ["C", "D", "A", "B"] - assert ind == [0, 1, 2, 3, 0, 2] - - # Handles None-ness. - lev, ind = factor([1, None, 5, None, 3, None]) - assert lev == [1, 5, 3] - assert ind == [0, None, 1, None, 2, None] - - -def test_factor_levels(): - revlev = [5, 4, 3, 2, 1] - lev, ind = factor([1, 3, 5, 5, 3, 1], levels=revlev) - assert lev == revlev - assert ind == [4, 2, 0, 0, 2, 4] - - # Preserves duplicates. - duplicated = [5, 4, 5, 4, 3, 4, 2, 3, 1, 1, 2] - lev, ind = factor([1, 3, 5, 5, 3, 1], levels=duplicated) - assert lev == duplicated - assert ind == [8, 4, 0, 0, 4, 8] - - # Ignores None. - noney = [None, 1, 2, 3, 4, 5, None] - lev, ind = factor([1, 3, 5, 5, 3, 1], levels=noney) - assert lev == noney - assert ind == [1, 3, 5, 5, 3, 1] - - -def test_factor_sorted(): - lev, ind = factor(["C", "D", "A", "B", "C", "A"], sort_levels=True) - assert lev == ["A", "B", "C", "D"] - assert ind == [2, 3, 0, 1, 2, 0] - - # Not affected if you supply the levels directly. - lev, ind = factor( - ["C", "D", "A", "B", "C", "A"], levels=["D", "C", "B", "A"], sort_levels=True - ) - assert lev == ["D", "C", "B", "A"] - assert ind == [1, 0, 3, 2, 1, 3] diff --git a/tests/test_factorize.py b/tests/test_factorize.py new file mode 100644 index 0000000..b3ba479 --- /dev/null +++ b/tests/test_factorize.py @@ -0,0 +1,49 @@ +from biocutils import factorize + + +def test_factor_simple(): + lev, ind = factorize([1, 3, 5, 5, 3, 1]) + assert lev == [1, 3, 5] + assert list(ind) == [0, 1, 2, 2, 1, 0] + + # Preserves the order. + lev, ind = factorize(["C", "D", "A", "B", "C", "A"]) + assert lev == ["C", "D", "A", "B"] + assert list(ind) == [0, 1, 2, 3, 0, 2] + + # Handles None-ness. + lev, ind = factorize([1, None, 5, None, 3, None]) + assert lev == [1, 5, 3] + assert list(ind) == [0, -1, 1, -1, 2, -1] + + +def test_factor_levels(): + revlev = [5, 4, 3, 2, 1] + lev, ind = factorize([1, 3, 5, 5, 3, 1], levels=revlev) + assert lev == revlev + assert list(ind) == [4, 2, 0, 0, 2, 4] + + # Preserves duplicates. + duplicated = [5, 4, 5, 4, 3, 4, 2, 3, 1, 1, 2] + lev, ind = factorize([1, 3, 5, 5, 3, 1], levels=duplicated) + assert lev == duplicated + assert list(ind) == [8, 4, 0, 0, 4, 8] + + # Ignores None. + noney = [None, 1, 2, 3, 4, 5, None] + lev, ind = factorize([1, 3, 5, 5, 3, 1], levels=noney) + assert lev == noney + assert list(ind) == [1, 3, 5, 5, 3, 1] + + +def test_factor_sorted(): + lev, ind = factorize(["C", "D", "A", "B", "C", "A"], sort_levels=True) + assert lev == ["A", "B", "C", "D"] + assert list(ind) == [2, 3, 0, 1, 2, 0] + + # Not affected if you supply the levels directly. + lev, ind = factorize( + ["C", "D", "A", "B", "C", "A"], levels=["D", "C", "B", "A"], sort_levels=True + ) + assert lev == ["D", "C", "B", "A"] + assert list(ind) == [1, 0, 3, 2, 1, 3] diff --git a/tests/test_match.py b/tests/test_match.py index 23af59b..55de7da 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -6,24 +6,24 @@ def test_match_simple(): levels = ["D", "C", "B", "A"] mm = match(x, levels) - assert mm == [3, 1, 2, 0, 3, 3, 1, 0, 2] + assert list(mm) == [3, 1, 2, 0, 3, 3, 1, 0, 2] mm2 = match(x, map_to_index(levels)) - assert mm == mm2 + assert (mm == mm2).all() def test_match_duplicates(): x = [5, 1, 2, 3, 5, 6, 7, 7, 2, 1] mm = match(x, [1, 2, 3, 3, 5, 6, 1, 7, 6]) - assert mm == [4, 0, 1, 2, 4, 5, 7, 7, 1, 0] + assert list(mm) == [4, 0, 1, 2, 4, 5, 7, 7, 1, 0] mm = match(x, [1, 2, 3, 3, 5, 6, 1, 7, 6], duplicate_method="last") - assert mm == [4, 6, 1, 3, 4, 8, 7, 7, 1, 6] + assert list(mm) == [4, 6, 1, 3, 4, 8, 7, 7, 1, 6] def test_match_none(): mm = match(["A", None, "B", "D", None, "A", "C", None, "B"], ["D", "C", "B", "A"]) - assert list(mm) == [3, None, 2, 0, None, 3, 1, None, 2] + assert list(mm) == [3, -1, 2, 0, -1, 3, 1, -1, 2] mm = match(["A", "B", "D", "A", "C", "B"], ["D", None, "C", "B", None, "A"]) assert list(mm) == [5, 3, 0, 5, 2, 3] diff --git a/tests/test_package_utils.py b/tests/test_package_utils.py index 46a571c..2e023a8 100644 --- a/tests/test_package_utils.py +++ b/tests/test_package_utils.py @@ -5,11 +5,6 @@ __license__ = "MIT" -def test_for_pandas(): - pkg = is_package_installed("pandas") - - assert pkg is False - def test_for_scipy(): pkg = is_package_installed("scipy")