Skip to content

Commit

Permalink
Updated Factor to use a StringVector for the levels.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Nov 8, 2023
1 parent abee243 commit 4eca0a3
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 55 deletions.
66 changes: 30 additions & 36 deletions src/biocutils/Factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from warnings import warn
import numpy

from .StringList import StringList
from .match import match
from .factorize import factorize
from .normalize_subscript import normalize_subscript
Expand All @@ -12,15 +13,6 @@
from .is_list_of_type import is_list_of_type


def _check_levels_type(levels: numpy.ndarray):
if not numpy.issubdtype(levels.dtype, numpy.str_):
raise TypeError("all entries of 'levels' should be strings")
if numpy.ma.is_masked(levels):
raise TypeError("all entries of 'levels' should be non-missing")
if len(levels.shape) != 1:
raise TypeError("'codes' should be a 1-dimensional array")


class Factor:
"""Factor class, equivalent to R's ``factor``.
Expand Down Expand Up @@ -57,28 +49,28 @@ def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool =
else:
replacement[i] = x
codes = replacement
elif not numpy.issubdtype(codes.dtype, numpy.signedinteger): # force it to be signed.
codes = codes.astype(numpy.min_scalar_type(-len(levels)))
self._codes = codes
else:
if len(codes.shape) != 1:
raise ValueError("'codes' should be a 1-dimensional array")
if not numpy.issubdtype(codes.dtype, numpy.signedinteger): # force it to be signed.
codes = codes.astype(numpy.min_scalar_type(-len(levels)))

if not isinstance(levels, numpy.ndarray):
levels = numpy.array(levels, dtype=str)
self._levels = levels
if not isinstance(levels, StringList):
levels = StringList(levels)

self._codes = codes
self._levels = levels
self._ordered = bool(ordered)

if validate:
if len(self._codes.shape) != 1:
raise TypeError("'codes' should be a 1-dimensional array")

_check_levels_type(self._levels)

if any(x is None for x in levels):
raise TypeError("all entries of 'levels' should be non-missing")
if len(set(levels)) < len(levels):
raise ValueError("all entries of 'levels' should be unique")
for x in codes:
if x >= len(self._levels):
if x < -1 or x >= len(self._levels):
raise ValueError("all entries of 'codes' should refer to an entry of 'levels'")

if len(set(self._levels)) < len(self._levels):
raise ValueError("all entries of 'levels' should be unique")

def get_codes(self) -> numpy.ndarray:
"""
Expand All @@ -94,15 +86,15 @@ def codes(self) -> numpy.ndarray:
"""See :py:attr:`~get_codes`."""
return self.get_codes()

def get_levels(self) -> numpy.ndarray:
def get_levels(self) -> StringList:
"""
Returns:
Array of strings containing the factor levels.
List of strings containing the factor levels.
"""
return self._levels

@property
def levels(self) -> numpy.ndarray:
def levels(self) -> StringList:
"""See :py:attr:`~get_levels`."""
return self.get_levels()

Expand Down Expand Up @@ -210,7 +202,7 @@ def replace(self, sub: Sequence, value: Union[str, "Factor"], in_place: bool = F
if not in_place:
codes = codes.copy()

if len(self._levels) == len(value._levels) and (self._levels == value._levels).all():
if self._levels == value._levels:
for i, x in enumerate(sub):
codes[x] = value._codes[i]
else:
Expand Down Expand Up @@ -255,13 +247,12 @@ def drop_unused_levels(self, in_place: bool = False) -> "Factor":
if x >= 0:
in_use[x] = True

new_levels = []
new_levels = StringList([])
reindex = [-1] * len(in_use)
for i, x in enumerate(in_use):
if x:
reindex[i] = len(new_levels)
new_levels.append(self._levels[i])
new_levels = numpy.array(new_levels)

for i, x in enumerate(self._codes):
if x >= 0:
Expand All @@ -275,13 +266,13 @@ def drop_unused_levels(self, in_place: bool = False) -> "Factor":
current_class_const = type(self)
return current_class_const(new_codes, new_levels, self._ordered, validate=False)

def set_levels(self, levels: Union[str, List[str]], in_place: bool = False) -> "Factor":
def set_levels(self, levels: Union[str, Sequence[str]], in_place: bool = False) -> "Factor":
"""Set or replace levels.
Args:
levels:
A list of replacement levels. These should be unique strings
with no missing values.
A sequence of replacement levels. These should be unique
strings with no missing values.
Alternatively a single string containing an existing level in
this object. The new levels are defined as a permutation of the
Expand Down Expand Up @@ -315,11 +306,14 @@ def set_levels(self, levels: Union[str, List[str]], in_place: bool = False) -> "
"string 'levels' should already be present among object levels"
)
else:
new_levels = numpy.array(levels)
_check_levels_type(new_levels)
new_levels = levels
if not isinstance(new_levels, StringList):
new_levels = StringList(levels)
for i, x in enumerate(new_levels):
if x is None:
raise TypeError("all entries of 'levels' should be non-missing")
if x in lmapping:
raise ValueError("levels should be unique")
raise ValueError("all entries of 'levels' should be unique")
lmapping[x] = i

mapping = [-1] * len(self._levels)
Expand Down Expand Up @@ -424,7 +418,7 @@ def _combine_factors(*x: Factor):
all_same = True
for f in x[1:]:
cur_levels = f._levels
if len(cur_levels) != len(first_levels) or (cur_levels != first_levels).any() or f._ordered != first._ordered:
if cur_levels != first_levels or f._ordered != first._ordered:
all_same = False
break

Expand Down
38 changes: 19 additions & 19 deletions tests/test_Factor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from biocutils import Factor, combine
from biocutils import Factor, combine, StringList
import pytest
import copy
import numpy
Expand All @@ -9,7 +9,7 @@ def test_Factor_basics():
assert len(f) == 6
assert list(f) == ["A", "B", "C", "A", "C", "E"]
assert list(f.get_codes()) == [0, 1, 2, 0, 2, 4]
assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
assert f.get_levels() == ["A", "B", "C", "D", "E"]
assert not f.get_ordered()

# Works with missing values.
Expand All @@ -25,7 +25,7 @@ def test_Factor_basics():
f = Factor(numpy.array([4,3,2,1,0], dtype=numpy.uint8), levels=numpy.array(["A", "B", "C", "D", "E"]))
assert len(f) == 5
assert f.get_codes().dtype == numpy.int8
assert numpy.issubdtype(f.get_levels().dtype, numpy.str_)
assert isinstance(f.get_levels(), StringList)

with pytest.raises(ValueError) as ex:
Factor([0, 1, 100], ["A"])
Expand Down Expand Up @@ -66,15 +66,15 @@ def test_Factor_getitem():

f2 = f[2:4]
assert list(f2.get_codes()) == [2, 0]
assert (f2.get_levels() == f.get_levels()).all()
assert f2.get_levels() == f.get_levels()

f2 = f[[1, 3, 5]]
assert list(f2.get_codes()) == [1, 0, 4]
assert (f2.get_levels() == f.get_levels()).all()
assert f2.get_levels() == f.get_levels()

f2 = f[[-1, -2, -3]]
assert list(f2.get_codes()) == [4, 2, 0]
assert (f2.get_levels() == f.get_levels()).all()
assert f2.get_levels() == f.get_levels()


def test_Factor_setitem():
Expand All @@ -83,48 +83,48 @@ def test_Factor_setitem():

f[0:2] = f2[2:4]
assert list(f.get_codes()) == [2, 3, 2, 3, 2, 1]
assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
assert f.get_levels() == ["A", "B", "C", "D", "E"]

f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"])
f2 = Factor([0, 1, 2, 3, 2, 1], levels=["E", "D", "C", "B", "A"])
f[[-3, -2, -1]] = f2[0:3]
assert list(f.get_codes()) == [0, 1, 2, 4, 3, 2]
assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
assert f.get_levels() == ["A", "B", "C", "D", "E"]

f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"])
f2 = Factor([0, 1, 2, 3, 2, 1], levels=["e", "d", "c", "b", "a"])
f[:] = f2[:]
assert list(f.get_codes()) == [-1] * 6
assert list(f.get_levels()) == ["A", "B", "C", "D", "E"]
assert f.get_levels() == ["A", "B", "C", "D", "E"]


def test_Factor_drop_unused_levels():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.drop_unused_levels()
assert list(f2.get_levels()) == ["A", "B", "C", "E"]
assert f2.get_levels() == ["A", "B", "C", "E"]
assert list(f2) == list(f)

f = Factor([3, 4, 2, 3, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.drop_unused_levels(in_place=True)
assert list(f2.get_levels()) == ["C", "D", "E"]
assert f2.get_levels() == ["C", "D", "E"]
assert list(f2) == ["D", "E", "C", "D", "C", "E"]


def test_Factor_set_levels():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.set_levels(["E", "D", "C", "B", "A"])
assert list(f2.get_levels()) == ["E", "D", "C", "B", "A"]
assert f2.get_levels() == ["E", "D", "C", "B", "A"]
assert list(f2.get_codes()) == [4, 3, 2, 4, 2, 0]
assert list(f2) == list(f)

f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.set_levels(["E", "C", "A"], in_place=True)
assert list(f2.get_levels()) == ["E", "C", "A"]
assert f2.get_levels() == ["E", "C", "A"]
assert list(f2.get_codes()) == [2, -1, 1, 2, 1, 0]

f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.set_levels("E") # reorders
assert list(f2.get_levels()) == ["E", "A", "B", "C", "D"]
assert f2.get_levels() == ["E", "A", "B", "C", "D"]
assert list(f2.get_codes()) == [1, 2, 3, 1, 3, 0]

with pytest.raises(ValueError) as ex:
Expand All @@ -133,7 +133,7 @@ def test_Factor_set_levels():

with pytest.raises(TypeError) as ex:
f.set_levels([None, "A"])
assert str(ex.value).find("should be strings") >= 0
assert str(ex.value).find("non-missing") >= 0

with pytest.raises(ValueError) as ex:
f.set_levels(["A", "A"])
Expand All @@ -144,27 +144,27 @@ def test_Factor_copy():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
out = copy.copy(f)
assert (f.get_codes() == out.get_codes()).all()
assert (f.get_levels() == out.get_levels()).all()
assert f.get_levels() == out.get_levels()

f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
out = copy.deepcopy(f)
assert (f.get_codes() == out.get_codes()).all()
assert (f.get_levels() == out.get_levels()).all()
assert f.get_levels() == out.get_levels()


def test_Factor_combine():
# Same levels.
f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"])
f2 = Factor([1, 3, 1], levels=["A", "B", "C", "D", "E"])
out = combine(f1, f2)
assert (out.get_levels() == f2.get_levels()).all()
assert out.get_levels() == f2.get_levels()
assert list(out.get_codes()) == [0, 2, 4, 2, 0, 1, 3, 1]

# Different levels.
f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"])
f2 = Factor([1, 3, 1], levels=["D", "E", "F", "G"])
out = combine(f1, f2)
assert list(out.get_levels()) == ["A", "B", "C", "D", "E", "F", "G"]
assert out.get_levels() == ["A", "B", "C", "D", "E", "F", "G"]
assert list(out.get_codes()) == [0, 2, 4, 2, 0, 4, 6, 4]

f2 = Factor([1, 3, None], levels=["D", "E", "F", "G"])
Expand Down

0 comments on commit 4eca0a3

Please sign in to comment.