Skip to content

Commit

Permalink
Added new Factor method to replace levels without remapping codes.
Browse files Browse the repository at this point in the history
The new replace_levels() method is much simpler and more consistent with the
levels()<- method in R. The existing set_levels() functionality is moved to the
new remap_levels() to avoid confusion; set_levels() itself is now an alias to
either remap_levels() or replace_levels(), depending on the remap= argument.
The default remap=True has a deprecation warning as I hope to change it to
False, such that set_levels() is eventually the same as replace_levels().
  • Loading branch information
LTLA committed Dec 16, 2024
1 parent 40be075 commit e2c1c7f
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 18 deletions.
81 changes: 77 additions & 4 deletions src/biocutils/Factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, Sequence, Union

import numpy
import warnings

from .assign_sequence import assign_sequence
from .combine_sequences import combine_sequences
Expand Down Expand Up @@ -198,7 +199,7 @@ def get_levels(self) -> StringList:
List of strings containing the factor levels.
This should be treated as a read-only reference. To modify the
levels, use :py:meth:`~set_levels` instead.
levels, use :py:meth:`~replace_levels` instead.
"""
return self._levels

Expand Down Expand Up @@ -562,10 +563,82 @@ def drop_unused_levels(self, in_place: bool = False) -> "Factor":
output._levels = new_levels
return output

def replace_levels(
self,
levels: Sequence[str],
in_place: bool = False,
) -> "Factor":
"""Replace the existing levels with a new list. The codes of the
returned ``Factor`` are unchanged by this method and will index into
the replacement ``levels``, so each element of the ``Factor`` may refer
to a different string after the levels are replaced. (To change the
levels while ensuring that each element of the ``Factor`` refers to the
same string, use :py:meth:`~remap_levels`. instead.)
Args:
levels:
A sequence of replacement levels. These should be unique
strings with no missing values. The length of this sequence
should be no less than the current number of levels.
in_place:
Whether to perform this modification in-place.
Returns:
If ``in_place = False``, returns same type as caller (a new
``Factor`` object) where the levels have been replaced. Codes
are unchanged and may refer to different strings.
If ``in_place = True``, the levels are replaced in the current
object, and a reference to the current object is returned.
"""
new_levels = levels
if not isinstance(new_levels, StringList):
new_levels = StringList(levels)
if len(new_levels) < len(self._levels):
raise ValueError("'levels' should be at least as long as the existing levels")

present = set()
for x in new_levels:
if x is None:
raise ValueError("all entries of 'levels' should be non-missing")
if x in present:
raise ValueError("all entries of 'levels' should be unique")
present.add(x)

if in_place:
output = self
else:
output = copy(self)

output._levels = new_levels
return output

def set_levels(
self,
levels: Union[str, Sequence[str]],
remap: bool = True,
in_place: bool = False
) -> "Factor":
"""
Alias for :py:meth:`~remap_levels` if ``remap = True``, otherwise an
alias for :py:meth:`~replace_levels`. The first alias is deprecated and
:py:meth:`~remap_levels` should be used directly if that is the intent.
"""
if remap:
warnings.warn("'remap=True' is deprecated, use 'remap_levels()' instead", category=DeprecationWarning)
return self.remap_levels(levels, in_place=in_place)
else:
return self.replace_levels(levels, in_place=in_place)

def remap_levels(
self, levels: Union[str, Sequence[str]], in_place: bool = False
) -> "Factor":
"""Set or replace levels.
"""Remap codes to a replacement list of levels. Each entry of the
remapped ``Factor`` will refer to the same string across the old and
new levels, provided that string is present in both sets of levels.
(To change the levels without altering the codes of the ``Factor``, use
:py:meth:`~replace_levels` instead.)
Args:
levels:
Expand All @@ -585,7 +658,7 @@ def set_levels(
``Factor`` object) where the levels have been replaced. This will
automatically update the codes so that they still refer to the same
string in the new ``levels``. If a code refers to a level that is
not present in the new ``levels``, it is replaced with None.
not present in the new ``levels``, it is set to a missing value.
If ``in_place = True``, the levels are replaced in the current
object, and a reference to the current object is returned.
Expand Down Expand Up @@ -615,7 +688,7 @@ def set_levels(
new_levels = StringList(levels)
for i, x in enumerate(new_levels):
if x is None:
raise TypeError("all entries of 'levels' should be non-missing")
raise ValueError("all entries of 'levels' should be non-missing")
if x in lmapping:
raise ValueError("all entries of 'levels' should be unique")
lmapping[x] = i
Expand Down
61 changes: 47 additions & 14 deletions tests/test_Factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_factor_comparisons():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
assert f == f
assert f != []
f2 = f.set_levels(["E", "C", "D", "B", "A"])
f2 = f.replace_levels(["E", "C", "D", "B", "A"])
assert f != f2
f2 = f.set_ordered(True)
assert f != f2
Expand Down Expand Up @@ -193,34 +193,67 @@ def test_Factor_drop_unused_levels():
assert list(f2) == ["D", "E", "C", "D", "C", "E"]


def test_Factor_set_levels():
def test_Factor_replace_levels():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.replace_levels(["E", "D", "C", "B", "A"])
assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"]
assert (f2.get_codes() == f.get_codes()).all()
assert list(f2) != list(f)

f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.replace_levels(["G", "F", "E", "D", "C", "B", "A"], in_place=True)
assert f2.get_levels().as_list() == ["G", "F", "E", "D", "C", "B", "A"]
assert (f2.get_codes() == f.get_codes()).all()

with pytest.raises(ValueError, match="at least as long") as ex:
f.replace_levels(["F"])

with pytest.raises(ValueError, match="non-missing") as ex:
f.replace_levels([None, "A"] * 10)
assert str(ex.value).find("non-missing") >= 0

with pytest.raises(ValueError, match="should be unique") as ex:
f.replace_levels(["A"] * 10)


def test_Factor_remap_levels():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.set_levels(["E", "D", "C", "B", "A"])
f2 = f.remap_levels(["E", "D", "C", "B", "A"])
assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"]
assert list(f2.get_codes()) == [4, 3, 2, 4, 2, 0]
assert list(f2) == list(f)

f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.set_levels(["E", "C", "A"], in_place=True)
f2 = f.remap_levels(["E", "C", "A"], in_place=True)
assert f2.get_levels().as_list() == ["E", "C", "A"]
assert list(f2.get_codes()) == [2, -1, 1, 2, 1, 0]

f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])
f2 = f.set_levels("E") # reorders
f2 = f.remap_levels("E") # reorders
assert f2.get_levels().as_list() == ["E", "A", "B", "C", "D"]
assert list(f2.get_codes()) == [1, 2, 3, 1, 3, 0]

with pytest.raises(ValueError) as ex:
f.set_levels("F")
assert str(ex.value).find("should already be present") >= 0
with pytest.raises(ValueError, match="should already be present"):
f.remap_levels("F")

with pytest.raises(TypeError) as ex:
f.set_levels([None, "A"])
assert str(ex.value).find("non-missing") >= 0
with pytest.raises(ValueError, match="non-missing") as ex:
f.remap_levels([None, "A"])

with pytest.raises(ValueError) as ex:
f.set_levels(["A", "A"])
assert str(ex.value).find("should be unique") >= 0
with pytest.raises(ValueError, match="should be unique") as ex:
f.remap_levels(["A", "A"])


def test_Factor_set_levels():
f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"])

f2 = f.set_levels(["E", "D", "C", "B", "A"], remap=False)
assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"]
assert (f2.get_codes() == f.get_codes()).all()

with pytest.warns(DeprecationWarning) as ex:
f2 = f.set_levels(["E", "D", "C", "B", "A"], remap=True)
assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"]
assert list(f2) == list(f)


def test_Factor_copy():
Expand Down

0 comments on commit e2c1c7f

Please sign in to comment.