From e2c1c7fcdae2c13319319b19ad551d1ac02f5d05 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sun, 15 Dec 2024 23:33:36 -0800 Subject: [PATCH] Added new Factor method to replace levels without remapping codes. The new replace_levels() method is much simpler and more consistent with the levels()<- method in R. The existing set_levels() functionality is moved to the new remap_levels() to avoid confusion; set_levels() itself is now an alias to either remap_levels() or replace_levels(), depending on the remap= argument. The default remap=True has a deprecation warning as I hope to change it to False, such that set_levels() is eventually the same as replace_levels(). --- src/biocutils/Factor.py | 81 +++++++++++++++++++++++++++++++++++++++-- tests/test_Factor.py | 61 ++++++++++++++++++++++++------- 2 files changed, 124 insertions(+), 18 deletions(-) diff --git a/src/biocutils/Factor.py b/src/biocutils/Factor.py index cdfa58f..374b34e 100644 --- a/src/biocutils/Factor.py +++ b/src/biocutils/Factor.py @@ -2,6 +2,7 @@ from typing import Optional, Sequence, Union import numpy +import warnings from .assign_sequence import assign_sequence from .combine_sequences import combine_sequences @@ -198,7 +199,7 @@ def get_levels(self) -> StringList: List of strings containing the factor levels. This should be treated as a read-only reference. To modify the - levels, use :py:meth:`~set_levels` instead. + levels, use :py:meth:`~replace_levels` instead. """ return self._levels @@ -562,10 +563,82 @@ def drop_unused_levels(self, in_place: bool = False) -> "Factor": output._levels = new_levels return output + def replace_levels( + self, + levels: Sequence[str], + in_place: bool = False, + ) -> "Factor": + """Replace the existing levels with a new list. The codes of the + returned ``Factor`` are unchanged by this method and will index into + the replacement ``levels``, so each element of the ``Factor`` may refer + to a different string after the levels are replaced. (To change the + levels while ensuring that each element of the ``Factor`` refers to the + same string, use :py:meth:`~remap_levels`. instead.) + + Args: + levels: + A sequence of replacement levels. These should be unique + strings with no missing values. The length of this sequence + should be no less than the current number of levels. + + in_place: + Whether to perform this modification in-place. + + Returns: + If ``in_place = False``, returns same type as caller (a new + ``Factor`` object) where the levels have been replaced. Codes + are unchanged and may refer to different strings. + + If ``in_place = True``, the levels are replaced in the current + object, and a reference to the current object is returned. + """ + new_levels = levels + if not isinstance(new_levels, StringList): + new_levels = StringList(levels) + if len(new_levels) < len(self._levels): + raise ValueError("'levels' should be at least as long as the existing levels") + + present = set() + for x in new_levels: + if x is None: + raise ValueError("all entries of 'levels' should be non-missing") + if x in present: + raise ValueError("all entries of 'levels' should be unique") + present.add(x) + + if in_place: + output = self + else: + output = copy(self) + + output._levels = new_levels + return output + def set_levels( + self, + levels: Union[str, Sequence[str]], + remap: bool = True, + in_place: bool = False + ) -> "Factor": + """ + Alias for :py:meth:`~remap_levels` if ``remap = True``, otherwise an + alias for :py:meth:`~replace_levels`. The first alias is deprecated and + :py:meth:`~remap_levels` should be used directly if that is the intent. + """ + if remap: + warnings.warn("'remap=True' is deprecated, use 'remap_levels()' instead", category=DeprecationWarning) + return self.remap_levels(levels, in_place=in_place) + else: + return self.replace_levels(levels, in_place=in_place) + + def remap_levels( self, levels: Union[str, Sequence[str]], in_place: bool = False ) -> "Factor": - """Set or replace levels. + """Remap codes to a replacement list of levels. Each entry of the + remapped ``Factor`` will refer to the same string across the old and + new levels, provided that string is present in both sets of levels. + (To change the levels without altering the codes of the ``Factor``, use + :py:meth:`~replace_levels` instead.) Args: levels: @@ -585,7 +658,7 @@ def set_levels( ``Factor`` object) where the levels have been replaced. This will automatically update the codes so that they still refer to the same string in the new ``levels``. If a code refers to a level that is - not present in the new ``levels``, it is replaced with None. + not present in the new ``levels``, it is set to a missing value. If ``in_place = True``, the levels are replaced in the current object, and a reference to the current object is returned. @@ -615,7 +688,7 @@ def set_levels( new_levels = StringList(levels) for i, x in enumerate(new_levels): if x is None: - raise TypeError("all entries of 'levels' should be non-missing") + raise ValueError("all entries of 'levels' should be non-missing") if x in lmapping: raise ValueError("all entries of 'levels' should be unique") lmapping[x] = i diff --git a/tests/test_Factor.py b/tests/test_Factor.py index 7484a23..fdcb004 100644 --- a/tests/test_Factor.py +++ b/tests/test_Factor.py @@ -52,7 +52,7 @@ def test_factor_comparisons(): f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) assert f == f assert f != [] - f2 = f.set_levels(["E", "C", "D", "B", "A"]) + f2 = f.replace_levels(["E", "C", "D", "B", "A"]) assert f != f2 f2 = f.set_ordered(True) assert f != f2 @@ -193,34 +193,67 @@ def test_Factor_drop_unused_levels(): assert list(f2) == ["D", "E", "C", "D", "C", "E"] -def test_Factor_set_levels(): +def test_Factor_replace_levels(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.replace_levels(["E", "D", "C", "B", "A"]) + assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"] + assert (f2.get_codes() == f.get_codes()).all() + assert list(f2) != list(f) + + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f2 = f.replace_levels(["G", "F", "E", "D", "C", "B", "A"], in_place=True) + assert f2.get_levels().as_list() == ["G", "F", "E", "D", "C", "B", "A"] + assert (f2.get_codes() == f.get_codes()).all() + + with pytest.raises(ValueError, match="at least as long") as ex: + f.replace_levels(["F"]) + + with pytest.raises(ValueError, match="non-missing") as ex: + f.replace_levels([None, "A"] * 10) + assert str(ex.value).find("non-missing") >= 0 + + with pytest.raises(ValueError, match="should be unique") as ex: + f.replace_levels(["A"] * 10) + + +def test_Factor_remap_levels(): f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) - f2 = f.set_levels(["E", "D", "C", "B", "A"]) + f2 = f.remap_levels(["E", "D", "C", "B", "A"]) assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"] assert list(f2.get_codes()) == [4, 3, 2, 4, 2, 0] assert list(f2) == list(f) f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) - f2 = f.set_levels(["E", "C", "A"], in_place=True) + f2 = f.remap_levels(["E", "C", "A"], in_place=True) assert f2.get_levels().as_list() == ["E", "C", "A"] assert list(f2.get_codes()) == [2, -1, 1, 2, 1, 0] f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) - f2 = f.set_levels("E") # reorders + f2 = f.remap_levels("E") # reorders assert f2.get_levels().as_list() == ["E", "A", "B", "C", "D"] assert list(f2.get_codes()) == [1, 2, 3, 1, 3, 0] - with pytest.raises(ValueError) as ex: - f.set_levels("F") - assert str(ex.value).find("should already be present") >= 0 + with pytest.raises(ValueError, match="should already be present"): + f.remap_levels("F") - with pytest.raises(TypeError) as ex: - f.set_levels([None, "A"]) - assert str(ex.value).find("non-missing") >= 0 + with pytest.raises(ValueError, match="non-missing") as ex: + f.remap_levels([None, "A"]) - with pytest.raises(ValueError) as ex: - f.set_levels(["A", "A"]) - assert str(ex.value).find("should be unique") >= 0 + with pytest.raises(ValueError, match="should be unique") as ex: + f.remap_levels(["A", "A"]) + + +def test_Factor_set_levels(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + + f2 = f.set_levels(["E", "D", "C", "B", "A"], remap=False) + assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"] + assert (f2.get_codes() == f.get_codes()).all() + + with pytest.warns(DeprecationWarning) as ex: + f2 = f.set_levels(["E", "D", "C", "B", "A"], remap=True) + assert f2.get_levels().as_list() == ["E", "D", "C", "B", "A"] + assert list(f2) == list(f) def test_Factor_copy():