diff --git a/src/biocutils/Factor.py b/src/biocutils/Factor.py index 656d3f3..3c3509c 100644 --- a/src/biocutils/Factor.py +++ b/src/biocutils/Factor.py @@ -1,18 +1,59 @@ -from copy import deepcopy +from copy import copy, deepcopy from typing import List, Sequence, Union, Optional from warnings import warn import numpy from .StringList import StringList +from .Names import Names, _name_to_position, _sanitize_names, _combine_names from .match import match from .factorize import factorize from .normalize_subscript import normalize_subscript, SubscriptTypes from .is_missing_scalar import is_missing_scalar from .print_truncated import print_truncated_list + +from .subset_sequence import subset_sequence +from .assign_sequence import assign_sequence from .combine_sequences import combine_sequences from .is_list_of_type import is_list_of_type +def _sanitize_codes(codes: Sequence[int], num_levels: int) -> numpy.ndarray: + if not isinstance(codes, numpy.ndarray): + replacement = numpy.ndarray(len(codes), dtype=numpy.min_scalar_type(-num_levels)) # get a signed type. + for i, x in enumerate(codes): + if is_missing_scalar(x) or x < 0: + replacement[i] = -1 + else: + replacement[i] = x + codes = replacement + else: + if len(codes.shape) != 1: + raise ValueError("'codes' should be a 1-dimensional array") + if not numpy.issubdtype(codes.dtype, numpy.signedinteger): # force it to be signed. + codes = codes.astype(numpy.min_scalar_type(-num_levels)) + + for x in codes: + if x < -1 or x >= num_levels: + raise ValueError("all entries of 'codes' should refer to an entry of 'levels'") + + return codes + + +def _sanitize_levels(levels: Sequence[str], check: bool = True) -> StringList: + if not isinstance(levels, StringList): + levels = StringList(levels) + if levels.get_names() is not None: + levels = levels.set_names(None) + + if check: + if any(x is None for x in levels): + raise TypeError("all entries of 'levels' should be non-missing") + if len(set(levels)) < len(levels): + raise ValueError("all entries of 'levels' should be unique") + + return levels + + class Factor: """Factor class, equivalent to R's ``factor``. @@ -21,7 +62,7 @@ class Factor: easier numerical analysis. """ - def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool = False, validate: bool = True): + def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool = False, names: Optional[Names] = None, _validate: bool = True): """Initialize a Factor object. Args: @@ -38,56 +79,66 @@ def __init__(self, codes: Sequence[int], levels: Sequence[str], ordered: bool = ordered: Whether the levels are ordered. + names: + List of names. This should have same length as ``codes``. + Alternatively None, if the factor has no names yet. + validate: Whether to validate the arguments. Internal use only. """ - if not isinstance(codes, numpy.ndarray): - replacement = numpy.ndarray(len(codes), dtype=numpy.min_scalar_type(-len(levels))) # get a signed type. - for i, x in enumerate(codes): - if is_missing_scalar(x) or x < 0: - replacement[i] = -1 - else: - replacement[i] = x - codes = replacement - else: - if len(codes.shape) != 1: - raise ValueError("'codes' should be a 1-dimensional array") - if not numpy.issubdtype(codes.dtype, numpy.signedinteger): # force it to be signed. - codes = codes.astype(numpy.min_scalar_type(-len(levels))) - - if not isinstance(levels, StringList): - levels = StringList(levels) - if levels.get_names() is not None: - levels = levels.set_names(None) + if _validate: + levels = _sanitize_levels(levels) + codes = _sanitize_codes(codes, len(levels)) + names = _sanitize_names(names, len(codes)) self._codes = codes self._levels = levels self._ordered = bool(ordered) + self._names = names - if validate: - if any(x is None for x in levels): - raise TypeError("all entries of 'levels' should be non-missing") - if len(set(levels)) < len(levels): - raise ValueError("all entries of 'levels' should be unique") - for x in codes: - if x < -1 or x >= len(self._levels): - raise ValueError("all entries of 'codes' should refer to an entry of 'levels'") + ################################## + #####>>>> Simple getters <<<<##### + ################################## + def _define_output(self, in_place: bool) -> "Factor": + if in_place: + return self + else: + return copy(self) def get_codes(self) -> numpy.ndarray: """ Returns: Array of integer codes, used as indices into the levels from - :py:attr:`~get_levels`. A masked array may also be returned if - any of the entries are missing. + :py:attr:`~get_levels`. Missing values are marked with -1. """ return self._codes @property def codes(self) -> numpy.ndarray: - """See :py:attr:`~get_codes`.""" + """Alias for :py:attr:`~get_codes`.""" return self.get_codes() + def set_codes(self, codes: Sequence[int], in_place: bool = False) -> "Factor": + """ + Args: + codes: + Integer codes referencing the factor levels. This should + have the same length as the current object. + + in_place: + Whether to modify this object in-place. + + Returns: + A modified ``Factor`` object with the new codes, either as a + new object or as a reference to the current object. + """ + output = self._define_output(in_place) + if len(codes) != len(self): + raise ValueError("length of 'codes' should be equal to that of the current object") + output._codes = _sanitize_codes(codes, len(self._levels)) + return output + def get_levels(self) -> StringList: """ Returns: @@ -97,7 +148,7 @@ def get_levels(self) -> StringList: @property def levels(self) -> StringList: - """See :py:attr:`~get_levels`.""" + """Alias for :py:attr:`~get_levels`.""" return self.get_levels() def get_ordered(self) -> bool: @@ -109,9 +160,59 @@ def get_ordered(self) -> bool: @property def ordered(self) -> bool: - """See :py:attr:`~get_ordered`.""" + """Alias for :py:attr:`~get_ordered`.""" return self.get_ordered() + def set_ordered(self, ordered: bool, in_place: bool = False) -> "Factor": + """ + Args: + ordered: + Whether to treat the levels as being ordered. + + in_place: + Whether to modify this object in-place. + + Returns: + A modified ``Factor`` object with the new ordered status, either as + a new object or as a reference to the current object. + """ + output = self._define_output(in_place) + output._ordered = bool(ordered) + return output + + def get_names(self) -> Names: + """ + Returns: + Names for the factor elements. + """ + return self._names + + @property + def names(self) -> Names: + """Alias for :py:attr:`~get_names`.""" + return self.get_names() + + def set_names(self, names: Optional[Names], in_place: bool = False) -> "NamedList": + """ + Args: + names: + List of names, of the same length as this list. + + in_place: + Whether to perform this modification in-place. + + Returns: + A modified ``Factor`` with the new names, either as a new object or + as a reference to the current object. + """ + output = self._define_output(in_place) + output._names = _sanitize_names(names, len(self)) + return output + + ################################# + #####>>>> Miscellaneous <<<<##### + ################################# + def __len__(self) -> int: """ Returns: @@ -127,6 +228,8 @@ def __repr__(self) -> str: tmp = "Factor(codes=" + print_truncated_list(self._codes) + ", levels=" + print_truncated_list(self._levels) if self._ordered: tmp += ", ordered=True" + if self._names: + tmp += ", names=" + print_truncated_list(self._names) tmp += ")" return tmp @@ -139,40 +242,110 @@ def __str__(self) -> str: if len(self._levels) != 0: message += "s" message += "\n" - message += "values: " + print_truncated_list(self._codes, transform=lambda i: self._levels[i]) + "\n" - message += "levels: " + print_truncated_list(self._levels, transform=lambda x: x) + "\n" + message += "values: " + print_truncated_list(self._codes, transform=lambda i: self._levels[i], include_brackets=False) + "\n" + if self._names is not None: + message += "names: " + print_truncated_list(self._names, transform=lambda x: x, include_brackets=False) + "\n" + message += "levels: " + print_truncated_list(self._levels, transform=lambda x: x, include_brackets=False) + "\n" message += "ordered: " + str(self._ordered) return message - def __getitem__(self, sub: SubscriptTypes) -> Union[str, "Factor"]: - """Subset the ``Factor`` to the specified subset of indices. + ########################### + #####>>>> Slicing <<<<##### + ########################### + def get_value(self, index: Union[str, int]) -> Union[str, None]: + """ Args: - sub: - Sequence of integers or booleans specifying the elements of - interest. Alternatively, an integer/boolean scalar specifying a - single element. + index: + Integer index of the element to obtain. Alternatively, a string + containing the name of the element, using the first occurrence + if duplicate names are present. Returns: - If ``sub`` is a sequence, returns same type as caller (a new - ``Factor``) containing only the elements of interest from ``sub``. + The factor level for the code at the specified position, or None if + the entry is missing. + """ + if isinstance(index, str): + index = _name_to_position(self._names, index) + i = self._codes[index] + if i < 0: + return None + return self._levels[i] + + def get_slice(self, index: SubscriptTypes) -> "Factor": + """ + Args: + index: + Subset of elements to obtain, see + :py:func:`~biocutils.normalize_subscript.normalize_subscript` + for details. Strings are matched to names in the current + object, using the first occurrence if duplicate names are + present. Scalars are treated as length-1 sequences. - If ``sub`` is a scalar, a string is returned containing the - level corresponding to the code at position ``sub``. This may - also be None if the code is missing. + Returns: + A ``Factor`` is returned containing the specified subset. + """ + index, scalar = normalize_subscript(index, len(self), self._names) + output = copy(self) + output._codes = self._codes[index] + if output._names is not None: + output._names = subset_sequence(self._names, index) + return output + + def __getitem__(self, index: SubscriptTypes) -> Union[str, "Factor"]: """ - sub, scalar = normalize_subscript(sub, len(self), None) + If ``index`` is a scalar, this is an alias for :py:attr:`~get_value`. + + If ``index`` is a sequence, this is an alias for :py:attr:`~get_slice`. + """ + index, scalar = normalize_subscript(index, len(self), self._names) if scalar: - x = self._codes[sub[0]] - if x >= 0: - return self._levels[x] - else: - return None - return type(self)(self._codes[sub], self._levels, self._ordered, validate=False) + return self.get_value(index[0]) + else: + return self.get_slice(index) - def replace(self, sub: SubscriptTypes, value: Union[str, "Factor"], in_place: bool = False): + def set_value(self, index: Union[str, int], value: Union[str, None], in_place: bool = False) -> "Factor": """ - Replace items in the ``Factor`` list. The ``subs`` elements in the + Args: + index: + Integer index of the element to replace. Alternatively, a string + containing the name of the element, using the first occurrence + if duplicate names are present. + + value: + Replacement value. This should be a string corresponding to a + factor level, or None if missing. + + in_place: + Whether to perform the modification in place. + + Returns: + A ``Factor`` object with the modified entry at ``index``. This is either + a new object or a reference to the current object. + """ + if in_place: + output = self + else: + output = copy(self) + output._codes = copy(self._codes) + + if isinstance(index, str): + index = _name_to_position(self._names, index) + + if value is None: + output._codes[index] = -1 + return output + + for i, l in enumerate(output._levels): + if l == value: + output._codes[index] = i + return output + + raise IndexError("failed to find level '" + str(value) + "'") + + def set_slice(self, index: SubscriptTypes, value: "Factor", in_place: bool = False): + """ + Replace items in the ``Factor`` list. The ``index`` elements in the current object are replaced with the corresponding values in ``value``. This is performed by finding the level for each entry of the replacement ``value``, matching it to a level in the current object, @@ -180,51 +353,62 @@ def replace(self, sub: SubscriptTypes, value: Union[str, "Factor"], in_place: bo level. If there is no matching level, a missing value is inserted. Args: - sub: - Sequence of integers or booleans specifying the items to be - replaced. + index: + Subset of elements to replace, see + :py:func:`~biocutils.normalize_subscript.normalize_subscript` + for details. Strings are matched to names in the current + object, using the first occurrence if duplicate names are + present. Scalars are treated as length-1 sequences. value: - If ``sub`` is a sequence, a ``Factor`` of the same length - containing the replacement values. + A ``Factor`` of the same length containing the replacement values. in_place: - Whether the replacement should be performed on the current - object. + Whether the replacement should be performed in place. Returns: - If ``in_place = False``, a new ``Factor`` is returned containing the - contents of the current object after replacement by ``value``. - - If ``in_place = True``, the current object is returned after its - items have been replaced. + A ``Factor`` object with values at ``index`` replaced by ``value``. + This is either a new object or a reference to the current object, + depending on ``in_place``. """ - sub, scalar = normalize_subscript(sub, len(self), None) - codes = self._codes - if not in_place: - codes = codes.copy() + if in_place: + output = self + else: + output = copy(self) + output._codes = copy(self._codes) + new_codes = output._codes + + index, scalar = normalize_subscript(index, len(self), self._names) if self._levels == value._levels: - for i, x in enumerate(sub): - codes[x] = value._codes[i] + for i, x in enumerate(index): + new_codes[x] = value._codes[i] else: mapping = match(value._levels, self._levels) - for i, x in enumerate(sub): + for i, x in enumerate(index): v = value._codes[i] if v >= 0: - codes[x] = mapping[v] + new_codes[x] = mapping[v] else: - codes[x] = -1 + new_codes[x] = -1 - if in_place: - self._codes = codes - return self + return output + + def __setitem__(self, index: SubscriptTypes, value: Union[str, "Factor"]): + """ + If ``index`` is a scalar, this is an alias for :py:attr:`~set_value`. + + If ``index`` is a sequence, this is an alias for :py:attr:`~set_slice`. + """ + index, scalar = normalize_subscript(index, len(self), self._names) + if scalar: + self.set_value(index, value, in_place=True) else: - return type(self)(codes, self._levels, self._ordered, validate=False) + self.set_slice(index, value, in_place=True) - def __setitem__(self, args: Sequence[int], value: "Factor"): - """See :py:attr:`~replace` for details.""" - return self.replace(args, value, in_place=True) + ################################# + #####>>>> Level setting <<<<##### + ################################# def drop_unused_levels(self, in_place: bool = False) -> "Factor": """Drop unused levels. @@ -240,9 +424,10 @@ def drop_unused_levels(self, in_place: bool = False) -> "Factor": current object; a reference to the current object is returned. """ if in_place: - new_codes = self._codes + output = self else: - new_codes = self._codes.copy() + output = copy(self) + output._codes = copy(self._codes) in_use = [False] * len(self._levels) for x in self._codes: @@ -256,17 +441,13 @@ def drop_unused_levels(self, in_place: bool = False) -> "Factor": reindex[i] = len(new_levels) new_levels.append(self._levels[i]) + new_codes = output._codes for i, x in enumerate(self._codes): if x >= 0: new_codes[i] = reindex[x] - if in_place: - self._codes = new_codes - self._levels = new_levels - return self - else: - current_class_const = type(self) - return current_class_const(new_codes, new_levels, self._ordered, validate=False) + output._levels = new_levels + return output def set_levels(self, levels: Union[str, Sequence[str]], in_place: bool = False) -> "Factor": """Set or replace levels. @@ -294,9 +475,15 @@ def set_levels(self, levels: Union[str, Sequence[str]], in_place: bool = False) If ``in_place = True``, the levels are replaced in the current object, and a reference to the current object is returned. """ + if in_place: + output = self + else: + output = copy(self) + output._codes = copy(self._codes) + lmapping = {} if isinstance(levels, str): - new_levels = [levels] + new_levels = StringList([levels]) for x in self._levels: if x == levels: lmapping[x] = 0 @@ -323,51 +510,50 @@ def set_levels(self, levels: Union[str, Sequence[str]], in_place: bool = False) if x in lmapping: mapping[i] = lmapping[x] - if in_place: - new_codes = self._codes - else: - new_codes = self._codes.copy() + new_codes = output._codes for i, x in enumerate(new_codes): if x >= 0: new_codes[i] = mapping[x] else: new_codes[i] = -1 - if in_place: - self._codes = new_codes - self._levels = new_levels - return self - else: - current_class_const = type(self) - return current_class_const(new_codes, new_levels, self._ordered, validate=False) + output._levels = new_levels + return output - @levels.setter - def levels(self, levels: Union[str, List[str]]): - """See :py:attr:`~set_levels`.""" - warn("Setting property 'levels'is an in-place operation, use 'set_levels' instead", UserWarning) - self.set_levels(levels, in_place=True) + ########################### + #####>>>> Copying <<<<##### + ########################### def __copy__(self) -> "Factor": """ Returns: A shallow copy of the ``Factor`` object. """ - current_class_const = type(self) - return current_class_const(self._codes, self._levels, self._ordered, validate=False) + return type(self)( + self._codes, + levels=self._levels, + ordered=self._ordered, + names=self._names, + _validate=False, + ) def __deepcopy__(self, memo) -> "Factor": """ Returns: A deep copy of the ``Factor`` object. """ - current_class_const = type(self) - return current_class_const( + return type(self)( deepcopy(self._codes, memo), - deepcopy(self._levels, memo), - self._ordered, - validate=False, + levels=deepcopy(self._levels, memo), + ordered=self._ordered, + names=deepcopy(self._names, memo), + _validate=False, ) + ############################# + #####>>>> Coercions <<<<##### + ############################# + def to_pandas(self): """Coerce to :py:class:`~pandas.Categorical` object. @@ -410,6 +596,16 @@ def from_sequence(x: Sequence[str], levels: Optional[Sequence[str]] = None, sort return Factor(indices, levels=levels, ordered=ordered) +@subset_sequence.register +def _subset_sequence_Factor(x: Factor, indices: Sequence[int]) -> Factor: + return x.get_slice(indices) + + +@assign_sequence.register +def _assign_sequence_Factor(x: Factor, indices: Sequence[int], other: Factor) -> Factor: + return x.set_slice(indices, other) + + @combine_sequences.register(Factor) def _combine_factors(*x: Factor): if not is_list_of_type(x, Factor): @@ -432,7 +628,7 @@ def _combine_factors(*x: Factor): new_ordered = first._ordered else: all_levels_map = {} - new_levels = [] + new_levels = StringList() for f in x: mapping = [] for i, y in enumerate(f._levels): @@ -450,4 +646,10 @@ def _combine_factors(*x: Factor): new_codes.append(curout) new_ordered = False - return Factor(combine_sequences(*new_codes), new_levels, new_ordered, validate=False) + return type(x[0])( + codes=combine_sequences(*new_codes), + levels=new_levels, + ordered=new_ordered, + names=_combine_names(*x, get_names=lambda x : x.get_names()), + _validate=False, + ) diff --git a/src/biocutils/NamedList.py b/src/biocutils/NamedList.py index 2c79537..2c9e3f6 100644 --- a/src/biocutils/NamedList.py +++ b/src/biocutils/NamedList.py @@ -1,32 +1,13 @@ from typing import Sequence, Optional, Iterable, Union, Any, Dict from copy import deepcopy -from .Names import Names +from .Names import Names, _name_to_position, _sanitize_names from .normalize_subscript import normalize_subscript, SubscriptTypes from .subset_sequence import subset_sequence from .combine_sequences import combine_sequences from .assign_sequence import assign_sequence -def _name_to_position(names: Optional[Names], index: str) -> int: - i = -1 - if names is not None: - i = names.map(index) - if i < 0: - raise KeyError("failed to find entry with name '" + index + "'") - return i - - -def _sanitize_names(names: Optional[Names], length: int) -> Union[None, Names]: - if names is None: - return names - if not isinstance(names, Names): - names = Names(names) - if len(names) != length: - raise ValueError("length of 'names' must be equal to number of entries (" + str(length) + ")") - return names - - class NamedList: """ A list-like object that could have names for each element, equivalent to R's @@ -44,7 +25,7 @@ def __init__(self, data: Optional[Iterable] = None, names: Optional[Names] = Non names: List of names. This should have same length as ``data``. - Alternatively None, if the list has no valid names yet. + Alternatively None, if the list has no names yet. _validate: Internal use only. diff --git a/src/biocutils/Names.py b/src/biocutils/Names.py index 804bed6..b6649c5 100644 --- a/src/biocutils/Names.py +++ b/src/biocutils/Names.py @@ -1,4 +1,4 @@ -from typing import Sequence, Optional, Iterable, Union, Any +from typing import Sequence, Optional, Iterable, Union, Any, Callable class Names(list): @@ -176,3 +176,45 @@ def copy(self): A copy of the current object. """ return Names(self, coerce=False) + + +def _name_to_position(names: Optional[Names], index: str) -> int: + i = -1 + if names is not None: + i = names.map(index) + if i < 0: + raise KeyError("failed to find entry with name '" + index + "'") + return i + + +def _sanitize_names(names: Optional[Names], length: int) -> Union[None, Names]: + if names is None: + return names + if not isinstance(names, Names): + names = Names(names) + if len(names) != length: + raise ValueError("length of 'names' must be equal to number of entries (" + str(length) + ")") + return names + + +def _combine_names(*x: Any, get_names: Callable) -> Union[Names, None]: + all_names = [] + has_names = False + for y in x: + n = get_names(y) + if n is None: + all_names.append(len(y)) + else: + has_names = True + all_names.append(n) + + if not has_names: + return None + else: + output = Names() + for i, n in enumerate(all_names): + if not isinstance(n, Names): + output.extend([""] * n) + else: + output.extend(n) + return output diff --git a/tests/test_Factor.py b/tests/test_Factor.py index bf8421d..d2aedc4 100644 --- a/tests/test_Factor.py +++ b/tests/test_Factor.py @@ -1,10 +1,10 @@ -from biocutils import Factor, combine, StringList +from biocutils import Factor, combine, StringList, subset_sequence, assign_sequence import pytest import copy import numpy -def test_Factor_basics(): +def test_Factor_init(): f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) assert len(f) == 6 assert list(f) == ["A", "B", "C", "A", "C", "E"] @@ -35,6 +35,10 @@ def test_Factor_basics(): Factor([0, 1], ["A", "B", "A"]) assert str(ex.value).find("should be unique") >= 0 + # Works with names. + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"], names=["1", "2", "3", "4", "5", "6"]) + assert f.get_names() == ["1", "2", "3", "4", "5", "6"] + def test_Factor_print(): f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) @@ -58,6 +62,35 @@ def test_Factor_print(): assert str(f).startswith("Factor of length") +def test_Factor_get_value(): + f = Factor([0, 1, 2, -1, 2, 4], levels=["A", "B", "C", "D", "E"]) + assert f.get_value(0) == "A" + assert f.get_value(2) == "C" + assert f.get_value(3) == None + + f.set_names(["1", "2", "3", "4", "5", "6"], in_place=True) + assert f.get_value("1") == "A" + assert f.get_value("2") == "B" + + +def test_Factor_get_slice(): + f = Factor([0, 1, 2, -1, 2, 4], levels=["A", "B", "C", "D", "E"]) + + sub = f.get_slice([0, 1]) + assert list(sub) == ["A", "B"] + assert sub.get_levels() == f.get_levels() + + sub = f.get_slice([True, False] * 3) + assert list(sub) == ["A", "C", "C"] + assert sub.get_levels() == f.get_levels() + + f.set_names(["1", "2", "3", "4", "5", "6"], in_place=True) + sub = f.get_slice(["4", "3", "2", "1"]) + assert list(sub) == [None, "C", "B", "A"] + assert sub.get_levels() == f.get_levels() + assert sub.get_names() == [ "4", "3", "2", "1" ] + + def test_Factor_getitem(): f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) assert f[0] == "A" @@ -77,25 +110,53 @@ def test_Factor_getitem(): assert f2.get_levels() == f.get_levels() -def test_Factor_setitem(): +def test_Factor_set_value(): + f = Factor([0, 1, 2, -1, 2, 4], levels=["A", "B", "C", "D", "E"]) + y = f.set_value(3, "D") + assert y.get_value(3) == "D" + + f.set_names(["1", "2", "3", "4", "5", "6"], in_place=True) + y = f.set_value("4", None) + assert f.get_value(3) == None + assert f.get_value("4") == None + + +def test_Factor_set_slice(): f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) f2 = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) - f[0:2] = f2[2:4] - assert list(f.get_codes()) == [2, 3, 2, 3, 2, 1] - assert f.get_levels().get_data() == ["A", "B", "C", "D", "E"] + y = f.set_slice(slice(2), f2[2:4]) + assert list(y.get_codes()) == [2, 3, 2, 3, 2, 1] + assert y.get_levels() == f.get_levels() - f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) f2 = Factor([0, 1, 2, 3, 2, 1], levels=["E", "D", "C", "B", "A"]) - f[[-3, -2, -1]] = f2[0:3] - assert list(f.get_codes()) == [0, 1, 2, 4, 3, 2] - assert f.get_levels().get_data() == ["A", "B", "C", "D", "E"] + y = f.set_slice([-3, -2, -1], f2[0:3]) + assert list(y.get_codes()) == [0, 1, 2, 4, 3, 2] + assert y.get_levels() == f.get_levels() - f = Factor([0, 1, 2, 3, 2, 1], levels=["A", "B", "C", "D", "E"]) f2 = Factor([0, 1, 2, 3, 2, 1], levels=["e", "d", "c", "b", "a"]) - f[:] = f2[:] - assert list(f.get_codes()) == [-1] * 6 - assert f.get_levels().get_data() == ["A", "B", "C", "D", "E"] + y = f.set_slice(range(6), f2) + assert list(y.get_codes()) == [-1] * 6 + assert y.get_levels() == f.get_levels() + + # Now throwing in some names. + f.set_names(["alpha", "bravo", "charlie", "delta", "echo", "foxtrot"], in_place=True) + y = f.set_slice(["bravo", "charlie", "delta"], f[3:6]) + assert list(y.get_codes()) == [ 0, 3, 2, 1, 2, 1 ] + assert y.get_levels() == f.get_levels() + assert y.get_names() == f.get_names() + + +def test_Factor_setitem(): + f = Factor([0, 1, 2, 0, 2, 4], levels=["A", "B", "C", "D", "E"]) + f[0] = "B" + f[2] = "A" + f[-1] = "D" + assert list(f.get_codes()) == [1, 1, 0, 0, 2, 3] + + f[2:5] = Factor([4, 3, 1], levels=["A", "B", "C", "D", "E"]) + assert list(f.get_codes()) == [1, 1, 4, 3, 1, 3] + assert f.get_levels() == f.get_levels() def test_Factor_drop_unused_levels(): @@ -151,6 +212,21 @@ def test_Factor_copy(): assert (f.get_codes() == out.get_codes()).all() assert f.get_levels() == out.get_levels() + f.set_names(["alpha", "bravo", "charlie", "delta", "echo", "foxtrot"], in_place=True) + out = copy.copy(f) + assert f.get_names() == out.get_names() + + +def test_Factor_generics(): + f = Factor([0,1,2,3,4], levels=["A", "B", "C", "D", "E"]) + sub = subset_sequence(f, range(2, 4)) + assert list(sub._codes) == [2, 3] + assert sub.get_levels() == f.get_levels() + + ass = assign_sequence(f, range(2, 4), f[1:3]) + assert list(ass._codes) == [0, 1, 1, 2, 4] + assert ass.get_levels() == f.get_levels() + def test_Factor_combine(): # Same levels. @@ -181,6 +257,13 @@ def test_Factor_combine(): out = combine(f1, f2) assert not out.get_ordered() + # Checking that names are correctly combined. + print(f1) + named = f2.set_names(["alpha", "bravo", "charlie"]) + out = combine(f1, named) + print(out) + assert out.get_names() == ["", "", "", "", "", "alpha", "bravo", "charlie"] + def test_Factor_pandas(): import pandas as pd