From da2e25209d2086108d8069ade6797ae10e882550 Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Mon, 13 Nov 2023 15:25:20 -0800 Subject: [PATCH] Added a dedicated NamedList class to mimic R's named lists. (#10) This allows integer indexing, slicing, etc. while also adding dict-like behavior with getting/setting by (string) key. It also supports multiple instances of the same name, which is consistent with R's behavior. This new class is supported by an additional Names class, which is basically a StringList but disallowing Nones and having more behind-the-scenes code for efficiently mapping a name to a positional index. --- src/biocutils/NamedList.py | 274 +++++++++++++++++++++++++++++++++++++ src/biocutils/Names.py | 172 +++++++++++++++++++++++ src/biocutils/__init__.py | 2 + tests/test_NamedList.py | 125 +++++++++++++++++ tests/test_Names.py | 109 +++++++++++++++ 5 files changed, 682 insertions(+) create mode 100644 src/biocutils/NamedList.py create mode 100644 src/biocutils/Names.py create mode 100644 tests/test_NamedList.py create mode 100644 tests/test_Names.py diff --git a/src/biocutils/NamedList.py b/src/biocutils/NamedList.py new file mode 100644 index 0000000..bf4bb5a --- /dev/null +++ b/src/biocutils/NamedList.py @@ -0,0 +1,274 @@ +from typing import Sequence, Optional, Iterable, Union, Any +from copy import deepcopy + +from .Names import Names +from .subset_sequence import subset_sequence +from .combine_sequences import combine_sequences +from .assign_sequence import assign_sequence + + +class NamedList(list): + """ + A Python list with a name for each element, equivalent to R's named list. + This provides some dict-like behavior - namely, getting or setting entries + by an existing name, adding entries with a new name. Of course, it's still + a list, so it can be indexed as usual by integer positions or slices. + """ + + def __init__(self, iterable: Optional[Iterable] = None, names: Optional[Names] = None): + """ + Args: + iterable: + Some iterable object. Alternatively None, for an empty list. + + names: + List of names. This should have same length as ``iterable``. + If None, defaults to an empty list. + """ + if iterable is None: + super().__init__() + else: + super().__init__(iterable) + + if names is None: + if isinstance(iterable, NamedList): + names = iterable._names + else: + names = Names() + elif not isinstance(names, Names): + names = Names(names) + self._names = names + if len(self) != len(self._names): + raise ValueError("length of 'names' should equal the length of 'data'") + + def __repr__(self): + return "NamedList(data=" + super().__repr__() + ", names=" + repr(self._names) + ")" + + def __str__(self): + return "[" + ", ".join(repr(self._names[i]) + "=" + repr(x) for i, x in enumerate(self)) + "]" + + def get_names(self) -> Names: + """ + Returns: + Names for the list elements. + """ + return self._names + + @property + def names(self) -> Names: + """Alias for :py:attr:`~get_names`.""" + return self.get_names() + + def set_names(self, names: Names, in_place: bool = False) -> "NamedList": + """ + Args: + names: + List of names, of the same length as this list. + + in_place: + Whether to perform this modification in-place. + + Returns: + A modified ``NamedList`` with the new names. If ``in_place = + False``, this is a new ``NamedList``, otherwise it is a reference + to the current ``NamedList``. + """ + if isinstance(names, Names): + names = Names(names) + if in_place: + if len(names) != len(self._data): + raise ValueError("length of 'names' should equal the length of 'data'") + self._names = names + return self + else: + return NamedList(self, names) + + def __getitem__(self, index: Union[str, int, slice]): + """ + Args: + index: + An integer index containing a position to extract, a string + specifying the name of the value to extract, or a slice + specifying multiple positions to extract. + + Returns: + If ``index`` is an integer, the value at the specified position. + + If ``index`` is a string, the value with the specified name. If + multiple values have the same name, the first is returned. + + If ``index`` is a slice, a new ``NamedList`` is returned + containing the items at the specified positions. + """ + if isinstance(index, str): + i = self._names.map(index) + if i < 0: + raise KeyError("no list element named '" + index + "'") + return super().__getitem__(i) + + output = super().__getitem__(index) + if isinstance(index, slice): + return NamedList(output, self._names[index]) + return output + + + def __setitem__(self, index: Union[int, str, slice], item: Any): + """ + Args: + index: + An integer index containing a position to set, a string + specifying the name of the value to set, or a slice specifying + multiple positions to set. + + item: + If ``index`` is an integer or string, a value to be set at the + corresponding position of this ``NamedList``. + + If ``index`` is a slice, an iterable of the same length + containing values to be set at the sliced positions. If + ``item`` is a ``NamedList``, the names are also transferred. + + Returns: + In the current object, the specified item(s) at ``index`` are + replaced with the contents of ``item``. + + If ``index`` is a string that does not exist in the names, it is + appended to the names and ``item`` is appended to the list. + """ + if isinstance(index, slice): + super().__setitem__(index, item) + if isinstance(item, type(self)): + self._names[index] = item._names + elif isinstance(index, str): + i = self._names.map(index) + if i >= 0: + return super().__setitem__(i, item) + else: + super().append(item) + self._names.append(index) + else: + super().__setitem__(index, item) + + def insert(self, index: Union[int, str], item: Any): + """ + Args: + index: + An integer index containing a position to insert at. + Alternatively, the name of the value to insert at (the first + occurrence of each name is used). + + item: + A scalar that can be coerced into a string, or None. + + Returns: + ``item`` is inserted at ``index`` in the current object. + """ + if isinstance(index, str): + i = self._names.map(index) + if i < 0: + raise KeyError("no list element named '" + index + "'") + index = i + super().insert(index, item) + self._names.insert(index, "") + + def append(self, item: Any): + """ + Args: + item: + Any value. + + Returns: + ``item`` is added to the end of the current object, with its name + set to an empty string. + """ + self._names.append("") + super().append(item) + + def extend(self, iterable: Iterable): + """ + Args: + iterable: + Some iterable object. If this is a ``NamedList``, its names are + used to extend the names of the current object; otherwise the + extended names are set to empty strings. + + Returns: + Items in ``iterable`` are added to the end of the current object. + """ + super().extend(iterable) + if isinstance(iterable, NamedList): + self._names.extend(iterable._names) + elif len(iterable): + self._names.extend([""] * len(iterable)) + + def __add__(self, other: list) -> "NamedList": + """ + Args: + other: + A list of items to be added to the right of the current object. + + Returns: + A new ``NamedList`` containing the concatenation of the + current object's items and those of ``other``. + """ + output = self.copy() + output.extend(other) + return output + + def __iadd__(self, other: list): + """ + Extend an existing ``NamedList`` with a new list. + + Args: + other: + A list of items. + + Returns: + The current object is extended with the contents of ``other``. If + ``other`` is a ``NamedList``, its names are used for extension; + otherwise the extension is performed with empty strings. + """ + self.extend(other) + return self + + def copy(self) -> "NamedList": + """ + Returns: + A shallow copy of a ``NamedList`` with the same contents. + """ + return NamedList(self, names=self._names.copy()) + + def __deepcopy__(self, memo=None, _nil=[]) -> "NamedList": + """ + Args: + memo: + See :py:func:`~copy.deepcopy` for details. + + _nil: + See :py:func:`~copy.deepcopy` for details. + + Returns: + A deep copy of a ``NamedList`` with the same contents. + """ + return NamedList(deepcopy(self, memo, _nil), names=deepcopy(self_names, memo, _nil)) + + +@subset_sequence.register +def _subset_sequence_NamedList(x: NamedList, indices: Sequence[int]) -> NamedList: + return NamedList((x[i] for i in indices), names=subset_sequence(x._names, indices)) + + +@combine_sequences.register +def _combine_sequences_NamedList(*x: NamedList) -> NamedList: + output = x[0].copy() + for i in range(1, len(x)): + output.extend(x[i]) + return output + + +@assign_sequence.register +def _assign_sequence_NamedList(x: NamedList, indices: Sequence[int], other) -> NamedList: + output = assign_sequence.registry[list](x, indices, other) + if isinstance(other, NamedList): + output._names = assign_sequence(output._names, indices, other._names) + return output diff --git a/src/biocutils/Names.py b/src/biocutils/Names.py new file mode 100644 index 0000000..496bc31 --- /dev/null +++ b/src/biocutils/Names.py @@ -0,0 +1,172 @@ +from typing import Sequence, Optional, Iterable, Union, Any + + +class Names(list): + """ + List of strings containing names. Typically used to decorate sequences, + such that callers can get or set elements by name instead of position. + """ + + def __init__(self, iterable: Optional[Iterable] = None, coerce: bool = True): + """ + Args: + iterable: + Some iterable object containing strings, or values that can + be coerced into strings. + + coerce: + Whether to coerce values of ``iterable`` into strings. + """ + if iterable is None: + super().__init__() + elif coerce and not isinstance(iterable, type(self)): + super().__init__(str(y) for y in iterable) + else: + super().__init__(iterable) + self._reverse = None + + # Enable fast indexing by name, but only on demand. This reverse mapping + # field is strictly internal. + def _populate_reverse_index(self): + if self._reverse is None: + self._reverse = {} + for i, n in enumerate(self): + if n not in self._reverse: + self._reverse[n] = i + + def _wipe_reverse_index(self): + self._reverse = None + + def __getitem__(self, index: Union[int, slice]) -> Union[str, "Names"]: + """ + Args: + index: + Integer specifying the position of interest, or a slice + specifying multiple such positions. + + Returns: + If ``index`` is a slice, a new ``Names`` object is returned + containing names from the specified positions. + + If ``index`` is an integer, the name at that position is returned. + """ + output = super().__getitem__(index) + if isinstance(index, slice): + return Names(output, coerce=False) + return output + + def __setitem__(self, index: Union[int, slice], item: Any): + """ + Args: + index: + Integer specifying the position of interest, or a slice + specifying multiple such positions. + + item: + If ``index`` is an integer, a string containing a name. + + If ``index`` is a slice, an iterable object of the appropriate + length, containing strings to use as replacement names. + + Returns: + The current object is modified with the replacement names. + """ + self._wipe_reverse_index() + if isinstance(index, slice): + new_it = item + if not isinstance(item, type(self)): + new_it = (str(x) for x in item) + super().__setitem__(index, new_it) + else: + super().__setitem__(index, str(item)) + + def map(self, name: str) -> int: + """ + Args: + name: Name of interest. + + Returns: + Index containing the position of the first occurrence of ``name``; + or -1, if ``name`` is not present in this object. + """ + self._populate_reverse_index() + if name in self._reverse: + return self._reverse[name] + else: + return -1 + + def append(self, name: Any): + """ + Args: + name: Name to be added. + + Returns: + ``name`` is added to the current object. + """ + name = str(name) + if self._reverse is not None and name not in self._reverse: + self._reverse[name] = len(self) + super().append(name) + + def insert(self, index: int, name: str): + """ + Args: + index: Position on the object to insert at. + + name: Name to be added. + + Returns: + ``name`` is inserted into the current object before ``index``. + """ + self._wipe_reverse_index() + super().insert(index, str(name)) + + def extend(self, names: Sequence[str]): + """ + Args: + names: Names to add to the current object. + + Returns: + ``names`` are added to the current object. + """ + if self._reverse is not None: + for i, n in enumerate(names): + n = str(n) + if n not in self._reverse: + self._reverse[n] = i + len(self) + self.append(n) + elif isinstance(names, Names): + super().extend(names) + else: + super().extend(str(y) for y in names) + + def __add__(self, other: list): + """ + Args: + other: List of names. + + Returns: + A new ``Names`` containing the combined contents + of the current object and ``other``. + """ + output = self.copy() + output.extend(other) + return output + + def __iadd__(self, other: list): + """ + Args: + other: List of names. + + Returns: + The current object is modified by adding ``other`` to its names. + """ + self.extend(other) + return self + + def copy(self): + """ + Returns: + A copy of the current object. + """ + return Names(self, coerce=False) diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 90b6742..9eaf7ad 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -17,6 +17,8 @@ from .Factor import Factor from .StringList import StringList +from .Names import Names +from .NamedList import NamedList from .factorize import factorize from .intersect import intersect diff --git a/tests/test_NamedList.py b/tests/test_NamedList.py new file mode 100644 index 0000000..3c99860 --- /dev/null +++ b/tests/test_NamedList.py @@ -0,0 +1,125 @@ +import biocutils +import pytest +from biocutils import NamedList + + +def test_NamedList_basics(): + x = NamedList([1,2,3,4], names=['a', 'b', 'c', 'd']) + assert isinstance(x, NamedList) + assert x == [ 1,2,3,4 ] + assert x.get_names() == ["a", "b", "c", "d"] + + assert x["a"] == 1 + assert x["b"] == 2 + with pytest.raises(KeyError) as ex: + x["Aaron"] + assert str(ex.value).find("Aaron") >= 0 + + # Constructor works with other NamedList objects. + y = NamedList(x) + assert y == x + assert y.get_names() == ["a", "b", "c", "d"] + + empty = NamedList() + assert empty == [] + assert isinstance(empty, NamedList) + assert empty.get_names() == [] + + # Slicing works correctly. + sub = x[1:3] + assert isinstance(sub, NamedList) + assert sub == [2, 3] + assert sub.get_names() == ["b", "c"] + + # Copying works. + z = x.copy() + z[0] = "Aaron" + assert z == [ "Aaron", 2, 3, 4 ] + assert x == [ 1, 2, 3, 4 ] + assert z.get_names() == [ "a", "b", "c", "d" ] + + +def test_NamedList_setitem(): + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x[0] = None + assert x == [None, 2, 3, 4] + assert x["A"] == None + + # Replacing by name. + x["B"] = "FOO" + assert x[1] == "FOO" + + # Replacing slices. + x[1:3] = [10, 20] + assert x == [None, 10, 20, 4] + x[1:3] = NamedList([4,5], names=["YAY", "BAR"]) + assert x == [None, 4, 5, 4] + assert x.get_names() == [ "A", "YAY", "BAR", "D" ] + + # Appending by name. + x["Aaron"] = "BAR" + assert x["Aaron"] == "BAR" + + +def test_NamedList_mutations(): + # Insertion: + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x.insert(2, "FOO") + assert x == [1, 2, "FOO", 3, 4] + assert x.get_names() == [ "A", "B", "", "C", "D"] + x.insert("D", None) + assert x == [1, 2, "FOO", 3, None, 4] + assert x.get_names() == [ "A", "B", "", "C", "", "D"] + + # Extension: + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x.extend([None, 1, True]) + assert x == [ 1, 2, 3, 4, None, 1, True ] + assert x.get_names() == [ "A", "B", "C", "D", "", "", "" ] + x.extend(NamedList([False, 2, None], names=[ "E", "F", "G" ])) + assert x == [ 1, 2, 3, 4, None, 1, True, False, 2, None ] + assert x.get_names() == [ "A", "B", "C", "D", "", "", "", "E", "F", "G" ] + + # Appending: + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x.append(1) + assert x == [ 1,2,3,4,1 ] + assert x.get_names() == [ "A", "B", "C", "D", "" ] + + +def test_NamedList_addition(): + x1 = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + summed = x1 + [5,6,7] + assert summed == [1, 2, 3, 4, 5, 6, 7] + assert summed.get_names() == [ "A", "B", "C", "D", "", "", "" ] + + x2 = NamedList([5,6,7], names=["E", "F", "G"]) + summed = x1 + x2 + assert summed == [1, 2, 3, 4, 5, 6, 7] + assert summed.get_names() == ["A", "B", "C", "D", "E", "F", "G"] + + x1 += x2 + assert x1 == [1, 2, 3, 4, 5, 6, 7] + assert x1.get_names() == ["A", "B", "C", "D", "E", "F", "G"] + + +def test_NamedList_generics(): + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + sub = biocutils.subset_sequence(x, [0,3,2,1]) + assert isinstance(sub, NamedList) + assert sub == [1, 4, 3, 2] + assert sub.get_names() == [ "A", "D", "C", "B" ] + + y = ["a", "b", "c", "d"] + com = biocutils.combine_sequences(x, y) + assert isinstance(com, NamedList) + assert com == [1, 2, 3, 4, "a", "b", "c", "d"] + assert com.get_names() == [ "A", "B", "C", "D", "", "", "", "" ] + + y = biocutils.assign_sequence(x, [1, 3], [ 20, 40 ]) + assert y == [ 1, 20, 3, 40 ] + assert y.get_names() == [ "A", "B", "C", "D" ] + + y = biocutils.assign_sequence(x, [1, 3], NamedList([ 20, 40 ], names=["b", "d" ])) + assert y == [ 1, 20, 3, 40 ] + assert y.get_names() == [ "A", "b", "C", "d" ] diff --git a/tests/test_Names.py b/tests/test_Names.py new file mode 100644 index 0000000..1b2b791 --- /dev/null +++ b/tests/test_Names.py @@ -0,0 +1,109 @@ +import biocutils +import pytest +from biocutils import Names + + +def test_Names_basics(): + x = Names([1,2,3,4]) + assert isinstance(x, Names) + assert x == [ "1","2","3","4" ] + + assert x.map("1") == 0 + assert x.map("4") == 3 + assert x.map("Aaron") == -1 + + # Constructor works with other Names objects. + y = Names(x) + assert y == x + + empty = Names() + assert empty == [] + assert isinstance(empty, Names) + + # Slicing works correctly. + sub = x[1:3] + assert isinstance(sub, Names) + assert sub == ["2", "3"] + + # Copying works. + z = x.copy() + z[0] = "Aaron" + assert z == [ "Aaron", "2", "3", "4" ] + assert x == [ "1", "2", "3", "4" ] + + +def test_Names_setitem(): + x = Names([1,2,3,4]) + x[0] = None + assert x == ["None", "2", "3", "4"] + assert x.map("None") == 0 + assert x.map("1") == -1 + + x[0] = 12345 + assert x == ["12345", "2", "3", "4"] + assert x.map("None") == -1 + assert x.map("12345") == 0 + + x[1:3] = [10, 20] + assert x == ["12345", "10", "20", "4"] + + alt = Names([ "YAY", "FOO", "BAR", "WHEE" ]) + x[:] = alt + assert x == alt + + +def test_Names_mutations(): + # Insertion: + x = Names([1,2,3,4]) + assert x.map("3") == 2 + x.insert(2, None) + assert x.map("1") == 0 + assert x.map("3") == 3 + x.insert(1, "FOO") + assert x.map("3") == 4 + assert x == [ "1", "FOO", "2", "None", "3", "4" ] + + # Extension: + x = Names([1,2,3,4]) + x.extend([None, 1, True]) + assert x == [ "1", "2", "3", "4", "None", "1", "True" ] + assert x.map("None") == 4 + assert x.map("1") == 0 + x.extend([False, 2, None]) + assert x == [ "1", "2", "3", "4", "None", "1", "True", "False", "2", "None" ] + assert x.map("None") == 4 + assert x.map("False") == 7 + assert x.map("2") == 1 + + # Appending: + x = Names([1,2,3,4]) + x.append(1) + assert x[-1] == "1" + assert x.map("1") == 0 + x.append(None) + assert x[-1] == "None" + assert x.map("None") == 5 + + +def test_Names_addition(): + x1 = Names([1,2,3,4]) + assert x1 + [5,6,7] == ["1", "2", "3", "4", "5", "6", "7"] + + x2 = Names([5,6,7]) + assert x1 + x2 == ["1", "2", "3", "4", "5", "6", "7"] + + x1 += x2 + print(x1) + assert x1 == ["1", "2", "3", "4", "5", "6", "7"] + + +def test_Names_generics(): + x = Names([1,2,3,4]) + sub = biocutils.subset_sequence(x, [0,3,2,1]) + assert isinstance(sub, Names) + assert sub == ["1", "4", "3", "2"] + + y = ["a", "b", "c", "d"] + com = biocutils.combine_sequences(x, y) + assert isinstance(com, Names) + assert com == ["1", "2", "3", "4", "a", "b", "c", "d"]