diff --git a/src/biocutils/NamedList.py b/src/biocutils/NamedList.py new file mode 100644 index 0000000..bf4bb5a --- /dev/null +++ b/src/biocutils/NamedList.py @@ -0,0 +1,274 @@ +from typing import Sequence, Optional, Iterable, Union, Any +from copy import deepcopy + +from .Names import Names +from .subset_sequence import subset_sequence +from .combine_sequences import combine_sequences +from .assign_sequence import assign_sequence + + +class NamedList(list): + """ + A Python list with a name for each element, equivalent to R's named list. + This provides some dict-like behavior - namely, getting or setting entries + by an existing name, adding entries with a new name. Of course, it's still + a list, so it can be indexed as usual by integer positions or slices. + """ + + def __init__(self, iterable: Optional[Iterable] = None, names: Optional[Names] = None): + """ + Args: + iterable: + Some iterable object. Alternatively None, for an empty list. + + names: + List of names. This should have same length as ``iterable``. + If None, defaults to an empty list. + """ + if iterable is None: + super().__init__() + else: + super().__init__(iterable) + + if names is None: + if isinstance(iterable, NamedList): + names = iterable._names + else: + names = Names() + elif not isinstance(names, Names): + names = Names(names) + self._names = names + if len(self) != len(self._names): + raise ValueError("length of 'names' should equal the length of 'data'") + + def __repr__(self): + return "NamedList(data=" + super().__repr__() + ", names=" + repr(self._names) + ")" + + def __str__(self): + return "[" + ", ".join(repr(self._names[i]) + "=" + repr(x) for i, x in enumerate(self)) + "]" + + def get_names(self) -> Names: + """ + Returns: + Names for the list elements. + """ + return self._names + + @property + def names(self) -> Names: + """Alias for :py:attr:`~get_names`.""" + return self.get_names() + + def set_names(self, names: Names, in_place: bool = False) -> "NamedList": + """ + Args: + names: + List of names, of the same length as this list. + + in_place: + Whether to perform this modification in-place. + + Returns: + A modified ``NamedList`` with the new names. If ``in_place = + False``, this is a new ``NamedList``, otherwise it is a reference + to the current ``NamedList``. + """ + if isinstance(names, Names): + names = Names(names) + if in_place: + if len(names) != len(self._data): + raise ValueError("length of 'names' should equal the length of 'data'") + self._names = names + return self + else: + return NamedList(self, names) + + def __getitem__(self, index: Union[str, int, slice]): + """ + Args: + index: + An integer index containing a position to extract, a string + specifying the name of the value to extract, or a slice + specifying multiple positions to extract. + + Returns: + If ``index`` is an integer, the value at the specified position. + + If ``index`` is a string, the value with the specified name. If + multiple values have the same name, the first is returned. + + If ``index`` is a slice, a new ``NamedList`` is returned + containing the items at the specified positions. + """ + if isinstance(index, str): + i = self._names.map(index) + if i < 0: + raise KeyError("no list element named '" + index + "'") + return super().__getitem__(i) + + output = super().__getitem__(index) + if isinstance(index, slice): + return NamedList(output, self._names[index]) + return output + + + def __setitem__(self, index: Union[int, str, slice], item: Any): + """ + Args: + index: + An integer index containing a position to set, a string + specifying the name of the value to set, or a slice specifying + multiple positions to set. + + item: + If ``index`` is an integer or string, a value to be set at the + corresponding position of this ``NamedList``. + + If ``index`` is a slice, an iterable of the same length + containing values to be set at the sliced positions. If + ``item`` is a ``NamedList``, the names are also transferred. + + Returns: + In the current object, the specified item(s) at ``index`` are + replaced with the contents of ``item``. + + If ``index`` is a string that does not exist in the names, it is + appended to the names and ``item`` is appended to the list. + """ + if isinstance(index, slice): + super().__setitem__(index, item) + if isinstance(item, type(self)): + self._names[index] = item._names + elif isinstance(index, str): + i = self._names.map(index) + if i >= 0: + return super().__setitem__(i, item) + else: + super().append(item) + self._names.append(index) + else: + super().__setitem__(index, item) + + def insert(self, index: Union[int, str], item: Any): + """ + Args: + index: + An integer index containing a position to insert at. + Alternatively, the name of the value to insert at (the first + occurrence of each name is used). + + item: + A scalar that can be coerced into a string, or None. + + Returns: + ``item`` is inserted at ``index`` in the current object. + """ + if isinstance(index, str): + i = self._names.map(index) + if i < 0: + raise KeyError("no list element named '" + index + "'") + index = i + super().insert(index, item) + self._names.insert(index, "") + + def append(self, item: Any): + """ + Args: + item: + Any value. + + Returns: + ``item`` is added to the end of the current object, with its name + set to an empty string. + """ + self._names.append("") + super().append(item) + + def extend(self, iterable: Iterable): + """ + Args: + iterable: + Some iterable object. If this is a ``NamedList``, its names are + used to extend the names of the current object; otherwise the + extended names are set to empty strings. + + Returns: + Items in ``iterable`` are added to the end of the current object. + """ + super().extend(iterable) + if isinstance(iterable, NamedList): + self._names.extend(iterable._names) + elif len(iterable): + self._names.extend([""] * len(iterable)) + + def __add__(self, other: list) -> "NamedList": + """ + Args: + other: + A list of items to be added to the right of the current object. + + Returns: + A new ``NamedList`` containing the concatenation of the + current object's items and those of ``other``. + """ + output = self.copy() + output.extend(other) + return output + + def __iadd__(self, other: list): + """ + Extend an existing ``NamedList`` with a new list. + + Args: + other: + A list of items. + + Returns: + The current object is extended with the contents of ``other``. If + ``other`` is a ``NamedList``, its names are used for extension; + otherwise the extension is performed with empty strings. + """ + self.extend(other) + return self + + def copy(self) -> "NamedList": + """ + Returns: + A shallow copy of a ``NamedList`` with the same contents. + """ + return NamedList(self, names=self._names.copy()) + + def __deepcopy__(self, memo=None, _nil=[]) -> "NamedList": + """ + Args: + memo: + See :py:func:`~copy.deepcopy` for details. + + _nil: + See :py:func:`~copy.deepcopy` for details. + + Returns: + A deep copy of a ``NamedList`` with the same contents. + """ + return NamedList(deepcopy(self, memo, _nil), names=deepcopy(self_names, memo, _nil)) + + +@subset_sequence.register +def _subset_sequence_NamedList(x: NamedList, indices: Sequence[int]) -> NamedList: + return NamedList((x[i] for i in indices), names=subset_sequence(x._names, indices)) + + +@combine_sequences.register +def _combine_sequences_NamedList(*x: NamedList) -> NamedList: + output = x[0].copy() + for i in range(1, len(x)): + output.extend(x[i]) + return output + + +@assign_sequence.register +def _assign_sequence_NamedList(x: NamedList, indices: Sequence[int], other) -> NamedList: + output = assign_sequence.registry[list](x, indices, other) + if isinstance(other, NamedList): + output._names = assign_sequence(output._names, indices, other._names) + return output diff --git a/src/biocutils/Names.py b/src/biocutils/Names.py new file mode 100644 index 0000000..496bc31 --- /dev/null +++ b/src/biocutils/Names.py @@ -0,0 +1,172 @@ +from typing import Sequence, Optional, Iterable, Union, Any + + +class Names(list): + """ + List of strings containing names. Typically used to decorate sequences, + such that callers can get or set elements by name instead of position. + """ + + def __init__(self, iterable: Optional[Iterable] = None, coerce: bool = True): + """ + Args: + iterable: + Some iterable object containing strings, or values that can + be coerced into strings. + + coerce: + Whether to coerce values of ``iterable`` into strings. + """ + if iterable is None: + super().__init__() + elif coerce and not isinstance(iterable, type(self)): + super().__init__(str(y) for y in iterable) + else: + super().__init__(iterable) + self._reverse = None + + # Enable fast indexing by name, but only on demand. This reverse mapping + # field is strictly internal. + def _populate_reverse_index(self): + if self._reverse is None: + self._reverse = {} + for i, n in enumerate(self): + if n not in self._reverse: + self._reverse[n] = i + + def _wipe_reverse_index(self): + self._reverse = None + + def __getitem__(self, index: Union[int, slice]) -> Union[str, "Names"]: + """ + Args: + index: + Integer specifying the position of interest, or a slice + specifying multiple such positions. + + Returns: + If ``index`` is a slice, a new ``Names`` object is returned + containing names from the specified positions. + + If ``index`` is an integer, the name at that position is returned. + """ + output = super().__getitem__(index) + if isinstance(index, slice): + return Names(output, coerce=False) + return output + + def __setitem__(self, index: Union[int, slice], item: Any): + """ + Args: + index: + Integer specifying the position of interest, or a slice + specifying multiple such positions. + + item: + If ``index`` is an integer, a string containing a name. + + If ``index`` is a slice, an iterable object of the appropriate + length, containing strings to use as replacement names. + + Returns: + The current object is modified with the replacement names. + """ + self._wipe_reverse_index() + if isinstance(index, slice): + new_it = item + if not isinstance(item, type(self)): + new_it = (str(x) for x in item) + super().__setitem__(index, new_it) + else: + super().__setitem__(index, str(item)) + + def map(self, name: str) -> int: + """ + Args: + name: Name of interest. + + Returns: + Index containing the position of the first occurrence of ``name``; + or -1, if ``name`` is not present in this object. + """ + self._populate_reverse_index() + if name in self._reverse: + return self._reverse[name] + else: + return -1 + + def append(self, name: Any): + """ + Args: + name: Name to be added. + + Returns: + ``name`` is added to the current object. + """ + name = str(name) + if self._reverse is not None and name not in self._reverse: + self._reverse[name] = len(self) + super().append(name) + + def insert(self, index: int, name: str): + """ + Args: + index: Position on the object to insert at. + + name: Name to be added. + + Returns: + ``name`` is inserted into the current object before ``index``. + """ + self._wipe_reverse_index() + super().insert(index, str(name)) + + def extend(self, names: Sequence[str]): + """ + Args: + names: Names to add to the current object. + + Returns: + ``names`` are added to the current object. + """ + if self._reverse is not None: + for i, n in enumerate(names): + n = str(n) + if n not in self._reverse: + self._reverse[n] = i + len(self) + self.append(n) + elif isinstance(names, Names): + super().extend(names) + else: + super().extend(str(y) for y in names) + + def __add__(self, other: list): + """ + Args: + other: List of names. + + Returns: + A new ``Names`` containing the combined contents + of the current object and ``other``. + """ + output = self.copy() + output.extend(other) + return output + + def __iadd__(self, other: list): + """ + Args: + other: List of names. + + Returns: + The current object is modified by adding ``other`` to its names. + """ + self.extend(other) + return self + + def copy(self): + """ + Returns: + A copy of the current object. + """ + return Names(self, coerce=False) diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 90b6742..9eaf7ad 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -17,6 +17,8 @@ from .Factor import Factor from .StringList import StringList +from .Names import Names +from .NamedList import NamedList from .factorize import factorize from .intersect import intersect diff --git a/tests/test_NamedList.py b/tests/test_NamedList.py new file mode 100644 index 0000000..3c99860 --- /dev/null +++ b/tests/test_NamedList.py @@ -0,0 +1,125 @@ +import biocutils +import pytest +from biocutils import NamedList + + +def test_NamedList_basics(): + x = NamedList([1,2,3,4], names=['a', 'b', 'c', 'd']) + assert isinstance(x, NamedList) + assert x == [ 1,2,3,4 ] + assert x.get_names() == ["a", "b", "c", "d"] + + assert x["a"] == 1 + assert x["b"] == 2 + with pytest.raises(KeyError) as ex: + x["Aaron"] + assert str(ex.value).find("Aaron") >= 0 + + # Constructor works with other NamedList objects. + y = NamedList(x) + assert y == x + assert y.get_names() == ["a", "b", "c", "d"] + + empty = NamedList() + assert empty == [] + assert isinstance(empty, NamedList) + assert empty.get_names() == [] + + # Slicing works correctly. + sub = x[1:3] + assert isinstance(sub, NamedList) + assert sub == [2, 3] + assert sub.get_names() == ["b", "c"] + + # Copying works. + z = x.copy() + z[0] = "Aaron" + assert z == [ "Aaron", 2, 3, 4 ] + assert x == [ 1, 2, 3, 4 ] + assert z.get_names() == [ "a", "b", "c", "d" ] + + +def test_NamedList_setitem(): + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x[0] = None + assert x == [None, 2, 3, 4] + assert x["A"] == None + + # Replacing by name. + x["B"] = "FOO" + assert x[1] == "FOO" + + # Replacing slices. + x[1:3] = [10, 20] + assert x == [None, 10, 20, 4] + x[1:3] = NamedList([4,5], names=["YAY", "BAR"]) + assert x == [None, 4, 5, 4] + assert x.get_names() == [ "A", "YAY", "BAR", "D" ] + + # Appending by name. + x["Aaron"] = "BAR" + assert x["Aaron"] == "BAR" + + +def test_NamedList_mutations(): + # Insertion: + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x.insert(2, "FOO") + assert x == [1, 2, "FOO", 3, 4] + assert x.get_names() == [ "A", "B", "", "C", "D"] + x.insert("D", None) + assert x == [1, 2, "FOO", 3, None, 4] + assert x.get_names() == [ "A", "B", "", "C", "", "D"] + + # Extension: + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x.extend([None, 1, True]) + assert x == [ 1, 2, 3, 4, None, 1, True ] + assert x.get_names() == [ "A", "B", "C", "D", "", "", "" ] + x.extend(NamedList([False, 2, None], names=[ "E", "F", "G" ])) + assert x == [ 1, 2, 3, 4, None, 1, True, False, 2, None ] + assert x.get_names() == [ "A", "B", "C", "D", "", "", "", "E", "F", "G" ] + + # Appending: + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + x.append(1) + assert x == [ 1,2,3,4,1 ] + assert x.get_names() == [ "A", "B", "C", "D", "" ] + + +def test_NamedList_addition(): + x1 = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + summed = x1 + [5,6,7] + assert summed == [1, 2, 3, 4, 5, 6, 7] + assert summed.get_names() == [ "A", "B", "C", "D", "", "", "" ] + + x2 = NamedList([5,6,7], names=["E", "F", "G"]) + summed = x1 + x2 + assert summed == [1, 2, 3, 4, 5, 6, 7] + assert summed.get_names() == ["A", "B", "C", "D", "E", "F", "G"] + + x1 += x2 + assert x1 == [1, 2, 3, 4, 5, 6, 7] + assert x1.get_names() == ["A", "B", "C", "D", "E", "F", "G"] + + +def test_NamedList_generics(): + x = NamedList([1,2,3,4], names=["A", "B", "C", "D"]) + sub = biocutils.subset_sequence(x, [0,3,2,1]) + assert isinstance(sub, NamedList) + assert sub == [1, 4, 3, 2] + assert sub.get_names() == [ "A", "D", "C", "B" ] + + y = ["a", "b", "c", "d"] + com = biocutils.combine_sequences(x, y) + assert isinstance(com, NamedList) + assert com == [1, 2, 3, 4, "a", "b", "c", "d"] + assert com.get_names() == [ "A", "B", "C", "D", "", "", "", "" ] + + y = biocutils.assign_sequence(x, [1, 3], [ 20, 40 ]) + assert y == [ 1, 20, 3, 40 ] + assert y.get_names() == [ "A", "B", "C", "D" ] + + y = biocutils.assign_sequence(x, [1, 3], NamedList([ 20, 40 ], names=["b", "d" ])) + assert y == [ 1, 20, 3, 40 ] + assert y.get_names() == [ "A", "b", "C", "d" ] diff --git a/tests/test_Names.py b/tests/test_Names.py new file mode 100644 index 0000000..1b2b791 --- /dev/null +++ b/tests/test_Names.py @@ -0,0 +1,109 @@ +import biocutils +import pytest +from biocutils import Names + + +def test_Names_basics(): + x = Names([1,2,3,4]) + assert isinstance(x, Names) + assert x == [ "1","2","3","4" ] + + assert x.map("1") == 0 + assert x.map("4") == 3 + assert x.map("Aaron") == -1 + + # Constructor works with other Names objects. + y = Names(x) + assert y == x + + empty = Names() + assert empty == [] + assert isinstance(empty, Names) + + # Slicing works correctly. + sub = x[1:3] + assert isinstance(sub, Names) + assert sub == ["2", "3"] + + # Copying works. + z = x.copy() + z[0] = "Aaron" + assert z == [ "Aaron", "2", "3", "4" ] + assert x == [ "1", "2", "3", "4" ] + + +def test_Names_setitem(): + x = Names([1,2,3,4]) + x[0] = None + assert x == ["None", "2", "3", "4"] + assert x.map("None") == 0 + assert x.map("1") == -1 + + x[0] = 12345 + assert x == ["12345", "2", "3", "4"] + assert x.map("None") == -1 + assert x.map("12345") == 0 + + x[1:3] = [10, 20] + assert x == ["12345", "10", "20", "4"] + + alt = Names([ "YAY", "FOO", "BAR", "WHEE" ]) + x[:] = alt + assert x == alt + + +def test_Names_mutations(): + # Insertion: + x = Names([1,2,3,4]) + assert x.map("3") == 2 + x.insert(2, None) + assert x.map("1") == 0 + assert x.map("3") == 3 + x.insert(1, "FOO") + assert x.map("3") == 4 + assert x == [ "1", "FOO", "2", "None", "3", "4" ] + + # Extension: + x = Names([1,2,3,4]) + x.extend([None, 1, True]) + assert x == [ "1", "2", "3", "4", "None", "1", "True" ] + assert x.map("None") == 4 + assert x.map("1") == 0 + x.extend([False, 2, None]) + assert x == [ "1", "2", "3", "4", "None", "1", "True", "False", "2", "None" ] + assert x.map("None") == 4 + assert x.map("False") == 7 + assert x.map("2") == 1 + + # Appending: + x = Names([1,2,3,4]) + x.append(1) + assert x[-1] == "1" + assert x.map("1") == 0 + x.append(None) + assert x[-1] == "None" + assert x.map("None") == 5 + + +def test_Names_addition(): + x1 = Names([1,2,3,4]) + assert x1 + [5,6,7] == ["1", "2", "3", "4", "5", "6", "7"] + + x2 = Names([5,6,7]) + assert x1 + x2 == ["1", "2", "3", "4", "5", "6", "7"] + + x1 += x2 + print(x1) + assert x1 == ["1", "2", "3", "4", "5", "6", "7"] + + +def test_Names_generics(): + x = Names([1,2,3,4]) + sub = biocutils.subset_sequence(x, [0,3,2,1]) + assert isinstance(sub, Names) + assert sub == ["1", "4", "3", "2"] + + y = ["a", "b", "c", "d"] + com = biocutils.combine_sequences(x, y) + assert isinstance(com, Names) + assert com == ["1", "2", "3", "4", "a", "b", "c", "d"]