From 0c2a69dfd600067446307edadd2686a3e9c5ee34 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 17:52:03 -0400 Subject: [PATCH 01/16] add a multi-type array class --- lenskit/lenskit/data/__init__.py | 1 + lenskit/lenskit/data/mtarray.py | 126 +++++++++++++++++++++++++++++++ lenskit/lenskit/data/vocab.py | 10 ++- lenskit/tests/test_mtarray.py | 69 +++++++++++++++++ 4 files changed, 204 insertions(+), 2 deletions(-) create mode 100644 lenskit/lenskit/data/mtarray.py create mode 100644 lenskit/tests/test_mtarray.py diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py index 6ddefed5d..493d1fec7 100644 --- a/lenskit/lenskit/data/__init__.py +++ b/lenskit/lenskit/data/__init__.py @@ -13,3 +13,4 @@ from .dataset import Dataset, from_interactions_df # noqa: F401, E402 from .movielens import load_movielens # noqa: F401, E402 +from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray # noqa: F401, E402 diff --git a/lenskit/lenskit/data/mtarray.py b/lenskit/lenskit/data/mtarray.py new file mode 100644 index 000000000..d9e97fe58 --- /dev/null +++ b/lenskit/lenskit/data/mtarray.py @@ -0,0 +1,126 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +# pyright: basic +from __future__ import annotations + +from typing import Generic, Literal, LiteralString, Sequence, TypeVar, overload + +import numpy as np +import torch +from numpy.typing import ArrayLike, NDArray + +NPT = TypeVar("NPT", bound=np.generic) + + +class MTArray(Generic[NPT]): + """ + Multi-typed array class, allowing arrays to be easily converted between + NumPy, PyTorch, and other supported backends, caching the conversion result. + + We use this class instead of one canonical format so that data can be + converted lazily, and can be left in-place when passing from one component + to another that use the same computational engine. + + .. note:: + + This class is only intended for read-only arrays. It is **not defined** + whether the different arrays share storage, and modifying one format may + or may not modify another. For example, PyTorch and NumPy usually share + storage when both are on CPU, but a GPU tensor and CPU ndarray do not. + """ + + _shape: tuple[int, ...] | None = None + _unknown: object = None + _numpy: NDArray[NPT] | None = None + _torch: torch.Tensor | None = None + + def __init__(self, array: NDArray[NPT] | torch.Tensor | Sequence | ArrayLike): + """ + Construct a new MTArray around an array. + """ + # TODO: support DLpack + if isinstance(array, torch.Tensor): + # torch might not be on-device + self._torch = array + self._shape = array.shape + else: + # stash it in the common-format field for lazy conversion + self._unknown = array + + @property + def shape(self) -> tuple[int, ...]: + if self._shape is None: + self._shape = self.numpy().shape + + return self._shape + + def numpy(self) -> NDArray[NPT]: + """ + Get the array as a NumPy array. + """ + if self._numpy is None: + self._numpy = np.asarray(self._convertible()) + + assert self._numpy is not None + return self._numpy + + def torch(self, *, device: str | None = None) -> torch.Tensor: + """ + Get the array as a PyTorch tensor. + + Args: + device: + The device on which the Torch tensor should reside. + """ + if self._torch is None: + self._torch = torch.as_tensor(self._convertible()) + + if device: + return self._torch.to(device) + else: + return self._torch + + @overload + def to(self, format: Literal["numpy"]) -> NDArray[NPT]: ... + @overload + def to(self, format: Literal["torch"], *, device: str | None) -> torch.Tensor: ... + @overload + def to(self, format: LiteralString, *, device: str | None = None) -> ArrayLike: ... + def to(self, format: str, *, device: str | None = None) -> ArrayLike: + """ + Obtain the array in the specified format (dynamic version). + """ + if format == "numpy": + return self.numpy() + elif format == "torch": + return self.torch(device=device) + else: + raise RuntimeError(f"unsupported array format {format}") + + def _convertible(self) -> object: + """ + Get the data suitable for passing to an ``as_array`` method. + """ + # look for a good format for the data. if we've already made a numpy, + # use that; otherwise, try unknown, and fall back to torch (moved to + # CPU). end result: convertible data. + if self._numpy is not None: + return self._numpy + elif self._unknown is not None: + return self._unknown + elif self._torch is not None: + return self._torch.cpu() + else: + raise RuntimeError("cannot find array data") + + def __array__(self) -> NDArray[NPT]: + return self.numpy() + + +MTFloatArray = MTArray[np.floating] +MTIntArray = MTArray[np.integer] +MTGenericArray = MTArray[np.generic] diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py index 7dbc8069e..f4bdf7ca3 100644 --- a/lenskit/lenskit/data/vocab.py +++ b/lenskit/lenskit/data/vocab.py @@ -19,6 +19,8 @@ EntityId: TypeAlias = int | str | bytes "Allowable entity identifier types." +NPEntityId: TypeAlias = np.integer | np.str_ | np.bytes_ +"Allowable entity identifier types (NumPy version)" VT = TypeVar("VT", bound=Hashable) "Term type in a vocabulary." @@ -100,7 +102,9 @@ def term(self, num: int) -> VT: raise IndexError("negative numbers not supported") return self._index[num] - def terms(self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None) -> np.ndarray: + def terms( + self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None + ) -> NDArray[NPEntityId]: """ Get a list of terms, optionally for an array of term numbers. @@ -125,7 +129,9 @@ def id(self, num: int) -> VT: "Alias for :meth:`term` for greater readability for entity ID vocabularies." return self.term(num) - def ids(self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None) -> np.ndarray: + def ids( + self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None + ) -> NDArray[NPEntityId]: "Alias for :meth:`terms` for greater readability for entity ID vocabularies." return self.terms(nums) diff --git a/lenskit/tests/test_mtarray.py b/lenskit/tests/test_mtarray.py new file mode 100644 index 000000000..8eb29134f --- /dev/null +++ b/lenskit/tests/test_mtarray.py @@ -0,0 +1,69 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import numpy as np +import torch +from numpy.typing import NDArray + +import hypothesis.extra.numpy as nph +import hypothesis.strategies as st +from hypothesis import assume, given + +from lenskit.data.mtarray import MTArray, MTIntArray + + +@given( + nph.arrays( + dtype=st.one_of(nph.integer_dtypes(endianness="="), nph.floating_dtypes(endianness="=")), + shape=nph.array_shapes(), + ) +) +def test_from_numpy(arr: NDArray[np.generic]): + # limit to data types that match + assume(np.all(np.isfinite(arr))) + mta = MTArray(arr) + assert mta.shape == arr.shape + npa = mta.numpy() + assert npa is arr + + tensor = mta.torch() + assert tensor.shape == arr.shape + assert np.all(tensor.numpy() == arr) + + assert np.asarray(mta) is arr + + +@given( + nph.arrays( + dtype=st.one_of(nph.integer_dtypes(endianness="="), nph.floating_dtypes(endianness="=")), + shape=nph.array_shapes(), + ) +) +def test_from_torch(arr: NDArray[np.generic]): + # limit to data types that match + assume(np.all(np.isfinite(arr))) + ot = torch.from_numpy(arr) + mta = MTArray(ot) + assert mta.shape == arr.shape + tensor = mta.torch() + assert tensor is ot + + npa = mta.numpy() + assert npa.shape == arr.shape + assert np.all(npa == arr) + + +@given(st.lists(st.integers(min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max))) +def test_from_list(xs: list[int]): + # limit to data types that match + mta = MTIntArray(xs) + assert mta.shape == (len(xs),) + + npa = mta.numpy() + assert np.all(npa == xs) + + tensor = mta.torch() + assert np.all(tensor.numpy() == xs) From 0af303e64de292c94c156dd9e4c5456290c19d19 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 19:06:34 -0400 Subject: [PATCH 02/16] initial item list (without scores) --- lenskit/lenskit/data/__init__.py | 1 + lenskit/lenskit/data/items.py | 120 +++++++++++++++++++++++++++++++ lenskit/tests/test_itemlist.py | 57 +++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 lenskit/lenskit/data/items.py create mode 100644 lenskit/tests/test_itemlist.py diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py index 493d1fec7..549b39f49 100644 --- a/lenskit/lenskit/data/__init__.py +++ b/lenskit/lenskit/data/__init__.py @@ -12,5 +12,6 @@ "Types of feedback supported." from .dataset import Dataset, from_interactions_df # noqa: F401, E402 +from .items import ItemList # noqa: F401, E402 from .movielens import load_movielens # noqa: F401, E402 from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray # noqa: F401, E402 diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py new file mode 100644 index 000000000..b77be4a3b --- /dev/null +++ b/lenskit/lenskit/data/items.py @@ -0,0 +1,120 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +""" +Primary item-list abstraction. +""" + +from __future__ import annotations + +from typing import Literal, LiteralString, Sequence, TypeAlias, overload + +import numpy as np +import pandas as pd +import torch +from numpy.typing import ArrayLike, NDArray + +from lenskit.data.mtarray import MTArray, MTGenericArray +from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary + +Backend: TypeAlias = Literal["numpy", "torch"] + + +class ItemList: + """ + Representation of a (usually ordered) list of items, possibly with scores + and other associated data. + """ + + _len: int + _ids: np.ndarray[int, np.dtype[NPEntityId]] | None = None + _numbers: MTArray[np.int32] | None = None + _vocab: Vocabulary[EntityId] | None = None + _fields: dict[str, MTGenericArray] + + def __init__( + self, + *, + item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None, + item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None, + vocabulary: Vocabulary[EntityId] | None = None, + ): + self._vocab = vocabulary + self._fields = {} + + if item_ids is None and item_nums is None: + self._ids = np.ndarray(0, dtype=np.int32) + self._numbers = MTArray(np.ndarray(0, dtype=np.int32)) + self._len = 0 + + if item_ids is not None: + self._ids = np.asarray(item_ids) + if len(self._ids.shape) > 1: + raise TypeError("item lists must be 1-dimensional") + self._len = len(item_ids) + if item_nums is not None: + self._numbers = MTArray(item_nums) + if hasattr(self, "_len"): + if self._numbers.shape != (self._len,): + nl = self._numbers.shape[0] + raise ValueError( + f"item ID and number lists have different lengths ({self._len} != {nl})" + ) + else: + self._len = self._numbers.shape[0] + + def clone(self) -> ItemList: + """ + Make a shallow copy of the item list. + """ + return ItemList(item_ids=self._ids, item_nums=self._numbers, vocabulary=self._vocab) + + def ids(self) -> NDArray[NPEntityId]: + """ + Get the item IDs. + + Returns: + An array of item identifiers. + + Raises: + RuntimeError: if the item list was not created with IDs or a :class:`Vocabulary`. + """ + if self._ids is None: + if self._vocab is None: + raise RuntimeError("item IDs not available (no IDs or vocabulary provided)") + assert self._numbers is not None + self._ids = self._vocab.ids(self._numbers.numpy()) + + return self._ids + + @overload + def numbers(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32]: ... + @overload + def numbers(self, format: Literal["torch"]) -> torch.Tensor: ... + def numbers(self, format: LiteralString = "numpy") -> ArrayLike: + """ + Get the item numbers. + + Args: + format: + The array format to use. + + Returns: + An array of item numbers. + + Raises: + RuntimeError: if the item list was not created with numbers or a :class:`Vocabulary`. + """ + if self._numbers is None: + if self._vocab is None: + raise RuntimeError("item numbers not available (no IDs or vocabulary provided)") + assert self._ids is not None + self._numbers = MTArray(self._vocab.numbers(self._ids)) + + return self._numbers.to(format) + + def __len__(self): + return self._len diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py new file mode 100644 index 000000000..2872c0b73 --- /dev/null +++ b/lenskit/tests/test_itemlist.py @@ -0,0 +1,57 @@ +import numpy as np + +from pytest import raises + +from lenskit.data import ItemList +from lenskit.data.vocab import Vocabulary + + +def test_empty(): + il = ItemList() + + assert len(il) == 0 + assert il.numbers().shape == (0,) + assert il.ids().shape == (0,) + + +def test_item_list(): + il = ItemList(item_ids=["one", "two"]) + + assert len(il) == 2 + assert il.ids().shape == (2,) + + with raises(RuntimeError, match="item numbers not available"): + il.numbers() + + +def test_item_num_list(): + il = ItemList(item_nums=np.arange(5)) + + assert len(il) == 5 + assert il.numbers().shape == (5,) + + with raises(RuntimeError, match="item IDs not available"): + il.ids() + + +def test_item_num_list_vocab(): + il = ItemList(item_nums=np.arange(5), vocabulary=Vocabulary(["a", "b", "c", "d", "e"])) + + assert len(il) == 5 + assert il.numbers().shape == (5,) + assert il.ids().shape == (5,) + + assert all(il.numbers() == np.arange(5)) + assert all(il.ids() == ["a", "b", "c", "d", "e"]) + + +def test_item_id_list_vocab(): + idl = ["a", "b", "c", "d", "e"] + il = ItemList(item_ids=idl, vocabulary=Vocabulary(idl)) + + assert len(il) == 5 + assert il.numbers().shape == (5,) + assert il.ids().shape == (5,) + + assert all(il.numbers() == np.arange(5)) + assert all(il.ids() == ["a", "b", "c", "d", "e"]) From 6c6e22616f0c03496b48651865583ff432e46a4f Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 19:42:30 -0400 Subject: [PATCH 03/16] make vocab covariant --- lenskit/lenskit/data/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py index f4bdf7ca3..48a90dc5e 100644 --- a/lenskit/lenskit/data/vocab.py +++ b/lenskit/lenskit/data/vocab.py @@ -22,7 +22,7 @@ NPEntityId: TypeAlias = np.integer | np.str_ | np.bytes_ "Allowable entity identifier types (NumPy version)" -VT = TypeVar("VT", bound=Hashable) +VT = TypeVar("VT", bound=Hashable, covariant=True) "Term type in a vocabulary." From db35ec15552ee5cc790f4b3efb469eeed9c24d21 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 19:44:00 -0400 Subject: [PATCH 04/16] remove vocab covariance --- lenskit/lenskit/data/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py index 48a90dc5e..f4bdf7ca3 100644 --- a/lenskit/lenskit/data/vocab.py +++ b/lenskit/lenskit/data/vocab.py @@ -22,7 +22,7 @@ NPEntityId: TypeAlias = np.integer | np.str_ | np.bytes_ "Allowable entity identifier types (NumPy version)" -VT = TypeVar("VT", bound=Hashable, covariant=True) +VT = TypeVar("VT", bound=Hashable) "Term type in a vocabulary." From af51cac8a9a8f7c0d20fa9050d4e1e0261565b08 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 19:44:52 -0400 Subject: [PATCH 05/16] fix vocabulary type variance --- lenskit/lenskit/data/items.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index b77be4a3b..2a2590956 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -10,7 +10,7 @@ from __future__ import annotations -from typing import Literal, LiteralString, Sequence, TypeAlias, overload +from typing import Literal, LiteralString, Sequence, TypeAlias, TypeVar, overload import numpy as np import pandas as pd @@ -21,6 +21,7 @@ from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary Backend: TypeAlias = Literal["numpy", "torch"] +EID = TypeVar("EID", bound=EntityId) class ItemList: @@ -40,7 +41,7 @@ def __init__( *, item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None, item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None, - vocabulary: Vocabulary[EntityId] | None = None, + vocabulary: Vocabulary[EID] | None = None, ): self._vocab = vocabulary self._fields = {} From 71a9a517b21c24bae3bcb9f13af9ece59e6e8d4e Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 19:49:56 -0400 Subject: [PATCH 06/16] initial working scores --- lenskit/lenskit/data/items.py | 44 +++++++++++++++++++++++++++++++++- lenskit/tests/test_itemlist.py | 29 ++++++++++++++++++---- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 2a2590956..8175cd5ec 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -28,6 +28,16 @@ class ItemList: """ Representation of a (usually ordered) list of items, possibly with scores and other associated data. + + Args: + item_ids: + A list or array of item identifiers. + item_nums: + A list or array of item numbers. + vocabulary: + A vocabulary to translate between item IDs and numbers. + fields: + Additional fields, such as ``score`` or ``rating``. """ _len: int @@ -42,9 +52,9 @@ def __init__( item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None, item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None, vocabulary: Vocabulary[EID] | None = None, + **fields: NDArray[np.generic] | torch.Tensor | ArrayLike, ): self._vocab = vocabulary - self._fields = {} if item_ids is None and item_nums is None: self._ids = np.ndarray(0, dtype=np.int32) @@ -56,6 +66,7 @@ def __init__( if len(self._ids.shape) > 1: raise TypeError("item lists must be 1-dimensional") self._len = len(item_ids) + if item_nums is not None: self._numbers = MTArray(item_nums) if hasattr(self, "_len"): @@ -67,6 +78,8 @@ def __init__( else: self._len = self._numbers.shape[0] + self._fields = {name: MTArray(data) for (name, data) in fields.items()} + def clone(self) -> ItemList: """ Make a shallow copy of the item list. @@ -95,6 +108,8 @@ def ids(self) -> NDArray[NPEntityId]: def numbers(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32]: ... @overload def numbers(self, format: Literal["torch"]) -> torch.Tensor: ... + @overload + def numbers(self, format: LiteralString = "numpy") -> ArrayLike: ... def numbers(self, format: LiteralString = "numpy") -> ArrayLike: """ Get the item numbers. @@ -117,5 +132,32 @@ def numbers(self, format: LiteralString = "numpy") -> ArrayLike: return self._numbers.to(format) + @overload + def scores(self, format: Literal["numpy"] = "numpy") -> NDArray[np.floating] | None: ... + @overload + def scores(self, format: Literal["torch"]) -> torch.Tensor | None: ... + @overload + def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: ... + def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: + """ + Get the item scores (if available). + """ + return self.field("scores", format) + + @overload + def field( + self, name: str, format: Literal["numpy"] = "numpy" + ) -> NDArray[np.floating] | None: ... + @overload + def field(self, name: str, format: Literal["torch"]) -> torch.Tensor | None: ... + @overload + def field(self, name: str, format: LiteralString) -> ArrayLike | None: ... + def field(self, name: str, format: LiteralString = "numpy") -> ArrayLike | None: + val = self._fields.get(name, None) + if val is None: + return None + else: + return val.to(format) + def __len__(self): return self._len diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index 2872c0b73..6668dc2fd 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -1,10 +1,14 @@ import numpy as np +import torch from pytest import raises from lenskit.data import ItemList from lenskit.data.vocab import Vocabulary +ITEMS = ["a", "b", "c", "d", "e"] +VOCAB = Vocabulary(ITEMS) + def test_empty(): il = ItemList() @@ -12,6 +16,7 @@ def test_empty(): assert len(il) == 0 assert il.numbers().shape == (0,) assert il.ids().shape == (0,) + assert il.scores() is None def test_item_list(): @@ -35,23 +40,37 @@ def test_item_num_list(): def test_item_num_list_vocab(): - il = ItemList(item_nums=np.arange(5), vocabulary=Vocabulary(["a", "b", "c", "d", "e"])) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB) assert len(il) == 5 assert il.numbers().shape == (5,) assert il.ids().shape == (5,) assert all(il.numbers() == np.arange(5)) - assert all(il.ids() == ["a", "b", "c", "d", "e"]) + assert all(il.ids() == ITEMS) def test_item_id_list_vocab(): - idl = ["a", "b", "c", "d", "e"] - il = ItemList(item_ids=idl, vocabulary=Vocabulary(idl)) + il = ItemList(item_ids=ITEMS, vocabulary=VOCAB) assert len(il) == 5 assert il.numbers().shape == (5,) assert il.ids().shape == (5,) assert all(il.numbers() == np.arange(5)) - assert all(il.ids() == ["a", "b", "c", "d", "e"]) + assert all(il.ids() == ITEMS) + + +def test_scores(): + data = np.random.randn(5) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data) + + scores = il.scores() + assert scores is not None + assert scores.shape == (5,) + assert np.all(scores == data) + + st = il.scores("torch") + assert isinstance(st, torch.Tensor) + assert st.shape == (5,) + assert np.all(st.numpy() == data) From 72b9ff7509d9cf9258a9c7a9e7916bdad60816a8 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 10:48:04 -0400 Subject: [PATCH 07/16] add data check routines --- lenskit/lenskit/data/checks.py | 150 ++++++++++++++++++++++++++++++ lenskit/tests/test_data_checks.py | 98 +++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 lenskit/lenskit/data/checks.py create mode 100644 lenskit/tests/test_data_checks.py diff --git a/lenskit/lenskit/data/checks.py b/lenskit/lenskit/data/checks.py new file mode 100644 index 000000000..2cde04cc6 --- /dev/null +++ b/lenskit/lenskit/data/checks.py @@ -0,0 +1,150 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +"Data check functions for LensKit." + +# pyright: strict +from __future__ import annotations + +from typing import Any, Literal, Protocol, TypeVar, overload + +import numpy as np +from numpy.typing import NDArray + + +class HasShape(Protocol): + @property + def shape(self) -> tuple[int, ...]: ... + + +A = TypeVar("A", bound=HasShape) +NPT = TypeVar("NPT", bound=np.generic) + + +@overload +def check_1d( + arr: A, + size: int | None = None, + *, + label: str = "array", + error: Literal["raise"] = "raise", +) -> A: ... +@overload +def check_1d( + arr: HasShape, + size: int | None = None, + *, + error: Literal["return"], +) -> bool: ... +def check_1d( + arr: A, + size: int | None = None, + *, + label: str = "array", + error: Literal["raise", "return"] = "raise", +) -> bool | A: + """ + Check that an array is one-dimensional, optionally checking that it has the + expected length. + + This check function has 2 modes: + + * If ``error="raise"`` (the default), it will raise a :class:`TypeError` + if the array shape is incorrect, and return the array otherwise. + * If ``error="return"``, it will return ``True`` or ``False`` depending on + whether the size is correct. + + Args: + arr: + The array to check. + size: + The expected size of the array. If unspecified, this function simply + checks that the array is 1-dimensional, but does not check the size + of that dimension. + label: + A label to use in the exception message. + error: + The behavior when an array fails the test. + + Returns: + The array, if ``error="raise"`` and the array passes the check, or a + boolean indicating whether it passes the check. + + Raises: + TypeError: if ``error="raise"`` and the array fails the check. + """ + if size is None and len(arr.shape) > 1: + if error == "raise": + raise TypeError(f"{label} must be 1D (has shape {arr.shape})") + else: + return False + elif size is not None and arr.shape != (size,): + if error == "raise": + raise TypeError(f"{label} has incorrect shape (found {arr.shape}, expected {size})") + else: + return False + + if error == "raise": + return arr + else: + return True + + +@overload +def check_type( + arr: NDArray[Any], + *types: type[NPT], + label: str = "array", + error: Literal["raise"] = "raise", +) -> NDArray[NPT]: ... +@overload +def check_type( + arr: NDArray[Any], + *types: type[NPT], + error: Literal["return"], +) -> bool: ... +def check_type( + arr: NDArray[Any], + *types: type[NPT], + label: str = "array", + error: Literal["raise", "return"] = "raise", +) -> bool | NDArray[Any]: + """ + Check that an array array is of an acceptable type. + + This check function has 2 modes: + + * If ``error="raise"`` (the default), it will raise a :class:`TypeError` + if the array shape is incorrect, and return the array otherwise. + * If ``error="return"``, it will return ``True`` or ``False`` depending on + whether the size is correct. + + Args: + arr: + The array to check. + types: + The acceptable types for the array. + label: + A label to use in the exception message. + error: + The behavior when an array fails the test. + + Returns: + The array, if ``error="raise"`` and the array passes the check, or a + boolean indicating whether it passes the check. + + Raises: + TypeError: if ``error="raise"`` and the array fails the check. + """ + if issubclass(arr.dtype.type, types): + if error == "raise": + return arr + else: + return True + elif error == "raise": + raise TypeError(f"{label} has incorrect type {arr.dtype} (allowed: {types})") + else: + return False diff --git a/lenskit/tests/test_data_checks.py b/lenskit/tests/test_data_checks.py new file mode 100644 index 000000000..5f07c97ae --- /dev/null +++ b/lenskit/tests/test_data_checks.py @@ -0,0 +1,98 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import numpy as np + +import hypothesis.extra.numpy as nph +import hypothesis.strategies as st +from hypothesis import given +from pytest import raises + +from lenskit.data.checks import check_1d, check_type + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=1, max_dims=1))) +def test_check_1d_ok(arr): + check_1d(arr) + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=1, max_dims=1))) +def test_check_1d_ok_return(arr): + assert check_1d(arr, error="return") + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=2))) +def test_check_1d_bad(arr): + with raises(TypeError, match="must be 1D"): + check_1d(arr) + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=2))) +def test_check_1d_bad_return(arr): + assert not check_1d(arr, error="return") + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes()), st.integers(min_value=0)) +def test_check_expected_size(arr, exp): + if arr.shape == (exp,): + check_1d(arr, exp) + else: + with raises(TypeError): + check_1d(arr, exp) + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes()), st.integers(min_value=0)) +def test_check_expected_size_return(arr, exp): + if arr.shape == (exp,): + assert check_1d(arr, exp, error="return") + else: + assert not check_1d(arr, exp, error="return") + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes())) +def test_check_type_ok(arr): + check_type(arr, arr.dtype.type) + + +@given(nph.arrays(nph.floating_dtypes(), nph.array_shapes())) +def test_check_type_ok_subclass(arr): + check_type(arr, np.floating) + + +@given(nph.arrays(st.one_of(nph.integer_dtypes(), nph.floating_dtypes()), nph.array_shapes())) +def test_check_type_ok_multi(arr): + check_type(arr, np.integer, np.floating) + + +@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes())) +def test_check_type_ok_return(arr): + assert check_type(arr, arr.dtype.type, error="return") + + +@given(nph.arrays(nph.floating_dtypes(), nph.array_shapes())) +def test_check_type_bad_float(arr): + with raises(TypeError): + check_type(arr, np.integer) + + +@given(nph.arrays(nph.floating_dtypes(), nph.array_shapes())) +def test_check_type_bad_float_return(arr): + assert not check_type(arr, np.integer, error="return") + + +@given(nph.arrays(nph.integer_dtypes(), nph.array_shapes())) +def test_check_type_bad_int(arr): + with raises(TypeError): + check_type(arr, np.floating) + + +@given(nph.arrays(nph.unicode_string_dtypes(), nph.array_shapes())) +def test_check_type_bad_str(arr): + with raises(TypeError): + check_type(arr, np.number) + + with raises(TypeError): + check_type(arr, np.integer, np.floating) From a84f8d80752ddb757a10ab01486d9b94c033da44 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 10:48:17 -0400 Subject: [PATCH 08/16] document item list fields and get them working --- lenskit/tests/test_itemlist.py | 53 +++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index 6668dc2fd..a43d592e1 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -1,3 +1,9 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + import numpy as np import torch @@ -29,7 +35,27 @@ def test_item_list(): il.numbers() -def test_item_num_list(): +def test_item_list_alias(): + il = ItemList(item_id=["one", "two"]) + + assert len(il) == 2 + assert il.ids().shape == (2,) + + with raises(RuntimeError, match="item numbers not available"): + il.numbers() + + +def test_item_list_bad_type(): + with raises(TypeError): + ItemList(item_id=[3.4, 7.2]) + + +def test_item_list_bad_dimension(): + with raises(TypeError): + ItemList(item_id=[["one", "two"]]) + + +def test_item_num_array(): il = ItemList(item_nums=np.arange(5)) assert len(il) == 5 @@ -39,6 +65,31 @@ def test_item_num_list(): il.ids() +def test_item_num_alias(): + il = ItemList(item_num=np.arange(5)) + + assert len(il) == 5 + assert il.numbers().shape == (5,) + + with raises(RuntimeError, match="item IDs not available"): + il.ids() + + +def test_item_num_bad_type(): + with raises(TypeError): + ItemList(item_num=np.random.randn(5)) + + +def test_item_num_bad_dims(): + with raises(TypeError): + ItemList(item_num=[[1, 3, 8, 4]]) + + +def test_item_ids_num_mismatch_sizes(): + with raises(TypeError, match="has incorrect shape"): + ItemList(item_ids=ITEMS, item_num=np.arange(4)) + + def test_item_num_list_vocab(): il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB) From 7b74ed06f4692a5ed9d9017e972e1388cd0d4720 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 10:57:45 -0400 Subject: [PATCH 09/16] implement ranks for the item list --- lenskit/lenskit/data/items.py | 124 ++++++++++++++++++++++++++++----- lenskit/tests/test_itemlist.py | 12 ++++ 2 files changed, 118 insertions(+), 18 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 8175cd5ec..4ee7c666f 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -10,13 +10,14 @@ from __future__ import annotations -from typing import Literal, LiteralString, Sequence, TypeAlias, TypeVar, overload +from typing import Any, Literal, LiteralString, Sequence, TypeAlias, TypeVar, cast, overload import numpy as np import pandas as pd import torch from numpy.typing import ArrayLike, NDArray +from lenskit.data.checks import check_1d from lenskit.data.mtarray import MTArray, MTGenericArray from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary @@ -27,23 +28,61 @@ class ItemList: """ Representation of a (usually ordered) list of items, possibly with scores - and other associated data. + and other associated data. Item lists are to be treated as **immutable** — + create a new list with modified data, do not do in-place modifications of + the list itself or the arrays or data frame it returns. + + An item list logically a list of rows, each of which is an item, like a + :class:`~pandas.DataFrame` but supporting multiple array backends. + + .. note:: + + Naming for fields and accessor methods is tricky, because the usual + convention for a data frame is to use singular column names (e.g. + “item_id”, “score”) instead of plural (“item_ids”, “scores”) — the data + frame, like a database table, is a list of instances, and the column + names are best interpreted as naming attributes of individual instances. + + However, when working with a list of e.g. item IDs, it is more natural — + at least to this author — to use plural names: ``item_ids``. Since this + class is doing somewhat double-duty, representing a list of items along + with associated data, as well as a data frame of columns representing + items, the appropriate naming is not entirely clear. The naming + convention in this class is therefore as follows: + + * Field names are singular (``item_id``, ``score``). + * Named accessor methods are plural (:meth:`item_ids`, :meth:`scores`). + * Both singular and plural forms are accepted for item IDs numbers, and + scores in the keyword arguments. Other field names should be + singular. Args: item_ids: - A list or array of item identifiers. + A list or array of item identifiers. ``item_id`` is accepted as an + alternate name. item_nums: - A list or array of item numbers. + A list or array of item numbers. ``item_num`` is accepted as an + alternate name. vocabulary: A vocabulary to translate between item IDs and numbers. + ordered: + Whether the list has a meaningful order. + scores: + An array of scores for the items. fields: - Additional fields, such as ``score`` or ``rating``. + Additional fields, such as ``score`` or ``rating``. Field names + should generally be singular; the named keyword arguments and + accessor methods are plural for readability (“get the list of item + IDs”) """ + ordered: bool + "Whether this list has a meaningful order." _len: int _ids: np.ndarray[int, np.dtype[NPEntityId]] | None = None _numbers: MTArray[np.int32] | None = None _vocab: Vocabulary[EntityId] | None = None + _ranks: MTArray[np.int32] | None = None _fields: dict[str, MTGenericArray] def __init__( @@ -52,10 +91,21 @@ def __init__( item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None, item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None, vocabulary: Vocabulary[EID] | None = None, + ordered: bool = False, + scores: NDArray[np.generic] | torch.Tensor | ArrayLike | None = None, **fields: NDArray[np.generic] | torch.Tensor | ArrayLike, ): + self.ordered = ordered self._vocab = vocabulary + if item_ids is None and "item_id" in fields: + item_ids = np.asarray(cast(Any, fields["item_id"])) + + if item_nums is None and "item_num" in fields: + item_nums = np.asarray(cast(Any, fields["item_num"])) + if not issubclass(item_nums.dtype.type, np.integer): + raise TypeError("item numbers not integers") + if item_ids is None and item_nums is None: self._ids = np.ndarray(0, dtype=np.int32) self._numbers = MTArray(np.ndarray(0, dtype=np.int32)) @@ -63,28 +113,40 @@ def __init__( if item_ids is not None: self._ids = np.asarray(item_ids) - if len(self._ids.shape) > 1: - raise TypeError("item lists must be 1-dimensional") + if not issubclass(self._ids.dtype.type, (np.integer, np.str_, np.bytes_)): + raise TypeError(f"item IDs not integers or bytes (type: {self._ids.dtype})") + + check_1d(self._ids, label="item_ids") self._len = len(item_ids) if item_nums is not None: self._numbers = MTArray(item_nums) - if hasattr(self, "_len"): - if self._numbers.shape != (self._len,): - nl = self._numbers.shape[0] - raise ValueError( - f"item ID and number lists have different lengths ({self._len} != {nl})" - ) - else: - self._len = self._numbers.shape[0] + check_1d(self._numbers, getattr(self, "_len", None), label="item_nums") + self._len = self._numbers.shape[0] + + # convert fields and drop singular ID/number aliases + self._fields = { + name: check_1d(MTArray(data), self._len, label=name) + for (name, data) in fields.items() + if name not in ("item_id", "item_num") + } - self._fields = {name: MTArray(data) for (name, data) in fields.items()} + if scores is not None: + if "score" in fields: # pragma: nocover + raise ValueError("cannot specify both scores= and score=") + self._fields["score"] = MTArray(scores) def clone(self) -> ItemList: """ Make a shallow copy of the item list. """ - return ItemList(item_ids=self._ids, item_nums=self._numbers, vocabulary=self._vocab) + return ItemList( + item_ids=self._ids, + item_nums=self._numbers, + vocabulary=self._vocab, + ordered=self.ordered, + **self._fields, + ) def ids(self) -> NDArray[NPEntityId]: """ @@ -142,7 +204,33 @@ def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: """ Get the item scores (if available). """ - return self.field("scores", format) + return self.field("score", format) + + @overload + def ranks(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32] | None: ... + @overload + def ranks(self, format: Literal["torch"]) -> torch.Tensor | None: ... + @overload + def ranks(self, format: LiteralString = "numpy") -> ArrayLike | None: ... + def ranks(self, format: LiteralString = "numpy") -> ArrayLike | None: + """ + Get an array of ranks for the items in this list, if it is ordered. + Unordered lists have no ranks. The ranks are based on the order in the + list, **not** on the score. + + Item ranks start with **1**, for compatibility with common practice in + mathematically defining information retrieval metrics and operations. + + Returns: + An array of item ranks, or ``None`` if the list is unordered. + """ + if not self.ordered: + return None + + if self._ranks is None: + self._ranks = MTArray(np.arange(1, self._len + 1, dtype=np.int32)) + + return self._ranks.to(format) @overload def field( diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index a43d592e1..ee87ca99b 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -125,3 +125,15 @@ def test_scores(): assert isinstance(st, torch.Tensor) assert st.shape == (5,) assert np.all(st.numpy() == data) + + assert il.ranks() is None + + +def test_ranks(): + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, ordered=True) + assert il.ordered + + ranks = il.ranks() + assert ranks is not None + assert ranks.shape == (5,) + assert np.all(ranks == np.arange(1, 6)) From 406a6789379feef5c85c1671bb6940c6b85dbde0 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 11:11:31 -0400 Subject: [PATCH 10/16] improve dataset documentation --- docs/data.rst | 37 ++++++++++++++++++++++----------- lenskit/lenskit/data/dataset.py | 24 ++++++++++++++------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/docs/data.rst b/docs/data.rst index 76aea5c72..3cd22835c 100644 --- a/docs/data.rst +++ b/docs/data.rst @@ -73,9 +73,9 @@ abstract class with implementations covering various scenarios. Creating Datasets ~~~~~~~~~~~~~~~~~ -Several functions create :class:`Dataset`s from different input data sources. +Several functions can create a :class:`Dataset` from different input data sources. -.. autofunction:: from_interaction_df +.. autofunction:: from_interactions_df Loading Common Datasets ~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,13 +89,32 @@ LensKit uses *vocabularies* to record user/item IDs, tags, terms, etc. in a way that facilitates easy mapping to 0-based contiguous indexes for use in matrix and tensor data structures. -.. module:: lenskit.data.vocab +.. module:: lenskit.data .. autoclass:: Vocabulary -Dataset implementations + +Item Lists +~~~~~~~~~~ + +LensKit uses *item lists* to represent collections of items that may be scored, +ranked, etc. + +.. autoclass:: ItemList + +User-Item Data Tables +~~~~~~~~~~~~~~~~~~~~~ + +.. module:: lenskit.data.tables + +.. autoclass:: NumpyUserItemTable +.. autoclass:: TorchUserItemTable + +Dataset Implementations ~~~~~~~~~~~~~~~~~~~~~~~ +.. module:: lenskit.data.dataset + Matrix Dataset -------------- @@ -103,6 +122,7 @@ The :class:`MatrixDataset` provides an in-memory dataset implementation backed by a ratings matrix or implicit-feedback matrix. .. autoclass:: MatrixDataset + :no-members: Lazy Dataset ------------ @@ -111,11 +131,4 @@ The lazy data set takes a function that loads a data set (of any type), and lazily uses that function to load an underlying data set when needed. .. autoclass:: LazyDataset - -User-Item Data Tables -~~~~~~~~~~~~~~~~~~~~~ - -.. module:: lenskit.data.tables - -.. autoclass:: NumpyUserItemTable -.. autoclass:: TorchUserItemTable + :members: delegate diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py index 8b5a06be8..51f4987f0 100644 --- a/lenskit/lenskit/data/dataset.py +++ b/lenskit/lenskit/data/dataset.py @@ -469,7 +469,13 @@ def user_stats(self) -> pd.DataFrame: class MatrixDataset(Dataset): """ - Dataset implementaiton using an in-memory rating or implicit-feedback matrix. + Dataset implementation using an in-memory rating or implicit-feedback + matrix. + + .. note:: + Client code generally should not construct this class directly. Instead + use the various ``from_`` and ``load_`` functions in + :mod:`lenskit.data`. """ _users: Vocabulary[EntityId] @@ -713,6 +719,10 @@ class LazyDataset(Dataset): """ A data set with an underlying load function, that doesn't call the function until data is actually needed. + + Args: + loader: + The function that will load the dataset when needed. """ _delegate: Dataset | None = None @@ -720,11 +730,7 @@ class LazyDataset(Dataset): def __init__(self, loader: Callable[[], Dataset]): """ - Construct a dataset. - - .. note:: - Client code generally should not call this constructor. Instead use the - various ``from_`` and ``load_`` functions in :mod:`lenskit.data`. + Construct a lazy dataset. """ self._loader = loader @@ -779,9 +785,11 @@ def from_interactions_df( The user-item interactions (e.g. ratings). The dataset code takes ownership of this data frame and may modify it. user_col: - The name of the user ID column. + The name of the user ID column. By default, looks for columns named + ``user``, ``user_id``, or ``userId``, with several case variants. item_col: - The name of the item ID column. + The name of the item ID column. By default, looks for columns named + ``item``, ``item_id``, or ``itemId``, with several case variants. rating_col: The name of the rating column. timestamp_col: From 3d22b74de75694bad92a39254b66425a9aa1e528 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 11:31:25 -0400 Subject: [PATCH 11/16] improve some item list docs --- docs/releases/2024.rst | 23 ++++++++++++++++------- lenskit/lenskit/data/items.py | 7 ++++--- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst index c99c8626b..23e92cb05 100644 --- a/docs/releases/2024.rst +++ b/docs/releases/2024.rst @@ -24,13 +24,6 @@ Significant Changes 2024.1 brings substantial changes to LensKit. -* **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms, - instead of Numba-accelerated NumPy code. Algorithms using PyTorch are: - - * :py:class:`~lenskit.algorithms.knn.ItemItem` - * :py:class:`~lenskit.algorithms.als.ImplicitMF` - * :py:class:`~lenskit.algorithms.als.BiasedMF` - * :class:`~lenskit.data.Dataset`. LensKit now provides an abstraction for training data instead of working with Pandas data frames directly, that allows components to reduce code duplication and recomputation, access data @@ -39,6 +32,22 @@ Significant Changes supersedes the old bespoke dataset loading support, with functions like :func:`~lenskit.data.load_movielens` to load standard datasets. +* New classes like :class:`~lenskit.data.ItemList` for routing item data + instead of using Pandas data frames and series. This makes component return + types more self-documenting (rather than requiring developers to remember + what is on the index, what the column names are, etc.), and facilitates more + efficient data transfer between components that do not use Pandas (e.g. data + passed between components using PyTorch can leave the data in tensors + without round-tripping through Pandas and NumPy, and keep this transparent + to client code). + +* **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms, + instead of Numba-accelerated NumPy code. Algorithms using PyTorch are: + + * :py:class:`~lenskit.algorithms.knn.ItemItem` + * :py:class:`~lenskit.algorithms.als.ImplicitMF` + * :py:class:`~lenskit.algorithms.als.BiasedMF` + * Many LensKit components (batch running, model training, etc.) now report progress with :py:mod:`progress_api`, and can be connected to TQDM or Enlighten. diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 4ee7c666f..86186edb3 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -28,9 +28,10 @@ class ItemList: """ Representation of a (usually ordered) list of items, possibly with scores - and other associated data. Item lists are to be treated as **immutable** — - create a new list with modified data, do not do in-place modifications of - the list itself or the arrays or data frame it returns. + and other associated data; many components take and return item lists. Item + lists are to be treated as **immutable** — create a new list with modified + data, do not do in-place modifications of the list itself or the arrays or + data frame it returns. An item list logically a list of rows, each of which is an item, like a :class:`~pandas.DataFrame` but supporting multiple array backends. From ace2e4145f5cb4905703f45319aa4fdeed4c0e68 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 11:56:27 -0400 Subject: [PATCH 12/16] Add user_row method to get a single user's data. --- lenskit/lenskit/data/dataset.py | 58 +++++++++++++++++++++++++--- lenskit/tests/test_dataset_matrix.py | 41 +++++++++++++++++++- 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py index 51f4987f0..abff11283 100644 --- a/lenskit/lenskit/data/dataset.py +++ b/lenskit/lenskit/data/dataset.py @@ -32,6 +32,7 @@ override, ) +from lenskit.data.items import ItemList from lenskit.data.matrix import CSRStructure, InteractionMatrix from lenskit.data.vocab import Vocabulary @@ -87,7 +88,7 @@ def items(self) -> Vocabulary[EntityId]: """ The items known by this dataset. """ - ... + raise NotImplementedError() @property @abstractmethod @@ -95,7 +96,7 @@ def users(self) -> Vocabulary[EntityId]: """ The users known by this dataset. """ - ... + raise NotImplementedError() @abstractmethod def count(self, what: str) -> int: @@ -118,7 +119,7 @@ def count(self, what: str) -> int: * interactions * ratings """ - ... + raise NotImplementedError() @property def item_count(self) -> int: @@ -212,7 +213,7 @@ def interaction_log( Returns: The user-item interaction log in the specified format. """ - ... + raise NotImplementedError() @overload @abstractmethod @@ -359,7 +360,25 @@ def interaction_matrix( ``True`` to return user and item IDs instead of numbers in ``pandas``-format matrix. """ - ... + raise NotImplementedError() + + @abstractmethod + @overload + def user_row(self, user_id: EntityId) -> ItemList | None: ... + @abstractmethod + @overload + def user_row(self, *, user_num: int) -> ItemList: ... + @abstractmethod + def user_row( + self, user_id: EntityId | None = None, *, user_num: int | None = None + ) -> ItemList | None: + """ + Get a user's row from the interaction matrix. Available fields are + returned as fields. If the dataset has ratings, these are provided as a + ``rating`` field, **not** as the item scores. The item list is unordered, + but items are returned in order by item number. + """ + raise NotImplementedError() def item_stats(self) -> pd.DataFrame: """ @@ -714,6 +733,31 @@ def _int_log_torch(self, fields: list[str]) -> TorchUserItemTable: tbl.timestamps = torch.from_numpy(self._matrix.timestamps) return tbl + @override + def user_row( + self, user_id: EntityId | None = None, *, user_num: int | None = None + ) -> ItemList | None: + if user_num is None: + if user_id is None: # pragma: nocover + raise ValueError("most provide one of user_id and user_num") + + user_num = self.users.number(user_id, "none") + if user_num is None: + return None + + elif user_id is not None: # pragma: nocover + raise ValueError("most provide one of user_id and user_num") + + sp = self._matrix.user_ptrs[user_num] + ep = self._matrix.user_ptrs[user_num + 1] + inums = self._matrix.item_nums[sp:ep] + fields = {} + if self._matrix.ratings is not None: + fields["rating"] = self._matrix.ratings[sp:ep] + if self._matrix.timestamps is not None: + fields["timestamp"] = self._matrix.timestamps[sp:ep] + return ItemList(item_nums=inums, vocabulary=self.items, **fields) + class LazyDataset(Dataset): """ @@ -764,6 +808,10 @@ def interaction_matrix(self, *args, **kwargs) -> Any: def interaction_log(self, *args, **kwargs) -> Any: return self.delegate().interaction_log(*args, **kwargs) + @override + def user_row(self, *args, **kwargs) -> ItemList | None: + return self.delegate().user_row(*args, **kwargs) + def from_interactions_df( df: pd.DataFrame, diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py index f753c6c5e..fa940b70f 100644 --- a/lenskit/tests/test_dataset_matrix.py +++ b/lenskit/tests/test_dataset_matrix.py @@ -19,7 +19,7 @@ from pytest import mark, raises from lenskit.data import Dataset -from lenskit.data.dataset import FieldError, from_interactions_df +from lenskit.data.dataset import FieldError, MatrixDataset, from_interactions_df from lenskit.data.matrix import CSRStructure from lenskit.util.test import ml_ds, ml_ratings # noqa: F401 @@ -64,6 +64,7 @@ def _check_timestamp(ml_ds: Dataset, ml_ratings: pd.DataFrame, ts: ArrayLike): def test_internals(ml_ds: Dataset): "Test internal matrix structures" + assert isinstance(ml_ds, MatrixDataset) assert ml_ds._matrix.user_nums.dtype == np.int32 assert ml_ds._matrix.user_ptrs.dtype == np.int32 assert ml_ds._matrix.item_nums.dtype == np.int32 @@ -342,3 +343,41 @@ def test_matrix_torch_timestamp(ml_ratings: pd.DataFrame, ml_ds: Dataset): _check_item_number_counts(ml_ds, ml_ratings, log.col_indices()) _check_item_ids(ml_ds, ml_ratings, log.col_indices()) _check_timestamp(ml_ds, ml_ratings, log.values().numpy()) + + +def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset): + users = rng.choice(ml_ds.users.ids(), 50) + + for user in users: + row = ml_ds.user_row(user) + assert row is not None + urows = ml_ratings[ml_ratings["user"] == user].sort_values("item") + assert set(row.ids()) == set(urows["item"]) + assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"])) + + ratings = row.field("rating") + assert ratings is not None + assert np.all(ratings == urows["rating"]) + + timestamps = row.field("timestamp") + assert timestamps is not None + assert np.all(timestamps == urows["timestamp"]) + + +def test_matrix_rows_by_num(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset): + users = rng.choice(ml_ds.user_count, 50) + + for user in users: + row = ml_ds.user_row(user_num=user) + assert row is not None + urows = ml_ratings[ml_ratings["user"] == ml_ds.users.id(user)].sort_values("item") + assert set(row.ids()) == set(urows["item"]) + assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"])) + + ratings = row.field("rating") + assert ratings is not None + assert np.all(ratings == urows["rating"]) + + timestamps = row.field("timestamp") + assert timestamps is not None + assert np.all(timestamps == urows["timestamp"]) From 3cd5111da31af266e22bb610a646cd9c5ce922a5 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 12:04:59 -0400 Subject: [PATCH 13/16] support getting fields and scores as Pandas series --- lenskit/lenskit/data/items.py | 30 +++++++++++++++++++++++++++--- lenskit/tests/test_itemlist.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 86186edb3..6e089cce4 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -200,12 +200,16 @@ def scores(self, format: Literal["numpy"] = "numpy") -> NDArray[np.floating] | N @overload def scores(self, format: Literal["torch"]) -> torch.Tensor | None: ... @overload + def scores( + self, format: Literal["pandas"], *, index: Literal["ids", "numbers"] | None = None + ) -> pd.Series | None: ... + @overload def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: ... - def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: + def scores(self, format: LiteralString = "numpy", **kwargs) -> ArrayLike | None: """ Get the item scores (if available). """ - return self.field("score", format) + return self.field("score", format, **kwargs) @overload def ranks(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32] | None: ... @@ -240,11 +244,31 @@ def field( @overload def field(self, name: str, format: Literal["torch"]) -> torch.Tensor | None: ... @overload + def field( + self, + name: str, + format: Literal["pandas"], + *, + index: Literal["ids", "numbers"] | None = None, + ) -> pd.Series | None: ... + @overload def field(self, name: str, format: LiteralString) -> ArrayLike | None: ... - def field(self, name: str, format: LiteralString = "numpy") -> ArrayLike | None: + def field( + self, name: str, format: LiteralString = "numpy", *, index: LiteralString | None = None + ) -> ArrayLike | None: val = self._fields.get(name, None) if val is None: return None + elif format == "pandas": + idx = None + vs = val.to("numpy") + if index == "ids": + idx = pd.Index(self.ids(), name="item_id") + elif index == "numbers": + idx = pd.Index(self.numbers(), name="item_num") + elif index: # pragma: nocover + raise ValueError(f"unsupported Pandas index {index}") + return pd.Series(vs, index=idx) else: return val.to(format) diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index ee87ca99b..292ccc0ba 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -129,6 +129,38 @@ def test_scores(): assert il.ranks() is None +def test_scores_pandas_no_index(): + data = np.random.randn(5) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data) + + scores = il.scores("pandas") + assert scores is not None + assert scores.shape == (5,) + assert np.all(scores == data) + + +def test_scores_pandas_id_index(): + data = np.random.randn(5) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data) + scores = il.scores("pandas", index="ids") + assert scores is not None + assert scores.shape == (5,) + assert np.all(scores == data) + assert scores.index.name == "item_id" + assert np.all(scores.index.values == ITEMS) + + +def test_scores_pandas_num_index(): + data = np.random.randn(5) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data) + scores = il.scores("pandas", index="numbers") + assert scores is not None + assert scores.shape == (5,) + assert np.all(scores == data) + assert scores.index.name == "item_num" + assert np.all(scores.index.values == np.arange(5)) + + def test_ranks(): il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, ordered=True) assert il.ordered From 4c97e810ff990ca3310d5d3d625a814762904032 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 12:12:02 -0400 Subject: [PATCH 14/16] support alternative vocabularies when retrieving item numbes --- lenskit/lenskit/data/items.py | 28 +++++++++++++++++++++++----- lenskit/tests/test_itemlist.py | 8 ++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 6e089cce4..6666a1fe4 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -168,25 +168,43 @@ def ids(self) -> NDArray[NPEntityId]: return self._ids @overload - def numbers(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32]: ... + def numbers( + self, format: Literal["numpy"] = "numpy", *, vocabulary: Vocabulary[EID] | None = None + ) -> NDArray[np.int32]: ... @overload - def numbers(self, format: Literal["torch"]) -> torch.Tensor: ... + def numbers( + self, format: Literal["torch"], *, vocabulary: Vocabulary[EID] | None = None + ) -> torch.Tensor: ... @overload - def numbers(self, format: LiteralString = "numpy") -> ArrayLike: ... - def numbers(self, format: LiteralString = "numpy") -> ArrayLike: + def numbers( + self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None + ) -> ArrayLike: ... + def numbers( + self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None + ) -> ArrayLike: """ Get the item numbers. Args: format: The array format to use. + vocabulary: + A alternate vocabulary for mapping IDs to numbers. If provided, + then the item list must have IDs (either stored, or through a + vocabulary). Returns: An array of item numbers. Raises: - RuntimeError: if the item list was not created with numbers or a :class:`Vocabulary`. + RuntimeError: if the item list was not created with numbers or a + :class:`Vocabulary`. """ + if vocabulary is not None and vocabulary is not self._vocab: + # we need to translate vocabulary + ids = self.ids() + return vocabulary.numbers(ids) + if self._numbers is None: if self._vocab is None: raise RuntimeError("item numbers not available (no IDs or vocabulary provided)") diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index 292ccc0ba..a37d58aa6 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -169,3 +169,11 @@ def test_ranks(): assert ranks is not None assert ranks.shape == (5,) assert np.all(ranks == np.arange(1, 6)) + + +def test_numbers_alt_vocab(): + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB) + + av = Vocabulary(["A", "B"] + ITEMS) + nums = il.numbers(vocabulary=av) + assert np.all(nums == np.arange(2, 7)) From e5db4b8ee1bb93e5e39b2b79bb2feb25b96427e0 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 12:42:47 -0400 Subject: [PATCH 15/16] support converting item lists to data frames --- lenskit/lenskit/data/items.py | 23 +++++++++++++++++++++++ lenskit/tests/test_dataset_matrix.py | 5 +++++ lenskit/tests/test_itemlist.py | 22 ++++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 6666a1fe4..96059ffa5 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -290,5 +290,28 @@ def field( else: return val.to(format) + def to_df(self) -> pd.DataFrame: + """ + Convert this item list to a Pandas data frame. It has the following columns: + + * ``item_id`` — the item IDs (if available) + * ``item_id`` — the item numbers (if available) + * ``score`` — the item scores + * ``rank`` — the item ranks (if the list is ordered) + * all other defined fields, using their field names + """ + cols = {} + if self._ids is not None or self._vocab is not None: + cols["item_id"] = self.ids() + if self._numbers is not None or self._vocab is not None: + cols["item_num"] = self.numbers() + if "score" in self._fields: + cols["score"] = self.scores() + if self.ordered: + cols["rank"] = self.ranks() + # add remaining fields + cols.update((k, v.numpy()) for (k, v) in self._fields.items() if k != "score") + return pd.DataFrame(cols) + def __len__(self): return self._len diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py index fa940b70f..eb994d763 100644 --- a/lenskit/tests/test_dataset_matrix.py +++ b/lenskit/tests/test_dataset_matrix.py @@ -352,6 +352,7 @@ def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, m row = ml_ds.user_row(user) assert row is not None urows = ml_ratings[ml_ratings["user"] == user].sort_values("item") + urows = urows.reset_index(drop=True) assert set(row.ids()) == set(urows["item"]) assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"])) @@ -363,6 +364,10 @@ def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, m assert timestamps is not None assert np.all(timestamps == urows["timestamp"]) + # we'll quick check additional fields on the item list here + df = row.to_df() + assert np.all(df["timestamp"] == urows["timestamp"]) + def test_matrix_rows_by_num(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset): users = rng.choice(ml_ds.user_count, 50) diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index a37d58aa6..a5bbbee8f 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -177,3 +177,25 @@ def test_numbers_alt_vocab(): av = Vocabulary(["A", "B"] + ITEMS) nums = il.numbers(vocabulary=av) assert np.all(nums == np.arange(2, 7)) + + +def test_pandas_df(): + data = np.random.randn(5) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data) + + df = il.to_df() + assert np.all(df["item_id"] == ITEMS) + assert np.all(df["item_num"] == np.arange(5)) + assert np.all(df["score"] == data) + assert "rank" not in df.columns + + +def test_pandas_df_ordered(): + data = np.random.randn(5) + il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data, ordered=True) + + df = il.to_df() + assert np.all(df["item_id"] == ITEMS) + assert np.all(df["item_num"] == np.arange(5)) + assert np.all(df["score"] == data) + assert np.all(df["rank"] == np.arange(1, 6)) From f2a3ea6240ba48a3dd8c3ecdc37104734da49914 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 13:17:54 -0400 Subject: [PATCH 16/16] fix LiteralString import --- lenskit/lenskit/data/items.py | 12 ++++++++++-- lenskit/lenskit/data/mtarray.py | 3 +-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 96059ffa5..595ddac07 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -10,12 +10,20 @@ from __future__ import annotations -from typing import Any, Literal, LiteralString, Sequence, TypeAlias, TypeVar, cast, overload - import numpy as np import pandas as pd import torch from numpy.typing import ArrayLike, NDArray +from typing_extensions import ( + Any, + Literal, + LiteralString, + Sequence, + TypeAlias, + TypeVar, + cast, + overload, +) from lenskit.data.checks import check_1d from lenskit.data.mtarray import MTArray, MTGenericArray diff --git a/lenskit/lenskit/data/mtarray.py b/lenskit/lenskit/data/mtarray.py index d9e97fe58..de1f73713 100644 --- a/lenskit/lenskit/data/mtarray.py +++ b/lenskit/lenskit/data/mtarray.py @@ -7,11 +7,10 @@ # pyright: basic from __future__ import annotations -from typing import Generic, Literal, LiteralString, Sequence, TypeVar, overload - import numpy as np import torch from numpy.typing import ArrayLike, NDArray +from typing_extensions import Generic, Literal, LiteralString, Sequence, TypeVar, overload NPT = TypeVar("NPT", bound=np.generic)