From 0c2a69dfd600067446307edadd2686a3e9c5ee34 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 17:52:03 -0400
Subject: [PATCH 01/16] add a multi-type array class

---
 lenskit/lenskit/data/__init__.py |   1 +
 lenskit/lenskit/data/mtarray.py  | 126 +++++++++++++++++++++++++++++++
 lenskit/lenskit/data/vocab.py    |  10 ++-
 lenskit/tests/test_mtarray.py    |  69 +++++++++++++++++
 4 files changed, 204 insertions(+), 2 deletions(-)
 create mode 100644 lenskit/lenskit/data/mtarray.py
 create mode 100644 lenskit/tests/test_mtarray.py

diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py
index 6ddefed5d..493d1fec7 100644
--- a/lenskit/lenskit/data/__init__.py
+++ b/lenskit/lenskit/data/__init__.py
@@ -13,3 +13,4 @@
 
 from .dataset import Dataset, from_interactions_df  # noqa: F401, E402
 from .movielens import load_movielens  # noqa: F401, E402
+from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray  # noqa: F401, E402
diff --git a/lenskit/lenskit/data/mtarray.py b/lenskit/lenskit/data/mtarray.py
new file mode 100644
index 000000000..d9e97fe58
--- /dev/null
+++ b/lenskit/lenskit/data/mtarray.py
@@ -0,0 +1,126 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+# pyright: basic
+from __future__ import annotations
+
+from typing import Generic, Literal, LiteralString, Sequence, TypeVar, overload
+
+import numpy as np
+import torch
+from numpy.typing import ArrayLike, NDArray
+
+NPT = TypeVar("NPT", bound=np.generic)
+
+
+class MTArray(Generic[NPT]):
+    """
+    Multi-typed array class, allowing arrays to be easily converted between
+    NumPy, PyTorch, and other supported backends, caching the conversion result.
+
+    We use this class instead of one canonical format so that data can be
+    converted lazily, and can be left in-place when passing from one component
+    to another that use the same computational engine.
+
+    .. note::
+
+        This class is only intended for read-only arrays.  It is **not defined**
+        whether the different arrays share storage, and modifying one format may
+        or may not modify another.  For example, PyTorch and NumPy usually share
+        storage when both are on CPU, but a GPU tensor and CPU ndarray do not.
+    """
+
+    _shape: tuple[int, ...] | None = None
+    _unknown: object = None
+    _numpy: NDArray[NPT] | None = None
+    _torch: torch.Tensor | None = None
+
+    def __init__(self, array: NDArray[NPT] | torch.Tensor | Sequence | ArrayLike):
+        """
+        Construct a new MTArray around an array.
+        """
+        # TODO: support DLpack
+        if isinstance(array, torch.Tensor):
+            # torch might not be on-device
+            self._torch = array
+            self._shape = array.shape
+        else:
+            # stash it in the common-format field for lazy conversion
+            self._unknown = array
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        if self._shape is None:
+            self._shape = self.numpy().shape
+
+        return self._shape
+
+    def numpy(self) -> NDArray[NPT]:
+        """
+        Get the array as a NumPy array.
+        """
+        if self._numpy is None:
+            self._numpy = np.asarray(self._convertible())
+
+        assert self._numpy is not None
+        return self._numpy
+
+    def torch(self, *, device: str | None = None) -> torch.Tensor:
+        """
+        Get the array as a PyTorch tensor.
+
+        Args:
+            device:
+                The device on which the Torch tensor should reside.
+        """
+        if self._torch is None:
+            self._torch = torch.as_tensor(self._convertible())
+
+        if device:
+            return self._torch.to(device)
+        else:
+            return self._torch
+
+    @overload
+    def to(self, format: Literal["numpy"]) -> NDArray[NPT]: ...
+    @overload
+    def to(self, format: Literal["torch"], *, device: str | None) -> torch.Tensor: ...
+    @overload
+    def to(self, format: LiteralString, *, device: str | None = None) -> ArrayLike: ...
+    def to(self, format: str, *, device: str | None = None) -> ArrayLike:
+        """
+        Obtain the array in the specified format (dynamic version).
+        """
+        if format == "numpy":
+            return self.numpy()
+        elif format == "torch":
+            return self.torch(device=device)
+        else:
+            raise RuntimeError(f"unsupported array format {format}")
+
+    def _convertible(self) -> object:
+        """
+        Get the data suitable for passing to an ``as_array`` method.
+        """
+        # look for a good format for the data. if we've already made a numpy,
+        # use that; otherwise, try unknown, and fall back to torch (moved to
+        # CPU). end result: convertible data.
+        if self._numpy is not None:
+            return self._numpy
+        elif self._unknown is not None:
+            return self._unknown
+        elif self._torch is not None:
+            return self._torch.cpu()
+        else:
+            raise RuntimeError("cannot find array data")
+
+    def __array__(self) -> NDArray[NPT]:
+        return self.numpy()
+
+
+MTFloatArray = MTArray[np.floating]
+MTIntArray = MTArray[np.integer]
+MTGenericArray = MTArray[np.generic]
diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py
index 7dbc8069e..f4bdf7ca3 100644
--- a/lenskit/lenskit/data/vocab.py
+++ b/lenskit/lenskit/data/vocab.py
@@ -19,6 +19,8 @@
 
 EntityId: TypeAlias = int | str | bytes
 "Allowable entity identifier types."
+NPEntityId: TypeAlias = np.integer | np.str_ | np.bytes_
+"Allowable entity identifier types (NumPy version)"
 
 VT = TypeVar("VT", bound=Hashable)
 "Term type in a vocabulary."
@@ -100,7 +102,9 @@ def term(self, num: int) -> VT:
             raise IndexError("negative numbers not supported")
         return self._index[num]
 
-    def terms(self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None) -> np.ndarray:
+    def terms(
+        self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None
+    ) -> NDArray[NPEntityId]:
         """
         Get a list of terms, optionally for an array of term numbers.
 
@@ -125,7 +129,9 @@ def id(self, num: int) -> VT:
         "Alias for :meth:`term`  for greater readability for entity ID vocabularies."
         return self.term(num)
 
-    def ids(self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None) -> np.ndarray:
+    def ids(
+        self, nums: list[int] | NDArray[np.integer] | pd.Series | None = None
+    ) -> NDArray[NPEntityId]:
         "Alias for :meth:`terms` for greater readability for entity ID vocabularies."
         return self.terms(nums)
 
diff --git a/lenskit/tests/test_mtarray.py b/lenskit/tests/test_mtarray.py
new file mode 100644
index 000000000..8eb29134f
--- /dev/null
+++ b/lenskit/tests/test_mtarray.py
@@ -0,0 +1,69 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+import torch
+from numpy.typing import NDArray
+
+import hypothesis.extra.numpy as nph
+import hypothesis.strategies as st
+from hypothesis import assume, given
+
+from lenskit.data.mtarray import MTArray, MTIntArray
+
+
+@given(
+    nph.arrays(
+        dtype=st.one_of(nph.integer_dtypes(endianness="="), nph.floating_dtypes(endianness="=")),
+        shape=nph.array_shapes(),
+    )
+)
+def test_from_numpy(arr: NDArray[np.generic]):
+    # limit to data types that match
+    assume(np.all(np.isfinite(arr)))
+    mta = MTArray(arr)
+    assert mta.shape == arr.shape
+    npa = mta.numpy()
+    assert npa is arr
+
+    tensor = mta.torch()
+    assert tensor.shape == arr.shape
+    assert np.all(tensor.numpy() == arr)
+
+    assert np.asarray(mta) is arr
+
+
+@given(
+    nph.arrays(
+        dtype=st.one_of(nph.integer_dtypes(endianness="="), nph.floating_dtypes(endianness="=")),
+        shape=nph.array_shapes(),
+    )
+)
+def test_from_torch(arr: NDArray[np.generic]):
+    # limit to data types that match
+    assume(np.all(np.isfinite(arr)))
+    ot = torch.from_numpy(arr)
+    mta = MTArray(ot)
+    assert mta.shape == arr.shape
+    tensor = mta.torch()
+    assert tensor is ot
+
+    npa = mta.numpy()
+    assert npa.shape == arr.shape
+    assert np.all(npa == arr)
+
+
+@given(st.lists(st.integers(min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max)))
+def test_from_list(xs: list[int]):
+    # limit to data types that match
+    mta = MTIntArray(xs)
+    assert mta.shape == (len(xs),)
+
+    npa = mta.numpy()
+    assert np.all(npa == xs)
+
+    tensor = mta.torch()
+    assert np.all(tensor.numpy() == xs)

From 0af303e64de292c94c156dd9e4c5456290c19d19 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 19:06:34 -0400
Subject: [PATCH 02/16] initial item list (without scores)

---
 lenskit/lenskit/data/__init__.py |   1 +
 lenskit/lenskit/data/items.py    | 120 +++++++++++++++++++++++++++++++
 lenskit/tests/test_itemlist.py   |  57 +++++++++++++++
 3 files changed, 178 insertions(+)
 create mode 100644 lenskit/lenskit/data/items.py
 create mode 100644 lenskit/tests/test_itemlist.py

diff --git a/lenskit/lenskit/data/__init__.py b/lenskit/lenskit/data/__init__.py
index 493d1fec7..549b39f49 100644
--- a/lenskit/lenskit/data/__init__.py
+++ b/lenskit/lenskit/data/__init__.py
@@ -12,5 +12,6 @@
 "Types of feedback supported."
 
 from .dataset import Dataset, from_interactions_df  # noqa: F401, E402
+from .items import ItemList  # noqa: F401, E402
 from .movielens import load_movielens  # noqa: F401, E402
 from .mtarray import MTArray, MTFloatArray, MTGenericArray, MTIntArray  # noqa: F401, E402
diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
new file mode 100644
index 000000000..b77be4a3b
--- /dev/null
+++ b/lenskit/lenskit/data/items.py
@@ -0,0 +1,120 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+"""
+Primary item-list abstraction.
+"""
+
+from __future__ import annotations
+
+from typing import Literal, LiteralString, Sequence, TypeAlias, overload
+
+import numpy as np
+import pandas as pd
+import torch
+from numpy.typing import ArrayLike, NDArray
+
+from lenskit.data.mtarray import MTArray, MTGenericArray
+from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary
+
+Backend: TypeAlias = Literal["numpy", "torch"]
+
+
+class ItemList:
+    """
+    Representation of a (usually ordered) list of items, possibly with scores
+    and other associated data.
+    """
+
+    _len: int
+    _ids: np.ndarray[int, np.dtype[NPEntityId]] | None = None
+    _numbers: MTArray[np.int32] | None = None
+    _vocab: Vocabulary[EntityId] | None = None
+    _fields: dict[str, MTGenericArray]
+
+    def __init__(
+        self,
+        *,
+        item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None,
+        item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None,
+        vocabulary: Vocabulary[EntityId] | None = None,
+    ):
+        self._vocab = vocabulary
+        self._fields = {}
+
+        if item_ids is None and item_nums is None:
+            self._ids = np.ndarray(0, dtype=np.int32)
+            self._numbers = MTArray(np.ndarray(0, dtype=np.int32))
+            self._len = 0
+
+        if item_ids is not None:
+            self._ids = np.asarray(item_ids)
+            if len(self._ids.shape) > 1:
+                raise TypeError("item lists must be 1-dimensional")
+            self._len = len(item_ids)
+        if item_nums is not None:
+            self._numbers = MTArray(item_nums)
+            if hasattr(self, "_len"):
+                if self._numbers.shape != (self._len,):
+                    nl = self._numbers.shape[0]
+                    raise ValueError(
+                        f"item ID and number lists have different lengths ({self._len} != {nl})"
+                    )
+            else:
+                self._len = self._numbers.shape[0]
+
+    def clone(self) -> ItemList:
+        """
+        Make a shallow copy of the item list.
+        """
+        return ItemList(item_ids=self._ids, item_nums=self._numbers, vocabulary=self._vocab)
+
+    def ids(self) -> NDArray[NPEntityId]:
+        """
+        Get the item IDs.
+
+        Returns:
+            An array of item identifiers.
+
+        Raises:
+            RuntimeError: if the item list was not created with IDs or a :class:`Vocabulary`.
+        """
+        if self._ids is None:
+            if self._vocab is None:
+                raise RuntimeError("item IDs not available (no IDs or vocabulary provided)")
+            assert self._numbers is not None
+            self._ids = self._vocab.ids(self._numbers.numpy())
+
+        return self._ids
+
+    @overload
+    def numbers(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32]: ...
+    @overload
+    def numbers(self, format: Literal["torch"]) -> torch.Tensor: ...
+    def numbers(self, format: LiteralString = "numpy") -> ArrayLike:
+        """
+        Get the item numbers.
+
+        Args:
+            format:
+                The array format to use.
+
+        Returns:
+            An array of item numbers.
+
+        Raises:
+            RuntimeError: if the item list was not created with numbers or a :class:`Vocabulary`.
+        """
+        if self._numbers is None:
+            if self._vocab is None:
+                raise RuntimeError("item numbers not available (no IDs or vocabulary provided)")
+            assert self._ids is not None
+            self._numbers = MTArray(self._vocab.numbers(self._ids))
+
+        return self._numbers.to(format)
+
+    def __len__(self):
+        return self._len
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
new file mode 100644
index 000000000..2872c0b73
--- /dev/null
+++ b/lenskit/tests/test_itemlist.py
@@ -0,0 +1,57 @@
+import numpy as np
+
+from pytest import raises
+
+from lenskit.data import ItemList
+from lenskit.data.vocab import Vocabulary
+
+
+def test_empty():
+    il = ItemList()
+
+    assert len(il) == 0
+    assert il.numbers().shape == (0,)
+    assert il.ids().shape == (0,)
+
+
+def test_item_list():
+    il = ItemList(item_ids=["one", "two"])
+
+    assert len(il) == 2
+    assert il.ids().shape == (2,)
+
+    with raises(RuntimeError, match="item numbers not available"):
+        il.numbers()
+
+
+def test_item_num_list():
+    il = ItemList(item_nums=np.arange(5))
+
+    assert len(il) == 5
+    assert il.numbers().shape == (5,)
+
+    with raises(RuntimeError, match="item IDs not available"):
+        il.ids()
+
+
+def test_item_num_list_vocab():
+    il = ItemList(item_nums=np.arange(5), vocabulary=Vocabulary(["a", "b", "c", "d", "e"]))
+
+    assert len(il) == 5
+    assert il.numbers().shape == (5,)
+    assert il.ids().shape == (5,)
+
+    assert all(il.numbers() == np.arange(5))
+    assert all(il.ids() == ["a", "b", "c", "d", "e"])
+
+
+def test_item_id_list_vocab():
+    idl = ["a", "b", "c", "d", "e"]
+    il = ItemList(item_ids=idl, vocabulary=Vocabulary(idl))
+
+    assert len(il) == 5
+    assert il.numbers().shape == (5,)
+    assert il.ids().shape == (5,)
+
+    assert all(il.numbers() == np.arange(5))
+    assert all(il.ids() == ["a", "b", "c", "d", "e"])

From 6c6e22616f0c03496b48651865583ff432e46a4f Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 19:42:30 -0400
Subject: [PATCH 03/16] make vocab covariant

---
 lenskit/lenskit/data/vocab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py
index f4bdf7ca3..48a90dc5e 100644
--- a/lenskit/lenskit/data/vocab.py
+++ b/lenskit/lenskit/data/vocab.py
@@ -22,7 +22,7 @@
 NPEntityId: TypeAlias = np.integer | np.str_ | np.bytes_
 "Allowable entity identifier types (NumPy version)"
 
-VT = TypeVar("VT", bound=Hashable)
+VT = TypeVar("VT", bound=Hashable, covariant=True)
 "Term type in a vocabulary."
 
 

From db35ec15552ee5cc790f4b3efb469eeed9c24d21 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 19:44:00 -0400
Subject: [PATCH 04/16] remove vocab covariance

---
 lenskit/lenskit/data/vocab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py
index 48a90dc5e..f4bdf7ca3 100644
--- a/lenskit/lenskit/data/vocab.py
+++ b/lenskit/lenskit/data/vocab.py
@@ -22,7 +22,7 @@
 NPEntityId: TypeAlias = np.integer | np.str_ | np.bytes_
 "Allowable entity identifier types (NumPy version)"
 
-VT = TypeVar("VT", bound=Hashable, covariant=True)
+VT = TypeVar("VT", bound=Hashable)
 "Term type in a vocabulary."
 
 

From af51cac8a9a8f7c0d20fa9050d4e1e0261565b08 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 19:44:52 -0400
Subject: [PATCH 05/16] fix vocabulary type variance

---
 lenskit/lenskit/data/items.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index b77be4a3b..2a2590956 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -10,7 +10,7 @@
 
 from __future__ import annotations
 
-from typing import Literal, LiteralString, Sequence, TypeAlias, overload
+from typing import Literal, LiteralString, Sequence, TypeAlias, TypeVar, overload
 
 import numpy as np
 import pandas as pd
@@ -21,6 +21,7 @@
 from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary
 
 Backend: TypeAlias = Literal["numpy", "torch"]
+EID = TypeVar("EID", bound=EntityId)
 
 
 class ItemList:
@@ -40,7 +41,7 @@ def __init__(
         *,
         item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None,
         item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None,
-        vocabulary: Vocabulary[EntityId] | None = None,
+        vocabulary: Vocabulary[EID] | None = None,
     ):
         self._vocab = vocabulary
         self._fields = {}

From 71a9a517b21c24bae3bcb9f13af9ece59e6e8d4e Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 19:49:56 -0400
Subject: [PATCH 06/16] initial working scores

---
 lenskit/lenskit/data/items.py  | 44 +++++++++++++++++++++++++++++++++-
 lenskit/tests/test_itemlist.py | 29 ++++++++++++++++++----
 2 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 2a2590956..8175cd5ec 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -28,6 +28,16 @@ class ItemList:
     """
     Representation of a (usually ordered) list of items, possibly with scores
     and other associated data.
+
+    Args:
+        item_ids:
+            A list or array of item identifiers.
+        item_nums:
+            A list or array of item numbers.
+        vocabulary:
+            A vocabulary to translate between item IDs and numbers.
+        fields:
+            Additional fields, such as ``score`` or ``rating``.
     """
 
     _len: int
@@ -42,9 +52,9 @@ def __init__(
         item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None,
         item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None,
         vocabulary: Vocabulary[EID] | None = None,
+        **fields: NDArray[np.generic] | torch.Tensor | ArrayLike,
     ):
         self._vocab = vocabulary
-        self._fields = {}
 
         if item_ids is None and item_nums is None:
             self._ids = np.ndarray(0, dtype=np.int32)
@@ -56,6 +66,7 @@ def __init__(
             if len(self._ids.shape) > 1:
                 raise TypeError("item lists must be 1-dimensional")
             self._len = len(item_ids)
+
         if item_nums is not None:
             self._numbers = MTArray(item_nums)
             if hasattr(self, "_len"):
@@ -67,6 +78,8 @@ def __init__(
             else:
                 self._len = self._numbers.shape[0]
 
+        self._fields = {name: MTArray(data) for (name, data) in fields.items()}
+
     def clone(self) -> ItemList:
         """
         Make a shallow copy of the item list.
@@ -95,6 +108,8 @@ def ids(self) -> NDArray[NPEntityId]:
     def numbers(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32]: ...
     @overload
     def numbers(self, format: Literal["torch"]) -> torch.Tensor: ...
+    @overload
+    def numbers(self, format: LiteralString = "numpy") -> ArrayLike: ...
     def numbers(self, format: LiteralString = "numpy") -> ArrayLike:
         """
         Get the item numbers.
@@ -117,5 +132,32 @@ def numbers(self, format: LiteralString = "numpy") -> ArrayLike:
 
         return self._numbers.to(format)
 
+    @overload
+    def scores(self, format: Literal["numpy"] = "numpy") -> NDArray[np.floating] | None: ...
+    @overload
+    def scores(self, format: Literal["torch"]) -> torch.Tensor | None: ...
+    @overload
+    def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: ...
+    def scores(self, format: LiteralString = "numpy") -> ArrayLike | None:
+        """
+        Get the item scores (if available).
+        """
+        return self.field("scores", format)
+
+    @overload
+    def field(
+        self, name: str, format: Literal["numpy"] = "numpy"
+    ) -> NDArray[np.floating] | None: ...
+    @overload
+    def field(self, name: str, format: Literal["torch"]) -> torch.Tensor | None: ...
+    @overload
+    def field(self, name: str, format: LiteralString) -> ArrayLike | None: ...
+    def field(self, name: str, format: LiteralString = "numpy") -> ArrayLike | None:
+        val = self._fields.get(name, None)
+        if val is None:
+            return None
+        else:
+            return val.to(format)
+
     def __len__(self):
         return self._len
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index 2872c0b73..6668dc2fd 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -1,10 +1,14 @@
 import numpy as np
+import torch
 
 from pytest import raises
 
 from lenskit.data import ItemList
 from lenskit.data.vocab import Vocabulary
 
+ITEMS = ["a", "b", "c", "d", "e"]
+VOCAB = Vocabulary(ITEMS)
+
 
 def test_empty():
     il = ItemList()
@@ -12,6 +16,7 @@ def test_empty():
     assert len(il) == 0
     assert il.numbers().shape == (0,)
     assert il.ids().shape == (0,)
+    assert il.scores() is None
 
 
 def test_item_list():
@@ -35,23 +40,37 @@ def test_item_num_list():
 
 
 def test_item_num_list_vocab():
-    il = ItemList(item_nums=np.arange(5), vocabulary=Vocabulary(["a", "b", "c", "d", "e"]))
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB)
 
     assert len(il) == 5
     assert il.numbers().shape == (5,)
     assert il.ids().shape == (5,)
 
     assert all(il.numbers() == np.arange(5))
-    assert all(il.ids() == ["a", "b", "c", "d", "e"])
+    assert all(il.ids() == ITEMS)
 
 
 def test_item_id_list_vocab():
-    idl = ["a", "b", "c", "d", "e"]
-    il = ItemList(item_ids=idl, vocabulary=Vocabulary(idl))
+    il = ItemList(item_ids=ITEMS, vocabulary=VOCAB)
 
     assert len(il) == 5
     assert il.numbers().shape == (5,)
     assert il.ids().shape == (5,)
 
     assert all(il.numbers() == np.arange(5))
-    assert all(il.ids() == ["a", "b", "c", "d", "e"])
+    assert all(il.ids() == ITEMS)
+
+
+def test_scores():
+    data = np.random.randn(5)
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data)
+
+    scores = il.scores()
+    assert scores is not None
+    assert scores.shape == (5,)
+    assert np.all(scores == data)
+
+    st = il.scores("torch")
+    assert isinstance(st, torch.Tensor)
+    assert st.shape == (5,)
+    assert np.all(st.numpy() == data)

From 72b9ff7509d9cf9258a9c7a9e7916bdad60816a8 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 10:48:04 -0400
Subject: [PATCH 07/16] add data check routines

---
 lenskit/lenskit/data/checks.py    | 150 ++++++++++++++++++++++++++++++
 lenskit/tests/test_data_checks.py |  98 +++++++++++++++++++
 2 files changed, 248 insertions(+)
 create mode 100644 lenskit/lenskit/data/checks.py
 create mode 100644 lenskit/tests/test_data_checks.py

diff --git a/lenskit/lenskit/data/checks.py b/lenskit/lenskit/data/checks.py
new file mode 100644
index 000000000..2cde04cc6
--- /dev/null
+++ b/lenskit/lenskit/data/checks.py
@@ -0,0 +1,150 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+"Data check functions for LensKit."
+
+# pyright: strict
+from __future__ import annotations
+
+from typing import Any, Literal, Protocol, TypeVar, overload
+
+import numpy as np
+from numpy.typing import NDArray
+
+
+class HasShape(Protocol):
+    @property
+    def shape(self) -> tuple[int, ...]: ...
+
+
+A = TypeVar("A", bound=HasShape)
+NPT = TypeVar("NPT", bound=np.generic)
+
+
+@overload
+def check_1d(
+    arr: A,
+    size: int | None = None,
+    *,
+    label: str = "array",
+    error: Literal["raise"] = "raise",
+) -> A: ...
+@overload
+def check_1d(
+    arr: HasShape,
+    size: int | None = None,
+    *,
+    error: Literal["return"],
+) -> bool: ...
+def check_1d(
+    arr: A,
+    size: int | None = None,
+    *,
+    label: str = "array",
+    error: Literal["raise", "return"] = "raise",
+) -> bool | A:
+    """
+    Check that an array is one-dimensional, optionally checking that it has the
+    expected length.
+
+    This check function has 2 modes:
+
+    *   If ``error="raise"`` (the default), it will raise a :class:`TypeError`
+        if the array shape is incorrect, and return the array otherwise.
+    *   If ``error="return"``, it will return ``True`` or ``False`` depending on
+        whether the size is correct.
+
+    Args:
+        arr:
+            The array to check.
+        size:
+            The expected size of the array. If unspecified, this function simply
+            checks that the array is 1-dimensional, but does not check the size
+            of that dimension.
+        label:
+            A label to use in the exception message.
+        error:
+            The behavior when an array fails the test.
+
+    Returns:
+        The array, if ``error="raise"`` and the array passes the check, or a
+        boolean indicating whether it passes the check.
+
+    Raises:
+        TypeError: if ``error="raise"`` and the array fails the check.
+    """
+    if size is None and len(arr.shape) > 1:
+        if error == "raise":
+            raise TypeError(f"{label} must be 1D (has shape {arr.shape})")
+        else:
+            return False
+    elif size is not None and arr.shape != (size,):
+        if error == "raise":
+            raise TypeError(f"{label} has incorrect shape (found {arr.shape}, expected {size})")
+        else:
+            return False
+
+    if error == "raise":
+        return arr
+    else:
+        return True
+
+
+@overload
+def check_type(
+    arr: NDArray[Any],
+    *types: type[NPT],
+    label: str = "array",
+    error: Literal["raise"] = "raise",
+) -> NDArray[NPT]: ...
+@overload
+def check_type(
+    arr: NDArray[Any],
+    *types: type[NPT],
+    error: Literal["return"],
+) -> bool: ...
+def check_type(
+    arr: NDArray[Any],
+    *types: type[NPT],
+    label: str = "array",
+    error: Literal["raise", "return"] = "raise",
+) -> bool | NDArray[Any]:
+    """
+    Check that an array array is of an acceptable type.
+
+    This check function has 2 modes:
+
+    *   If ``error="raise"`` (the default), it will raise a :class:`TypeError`
+        if the array shape is incorrect, and return the array otherwise.
+    *   If ``error="return"``, it will return ``True`` or ``False`` depending on
+        whether the size is correct.
+
+    Args:
+        arr:
+            The array to check.
+        types:
+            The acceptable types for the array.
+        label:
+            A label to use in the exception message.
+        error:
+            The behavior when an array fails the test.
+
+    Returns:
+        The array, if ``error="raise"`` and the array passes the check, or a
+        boolean indicating whether it passes the check.
+
+    Raises:
+        TypeError: if ``error="raise"`` and the array fails the check.
+    """
+    if issubclass(arr.dtype.type, types):
+        if error == "raise":
+            return arr
+        else:
+            return True
+    elif error == "raise":
+        raise TypeError(f"{label} has incorrect type {arr.dtype} (allowed: {types})")
+    else:
+        return False
diff --git a/lenskit/tests/test_data_checks.py b/lenskit/tests/test_data_checks.py
new file mode 100644
index 000000000..5f07c97ae
--- /dev/null
+++ b/lenskit/tests/test_data_checks.py
@@ -0,0 +1,98 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+
+import hypothesis.extra.numpy as nph
+import hypothesis.strategies as st
+from hypothesis import given
+from pytest import raises
+
+from lenskit.data.checks import check_1d, check_type
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=1, max_dims=1)))
+def test_check_1d_ok(arr):
+    check_1d(arr)
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=1, max_dims=1)))
+def test_check_1d_ok_return(arr):
+    assert check_1d(arr, error="return")
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=2)))
+def test_check_1d_bad(arr):
+    with raises(TypeError, match="must be 1D"):
+        check_1d(arr)
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes(min_dims=2)))
+def test_check_1d_bad_return(arr):
+    assert not check_1d(arr, error="return")
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes()), st.integers(min_value=0))
+def test_check_expected_size(arr, exp):
+    if arr.shape == (exp,):
+        check_1d(arr, exp)
+    else:
+        with raises(TypeError):
+            check_1d(arr, exp)
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes()), st.integers(min_value=0))
+def test_check_expected_size_return(arr, exp):
+    if arr.shape == (exp,):
+        assert check_1d(arr, exp, error="return")
+    else:
+        assert not check_1d(arr, exp, error="return")
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes()))
+def test_check_type_ok(arr):
+    check_type(arr, arr.dtype.type)
+
+
+@given(nph.arrays(nph.floating_dtypes(), nph.array_shapes()))
+def test_check_type_ok_subclass(arr):
+    check_type(arr, np.floating)
+
+
+@given(nph.arrays(st.one_of(nph.integer_dtypes(), nph.floating_dtypes()), nph.array_shapes()))
+def test_check_type_ok_multi(arr):
+    check_type(arr, np.integer, np.floating)
+
+
+@given(nph.arrays(nph.scalar_dtypes(), nph.array_shapes()))
+def test_check_type_ok_return(arr):
+    assert check_type(arr, arr.dtype.type, error="return")
+
+
+@given(nph.arrays(nph.floating_dtypes(), nph.array_shapes()))
+def test_check_type_bad_float(arr):
+    with raises(TypeError):
+        check_type(arr, np.integer)
+
+
+@given(nph.arrays(nph.floating_dtypes(), nph.array_shapes()))
+def test_check_type_bad_float_return(arr):
+    assert not check_type(arr, np.integer, error="return")
+
+
+@given(nph.arrays(nph.integer_dtypes(), nph.array_shapes()))
+def test_check_type_bad_int(arr):
+    with raises(TypeError):
+        check_type(arr, np.floating)
+
+
+@given(nph.arrays(nph.unicode_string_dtypes(), nph.array_shapes()))
+def test_check_type_bad_str(arr):
+    with raises(TypeError):
+        check_type(arr, np.number)
+
+    with raises(TypeError):
+        check_type(arr, np.integer, np.floating)

From a84f8d80752ddb757a10ab01486d9b94c033da44 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 10:48:17 -0400
Subject: [PATCH 08/16] document item list fields and get them working

---
 lenskit/tests/test_itemlist.py | 53 +++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index 6668dc2fd..a43d592e1 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -1,3 +1,9 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
 import numpy as np
 import torch
 
@@ -29,7 +35,27 @@ def test_item_list():
         il.numbers()
 
 
-def test_item_num_list():
+def test_item_list_alias():
+    il = ItemList(item_id=["one", "two"])
+
+    assert len(il) == 2
+    assert il.ids().shape == (2,)
+
+    with raises(RuntimeError, match="item numbers not available"):
+        il.numbers()
+
+
+def test_item_list_bad_type():
+    with raises(TypeError):
+        ItemList(item_id=[3.4, 7.2])
+
+
+def test_item_list_bad_dimension():
+    with raises(TypeError):
+        ItemList(item_id=[["one", "two"]])
+
+
+def test_item_num_array():
     il = ItemList(item_nums=np.arange(5))
 
     assert len(il) == 5
@@ -39,6 +65,31 @@ def test_item_num_list():
         il.ids()
 
 
+def test_item_num_alias():
+    il = ItemList(item_num=np.arange(5))
+
+    assert len(il) == 5
+    assert il.numbers().shape == (5,)
+
+    with raises(RuntimeError, match="item IDs not available"):
+        il.ids()
+
+
+def test_item_num_bad_type():
+    with raises(TypeError):
+        ItemList(item_num=np.random.randn(5))
+
+
+def test_item_num_bad_dims():
+    with raises(TypeError):
+        ItemList(item_num=[[1, 3, 8, 4]])
+
+
+def test_item_ids_num_mismatch_sizes():
+    with raises(TypeError, match="has incorrect shape"):
+        ItemList(item_ids=ITEMS, item_num=np.arange(4))
+
+
 def test_item_num_list_vocab():
     il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB)
 

From 7b74ed06f4692a5ed9d9017e972e1388cd0d4720 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 10:57:45 -0400
Subject: [PATCH 09/16] implement ranks for the item list

---
 lenskit/lenskit/data/items.py  | 124 ++++++++++++++++++++++++++++-----
 lenskit/tests/test_itemlist.py |  12 ++++
 2 files changed, 118 insertions(+), 18 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 8175cd5ec..4ee7c666f 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -10,13 +10,14 @@
 
 from __future__ import annotations
 
-from typing import Literal, LiteralString, Sequence, TypeAlias, TypeVar, overload
+from typing import Any, Literal, LiteralString, Sequence, TypeAlias, TypeVar, cast, overload
 
 import numpy as np
 import pandas as pd
 import torch
 from numpy.typing import ArrayLike, NDArray
 
+from lenskit.data.checks import check_1d
 from lenskit.data.mtarray import MTArray, MTGenericArray
 from lenskit.data.vocab import EntityId, NPEntityId, Vocabulary
 
@@ -27,23 +28,61 @@
 class ItemList:
     """
     Representation of a (usually ordered) list of items, possibly with scores
-    and other associated data.
+    and other associated data.  Item lists are to be treated as **immutable** —
+    create a new list with modified data, do not do in-place modifications of
+    the list itself or the arrays or data frame it returns.
+
+    An item list logically a list of rows, each of which is an item, like a
+    :class:`~pandas.DataFrame` but supporting multiple array backends.
+
+    .. note::
+
+        Naming for fields and accessor methods is tricky, because the usual
+        convention for a data frame is to use singular column names (e.g.
+        “item_id”, “score”) instead of plural (“item_ids”, “scores”) — the data
+        frame, like a database table, is a list of instances, and the column
+        names are best interpreted as naming attributes of individual instances.
+
+        However, when working with a list of e.g. item IDs, it is more natural —
+        at least to this author — to use plural names: ``item_ids``.  Since this
+        class is doing somewhat double-duty, representing a list of items along
+        with associated data, as well as a data frame of columns representing
+        items, the appropriate naming is not entirely clear.  The naming
+        convention in this class is therefore as follows:
+
+        * Field names are singular (``item_id``, ``score``).
+        * Named accessor methods are plural (:meth:`item_ids`, :meth:`scores`).
+        * Both singular and plural forms are accepted for item IDs numbers, and
+          scores in the keyword arguments.  Other field names should be
+          singular.
 
     Args:
         item_ids:
-            A list or array of item identifiers.
+            A list or array of item identifiers. ``item_id`` is accepted as an
+            alternate name.
         item_nums:
-            A list or array of item numbers.
+            A list or array of item numbers. ``item_num`` is accepted as an
+            alternate name.
         vocabulary:
             A vocabulary to translate between item IDs and numbers.
+        ordered:
+            Whether the list has a meaningful order.
+        scores:
+            An array of scores for the items.
         fields:
-            Additional fields, such as ``score`` or ``rating``.
+            Additional fields, such as ``score`` or ``rating``.  Field names
+            should generally be singular; the named keyword arguments and
+            accessor methods are plural for readability (“get the list of item
+            IDs”)
     """
 
+    ordered: bool
+    "Whether this list has a meaningful order."
     _len: int
     _ids: np.ndarray[int, np.dtype[NPEntityId]] | None = None
     _numbers: MTArray[np.int32] | None = None
     _vocab: Vocabulary[EntityId] | None = None
+    _ranks: MTArray[np.int32] | None = None
     _fields: dict[str, MTGenericArray]
 
     def __init__(
@@ -52,10 +91,21 @@ def __init__(
         item_ids: NDArray[NPEntityId] | pd.Series[EntityId] | Sequence[EntityId] | None = None,
         item_nums: NDArray[np.int32] | pd.Series[int] | Sequence[int] | ArrayLike | None = None,
         vocabulary: Vocabulary[EID] | None = None,
+        ordered: bool = False,
+        scores: NDArray[np.generic] | torch.Tensor | ArrayLike | None = None,
         **fields: NDArray[np.generic] | torch.Tensor | ArrayLike,
     ):
+        self.ordered = ordered
         self._vocab = vocabulary
 
+        if item_ids is None and "item_id" in fields:
+            item_ids = np.asarray(cast(Any, fields["item_id"]))
+
+        if item_nums is None and "item_num" in fields:
+            item_nums = np.asarray(cast(Any, fields["item_num"]))
+            if not issubclass(item_nums.dtype.type, np.integer):
+                raise TypeError("item numbers not integers")
+
         if item_ids is None and item_nums is None:
             self._ids = np.ndarray(0, dtype=np.int32)
             self._numbers = MTArray(np.ndarray(0, dtype=np.int32))
@@ -63,28 +113,40 @@ def __init__(
 
         if item_ids is not None:
             self._ids = np.asarray(item_ids)
-            if len(self._ids.shape) > 1:
-                raise TypeError("item lists must be 1-dimensional")
+            if not issubclass(self._ids.dtype.type, (np.integer, np.str_, np.bytes_)):
+                raise TypeError(f"item IDs not integers or bytes (type: {self._ids.dtype})")
+
+            check_1d(self._ids, label="item_ids")
             self._len = len(item_ids)
 
         if item_nums is not None:
             self._numbers = MTArray(item_nums)
-            if hasattr(self, "_len"):
-                if self._numbers.shape != (self._len,):
-                    nl = self._numbers.shape[0]
-                    raise ValueError(
-                        f"item ID and number lists have different lengths ({self._len} != {nl})"
-                    )
-            else:
-                self._len = self._numbers.shape[0]
+            check_1d(self._numbers, getattr(self, "_len", None), label="item_nums")
+            self._len = self._numbers.shape[0]
+
+        # convert fields and drop singular ID/number aliases
+        self._fields = {
+            name: check_1d(MTArray(data), self._len, label=name)
+            for (name, data) in fields.items()
+            if name not in ("item_id", "item_num")
+        }
 
-        self._fields = {name: MTArray(data) for (name, data) in fields.items()}
+        if scores is not None:
+            if "score" in fields:  # pragma: nocover
+                raise ValueError("cannot specify both scores= and score=")
+            self._fields["score"] = MTArray(scores)
 
     def clone(self) -> ItemList:
         """
         Make a shallow copy of the item list.
         """
-        return ItemList(item_ids=self._ids, item_nums=self._numbers, vocabulary=self._vocab)
+        return ItemList(
+            item_ids=self._ids,
+            item_nums=self._numbers,
+            vocabulary=self._vocab,
+            ordered=self.ordered,
+            **self._fields,
+        )
 
     def ids(self) -> NDArray[NPEntityId]:
         """
@@ -142,7 +204,33 @@ def scores(self, format: LiteralString = "numpy") -> ArrayLike | None:
         """
         Get the item scores (if available).
         """
-        return self.field("scores", format)
+        return self.field("score", format)
+
+    @overload
+    def ranks(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32] | None: ...
+    @overload
+    def ranks(self, format: Literal["torch"]) -> torch.Tensor | None: ...
+    @overload
+    def ranks(self, format: LiteralString = "numpy") -> ArrayLike | None: ...
+    def ranks(self, format: LiteralString = "numpy") -> ArrayLike | None:
+        """
+        Get an array of ranks for the items in this list, if it is ordered.
+        Unordered lists have no ranks.  The ranks are based on the order in the
+        list, **not** on the score.
+
+        Item ranks start with **1**, for compatibility with common practice in
+        mathematically defining information retrieval metrics and operations.
+
+        Returns:
+            An array of item ranks, or ``None`` if the list is unordered.
+        """
+        if not self.ordered:
+            return None
+
+        if self._ranks is None:
+            self._ranks = MTArray(np.arange(1, self._len + 1, dtype=np.int32))
+
+        return self._ranks.to(format)
 
     @overload
     def field(
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index a43d592e1..ee87ca99b 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -125,3 +125,15 @@ def test_scores():
     assert isinstance(st, torch.Tensor)
     assert st.shape == (5,)
     assert np.all(st.numpy() == data)
+
+    assert il.ranks() is None
+
+
+def test_ranks():
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, ordered=True)
+    assert il.ordered
+
+    ranks = il.ranks()
+    assert ranks is not None
+    assert ranks.shape == (5,)
+    assert np.all(ranks == np.arange(1, 6))

From 406a6789379feef5c85c1671bb6940c6b85dbde0 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 11:11:31 -0400
Subject: [PATCH 10/16] improve dataset documentation

---
 docs/data.rst                   | 37 ++++++++++++++++++++++-----------
 lenskit/lenskit/data/dataset.py | 24 ++++++++++++++-------
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/docs/data.rst b/docs/data.rst
index 76aea5c72..3cd22835c 100644
--- a/docs/data.rst
+++ b/docs/data.rst
@@ -73,9 +73,9 @@ abstract class with implementations covering various scenarios.
 Creating Datasets
 ~~~~~~~~~~~~~~~~~
 
-Several functions create :class:`Dataset`s from different input data sources.
+Several functions can create a :class:`Dataset` from different input data sources.
 
-.. autofunction:: from_interaction_df
+.. autofunction:: from_interactions_df
 
 Loading Common Datasets
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -89,13 +89,32 @@ LensKit uses *vocabularies* to record user/item IDs, tags, terms, etc. in a way
 that facilitates easy mapping to 0-based contiguous indexes for use in matrix
 and tensor data structures.
 
-.. module:: lenskit.data.vocab
+.. module:: lenskit.data
 
 .. autoclass:: Vocabulary
 
-Dataset implementations
+
+Item Lists
+~~~~~~~~~~
+
+LensKit uses *item lists* to represent collections of items that may be scored,
+ranked, etc.
+
+.. autoclass:: ItemList
+
+User-Item Data Tables
+~~~~~~~~~~~~~~~~~~~~~
+
+.. module:: lenskit.data.tables
+
+.. autoclass:: NumpyUserItemTable
+.. autoclass:: TorchUserItemTable
+
+Dataset Implementations
 ~~~~~~~~~~~~~~~~~~~~~~~
 
+.. module:: lenskit.data.dataset
+
 Matrix Dataset
 --------------
 
@@ -103,6 +122,7 @@ The :class:`MatrixDataset` provides an in-memory dataset implementation backed
 by a ratings matrix or implicit-feedback matrix.
 
 .. autoclass:: MatrixDataset
+    :no-members:
 
 Lazy Dataset
 ------------
@@ -111,11 +131,4 @@ The lazy data set takes a function that loads a data set (of any type), and
 lazily uses that function to load an underlying data set when needed.
 
 .. autoclass:: LazyDataset
-
-User-Item Data Tables
-~~~~~~~~~~~~~~~~~~~~~
-
-.. module:: lenskit.data.tables
-
-.. autoclass:: NumpyUserItemTable
-.. autoclass:: TorchUserItemTable
+    :members: delegate
diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py
index 8b5a06be8..51f4987f0 100644
--- a/lenskit/lenskit/data/dataset.py
+++ b/lenskit/lenskit/data/dataset.py
@@ -469,7 +469,13 @@ def user_stats(self) -> pd.DataFrame:
 
 class MatrixDataset(Dataset):
     """
-    Dataset implementaiton using an in-memory rating or implicit-feedback matrix.
+    Dataset implementation using an in-memory rating or implicit-feedback
+    matrix.
+
+    .. note::
+        Client code generally should not construct this class directly.  Instead
+        use the various ``from_`` and ``load_`` functions in
+        :mod:`lenskit.data`.
     """
 
     _users: Vocabulary[EntityId]
@@ -713,6 +719,10 @@ class LazyDataset(Dataset):
     """
     A data set with an underlying load function, that doesn't call the function
     until data is actually needed.
+
+    Args:
+        loader:
+            The function that will load the dataset when needed.
     """
 
     _delegate: Dataset | None = None
@@ -720,11 +730,7 @@ class LazyDataset(Dataset):
 
     def __init__(self, loader: Callable[[], Dataset]):
         """
-        Construct a dataset.
-
-        .. note::
-            Client code generally should not call this constructor.  Instead use the
-            various ``from_`` and ``load_`` functions in :mod:`lenskit.data`.
+        Construct a lazy dataset.
         """
         self._loader = loader
 
@@ -779,9 +785,11 @@ def from_interactions_df(
             The user-item interactions (e.g. ratings).  The dataset code takes
             ownership of this data frame and may modify it.
         user_col:
-            The name of the user ID column.
+            The name of the user ID column.  By default, looks for columns named
+            ``user``, ``user_id``, or ``userId``, with several case variants.
         item_col:
-            The name of the item ID column.
+            The name of the item ID column.  By default, looks for columns named
+            ``item``, ``item_id``, or ``itemId``, with several case variants.
         rating_col:
             The name of the rating column.
         timestamp_col:

From 3d22b74de75694bad92a39254b66425a9aa1e528 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 11:31:25 -0400
Subject: [PATCH 11/16] improve some item list docs

---
 docs/releases/2024.rst        | 23 ++++++++++++++++-------
 lenskit/lenskit/data/items.py |  7 ++++---
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst
index c99c8626b..23e92cb05 100644
--- a/docs/releases/2024.rst
+++ b/docs/releases/2024.rst
@@ -24,13 +24,6 @@ Significant Changes
 
 2024.1 brings substantial changes to LensKit.
 
-*   **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms,
-    instead of Numba-accelerated NumPy code.  Algorithms using PyTorch are:
-
-    * :py:class:`~lenskit.algorithms.knn.ItemItem`
-    * :py:class:`~lenskit.algorithms.als.ImplicitMF`
-    * :py:class:`~lenskit.algorithms.als.BiasedMF`
-
 *   :class:`~lenskit.data.Dataset`.  LensKit now provides an abstraction for
     training data instead of working with Pandas data frames directly, that
     allows components to reduce code duplication and recomputation, access data
@@ -39,6 +32,22 @@ Significant Changes
     supersedes the old bespoke dataset loading support, with functions like
     :func:`~lenskit.data.load_movielens` to load standard datasets.
 
+*   New classes like :class:`~lenskit.data.ItemList` for routing item data
+    instead of using Pandas data frames and series.  This makes component return
+    types more self-documenting (rather than requiring developers to remember
+    what is on the index, what the column names are, etc.), and facilitates more
+    efficient data transfer between components that do not use Pandas (e.g. data
+    passed between components using PyTorch can leave the data in tensors
+    without round-tripping through Pandas and NumPy, and keep this transparent
+    to client code).
+
+*   **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms,
+    instead of Numba-accelerated NumPy code.  Algorithms using PyTorch are:
+
+    * :py:class:`~lenskit.algorithms.knn.ItemItem`
+    * :py:class:`~lenskit.algorithms.als.ImplicitMF`
+    * :py:class:`~lenskit.algorithms.als.BiasedMF`
+
 *   Many LensKit components (batch running, model training, etc.) now report progress with
     :py:mod:`progress_api`, and can be connected to TQDM or Enlighten.
 
diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 4ee7c666f..86186edb3 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -28,9 +28,10 @@
 class ItemList:
     """
     Representation of a (usually ordered) list of items, possibly with scores
-    and other associated data.  Item lists are to be treated as **immutable** —
-    create a new list with modified data, do not do in-place modifications of
-    the list itself or the arrays or data frame it returns.
+    and other associated data; many components take and return item lists.  Item
+    lists are to be treated as **immutable** — create a new list with modified
+    data, do not do in-place modifications of the list itself or the arrays or
+    data frame it returns.
 
     An item list logically a list of rows, each of which is an item, like a
     :class:`~pandas.DataFrame` but supporting multiple array backends.

From ace2e4145f5cb4905703f45319aa4fdeed4c0e68 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 11:56:27 -0400
Subject: [PATCH 12/16] Add user_row method to get a single user's data.

---
 lenskit/lenskit/data/dataset.py      | 58 +++++++++++++++++++++++++---
 lenskit/tests/test_dataset_matrix.py | 41 +++++++++++++++++++-
 2 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py
index 51f4987f0..abff11283 100644
--- a/lenskit/lenskit/data/dataset.py
+++ b/lenskit/lenskit/data/dataset.py
@@ -32,6 +32,7 @@
     override,
 )
 
+from lenskit.data.items import ItemList
 from lenskit.data.matrix import CSRStructure, InteractionMatrix
 from lenskit.data.vocab import Vocabulary
 
@@ -87,7 +88,7 @@ def items(self) -> Vocabulary[EntityId]:
         """
         The items known by this dataset.
         """
-        ...
+        raise NotImplementedError()
 
     @property
     @abstractmethod
@@ -95,7 +96,7 @@ def users(self) -> Vocabulary[EntityId]:
         """
         The users known by this dataset.
         """
-        ...
+        raise NotImplementedError()
 
     @abstractmethod
     def count(self, what: str) -> int:
@@ -118,7 +119,7 @@ def count(self, what: str) -> int:
                 * interactions
                 * ratings
         """
-        ...
+        raise NotImplementedError()
 
     @property
     def item_count(self) -> int:
@@ -212,7 +213,7 @@ def interaction_log(
         Returns:
             The user-item interaction log in the specified format.
         """
-        ...
+        raise NotImplementedError()
 
     @overload
     @abstractmethod
@@ -359,7 +360,25 @@ def interaction_matrix(
                 ``True`` to return user and item IDs instead of numbers in
                 ``pandas``-format matrix.
         """
-        ...
+        raise NotImplementedError()
+
+    @abstractmethod
+    @overload
+    def user_row(self, user_id: EntityId) -> ItemList | None: ...
+    @abstractmethod
+    @overload
+    def user_row(self, *, user_num: int) -> ItemList: ...
+    @abstractmethod
+    def user_row(
+        self, user_id: EntityId | None = None, *, user_num: int | None = None
+    ) -> ItemList | None:
+        """
+        Get a user's row from the interaction matrix.  Available fields are
+        returned as fields. If the dataset has ratings, these are provided as a
+        ``rating`` field, **not** as the item scores.  The item list is unordered,
+        but items are returned in order by item number.
+        """
+        raise NotImplementedError()
 
     def item_stats(self) -> pd.DataFrame:
         """
@@ -714,6 +733,31 @@ def _int_log_torch(self, fields: list[str]) -> TorchUserItemTable:
             tbl.timestamps = torch.from_numpy(self._matrix.timestamps)
         return tbl
 
+    @override
+    def user_row(
+        self, user_id: EntityId | None = None, *, user_num: int | None = None
+    ) -> ItemList | None:
+        if user_num is None:
+            if user_id is None:  # pragma: nocover
+                raise ValueError("most provide one of user_id and user_num")
+
+            user_num = self.users.number(user_id, "none")
+            if user_num is None:
+                return None
+
+        elif user_id is not None:  # pragma: nocover
+            raise ValueError("most provide one of user_id and user_num")
+
+        sp = self._matrix.user_ptrs[user_num]
+        ep = self._matrix.user_ptrs[user_num + 1]
+        inums = self._matrix.item_nums[sp:ep]
+        fields = {}
+        if self._matrix.ratings is not None:
+            fields["rating"] = self._matrix.ratings[sp:ep]
+        if self._matrix.timestamps is not None:
+            fields["timestamp"] = self._matrix.timestamps[sp:ep]
+        return ItemList(item_nums=inums, vocabulary=self.items, **fields)
+
 
 class LazyDataset(Dataset):
     """
@@ -764,6 +808,10 @@ def interaction_matrix(self, *args, **kwargs) -> Any:
     def interaction_log(self, *args, **kwargs) -> Any:
         return self.delegate().interaction_log(*args, **kwargs)
 
+    @override
+    def user_row(self, *args, **kwargs) -> ItemList | None:
+        return self.delegate().user_row(*args, **kwargs)
+
 
 def from_interactions_df(
     df: pd.DataFrame,
diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py
index f753c6c5e..fa940b70f 100644
--- a/lenskit/tests/test_dataset_matrix.py
+++ b/lenskit/tests/test_dataset_matrix.py
@@ -19,7 +19,7 @@
 from pytest import mark, raises
 
 from lenskit.data import Dataset
-from lenskit.data.dataset import FieldError, from_interactions_df
+from lenskit.data.dataset import FieldError, MatrixDataset, from_interactions_df
 from lenskit.data.matrix import CSRStructure
 from lenskit.util.test import ml_ds, ml_ratings  # noqa: F401
 
@@ -64,6 +64,7 @@ def _check_timestamp(ml_ds: Dataset, ml_ratings: pd.DataFrame, ts: ArrayLike):
 
 def test_internals(ml_ds: Dataset):
     "Test internal matrix structures"
+    assert isinstance(ml_ds, MatrixDataset)
     assert ml_ds._matrix.user_nums.dtype == np.int32
     assert ml_ds._matrix.user_ptrs.dtype == np.int32
     assert ml_ds._matrix.item_nums.dtype == np.int32
@@ -342,3 +343,41 @@ def test_matrix_torch_timestamp(ml_ratings: pd.DataFrame, ml_ds: Dataset):
     _check_item_number_counts(ml_ds, ml_ratings, log.col_indices())
     _check_item_ids(ml_ds, ml_ratings, log.col_indices())
     _check_timestamp(ml_ds, ml_ratings, log.values().numpy())
+
+
+def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset):
+    users = rng.choice(ml_ds.users.ids(), 50)
+
+    for user in users:
+        row = ml_ds.user_row(user)
+        assert row is not None
+        urows = ml_ratings[ml_ratings["user"] == user].sort_values("item")
+        assert set(row.ids()) == set(urows["item"])
+        assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"]))
+
+        ratings = row.field("rating")
+        assert ratings is not None
+        assert np.all(ratings == urows["rating"])
+
+        timestamps = row.field("timestamp")
+        assert timestamps is not None
+        assert np.all(timestamps == urows["timestamp"])
+
+
+def test_matrix_rows_by_num(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset):
+    users = rng.choice(ml_ds.user_count, 50)
+
+    for user in users:
+        row = ml_ds.user_row(user_num=user)
+        assert row is not None
+        urows = ml_ratings[ml_ratings["user"] == ml_ds.users.id(user)].sort_values("item")
+        assert set(row.ids()) == set(urows["item"])
+        assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"]))
+
+        ratings = row.field("rating")
+        assert ratings is not None
+        assert np.all(ratings == urows["rating"])
+
+        timestamps = row.field("timestamp")
+        assert timestamps is not None
+        assert np.all(timestamps == urows["timestamp"])

From 3cd5111da31af266e22bb610a646cd9c5ce922a5 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 12:04:59 -0400
Subject: [PATCH 13/16] support getting fields and scores as Pandas series

---
 lenskit/lenskit/data/items.py  | 30 +++++++++++++++++++++++++++---
 lenskit/tests/test_itemlist.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 86186edb3..6e089cce4 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -200,12 +200,16 @@ def scores(self, format: Literal["numpy"] = "numpy") -> NDArray[np.floating] | N
     @overload
     def scores(self, format: Literal["torch"]) -> torch.Tensor | None: ...
     @overload
+    def scores(
+        self, format: Literal["pandas"], *, index: Literal["ids", "numbers"] | None = None
+    ) -> pd.Series | None: ...
+    @overload
     def scores(self, format: LiteralString = "numpy") -> ArrayLike | None: ...
-    def scores(self, format: LiteralString = "numpy") -> ArrayLike | None:
+    def scores(self, format: LiteralString = "numpy", **kwargs) -> ArrayLike | None:
         """
         Get the item scores (if available).
         """
-        return self.field("score", format)
+        return self.field("score", format, **kwargs)
 
     @overload
     def ranks(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32] | None: ...
@@ -240,11 +244,31 @@ def field(
     @overload
     def field(self, name: str, format: Literal["torch"]) -> torch.Tensor | None: ...
     @overload
+    def field(
+        self,
+        name: str,
+        format: Literal["pandas"],
+        *,
+        index: Literal["ids", "numbers"] | None = None,
+    ) -> pd.Series | None: ...
+    @overload
     def field(self, name: str, format: LiteralString) -> ArrayLike | None: ...
-    def field(self, name: str, format: LiteralString = "numpy") -> ArrayLike | None:
+    def field(
+        self, name: str, format: LiteralString = "numpy", *, index: LiteralString | None = None
+    ) -> ArrayLike | None:
         val = self._fields.get(name, None)
         if val is None:
             return None
+        elif format == "pandas":
+            idx = None
+            vs = val.to("numpy")
+            if index == "ids":
+                idx = pd.Index(self.ids(), name="item_id")
+            elif index == "numbers":
+                idx = pd.Index(self.numbers(), name="item_num")
+            elif index:  # pragma: nocover
+                raise ValueError(f"unsupported Pandas index {index}")
+            return pd.Series(vs, index=idx)
         else:
             return val.to(format)
 
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index ee87ca99b..292ccc0ba 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -129,6 +129,38 @@ def test_scores():
     assert il.ranks() is None
 
 
+def test_scores_pandas_no_index():
+    data = np.random.randn(5)
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data)
+
+    scores = il.scores("pandas")
+    assert scores is not None
+    assert scores.shape == (5,)
+    assert np.all(scores == data)
+
+
+def test_scores_pandas_id_index():
+    data = np.random.randn(5)
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data)
+    scores = il.scores("pandas", index="ids")
+    assert scores is not None
+    assert scores.shape == (5,)
+    assert np.all(scores == data)
+    assert scores.index.name == "item_id"
+    assert np.all(scores.index.values == ITEMS)
+
+
+def test_scores_pandas_num_index():
+    data = np.random.randn(5)
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data)
+    scores = il.scores("pandas", index="numbers")
+    assert scores is not None
+    assert scores.shape == (5,)
+    assert np.all(scores == data)
+    assert scores.index.name == "item_num"
+    assert np.all(scores.index.values == np.arange(5))
+
+
 def test_ranks():
     il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, ordered=True)
     assert il.ordered

From 4c97e810ff990ca3310d5d3d625a814762904032 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 12:12:02 -0400
Subject: [PATCH 14/16] support alternative vocabularies when retrieving item
 numbes

---
 lenskit/lenskit/data/items.py  | 28 +++++++++++++++++++++++-----
 lenskit/tests/test_itemlist.py |  8 ++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 6e089cce4..6666a1fe4 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -168,25 +168,43 @@ def ids(self) -> NDArray[NPEntityId]:
         return self._ids
 
     @overload
-    def numbers(self, format: Literal["numpy"] = "numpy") -> NDArray[np.int32]: ...
+    def numbers(
+        self, format: Literal["numpy"] = "numpy", *, vocabulary: Vocabulary[EID] | None = None
+    ) -> NDArray[np.int32]: ...
     @overload
-    def numbers(self, format: Literal["torch"]) -> torch.Tensor: ...
+    def numbers(
+        self, format: Literal["torch"], *, vocabulary: Vocabulary[EID] | None = None
+    ) -> torch.Tensor: ...
     @overload
-    def numbers(self, format: LiteralString = "numpy") -> ArrayLike: ...
-    def numbers(self, format: LiteralString = "numpy") -> ArrayLike:
+    def numbers(
+        self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None
+    ) -> ArrayLike: ...
+    def numbers(
+        self, format: LiteralString = "numpy", *, vocabulary: Vocabulary[EID] | None = None
+    ) -> ArrayLike:
         """
         Get the item numbers.
 
         Args:
             format:
                 The array format to use.
+            vocabulary:
+                A alternate vocabulary for mapping IDs to numbers.  If provided,
+                then the item list must have IDs (either stored, or through a
+                vocabulary).
 
         Returns:
             An array of item numbers.
 
         Raises:
-            RuntimeError: if the item list was not created with numbers or a :class:`Vocabulary`.
+            RuntimeError: if the item list was not created with numbers or a
+            :class:`Vocabulary`.
         """
+        if vocabulary is not None and vocabulary is not self._vocab:
+            # we need to translate vocabulary
+            ids = self.ids()
+            return vocabulary.numbers(ids)
+
         if self._numbers is None:
             if self._vocab is None:
                 raise RuntimeError("item numbers not available (no IDs or vocabulary provided)")
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index 292ccc0ba..a37d58aa6 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -169,3 +169,11 @@ def test_ranks():
     assert ranks is not None
     assert ranks.shape == (5,)
     assert np.all(ranks == np.arange(1, 6))
+
+
+def test_numbers_alt_vocab():
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB)
+
+    av = Vocabulary(["A", "B"] + ITEMS)
+    nums = il.numbers(vocabulary=av)
+    assert np.all(nums == np.arange(2, 7))

From e5db4b8ee1bb93e5e39b2b79bb2feb25b96427e0 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 12:42:47 -0400
Subject: [PATCH 15/16] support converting item lists to data frames

---
 lenskit/lenskit/data/items.py        | 23 +++++++++++++++++++++++
 lenskit/tests/test_dataset_matrix.py |  5 +++++
 lenskit/tests/test_itemlist.py       | 22 ++++++++++++++++++++++
 3 files changed, 50 insertions(+)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 6666a1fe4..96059ffa5 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -290,5 +290,28 @@ def field(
         else:
             return val.to(format)
 
+    def to_df(self) -> pd.DataFrame:
+        """
+        Convert this item list to a Pandas data frame.  It has the following columns:
+
+        * ``item_id`` — the item IDs (if available)
+        * ``item_id`` — the item numbers (if available)
+        * ``score`` — the item scores
+        * ``rank`` — the item ranks (if the list is ordered)
+        * all other defined fields, using their field names
+        """
+        cols = {}
+        if self._ids is not None or self._vocab is not None:
+            cols["item_id"] = self.ids()
+        if self._numbers is not None or self._vocab is not None:
+            cols["item_num"] = self.numbers()
+        if "score" in self._fields:
+            cols["score"] = self.scores()
+        if self.ordered:
+            cols["rank"] = self.ranks()
+        # add remaining fields
+        cols.update((k, v.numpy()) for (k, v) in self._fields.items() if k != "score")
+        return pd.DataFrame(cols)
+
     def __len__(self):
         return self._len
diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py
index fa940b70f..eb994d763 100644
--- a/lenskit/tests/test_dataset_matrix.py
+++ b/lenskit/tests/test_dataset_matrix.py
@@ -352,6 +352,7 @@ def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, m
         row = ml_ds.user_row(user)
         assert row is not None
         urows = ml_ratings[ml_ratings["user"] == user].sort_values("item")
+        urows = urows.reset_index(drop=True)
         assert set(row.ids()) == set(urows["item"])
         assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"]))
 
@@ -363,6 +364,10 @@ def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, m
         assert timestamps is not None
         assert np.all(timestamps == urows["timestamp"])
 
+        # we'll quick check additional fields on the item list here
+        df = row.to_df()
+        assert np.all(df["timestamp"] == urows["timestamp"])
+
 
 def test_matrix_rows_by_num(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset):
     users = rng.choice(ml_ds.user_count, 50)
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index a37d58aa6..a5bbbee8f 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -177,3 +177,25 @@ def test_numbers_alt_vocab():
     av = Vocabulary(["A", "B"] + ITEMS)
     nums = il.numbers(vocabulary=av)
     assert np.all(nums == np.arange(2, 7))
+
+
+def test_pandas_df():
+    data = np.random.randn(5)
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data)
+
+    df = il.to_df()
+    assert np.all(df["item_id"] == ITEMS)
+    assert np.all(df["item_num"] == np.arange(5))
+    assert np.all(df["score"] == data)
+    assert "rank" not in df.columns
+
+
+def test_pandas_df_ordered():
+    data = np.random.randn(5)
+    il = ItemList(item_nums=np.arange(5), vocabulary=VOCAB, scores=data, ordered=True)
+
+    df = il.to_df()
+    assert np.all(df["item_id"] == ITEMS)
+    assert np.all(df["item_num"] == np.arange(5))
+    assert np.all(df["score"] == data)
+    assert np.all(df["rank"] == np.arange(1, 6))

From f2a3ea6240ba48a3dd8c3ecdc37104734da49914 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 13:17:54 -0400
Subject: [PATCH 16/16] fix LiteralString import

---
 lenskit/lenskit/data/items.py   | 12 ++++++++++--
 lenskit/lenskit/data/mtarray.py |  3 +--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 96059ffa5..595ddac07 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -10,12 +10,20 @@
 
 from __future__ import annotations
 
-from typing import Any, Literal, LiteralString, Sequence, TypeAlias, TypeVar, cast, overload
-
 import numpy as np
 import pandas as pd
 import torch
 from numpy.typing import ArrayLike, NDArray
+from typing_extensions import (
+    Any,
+    Literal,
+    LiteralString,
+    Sequence,
+    TypeAlias,
+    TypeVar,
+    cast,
+    overload,
+)
 
 from lenskit.data.checks import check_1d
 from lenskit.data.mtarray import MTArray, MTGenericArray
diff --git a/lenskit/lenskit/data/mtarray.py b/lenskit/lenskit/data/mtarray.py
index d9e97fe58..de1f73713 100644
--- a/lenskit/lenskit/data/mtarray.py
+++ b/lenskit/lenskit/data/mtarray.py
@@ -7,11 +7,10 @@
 # pyright: basic
 from __future__ import annotations
 
-from typing import Generic, Literal, LiteralString, Sequence, TypeVar, overload
-
 import numpy as np
 import torch
 from numpy.typing import ArrayLike, NDArray
+from typing_extensions import Generic, Literal, LiteralString, Sequence, TypeVar, overload
 
 NPT = TypeVar("NPT", bound=np.generic)