From 4ce186de59551d17d60bc06cc915f4b5e1c9474e Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 16:56:51 -0400
Subject: [PATCH 01/22] move/copy split tests

---
 lenskit/tests/test_split_ratings.py      | 121 ++++++++++++++
 lenskit/tests/test_split_user_holdout.py |  98 ++++++++++++
 lenskit/tests/test_split_users.py        | 193 +++++++++++++++++++++++
 3 files changed, 412 insertions(+)
 create mode 100644 lenskit/tests/test_split_ratings.py
 create mode 100644 lenskit/tests/test_split_user_holdout.py
 create mode 100644 lenskit/tests/test_split_users.py

diff --git a/lenskit/tests/test_split_ratings.py b/lenskit/tests/test_split_ratings.py
new file mode 100644
index 000000000..5e061b87f
--- /dev/null
+++ b/lenskit/tests/test_split_ratings.py
@@ -0,0 +1,121 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import functools as ft
+import itertools as it
+import math
+
+import numpy as np
+import pandas as pd
+
+import pytest
+
+import lenskit.crossfold as xf
+
+
+def test_partition_rows(ml_ratings: pd.DataFrame):
+    splits = xf.partition_rows(ml_ratings, 5)
+    splits = list(splits)
+    assert len(splits) == 5
+
+    for s in splits:
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        test_idx = s.test.set_index(["user", "item"]).index
+        train_idx = s.train.set_index(["user", "item"]).index
+        assert len(test_idx.intersection(train_idx)) == 0
+
+    # we should partition!
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        i1 = s1.test.set_index(["user", "item"]).index
+        i2 = s2.test.set_index(["user", "item"]).index
+        inter = i1.intersection(i2)
+        assert len(inter) == 0
+
+    union = ft.reduce(lambda i1, i2: i1.union(i2), (s.test.index for s in splits))
+    assert len(union.unique()) == len(ml_ratings)
+
+
+def test_sample_rows(ml_ratings: pd.DataFrame):
+    splits = xf.sample_rows(ml_ratings, partitions=5, size=1000)
+    splits = list(splits)
+    assert len(splits) == 5
+
+    for s in splits:
+        assert len(s.test) == 1000
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+        test_idx = s.test.set_index(["user", "item"]).index
+        train_idx = s.train.set_index(["user", "item"]).index
+        assert len(test_idx.intersection(train_idx)) == 0
+
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        i1 = s1.test.set_index(["user", "item"]).index
+        i2 = s2.test.set_index(["user", "item"]).index
+        inter = i1.intersection(i2)
+        assert len(inter) == 0
+
+
+def test_sample_rows_more_smaller_parts(ml_ratings: pd.DataFrame):
+    splits = xf.sample_rows(ml_ratings, partitions=10, size=500)
+    splits = list(splits)
+    assert len(splits) == 10
+
+    for s in splits:
+        assert len(s.test) == 500
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+        test_idx = s.test.set_index(["user", "item"]).index
+        train_idx = s.train.set_index(["user", "item"]).index
+        assert len(test_idx.intersection(train_idx)) == 0
+
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        i1 = s1.test.set_index(["user", "item"]).index
+        i2 = s2.test.set_index(["user", "item"]).index
+        inter = i1.intersection(i2)
+        assert len(inter) == 0
+
+
+def test_sample_non_disjoint(ml_ratings: pd.DataFrame):
+    splits = xf.sample_rows(ml_ratings, partitions=10, size=1000, disjoint=False)
+    splits = list(splits)
+    assert len(splits) == 10
+
+    for s in splits:
+        assert len(s.test) == 1000
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+        test_idx = s.test.set_index(["user", "item"]).index
+        train_idx = s.train.set_index(["user", "item"]).index
+        assert len(test_idx.intersection(train_idx)) == 0
+
+    # There are enough splits & items we should pick at least one duplicate
+    ipairs = (
+        (s1.test.set_index(["user", "item"]).index, s2.test.set_index(["user", "item"]).index)
+        for (s1, s2) in it.product(splits, splits)
+    )
+    isizes = [len(i1.intersection(i2)) for (i1, i2) in ipairs]
+    assert any(n > 0 for n in isizes)
+
+
+@pytest.mark.slow
+def test_sample_oversize(ml_ratings: pd.DataFrame):
+    splits = xf.sample_rows(ml_ratings, 50, 10000)
+    splits = list(splits)
+    assert len(splits) == 50
+
+    for s in splits:
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        test_idx = s.test.set_index(["user", "item"]).index
+        train_idx = s.train.set_index(["user", "item"]).index
+        assert len(test_idx.intersection(train_idx)) == 0
diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_user_holdout.py
new file mode 100644
index 000000000..504c9ee06
--- /dev/null
+++ b/lenskit/tests/test_split_user_holdout.py
@@ -0,0 +1,98 @@
+import functools as ft
+import itertools as it
+import math
+
+import numpy as np
+import pandas as pd
+
+import pytest
+
+import lenskit.crossfold as xf
+
+
+def test_sample_n(ml_ratings: pd.DataFrame):
+    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+
+    s5 = xf.SampleN(5)
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = s5(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) == 5
+        assert len(tst) + len(trn) == len(udf)
+
+    s10 = xf.SampleN(10)
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = s10(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) == 10
+        assert len(tst) + len(trn) == len(udf)
+
+
+def test_sample_frac(ml_ratings: pd.DataFrame):
+    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+
+    samp = xf.SampleFrac(0.2)
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = samp(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) + len(trn) == len(udf)
+        assert len(tst) >= math.floor(len(udf) * 0.2)
+        assert len(tst) <= math.ceil(len(udf) * 0.2)
+
+    samp = xf.SampleFrac(0.5)
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = samp(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) + len(trn) == len(udf)
+        assert len(tst) >= math.floor(len(udf) * 0.5)
+        assert len(tst) <= math.ceil(len(udf) * 0.5)
+
+
+def test_last_n(ml_ratings: pd.DataFrame):
+    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+
+    samp = xf.LastN(5)
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = samp(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) == 5
+        assert len(tst) + len(trn) == len(udf)
+        assert tst.timestamp.min() >= trn.timestamp.max()
+
+    samp = xf.LastN(7)
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = samp(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) == 7
+        assert len(tst) + len(trn) == len(udf)
+        assert tst.timestamp.min() >= trn.timestamp.max()
+
+
+def test_last_frac(ml_ratings: pd.DataFrame):
+    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+
+    samp = xf.LastFrac(0.2, "timestamp")
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = samp(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) + len(trn) == len(udf)
+        assert len(tst) >= math.floor(len(udf) * 0.2)
+        assert len(tst) <= math.ceil(len(udf) * 0.2)
+        assert tst.timestamp.min() >= trn.timestamp.max()
+
+    samp = xf.LastFrac(0.5, "timestamp")
+    for u in users:
+        udf = ml_ratings[ml_ratings.user == u]
+        tst = samp(udf)
+        trn = udf.loc[udf.index.difference(tst.index), :]
+        assert len(tst) + len(trn) == len(udf)
+        assert len(tst) >= math.floor(len(udf) * 0.5)
+        assert len(tst) <= math.ceil(len(udf) * 0.5)
+        assert tst.timestamp.min() >= trn.timestamp.max()
diff --git a/lenskit/tests/test_split_users.py b/lenskit/tests/test_split_users.py
new file mode 100644
index 000000000..2a6671b1c
--- /dev/null
+++ b/lenskit/tests/test_split_users.py
@@ -0,0 +1,193 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import functools as ft
+import itertools as it
+import math
+
+import numpy as np
+import pandas as pd
+
+import pytest
+
+import lenskit.crossfold as xf
+
+
+def test_partition_users(ml_ratings: pd.DataFrame):
+    splits = xf.partition_users(ml_ratings, 5, xf.SampleN(5))
+    splits = list(splits)
+    assert len(splits) == 5
+
+    for s in splits:
+        ucounts = s.test.groupby("user").agg("count")
+        assert all(ucounts == 5)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        assert all(s.train["user"].isin(s.train["user"].unique()))
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+
+    users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits))
+    assert len(users) == ml_ratings.user.nunique()
+    assert users == set(ml_ratings.user)
+
+
+def test_partition_may_skip_train(ml_ratings: pd.DataFrame):
+    "Partitioning when users may not have enough ratings to be in the train and test sets."
+    # make a data set where some users only have 1 rating
+    ml_ratings = ml_ratings.sample(frac=0.1)
+    users = ml_ratings.groupby("user")["rating"].count()
+    assert users.min() == 1.0  # we should have some small users!
+    users.name = "ur_count"
+
+    splits = xf.partition_users(ml_ratings, 5, xf.SampleN(1))
+    splits = list(splits)
+    assert len(splits) == 5
+
+    # now we go make sure we're missing some users! And don't have any NaN ml_ratings
+    for train, test in splits:
+        # no null ml_ratings
+        assert all(train["rating"].notna())
+        # see if test users with 1 rating are missing from train
+        test = test.join(users, on="user")
+        assert all(~(test.loc[test["ur_count"] == 1, "user"].isin(train["user"].unique())))
+        # and users with more than one rating are in train
+        assert all(test.loc[test["ur_count"] > 1, "user"].isin(train["user"].unique()))
+
+
+def test_partition_users_frac(ml_ratings: pd.DataFrame):
+    splits = xf.partition_users(ml_ratings, 5, xf.SampleFrac(0.2))
+    splits = list(splits)
+    assert len(splits) == 5
+    ucounts = ml_ratings.groupby("user").item.count()
+    uss = ucounts * 0.2
+
+    for s in splits:
+        tucs = s.test.groupby("user").item.count()
+        assert all(tucs >= uss.loc[tucs.index] - 1)
+        assert all(tucs <= uss.loc[tucs.index] + 1)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+
+    # we have all users
+    users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits))
+    assert len(users) == ml_ratings.user.nunique()
+    assert users == set(ml_ratings.user)
+
+
+def test_sample_users(ml_ratings: pd.DataFrame):
+    splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5))
+    splits = list(splits)
+    assert len(splits) == 5
+
+    for s in splits:
+        ucounts = s.test.groupby("user").agg("count")
+        assert len(s.test) == 5 * 100
+        assert len(ucounts) == 100
+        assert all(ucounts == 5)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+
+    # no overlapping users
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+        us1 = s1.test.user.unique()
+        us2 = s2.test.user.unique()
+        assert len(np.intersect1d(us1, us2)) == 0
+
+
+def test_sample_users_frac(ml_ratings: pd.DataFrame):
+    splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleFrac(0.2))
+    splits = list(splits)
+    assert len(splits) == 5
+    ucounts = ml_ratings.groupby("user").item.count()
+    uss = ucounts * 0.2
+
+    for s in splits:
+        tucs = s.test.groupby("user").item.count()
+        assert len(tucs) == 100
+        assert all(tucs >= uss.loc[tucs.index] - 1)
+        assert all(tucs <= uss.loc[tucs.index] + 1)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+
+    # no overlapping users
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+        us1 = s1.test.user.unique()
+        us2 = s2.test.user.unique()
+        assert len(np.intersect1d(us1, us2)) == 0
+
+
+@pytest.mark.slow
+def test_sample_users_frac_oversize(ml_ratings: pd.DataFrame):
+    splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5))
+    splits = list(splits)
+    assert len(splits) == 20
+
+    for s in splits:
+        ucounts = s.test.groupby("user").agg("count")
+        assert len(ucounts) < 100
+        assert all(ucounts == 5)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+
+    users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits))
+    assert len(users) == ml_ratings.user.nunique()
+    assert users == set(ml_ratings.user)
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        us1 = s1.test.user.unique()
+        us2 = s2.test.user.unique()
+        assert len(np.intersect1d(us1, us2)) == 0
+
+
+def test_sample_users_frac_oversize_ndj(ml_ratings: pd.DataFrame):
+    splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5), disjoint=False)
+    splits = list(splits)
+    assert len(splits) == 20
+
+    for s in splits:
+        ucounts = s.test.groupby("user").agg("count")
+        assert len(ucounts) == 100
+        assert len(s.test) == 5 * 100
+        assert all(ucounts == 5)
+        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
+        assert len(s.test) + len(s.train) == len(ml_ratings)
+
+
+def test_non_unique_index_partition_users(ml_ratings: pd.DataFrame):
+    """Partitioning users when dataframe has non-unique indices"""
+    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
+    with pytest.raises(ValueError):
+        for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)):
+            pass
+
+
+def test_sample_users_dup_index(ml_ratings: pd.DataFrame):
+    """Sampling users when dataframe has non-unique indices"""
+    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
+    with pytest.raises(ValueError):
+        for split in xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)):
+            pass
+
+
+def test_sample_rows_dup_index(ml_ratings: pd.DataFrame):
+    """Sampling ml_ratings when dataframe has non-unique indices"""
+    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
+    with pytest.raises(ValueError):
+        for split in xf.sample_rows(ml_ratings, partitions=5, size=1000):
+            pass
+
+
+def test_partition_users_dup_index(ml_ratings: pd.DataFrame):
+    """Partitioning ml_ratings when dataframe has non-unique indices"""
+    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
+    with pytest.raises(ValueError):
+        for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)):
+            pass

From e9dd86fe3e93a29d623de3cd232a3ddef77cd60c Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Tue, 30 Jul 2024 17:02:18 -0400
Subject: [PATCH 02/22] start creating the new splitting package

---
 lenskit/lenskit/splitting/__init__.py    |   9 ++
 lenskit/lenskit/splitting/holdout.py     | 101 +++++++++++++++++++++++
 lenskit/lenskit/splitting/types.py       |  27 ++++++
 lenskit/tests/test_split_user_holdout.py |   6 ++
 4 files changed, 143 insertions(+)
 create mode 100644 lenskit/lenskit/splitting/__init__.py
 create mode 100644 lenskit/lenskit/splitting/holdout.py
 create mode 100644 lenskit/lenskit/splitting/types.py

diff --git a/lenskit/lenskit/splitting/__init__.py b/lenskit/lenskit/splitting/__init__.py
new file mode 100644
index 000000000..ec4765a1f
--- /dev/null
+++ b/lenskit/lenskit/splitting/__init__.py
@@ -0,0 +1,9 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+"""
+Splitting data for train-test evaluation.
+"""
diff --git a/lenskit/lenskit/splitting/holdout.py b/lenskit/lenskit/splitting/holdout.py
new file mode 100644
index 000000000..770ec6550
--- /dev/null
+++ b/lenskit/lenskit/splitting/holdout.py
@@ -0,0 +1,101 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+"""
+Per-user rating holdout methods for user-based data splitting.
+"""
+
+from typing import Protocol
+
+from seedbank import numpy_rng
+
+
+class HoldoutMethod(Protocol):
+    """
+    Holdout methods select test rows for a user (or item).  Partition methods
+    are callable; when called with a data frame, they return the test entries.
+    """
+
+    def __call__(self, udf):
+        """
+        Subset a data frame.
+
+        Args:
+            udf(pandas.DataFrame):
+                The input data frame of rows for a user or item.
+
+        Returns:
+            pandas.DataFrame:
+                The data frame of test rows, a subset of ``udf``.
+        """
+        pass
+
+
+class SampleN(HoldoutMethod):
+    """
+    Randomly select a fixed number of test rows per user/item.
+
+    Args:
+        n(int): the number of test items to select
+        rng: the random number generator or seed
+    """
+
+    def __init__(self, n, rng_spec=None):
+        self.n = n
+        self.rng = numpy_rng(rng_spec)
+
+    def __call__(self, udf):
+        return udf.sample(n=self.n, random_state=self.rng)
+
+
+class SampleFrac(HoldoutMethod):
+    """
+    Randomly select a fraction of test rows per user/item.
+
+    Args:
+        frac(float): the fraction items to select for testing.
+    """
+
+    def __init__(self, frac, rng_spec=None):
+        self.fraction = frac
+        self.rng = numpy_rng(rng_spec)
+
+    def __call__(self, udf):
+        return udf.sample(frac=self.fraction, random_state=self.rng)
+
+
+class LastN(HoldoutMethod):
+    """
+    Select a fixed number of test rows per user/item, based on ordering by a
+    column.
+
+    Args:
+        n(int): The number of test items to select.
+    """
+
+    def __init__(self, n, col="timestamp"):
+        self.n = n
+        self.column = col
+
+    def __call__(self, udf):
+        return udf.sort_values(self.column).iloc[-self.n :]
+
+
+class LastFrac(HoldoutMethod):
+    """
+    Select a fraction of test rows per user/item.
+
+    Args:
+        frac(double): the fraction of items to select for testing.
+    """
+
+    def __init__(self, frac, col="timestamp"):
+        self.fraction = frac
+        self.column = col
+
+    def __call__(self, udf):
+        n = round(len(udf) * self.fraction)
+        return udf.sort_values(self.column).iloc[-n:]
diff --git a/lenskit/lenskit/splitting/types.py b/lenskit/lenskit/splitting/types.py
new file mode 100644
index 000000000..8190feba5
--- /dev/null
+++ b/lenskit/lenskit/splitting/types.py
@@ -0,0 +1,27 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+from typing import NamedTuple
+
+import pandas as pd
+
+from lenskit.data.dataset import Dataset
+
+
+class TTPair(NamedTuple):
+    """
+    A train-test pair from splitting.
+    """
+
+    train: Dataset
+    """
+    The training data.
+    """
+
+    test: pd.DataFrame
+    """
+    The test data.
+    """
diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_user_holdout.py
index 504c9ee06..a569aa002 100644
--- a/lenskit/tests/test_split_user_holdout.py
+++ b/lenskit/tests/test_split_user_holdout.py
@@ -1,3 +1,9 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
 import functools as ft
 import itertools as it
 import math

From 7f13fe03555c8d1885cbc7de6ccffabab25498c0 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 14:30:18 -0400
Subject: [PATCH 03/22] add support for subsetting item lists

---
 lenskit/lenskit/data/items.py  | 38 ++++++++++++++++++++++++++++++
 lenskit/tests/test_itemlist.py | 42 ++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 3fd5ce93c..1cf8ac6d1 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -44,6 +44,10 @@ class ItemList:
     An item list logically a list of rows, each of which is an item, like a
     :class:`~pandas.DataFrame` but supporting multiple array backends.
 
+    Item lists can be subset as an array (e.g. ``items[selector]``), where
+    integer indices (or arrays thereof), boolean arrays, and slices are allowed
+    as selectors.
+
     When an item list is pickled, it is pickled compactly but only for CPUs: the
     vocabulary is dropped (after ensuring both IDs and numbers are computed),
     and all arrays are pickled as NumPy arrays.  This makes item lists compact
@@ -74,6 +78,13 @@ class is doing somewhat double-duty, representing a list of items along
           scores in the keyword arguments.  Other field names should be
           singular.
 
+    .. todo::
+
+        Right now, selection / subsetting only happens on the CPU, and will move
+        data to the CPU for the subsetting operation.  There is no reason, in
+        principle, why we cannot subset on GPU.  Future revisions may add
+        support for this.
+
     Args:
         item_ids:
             A list or array of item identifiers. ``item_id`` is accepted as an
@@ -333,6 +344,33 @@ def to_df(self) -> pd.DataFrame:
     def __len__(self):
         return self._len
 
+    def __getitem__(
+        self,
+        sel: NDArray[np.bool_] | NDArray[np.integer] | Sequence[int] | torch.Tensor | int | slice,
+    ) -> ItemList:
+        """
+        Subset the item list.
+
+        Args:
+            sel:
+                The items to select. Can be either a Boolean array of the same
+                length as the list that is ``True`` to indicate selected items,
+                or an array of indices of the items to retain (in order in the
+                list, starting from 0).
+        """
+        if np.isscalar(sel):
+            sel = np.array([sel])
+        elif not isinstance(sel, slice):
+            sel = np.asarray(sel)
+
+        # sel is now a selection array, or it is a slice. numpy supports both.
+        iids = self._ids[sel] if self._ids is not None else None
+        nums = self._numbers.numpy()[sel] if self._numbers is not None else None
+        flds = {n: f.numpy()[sel] for (n, f) in self._fields.items()}
+        return ItemList(
+            item_ids=iids, item_nums=nums, vocabulary=self._vocab, ordered=self.ordered, **flds
+        )
+
     def __getstate__(self) -> dict[str, object]:
         state: dict[str, object] = {"ordered": self.ordered, "len": self._len}
         if self._ids is not None:
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index 5b9957ec1..41abda1f9 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -231,3 +231,45 @@ def test_item_list_pickle_fields(ml_ds):
     assert np.all(r2.field("rating") == row.field("rating"))
     assert r2.field("timestamp") is not None
     assert np.all(r2.field("timestamp") == row.field("timestamp"))
+
+
+def test_subset_mask(ml_ds):
+    row = ml_ds.user_row(user_num=400)
+    ratings = row.field("rating")
+    assert ratings is not None
+
+    mask = ratings > 3.0
+    pos = row[mask]
+
+    assert len(pos) == np.sum(mask)
+    assert np.all(pos.ids() == row.ids()[mask])
+    assert np.all(pos.numbers() == row.numbers()[mask])
+    assert np.all(pos.field("rating") == row.field("rating")[mask])
+    assert np.all(pos.field("rating") > 3.0)
+
+
+def test_subset_idx(ml_ds):
+    row = ml_ds.user_row(user_num=400)
+    ratings = row.field("rating")
+    assert ratings is not None
+
+    ks = [0, 5, 15]
+    pos = row[ks]
+
+    assert len(pos) == 3
+    assert np.all(pos.ids() == row.ids()[ks])
+    assert np.all(pos.numbers() == row.numbers()[ks])
+    assert np.all(pos.field("rating") == row.field("rating")[ks])
+
+
+def test_subset_slice(ml_ds):
+    row = ml_ds.user_row(user_num=400)
+    ratings = row.field("rating")
+    assert ratings is not None
+
+    pos = row[5:10]
+
+    assert len(pos) == 5
+    assert np.all(pos.ids() == row.ids()[5:10])
+    assert np.all(pos.numbers() == row.numbers()[5:10])
+    assert np.all(pos.field("rating") == row.field("rating")[5:10])

From f993ceb1faaea37bf3bf49cb465a1141764bf527 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 14:44:57 -0400
Subject: [PATCH 04/22] add __str__ to ItemList

---
 lenskit/lenskit/data/items.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 1cf8ac6d1..6b534f049 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -394,3 +394,6 @@ def __setstate__(self, state: dict[str, Any]):
         if "numbers" in state:
             self._numbers = MTArray(state["numbers"])
         self._fields = {k[6:]: MTArray(v) for (k, v) in state.items() if k.startswith("field_")}
+
+    def __str__(self) -> str:
+        return f"<ItemList of {self._len} items>"

From 1c9b5f93bb9d1b7f70ddcde73ec236231ab8eeb1 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 14:45:08 -0400
Subject: [PATCH 05/22] reimplement holdouts to use item lists

---
 lenskit/lenskit/splitting/holdout.py     |  92 ++++++++++-----
 lenskit/tests/test_split_user_holdout.py | 143 +++++++++++++----------
 2 files changed, 144 insertions(+), 91 deletions(-)

diff --git a/lenskit/lenskit/splitting/holdout.py b/lenskit/lenskit/splitting/holdout.py
index 770ec6550..070fc18ea 100644
--- a/lenskit/lenskit/splitting/holdout.py
+++ b/lenskit/lenskit/splitting/holdout.py
@@ -10,28 +10,33 @@
 
 from typing import Protocol
 
+import numpy as np
 from seedbank import numpy_rng
+from seedbank.numpy import NPRNGSource
+
+from lenskit.data.items import ItemList
 
 
 class HoldoutMethod(Protocol):
     """
-    Holdout methods select test rows for a user (or item).  Partition methods
-    are callable; when called with a data frame, they return the test entries.
+    Holdout methods select test rows for a user (or occasionally an item).
+    Partition methods are callable; when called with a data frame, they return
+    the test entries.
     """
 
-    def __call__(self, udf):
+    def __call__(self, items: ItemList) -> ItemList:
         """
-        Subset a data frame.
+        Subset an item list (in the uncommon case of item-based holdouts, the
+        item list actually holds user IDs).
 
         Args:
-            udf(pandas.DataFrame):
-                The input data frame of rows for a user or item.
+            udf:
+                The item list from which holdout items should be selected.
 
         Returns:
-            pandas.DataFrame:
-                The data frame of test rows, a subset of ``udf``.
+            The list of test items.
         """
-        pass
+        raise NotImplementedError()
 
 
 class SampleN(HoldoutMethod):
@@ -39,16 +44,23 @@ class SampleN(HoldoutMethod):
     Randomly select a fixed number of test rows per user/item.
 
     Args:
-        n(int): the number of test items to select
+        n: the number of test items to select
         rng: the random number generator or seed
     """
 
-    def __init__(self, n, rng_spec=None):
+    n: int
+    rng: np.random.Generator
+
+    def __init__(self, n: int, rng_spec: NPRNGSource | None = None):
         self.n = n
         self.rng = numpy_rng(rng_spec)
 
-    def __call__(self, udf):
-        return udf.sample(n=self.n, random_state=self.rng)
+    def __call__(self, items: ItemList) -> ItemList:
+        if len(items) <= self.n:
+            return items
+
+        sel = self.rng.choice(len(items), self.n, replace=False)
+        return items[sel]
 
 
 class SampleFrac(HoldoutMethod):
@@ -56,32 +68,48 @@ class SampleFrac(HoldoutMethod):
     Randomly select a fraction of test rows per user/item.
 
     Args:
-        frac(float): the fraction items to select for testing.
+        frac: the fraction items to select for testing.
     """
 
-    def __init__(self, frac, rng_spec=None):
+    fraction: float
+    rng: np.random.Generator
+
+    def __init__(self, frac: float, rng_spec: NPRNGSource | None = None):
         self.fraction = frac
         self.rng = numpy_rng(rng_spec)
 
-    def __call__(self, udf):
-        return udf.sample(frac=self.fraction, random_state=self.rng)
+    def __call__(self, items: ItemList) -> ItemList:
+        n = round(len(items) * self.fraction)
+        sel = self.rng.choice(len(items), n, replace=False)
+        return items[sel]
 
 
 class LastN(HoldoutMethod):
     """
     Select a fixed number of test rows per user/item, based on ordering by a
-    column.
+    field.
 
     Args:
-        n(int): The number of test items to select.
+        n: The number of test items to select.
+        field: The field to order by.
     """
 
-    def __init__(self, n, col="timestamp"):
+    n: int
+    field: str
+
+    def __init__(self, n: int, field: str = "timestamp"):
         self.n = n
-        self.column = col
+        self.field = field
+
+    def __call__(self, items: ItemList) -> ItemList:
+        if len(items) <= self.n:
+            return items
 
-    def __call__(self, udf):
-        return udf.sort_values(self.column).iloc[-self.n :]
+        col = items.field(self.field)
+        if col is None:
+            raise TypeError(f"item list does not have ordering field {self.field}")
+        ordered = np.argsort(col)
+        return items[ordered[-self.n :]]
 
 
 class LastFrac(HoldoutMethod):
@@ -92,10 +120,18 @@ class LastFrac(HoldoutMethod):
         frac(double): the fraction of items to select for testing.
     """
 
-    def __init__(self, frac, col="timestamp"):
+    fraction: float
+    field: str
+
+    def __init__(self, frac: float, field: str = "timestamp"):
         self.fraction = frac
-        self.column = col
+        self.field = field
+
+    def __call__(self, items: ItemList) -> ItemList:
+        n = round(len(items) * self.fraction)
 
-    def __call__(self, udf):
-        n = round(len(udf) * self.fraction)
-        return udf.sort_values(self.column).iloc[-n:]
+        col = items.field(self.field)
+        if col is None:
+            raise TypeError(f"item list does not have ordering field {self.field}")
+        ordered = np.argsort(col)
+        return items[ordered[-n:]]
diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_user_holdout.py
index a569aa002..931526864 100644
--- a/lenskit/tests/test_split_user_holdout.py
+++ b/lenskit/tests/test_split_user_holdout.py
@@ -13,92 +13,109 @@
 
 import pytest
 
-import lenskit.crossfold as xf
+from lenskit.data.dataset import Dataset
+from lenskit.splitting.holdout import LastFrac, LastN, SampleFrac, SampleN
 
 
-def test_sample_n(ml_ratings: pd.DataFrame):
-    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+def test_sample_n(ml_ds: Dataset):
+    users = np.random.choice(ml_ds.users.ids(), 5, replace=False)
 
-    s5 = xf.SampleN(5)
+    s5 = SampleN(5)
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = s5(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = s5(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
         assert len(tst) == 5
-        assert len(tst) + len(trn) == len(udf)
+        assert len(tst) + len(trn) == len(row)
 
-    s10 = xf.SampleN(10)
+    s10 = SampleN(10)
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = s10(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = s10(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
         assert len(tst) == 10
-        assert len(tst) + len(trn) == len(udf)
+        assert len(tst) + len(trn) == len(row)
 
 
-def test_sample_frac(ml_ratings: pd.DataFrame):
-    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+def test_sample_frac(ml_ds: Dataset):
+    users = np.random.choice(ml_ds.users.ids(), 5, replace=False)
 
-    samp = xf.SampleFrac(0.2)
+    samp = SampleFrac(0.2)
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = samp(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
-        assert len(tst) + len(trn) == len(udf)
-        assert len(tst) >= math.floor(len(udf) * 0.2)
-        assert len(tst) <= math.ceil(len(udf) * 0.2)
-
-    samp = xf.SampleFrac(0.5)
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = samp(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
+        assert len(tst) + len(trn) == len(row)
+        assert len(tst) >= math.floor(len(row) * 0.2)
+        assert len(tst) <= math.ceil(len(row) * 0.2)
+
+    samp = SampleFrac(0.5)
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = samp(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
-        assert len(tst) + len(trn) == len(udf)
-        assert len(tst) >= math.floor(len(udf) * 0.5)
-        assert len(tst) <= math.ceil(len(udf) * 0.5)
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = samp(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
+        assert len(tst) + len(trn) == len(row)
+        assert len(tst) >= math.floor(len(row) * 0.5)
+        assert len(tst) <= math.ceil(len(row) * 0.5)
 
 
-def test_last_n(ml_ratings: pd.DataFrame):
-    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+def test_last_n(ml_ds: Dataset):
+    users = np.random.choice(ml_ds.users.ids(), 5, replace=False)
 
-    samp = xf.LastN(5)
+    samp = LastN(5)
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = samp(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = samp(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
         assert len(tst) == 5
-        assert len(tst) + len(trn) == len(udf)
-        assert tst.timestamp.min() >= trn.timestamp.max()
+        assert len(tst) + len(trn) == len(row)
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
 
-    samp = xf.LastN(7)
+    samp = LastN(7)
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = samp(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = samp(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
         assert len(tst) == 7
-        assert len(tst) + len(trn) == len(udf)
-        assert tst.timestamp.min() >= trn.timestamp.max()
+        assert len(tst) + len(trn) == len(row)
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
 
 
-def test_last_frac(ml_ratings: pd.DataFrame):
-    users = np.random.choice(ml_ratings.user.unique(), 5, replace=False)
+def test_last_frac(ml_ds: Dataset):
+    users = np.random.choice(ml_ds.users.ids(), 5, replace=False)
 
-    samp = xf.LastFrac(0.2, "timestamp")
+    samp = LastFrac(0.2, "timestamp")
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = samp(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
-        assert len(tst) + len(trn) == len(udf)
-        assert len(tst) >= math.floor(len(udf) * 0.2)
-        assert len(tst) <= math.ceil(len(udf) * 0.2)
-        assert tst.timestamp.min() >= trn.timestamp.max()
-
-    samp = xf.LastFrac(0.5, "timestamp")
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = samp(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
+        assert len(tst) + len(trn) == len(row)
+        assert len(tst) >= math.floor(len(row) * 0.2)
+        assert len(tst) <= math.ceil(len(row) * 0.2)
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
+
+    samp = LastFrac(0.5, "timestamp")
     for u in users:
-        udf = ml_ratings[ml_ratings.user == u]
-        tst = samp(udf)
-        trn = udf.loc[udf.index.difference(tst.index), :]
-        assert len(tst) + len(trn) == len(udf)
-        assert len(tst) >= math.floor(len(udf) * 0.5)
-        assert len(tst) <= math.ceil(len(udf) * 0.5)
-        assert tst.timestamp.min() >= trn.timestamp.max()
+        row = ml_ds.user_row(u)
+        assert row is not None
+        tst = samp(row)
+        mask = np.isin(row.ids(), tst.ids())
+        trn = row[~mask]
+        assert len(tst) + len(trn) == len(row)
+        assert len(tst) >= math.floor(len(row) * 0.5)
+        assert len(tst) <= math.ceil(len(row) * 0.5)
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()

From f326273647338acda5489ca9c6e84f262f91a50e Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 14:51:53 -0400
Subject: [PATCH 06/22] rename test files for split tests

---
 .../tests/{test_split_user_holdout.py => test_split_holdout.py}   | 0
 lenskit/tests/{test_split_ratings.py => test_split_rows.py}       | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename lenskit/tests/{test_split_user_holdout.py => test_split_holdout.py} (100%)
 rename lenskit/tests/{test_split_ratings.py => test_split_rows.py} (100%)

diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_holdout.py
similarity index 100%
rename from lenskit/tests/test_split_user_holdout.py
rename to lenskit/tests/test_split_holdout.py
diff --git a/lenskit/tests/test_split_ratings.py b/lenskit/tests/test_split_rows.py
similarity index 100%
rename from lenskit/tests/test_split_ratings.py
rename to lenskit/tests/test_split_rows.py

From 4cbc27ba532ba5a962115a49339488a8cc3391c9 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 14:52:34 -0400
Subject: [PATCH 07/22] add type ignores for holdout test

---
 lenskit/tests/test_split_holdout.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lenskit/tests/test_split_holdout.py b/lenskit/tests/test_split_holdout.py
index 931526864..f9a0666c3 100644
--- a/lenskit/tests/test_split_holdout.py
+++ b/lenskit/tests/test_split_holdout.py
@@ -79,7 +79,7 @@ def test_last_n(ml_ds: Dataset):
         trn = row[~mask]
         assert len(tst) == 5
         assert len(tst) + len(trn) == len(row)
-        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()  # type: ignore
 
     samp = LastN(7)
     for u in users:
@@ -90,7 +90,7 @@ def test_last_n(ml_ds: Dataset):
         trn = row[~mask]
         assert len(tst) == 7
         assert len(tst) + len(trn) == len(row)
-        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()  # type: ignore
 
 
 def test_last_frac(ml_ds: Dataset):
@@ -106,7 +106,7 @@ def test_last_frac(ml_ds: Dataset):
         assert len(tst) + len(trn) == len(row)
         assert len(tst) >= math.floor(len(row) * 0.2)
         assert len(tst) <= math.ceil(len(row) * 0.2)
-        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()  # type: ignore
 
     samp = LastFrac(0.5, "timestamp")
     for u in users:
@@ -118,4 +118,4 @@ def test_last_frac(ml_ds: Dataset):
         assert len(tst) + len(trn) == len(row)
         assert len(tst) >= math.floor(len(row) * 0.5)
         assert len(tst) <= math.ceil(len(row) * 0.5)
-        assert tst.field("timestamp").min() >= trn.field("timestamp").max()
+        assert tst.field("timestamp").min() >= trn.field("timestamp").max()  # type: ignore

From 151ebc40520bf1cd5f448f9ddce2b6c3a8876342 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 16:20:33 -0400
Subject: [PATCH 08/22] update docs

---
 docs/releases/2024.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst
index 23e92cb05..dfbaa2358 100644
--- a/docs/releases/2024.rst
+++ b/docs/releases/2024.rst
@@ -41,6 +41,11 @@ Significant Changes
     without round-tripping through Pandas and NumPy, and keep this transparent
     to client code).
 
+*   Where Pandas data frames are still used, the standard user and item columns
+    have been renamed to ``user_id`` and ``item_id`` respectively, with
+    ``user_num`` and ``item_num`` for 0-based user and item numbers.  This is to
+    remove ambiguity about how users and items are being referenced.
+
 *   **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms,
     instead of Numba-accelerated NumPy code.  Algorithms using PyTorch are:
 

From b0303e4a6b8b35759f5bd18889fa8777694678a7 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 16:21:54 -0400
Subject: [PATCH 09/22] make ids & numbers optional converting ItemList to a
 data frame

---
 lenskit/lenskit/data/items.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 6b534f049..6b0245da2 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -318,21 +318,30 @@ def field(
         else:
             return val.to(format)
 
-    def to_df(self) -> pd.DataFrame:
+    def to_df(self, *, ids: bool = True, numbers: bool = True) -> pd.DataFrame:
         """
         Convert this item list to a Pandas data frame.  It has the following columns:
 
-        * ``item_id`` — the item IDs (if available)
-        * ``item_id`` — the item numbers (if available)
+        * ``item_id`` — the item IDs (if available and ``ids=True``)
+        * ``item_num`` — the item numbers (if available and ``numbers=True``)
         * ``score`` — the item scores
         * ``rank`` — the item ranks (if the list is ordered)
         * all other defined fields, using their field names
         """
         cols = {}
-        if self._ids is not None or self._vocab is not None:
+        if ids and self._ids is not None or self._vocab is not None:
             cols["item_id"] = self.ids()
-        if self._numbers is not None or self._vocab is not None:
+        if numbers and self._numbers is not None or self._vocab is not None:
             cols["item_num"] = self.numbers()
+        # we need to have numbers or ids, or it makes no sense
+        if "item_id" not in cols and "item_num" not in cols:
+            if ids and not numbers:
+                raise RuntimeError("item list has no vocabulary, cannot compute IDs")
+            elif numbers and not ids:
+                raise RuntimeError("item list has no vocabulary, cannot compute numbers")
+            else:
+                raise RuntimeError("cannot create item data frame without identifiers or numbers")
+
         if "score" in self._fields:
             cols["score"] = self.scores()
         if self.ordered:

From 544f1e151dc1e7187917461776f979762d9bf2e3 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 17:28:23 -0400
Subject: [PATCH 10/22] add data splitting move to release notes

---
 docs/releases/2024.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst
index dfbaa2358..0e3baa0e8 100644
--- a/docs/releases/2024.rst
+++ b/docs/releases/2024.rst
@@ -41,6 +41,12 @@ Significant Changes
     without round-tripping through Pandas and NumPy, and keep this transparent
     to client code).
 
+*   Data splitting for offline evaluation has been moved into
+    :mod:`lenskit.splitting`, updated to work with data sets and item lists
+    instead of raw data frames, and splitting functions have been renamed (e.g.
+    ``rows`` to ``records``) and had parameters updated for clarity and
+    consistency.
+
 *   Where Pandas data frames are still used, the standard user and item columns
     have been renamed to ``user_id`` and ``item_id`` respectively, with
     ``user_num`` and ``item_num`` for 0-based user and item numbers.  This is to

From 86225113fefb2ad60db46d670c6400078a57f139 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 17:28:36 -0400
Subject: [PATCH 11/22] add support for counting observed pairs

---
 lenskit/lenskit/data/dataset.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py
index abff11283..ddb6a4f57 100644
--- a/lenskit/lenskit/data/dataset.py
+++ b/lenskit/lenskit/data/dataset.py
@@ -116,6 +116,7 @@ def count(self, what: str) -> int:
 
                 * users
                 * items
+                * pairs (observed user-item pairs)
                 * interactions
                 * ratings
         """
@@ -338,6 +339,11 @@ def interaction_matrix(
                 underlying data, then this is equivalent to ``"indicator"``,
                 except that the ``"pandas"`` format will include a ``"rating"``
                 column of all 1s.
+
+                The ``"pandas"`` format also supports the special field name
+                ``"all"`` to return a data frame with all available fields. When
+                ``field="all"``, a field named ``count`` (if defined) is
+                combined with the ``sum`` method, and other fields use ``last``.
             combine:
                 How to combine multiple observations for a single user-item
                 pair. Available methods are:
@@ -348,7 +354,8 @@ def interaction_matrix(
                   field.
                 * ``"sum"`` — sum the field values.
                 * ``"first"``, ``"last"`` — take the first or last value seen
-                  (in timestamp order, if timestamps are defined).
+                  (in timestamp order, if timestamps are defined; otherwise,
+                  their order in the original input).
             layout:
                 The layout for a sparse matrix.  Can be either ``csr`` or
                 ``coo``, or ``None`` to use the default for the specified
@@ -488,8 +495,8 @@ def user_stats(self) -> pd.DataFrame:
 
 class MatrixDataset(Dataset):
     """
-    Dataset implementation using an in-memory rating or implicit-feedback
-    matrix.
+    Dataset implementation using an in-memory rating or implicit-feedback matrix
+    (with no duplicate interactions).
 
     .. note::
         Client code generally should not construct this class directly.  Instead
@@ -554,7 +561,7 @@ def count(self, what: str) -> int:
                 return self._users.size
             case "items":
                 return self._items.size
-            case "interactions" | "ratings":
+            case "pairs" | "interactions" | "ratings":
                 return self._matrix.n_obs
             case _:
                 raise KeyError(f"unknown entity type {what}")
@@ -603,16 +610,16 @@ def _int_mat_pandas(self, field: str | None, original_ids: bool) -> pd.DataFrame
                 "user_num": self._matrix.user_nums,
                 "item_num": self._matrix.item_nums,
             }
-        if field == "rating":
+        if field == "all" or field == "rating":
             if self._matrix.ratings is not None:
                 cols["rating"] = self._matrix.ratings
             else:
                 cols["rating"] = np.ones(self._matrix.n_obs)
-        elif field == "timestamp":
+        elif field == "all" or field == "timestamp":
             if self._matrix.timestamps is None:
                 raise FieldError("interaction", field)
             cols["timestamp"] = self._matrix.timestamps
-        elif field:
+        elif field and field != "all":
             raise FieldError("interaction", field)
         return pd.DataFrame(cols)
 

From c6b7d0eb84c4be7e2e2a1c784944a74add84530f Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 17:29:05 -0400
Subject: [PATCH 12/22] add support for making item lists from pandas
 dataframes

---
 lenskit/lenskit/data/items.py  | 34 +++++++++++++++++++++++++++++++++-
 lenskit/tests/test_itemlist.py | 22 ++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py
index 6b0245da2..a046c44af 100644
--- a/lenskit/lenskit/data/items.py
+++ b/lenskit/lenskit/data/items.py
@@ -142,7 +142,7 @@ def __init__(
 
         if item_ids is not None:
             self._ids = np.asarray(item_ids)
-            if not issubclass(self._ids.dtype.type, (np.integer, np.str_, np.bytes_)):
+            if not issubclass(self._ids.dtype.type, (np.integer, np.str_, np.bytes_, np.object_)):
                 raise TypeError(f"item IDs not integers or bytes (type: {self._ids.dtype})")
 
             check_1d(self._ids, label="item_ids")
@@ -165,6 +165,38 @@ def __init__(
                 raise ValueError("cannot specify both scores= and score=")
             self._fields["score"] = MTArray(scores)
 
+    @classmethod
+    def from_df(
+        cls, df: pd.DataFrame, *, vocabulary=Vocabulary[EntityId], keep_user: bool = False
+    ) -> ItemList:
+        """
+        Create a item list from a Pandas data frame.  The frame should have
+        ``item_num`` and/or ``item_id`` columns to identify the items; other
+        columns (e.g. ``score`` or ``rating``) are added as fields. If the data
+        frame has user columns (``user_id`` or ``user_num``), those are dropped
+        by default.
+
+        Args:
+            df:
+                The data frame to turn into an item list.
+            vocabulary:
+                The item vocabulary.
+            keep_user:
+                If ``True``, keeps user ID/number columns instead of dropping them.
+        """
+        ids = df["item_id"].values if "item_id" in df.columns else None
+        nums = df["item_num"].values if "item_num" in df.columns else None
+        if ids is None and nums is None:
+            raise TypeError("data frame must have at least one of item_id, item_num columns")
+
+        to_drop = ["item_id", "item_num"]
+        if not keep_user:
+            to_drop += ["user_id", "user_num"]
+        df = df.drop(columns=to_drop, errors="ignore")
+
+        fields = {f: df[f].values for f in df.columns}
+        return cls(item_ids=ids, item_nums=nums, vocabulary=vocabulary, **fields)  # type: ignore
+
     def clone(self) -> ItemList:
         """
         Make a shallow copy of the item list.
diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py
index 41abda1f9..416735baa 100644
--- a/lenskit/tests/test_itemlist.py
+++ b/lenskit/tests/test_itemlist.py
@@ -6,6 +6,7 @@
 import pickle
 
 import numpy as np
+import pandas as pd
 import torch
 
 from pytest import raises
@@ -273,3 +274,24 @@ def test_subset_slice(ml_ds):
     assert np.all(pos.ids() == row.ids()[5:10])
     assert np.all(pos.numbers() == row.numbers()[5:10])
     assert np.all(pos.field("rating") == row.field("rating")[5:10])
+
+
+def test_from_df():
+    df = pd.DataFrame({"item_id": ITEMS, "item_num": np.arange(5), "score": np.random.randn(5)})
+    il = ItemList.from_df(df, vocabulary=VOCAB)  # type: ignore
+    assert len(il) == 5
+    assert np.all(il.ids() == ITEMS)
+    assert np.all(il.numbers() == np.arange(5))
+    assert np.all(il.scores() == df["score"].values)
+
+
+def test_from_df_user():
+    df = pd.DataFrame(
+        {"user_id": 50, "item_id": ITEMS, "item_num": np.arange(5), "score": np.random.randn(5)}
+    )
+    il = ItemList.from_df(df, vocabulary=VOCAB)  # type: ignore
+    assert len(il) == 5
+    assert np.all(il.ids() == ITEMS)
+    assert np.all(il.numbers() == np.arange(5))
+    assert np.all(il.scores() == df["score"].values)
+    assert il.field("user_id") is None

From 36a73cfaa785733430d83f9a4c65f61e6e6af088 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 17:29:30 -0400
Subject: [PATCH 13/22] add utilities for split test data & test it

---
 lenskit/lenskit/splitting/__init__.py |  2 ++
 lenskit/lenskit/splitting/split.py    | 52 +++++++++++++++++++++++++++
 lenskit/lenskit/splitting/types.py    | 27 --------------
 lenskit/tests/test_split_types.py     | 24 +++++++++++++
 4 files changed, 78 insertions(+), 27 deletions(-)
 create mode 100644 lenskit/lenskit/splitting/split.py
 delete mode 100644 lenskit/lenskit/splitting/types.py
 create mode 100644 lenskit/tests/test_split_types.py

diff --git a/lenskit/lenskit/splitting/__init__.py b/lenskit/lenskit/splitting/__init__.py
index ec4765a1f..080e9a8c2 100644
--- a/lenskit/lenskit/splitting/__init__.py
+++ b/lenskit/lenskit/splitting/__init__.py
@@ -7,3 +7,5 @@
 """
 Splitting data for train-test evaluation.
 """
+
+from .split import TTSplit  # noqa: F401
diff --git a/lenskit/lenskit/splitting/split.py b/lenskit/lenskit/splitting/split.py
new file mode 100644
index 000000000..579feaffd
--- /dev/null
+++ b/lenskit/lenskit/splitting/split.py
@@ -0,0 +1,52 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+from typing import Literal, NamedTuple, TypeAlias
+
+import pandas as pd
+
+from lenskit.data.dataset import Dataset
+from lenskit.data.items import ItemList
+from lenskit.data.vocab import EntityId
+
+SplitTable: TypeAlias = Literal["matrix"]
+
+
+class TTSplit(NamedTuple):
+    """
+    A train-test pair from splitting.
+    """
+
+    train: Dataset
+    """
+    The training data.
+    """
+
+    test: dict[EntityId, ItemList]
+    """
+    The test data.
+    """
+
+
+def dict_to_df(data: dict[EntityId, ItemList]) -> pd.DataFrame:
+    """
+    Convert a dictionary mapping user IDs to item lists into a data frame.
+    """
+
+    df = pd.concat(
+        {u: il.to_df(numbers=False) for (u, il) in data.items()},
+        names=["user_id"],
+    )
+    df = df.reset_index("user_id")
+    df = df.reset_index(drop=True)
+    return df
+
+
+def dict_from_df(df: pd.DataFrame) -> dict[EntityId, ItemList]:
+    """
+    Convert a dictionary mapping user IDs to item lists into a data frame.
+    """
+    return {u: ItemList.from_df(udf) for (u, udf) in df.groupby("user_id")}  # type: ignore
diff --git a/lenskit/lenskit/splitting/types.py b/lenskit/lenskit/splitting/types.py
deleted file mode 100644
index 8190feba5..000000000
--- a/lenskit/lenskit/splitting/types.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# This file is part of LensKit.
-# Copyright (C) 2018-2023 Boise State University
-# Copyright (C) 2023-2024 Drexel University
-# Licensed under the MIT license, see LICENSE.md for details.
-# SPDX-License-Identifier: MIT
-
-from typing import NamedTuple
-
-import pandas as pd
-
-from lenskit.data.dataset import Dataset
-
-
-class TTPair(NamedTuple):
-    """
-    A train-test pair from splitting.
-    """
-
-    train: Dataset
-    """
-    The training data.
-    """
-
-    test: pd.DataFrame
-    """
-    The test data.
-    """
diff --git a/lenskit/tests/test_split_types.py b/lenskit/tests/test_split_types.py
new file mode 100644
index 000000000..45c70f82b
--- /dev/null
+++ b/lenskit/tests/test_split_types.py
@@ -0,0 +1,24 @@
+"""
+Test the data type utilities in splits.
+"""
+
+import numpy as np
+import pandas as pd
+
+from lenskit.splitting.split import dict_from_df
+
+
+def test_dict_from_df(rng, ml_ratings: pd.DataFrame):
+    ml_ratings = ml_ratings.rename(columns={"user": "user_id", "item": "item_id"})
+    users = dict_from_df(ml_ratings)
+    assert len(users) == ml_ratings["user_id"].nunique()
+    assert set(users.keys()) == set(ml_ratings["user_id"])
+
+    for uid in rng.choice(ml_ratings["user_id"].unique(), 25):
+        items = users[uid]
+        udf = ml_ratings[ml_ratings["user_id"] == uid]
+        assert len(items) == len(udf)
+        assert np.all(np.unique(items.ids()) == np.unique(udf["item_id"]))
+
+    tot = sum(len(il) for il in users.values())
+    assert tot == len(ml_ratings)

From 27fb76c5c622742d275144c48ac196e3626169d4 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 17:29:43 -0400
Subject: [PATCH 14/22] make record-based splitting work

---
 lenskit/lenskit/splitting/records.py | 170 +++++++++++++++++++++++++++
 lenskit/tests/test_split_records.py  | 164 ++++++++++++++++++++++++++
 lenskit/tests/test_split_rows.py     | 121 -------------------
 3 files changed, 334 insertions(+), 121 deletions(-)
 create mode 100644 lenskit/lenskit/splitting/records.py
 create mode 100644 lenskit/tests/test_split_records.py
 delete mode 100644 lenskit/tests/test_split_rows.py

diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py
new file mode 100644
index 000000000..a722247fc
--- /dev/null
+++ b/lenskit/lenskit/splitting/records.py
@@ -0,0 +1,170 @@
+import logging
+from typing import Iterator, overload
+
+import numpy as np
+import pandas as pd
+from seedbank import numpy_rng
+
+from lenskit.data.dataset import Dataset, MatrixDataset
+
+from .split import TTSplit, dict_from_df
+
+_log = logging.getLogger(__name__)
+
+
+def crossfold_records(data: Dataset, partitions: int, *, rng_spec=None) -> Iterator[TTSplit]:
+    """
+    Partition a dataset by **records** into cross-fold partitions.  This
+    partitions the records (ratings, play counts, clicks, etc.) into *k*
+    partitions without regard to users or items.
+
+    Since record-based random cross-validation doesn't make much sense with
+    repeated interactions, this splitter only supports operating on the
+    dataset's interaction matrix.
+
+    Args:
+        data:
+            Ratings or other data you wish to partition.
+        partitions:
+            The number of partitions to produce.
+        rng_spec:
+            The random number generator or seed (see
+            :func:`seedbank.numpy_rng`).
+
+    Returns:
+        iterator: an iterator of train-test pairs
+    """
+
+    _log.info("partitioning %d ratings into %d partitions", data.count("pairs"), partitions)
+    rng = numpy_rng(rng_spec)
+
+    # get the full data list to split
+    df = data.interaction_matrix(format="pandas", field="all", original_ids=True)
+    n = len(df)
+    rows = np.arange(n)
+
+    # shuffle the indices & split into partitions
+    rng.shuffle(rows)
+    test_sets = np.array_split(rows, partitions)
+
+    # convert each partition into a split
+    for ts in test_sets:
+        yield _make_pair(data, df, ts)
+
+
+@overload
+def sample_records(
+    data: Dataset, size: int, *, disjoint=True, rng_spec=None, repeats: None = None
+) -> TTSplit: ...
+@overload
+def sample_records(
+    data: Dataset, size: int, *, repeats: int, disjoint=True, rng_spec=None
+) -> Iterator[TTSplit]: ...
+def sample_records(
+    data: Dataset, size: int, *, repeats: int | None = None, disjoint=True, rng_spec=None
+) -> TTSplit | Iterator[TTSplit]:
+    """
+    Sample train-test a frame of ratings into train-test partitions.  This
+    function does not care what kind of data is in `data`, so long as it is a
+    Pandas DataFrame (or equivalent).
+
+    We can loop over a sequence of train-test pairs::
+
+        >>> from lenskit.data.movielens import load_movielens_df
+        >>> ratings = load_movielens_df('data/ml-latest-small')
+        >>> for train, test in sample_records(ratings, 1000, repeats=5):
+        ...     print(sum(len(il) for il in test.values()))
+        1000
+        1000
+        1000
+        1000
+        1000
+
+    Sometimes for testing, it is useful to just get a single pair::
+
+        >>> train, test = sample_records(ratings, 1000)
+        >>> sum(len(il) for il in test.values())
+        1000
+
+    Args:
+        data:
+            The data set to split.
+        size:
+            The size of each test sample.
+        repeats:
+            The number of data splits to produce.  If ``None``, produce a
+            _single_ train-test pair instead of an iterator or list.
+        disjoint:
+            If ``True``, force test samples to be disjoint.
+        rng_spec:
+            The random number generator or seed (see
+            :py:func:`seedbank.numpy_rng`).
+
+    Returns:
+        A train-test pair or iterator of such pairs (depending on ``repeats``).
+    """
+
+    rng = numpy_rng(rng_spec)
+
+    # get the full data list to split
+    df = data.interaction_matrix(format="pandas", field="all", original_ids=True)
+    n = len(df)
+
+    if repeats is None:
+        test_pos = rng.choice(np.int32(n), size, replace=False)
+        return _make_pair(data, df, test_pos)
+
+    if disjoint and repeats * size >= n:
+        _log.warning(
+            "wanted %d disjoint splits of %d each, but only have %d rows; cross-folding",
+            repeats,
+            size,
+            n,
+        )
+        return crossfold_records(data, repeats, rng_spec=rng)
+
+    # get iterators over index arrays for producing the data
+    if disjoint:
+        _log.info("creating %d disjoint samples of size %d", repeats, size)
+        ips = _disjoint_samples(n, size, repeats, rng)
+
+    else:
+        _log.info("taking %d samples of size %d", repeats, size)
+        ips = _n_samples(n, size, repeats, rng)
+
+    # since this func is both generator and return depending on args,
+    # we can't use yield — need to return a generator expression
+    return (_make_pair(data, df, test_is) for test_is in ips)
+
+
+def _make_pair(
+    data: Dataset, df: pd.DataFrame, test_is: np.ndarray[int, np.dtype[np.int32]]
+) -> TTSplit:
+    mask = np.zeros(len(df), np.bool_)
+    mask[test_is] = True
+
+    test = dict_from_df(df[mask])
+    train = MatrixDataset(data.users, data.items, df[~mask])
+
+    return TTSplit(train, test)
+
+
+def _disjoint_samples(
+    n: int, size: int, reps: int, rng: np.random.Generator
+) -> Iterator[np.ndarray[int, np.dtype[np.int32]]]:
+    # shuffle the indices & split into partitions
+    xs = np.arange(n, dtype=np.int32)
+    rng.shuffle(xs)
+
+    # convert each partition into a split
+    for i in range(reps):
+        start = i * size
+        end = start + size
+        yield xs[start:end]
+
+
+def _n_samples(
+    n: int, size: int, reps: int, rng: np.random.Generator
+) -> Iterator[np.ndarray[int, np.dtype[np.int32]]]:
+    for i in range(reps):
+        yield rng.choice(np.int32(n), size, replace=False)
diff --git a/lenskit/tests/test_split_records.py b/lenskit/tests/test_split_records.py
new file mode 100644
index 000000000..8baade56c
--- /dev/null
+++ b/lenskit/tests/test_split_records.py
@@ -0,0 +1,164 @@
+# This file is part of LensKit.
+# Copyright (C) 2018-2023 Boise State University
+# Copyright (C) 2023-2024 Drexel University
+# Licensed under the MIT license, see LICENSE.md for details.
+# SPDX-License-Identifier: MIT
+
+import functools as ft
+import itertools as it
+import math
+
+import numpy as np
+import pandas as pd
+
+import pytest
+
+from lenskit.data.dataset import Dataset
+from lenskit.splitting.records import crossfold_records, sample_records
+
+
+def test_crossfold_records(ml_ds: Dataset):
+    splits = crossfold_records(ml_ds, 5)
+    splits = list(splits)
+    assert len(splits) == 5
+
+    for s in splits:
+        # do we have all the data?
+        test_count = sum(len(il) for il in s.test.values())
+        assert test_count + s.train.interaction_count == ml_ds.count("pairs")
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+
+        # no overlap
+        assert not (test_pairs & train_pairs)
+        # union is complete
+        assert len(test_pairs | train_pairs) == ml_ds.count("pairs")
+
+    # the test sets are pairwise disjoint
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        p1 = set((u, i) for (u, il) in s1.test.items() for i in il.ids())
+        p2 = set((u, i) for (u, il) in s2.test.items() for i in il.ids())
+        assert not (p1 & p2)
+
+
+def test_sample_records_once(ml_ds):
+    train, test = sample_records(ml_ds, size=1000)
+
+    test_count = sum(len(il) for il in test.values())
+    assert test_count == 1000
+    assert test_count + train.interaction_count == ml_ds.count("pairs")
+    test_pairs = set((u, i) for (u, il) in test.items() for i in il.ids())
+    tdf = train.interaction_matrix("pandas", field="rating", original_ids=True)
+    train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+
+    # no overlap
+    assert not (test_pairs & train_pairs)
+    # union is complete
+    assert len(test_pairs | train_pairs) == ml_ds.count("pairs")
+
+
+def test_sample_records(ml_ds):
+    splits = sample_records(ml_ds, size=1000, repeats=5)
+    splits = list(splits)
+    assert len(splits) == 5
+
+    for s in splits:
+        test_count = sum(len(il) for il in s.test.values())
+        assert test_count == 1000
+        assert test_count + s.train.interaction_count == ml_ds.count("pairs")
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+
+        # no overlap
+        assert not (test_pairs & train_pairs)
+        # union is complete
+        assert len(test_pairs | train_pairs) == ml_ds.count("pairs")
+
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        p1 = set((u, i) for (u, il) in s1.test.items() for i in il.ids())
+        p2 = set((u, i) for (u, il) in s2.test.items() for i in il.ids())
+        assert not (p1 & p2)
+
+
+def test_sample_rows_more_smaller_parts(ml_ds: Dataset):
+    splits = sample_records(ml_ds, 500, repeats=10)
+    splits = list(splits)
+    assert len(splits) == 10
+
+    for s in splits:
+        test_count = sum(len(il) for il in s.test.values())
+        assert test_count == 500
+        assert test_count + s.train.interaction_count == ml_ds.count("pairs")
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+
+        # no overlap
+        assert not (test_pairs & train_pairs)
+        # union is complete
+        assert len(test_pairs | train_pairs) == ml_ds.count("pairs")
+
+    for s1, s2 in it.product(splits, splits):
+        if s1 is s2:
+            continue
+
+        p1 = set((u, i) for (u, il) in s1.test.items() for i in il.ids())
+        p2 = set((u, i) for (u, il) in s2.test.items() for i in il.ids())
+        assert not (p1 & p2)
+
+
+def test_sample_non_disjoint(ml_ds: Dataset):
+    splits = sample_records(ml_ds, 1000, repeats=10, disjoint=False)
+    splits = list(splits)
+    assert len(splits) == 10
+
+    for s in splits:
+        test_count = sum(len(il) for il in s.test.values())
+        assert test_count == 1000
+        assert test_count + s.train.interaction_count == ml_ds.count("pairs")
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+
+        # no overlap
+        assert not (test_pairs & train_pairs)
+        # union is complete
+        assert len(test_pairs | train_pairs) == ml_ds.count("pairs")
+
+    # There are enough splits & items we should pick at least one duplicate
+    ipairs = (
+        (
+            set((u, i) for (u, il) in s1.test.items() for i in il.ids()),
+            set((u, i) for (u, il) in s2.test.items() for i in il.ids()),
+        )
+        for (s1, s2) in it.product(splits, splits)
+    )
+    isizes = [len(i1.intersection(i2)) for (i1, i2) in ipairs]
+    assert any(n > 0 for n in isizes)
+
+
+@pytest.mark.slow
+def test_sample_oversize(ml_ds: Dataset):
+    splits = sample_records(ml_ds, 10000, repeats=50)
+    splits = list(splits)
+    assert len(splits) == 50
+
+    for s in splits:
+        test_count = sum(len(il) for il in s.test.values())
+        assert test_count + s.train.interaction_count == ml_ds.count("pairs")
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+
+        # no overlap
+        assert not (test_pairs & train_pairs)
+        # union is complete
+        assert len(test_pairs | train_pairs) == ml_ds.count("pairs")
diff --git a/lenskit/tests/test_split_rows.py b/lenskit/tests/test_split_rows.py
deleted file mode 100644
index 5e061b87f..000000000
--- a/lenskit/tests/test_split_rows.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# This file is part of LensKit.
-# Copyright (C) 2018-2023 Boise State University
-# Copyright (C) 2023-2024 Drexel University
-# Licensed under the MIT license, see LICENSE.md for details.
-# SPDX-License-Identifier: MIT
-
-import functools as ft
-import itertools as it
-import math
-
-import numpy as np
-import pandas as pd
-
-import pytest
-
-import lenskit.crossfold as xf
-
-
-def test_partition_rows(ml_ratings: pd.DataFrame):
-    splits = xf.partition_rows(ml_ratings, 5)
-    splits = list(splits)
-    assert len(splits) == 5
-
-    for s in splits:
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        test_idx = s.test.set_index(["user", "item"]).index
-        train_idx = s.train.set_index(["user", "item"]).index
-        assert len(test_idx.intersection(train_idx)) == 0
-
-    # we should partition!
-    for s1, s2 in it.product(splits, splits):
-        if s1 is s2:
-            continue
-
-        i1 = s1.test.set_index(["user", "item"]).index
-        i2 = s2.test.set_index(["user", "item"]).index
-        inter = i1.intersection(i2)
-        assert len(inter) == 0
-
-    union = ft.reduce(lambda i1, i2: i1.union(i2), (s.test.index for s in splits))
-    assert len(union.unique()) == len(ml_ratings)
-
-
-def test_sample_rows(ml_ratings: pd.DataFrame):
-    splits = xf.sample_rows(ml_ratings, partitions=5, size=1000)
-    splits = list(splits)
-    assert len(splits) == 5
-
-    for s in splits:
-        assert len(s.test) == 1000
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-        test_idx = s.test.set_index(["user", "item"]).index
-        train_idx = s.train.set_index(["user", "item"]).index
-        assert len(test_idx.intersection(train_idx)) == 0
-
-    for s1, s2 in it.product(splits, splits):
-        if s1 is s2:
-            continue
-
-        i1 = s1.test.set_index(["user", "item"]).index
-        i2 = s2.test.set_index(["user", "item"]).index
-        inter = i1.intersection(i2)
-        assert len(inter) == 0
-
-
-def test_sample_rows_more_smaller_parts(ml_ratings: pd.DataFrame):
-    splits = xf.sample_rows(ml_ratings, partitions=10, size=500)
-    splits = list(splits)
-    assert len(splits) == 10
-
-    for s in splits:
-        assert len(s.test) == 500
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-        test_idx = s.test.set_index(["user", "item"]).index
-        train_idx = s.train.set_index(["user", "item"]).index
-        assert len(test_idx.intersection(train_idx)) == 0
-
-    for s1, s2 in it.product(splits, splits):
-        if s1 is s2:
-            continue
-
-        i1 = s1.test.set_index(["user", "item"]).index
-        i2 = s2.test.set_index(["user", "item"]).index
-        inter = i1.intersection(i2)
-        assert len(inter) == 0
-
-
-def test_sample_non_disjoint(ml_ratings: pd.DataFrame):
-    splits = xf.sample_rows(ml_ratings, partitions=10, size=1000, disjoint=False)
-    splits = list(splits)
-    assert len(splits) == 10
-
-    for s in splits:
-        assert len(s.test) == 1000
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-        test_idx = s.test.set_index(["user", "item"]).index
-        train_idx = s.train.set_index(["user", "item"]).index
-        assert len(test_idx.intersection(train_idx)) == 0
-
-    # There are enough splits & items we should pick at least one duplicate
-    ipairs = (
-        (s1.test.set_index(["user", "item"]).index, s2.test.set_index(["user", "item"]).index)
-        for (s1, s2) in it.product(splits, splits)
-    )
-    isizes = [len(i1.intersection(i2)) for (i1, i2) in ipairs]
-    assert any(n > 0 for n in isizes)
-
-
-@pytest.mark.slow
-def test_sample_oversize(ml_ratings: pd.DataFrame):
-    splits = xf.sample_rows(ml_ratings, 50, 10000)
-    splits = list(splits)
-    assert len(splits) == 50
-
-    for s in splits:
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        test_idx = s.test.set_index(["user", "item"]).index
-        train_idx = s.train.set_index(["user", "item"]).index
-        assert len(test_idx.intersection(train_idx)) == 0

From bc35baa1fe0844fd32426e2f7a5a69ddf57a4808 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 17:36:48 -0400
Subject: [PATCH 15/22] add test-size utlity

---
 lenskit/lenskit/splitting/split.py  |  7 +++++++
 lenskit/tests/test_split_records.py | 15 ++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/lenskit/lenskit/splitting/split.py b/lenskit/lenskit/splitting/split.py
index 579feaffd..393424c78 100644
--- a/lenskit/lenskit/splitting/split.py
+++ b/lenskit/lenskit/splitting/split.py
@@ -30,6 +30,13 @@ class TTSplit(NamedTuple):
     The test data.
     """
 
+    @property
+    def test_size(self) -> int:
+        """
+        Get the number of test pairs.
+        """
+        return sum(len(il) for il in self.test.values())
+
 
 def dict_to_df(data: dict[EntityId, ItemList]) -> pd.DataFrame:
     """
diff --git a/lenskit/tests/test_split_records.py b/lenskit/tests/test_split_records.py
index 8baade56c..a4e565dfd 100644
--- a/lenskit/tests/test_split_records.py
+++ b/lenskit/tests/test_split_records.py
@@ -24,7 +24,7 @@ def test_crossfold_records(ml_ds: Dataset):
 
     for s in splits:
         # do we have all the data?
-        test_count = sum(len(il) for il in s.test.values())
+        test_count = s.test_size
         assert test_count + s.train.interaction_count == ml_ds.count("pairs")
         test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
         tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
@@ -46,9 +46,10 @@ def test_crossfold_records(ml_ds: Dataset):
 
 
 def test_sample_records_once(ml_ds):
-    train, test = sample_records(ml_ds, size=1000)
+    split = sample_records(ml_ds, size=1000)
+    train, test = split
 
-    test_count = sum(len(il) for il in test.values())
+    test_count = split.test_size
     assert test_count == 1000
     assert test_count + train.interaction_count == ml_ds.count("pairs")
     test_pairs = set((u, i) for (u, il) in test.items() for i in il.ids())
@@ -67,7 +68,7 @@ def test_sample_records(ml_ds):
     assert len(splits) == 5
 
     for s in splits:
-        test_count = sum(len(il) for il in s.test.values())
+        test_count = s.test_size
         assert test_count == 1000
         assert test_count + s.train.interaction_count == ml_ds.count("pairs")
         test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
@@ -94,7 +95,7 @@ def test_sample_rows_more_smaller_parts(ml_ds: Dataset):
     assert len(splits) == 10
 
     for s in splits:
-        test_count = sum(len(il) for il in s.test.values())
+        test_count = s.test_size
         assert test_count == 500
         assert test_count + s.train.interaction_count == ml_ds.count("pairs")
         test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
@@ -121,7 +122,7 @@ def test_sample_non_disjoint(ml_ds: Dataset):
     assert len(splits) == 10
 
     for s in splits:
-        test_count = sum(len(il) for il in s.test.values())
+        test_count = s.test_size
         assert test_count == 1000
         assert test_count + s.train.interaction_count == ml_ds.count("pairs")
         test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
@@ -152,7 +153,7 @@ def test_sample_oversize(ml_ds: Dataset):
     assert len(splits) == 50
 
     for s in splits:
-        test_count = sum(len(il) for il in s.test.values())
+        test_count = s.test_size
         assert test_count + s.train.interaction_count == ml_ds.count("pairs")
         test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
         tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)

From cbfd00cb15dfb2bcddcd4b45ac2fd3a25a4f8a65 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:25:34 -0400
Subject: [PATCH 16/22] make vocabularies iterable

---
 lenskit/lenskit/data/vocab.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py
index f4bdf7ca3..cf11be40c 100644
--- a/lenskit/lenskit/data/vocab.py
+++ b/lenskit/lenskit/data/vocab.py
@@ -11,7 +11,18 @@
 # pyright: basic
 from __future__ import annotations
 
-from typing import Any, Generic, Hashable, Iterable, Literal, Sequence, TypeAlias, TypeVar, overload
+from typing import (
+    Any,
+    Generic,
+    Hashable,
+    Iterable,
+    Iterator,
+    Literal,
+    Sequence,
+    TypeAlias,
+    TypeVar,
+    overload,
+)
 
 import numpy as np
 import pandas as pd
@@ -159,6 +170,9 @@ def __eq__(self, other: Vocabulary[Any]) -> bool:  # noqa: F821
     def __contains__(self, key: VT) -> bool:
         return key in self._index
 
+    def __iter__(self) -> Iterator[EntityId]:
+        return iter(self._index.values)
+
     def __len__(self) -> int:
         return self.size
 

From 6bfc75a81206e1d6a4057529637e1a2fca530943 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:25:55 -0400
Subject: [PATCH 17/22] strengthen user row tests

---
 lenskit/tests/test_dataset_matrix.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py
index eb994d763..3a88c9041 100644
--- a/lenskit/tests/test_dataset_matrix.py
+++ b/lenskit/tests/test_dataset_matrix.py
@@ -372,12 +372,25 @@ def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, m
 def test_matrix_rows_by_num(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset):
     users = rng.choice(ml_ds.user_count, 50)
 
+    rated = set(zip(ml_ratings["user"], ml_ratings["item"]))
+    rdf = ml_ds.interaction_matrix("pandas")
+    rnums = set(zip(rdf["user_num"], rdf["item_num"]))
+
+    dfi = ml_ratings.set_index(["user", "item"])
+
     for user in users:
+        uid = ml_ds.users.id(user)
         row = ml_ds.user_row(user_num=user)
         assert row is not None
         urows = ml_ratings[ml_ratings["user"] == ml_ds.users.id(user)].sort_values("item")
         assert set(row.ids()) == set(urows["item"])
+
         assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"]))
+        assert all((user, ino) in rnums for ino in row.numbers())
+
+        assert np.all(row.ids() == ml_ds.items.ids(row.numbers()))
+        assert all((uid, item) in rated for item in row.ids())
+        assert all((uid, item) in dfi.index for item in row.ids())
 
         ratings = row.field("rating")
         assert ratings is not None

From 0f8277fae6d55daef95a8417ffc2c7de4517c7df Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:26:06 -0400
Subject: [PATCH 18/22] working user-based splitting

---
 lenskit/lenskit/splitting/records.py |  26 ++-
 lenskit/lenskit/splitting/users.py   | 171 ++++++++++++++++++++
 lenskit/tests/test_split_users.py    | 227 +++++++++++----------------
 3 files changed, 288 insertions(+), 136 deletions(-)
 create mode 100644 lenskit/lenskit/splitting/users.py

diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py
index a722247fc..82bde8d45 100644
--- a/lenskit/lenskit/splitting/records.py
+++ b/lenskit/lenskit/splitting/records.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 from seedbank import numpy_rng
+from seedbank.numpy import NPRNGSource
 
 from lenskit.data.dataset import Dataset, MatrixDataset
 
@@ -12,7 +13,9 @@
 _log = logging.getLogger(__name__)
 
 
-def crossfold_records(data: Dataset, partitions: int, *, rng_spec=None) -> Iterator[TTSplit]:
+def crossfold_records(
+    data: Dataset, partitions: int, *, rng_spec: NPRNGSource | None = None
+) -> Iterator[TTSplit]:
     """
     Partition a dataset by **records** into cross-fold partitions.  This
     partitions the records (ratings, play counts, clicks, etc.) into *k*
@@ -54,14 +57,29 @@ def crossfold_records(data: Dataset, partitions: int, *, rng_spec=None) -> Itera
 
 @overload
 def sample_records(
-    data: Dataset, size: int, *, disjoint=True, rng_spec=None, repeats: None = None
+    data: Dataset,
+    size: int,
+    *,
+    disjoint: bool = True,
+    rng_spec: NPRNGSource | None = None,
+    repeats: None = None,
 ) -> TTSplit: ...
 @overload
 def sample_records(
-    data: Dataset, size: int, *, repeats: int, disjoint=True, rng_spec=None
+    data: Dataset,
+    size: int,
+    *,
+    repeats: int,
+    disjoint: bool = True,
+    rng_spec: NPRNGSource | None = None,
 ) -> Iterator[TTSplit]: ...
 def sample_records(
-    data: Dataset, size: int, *, repeats: int | None = None, disjoint=True, rng_spec=None
+    data: Dataset,
+    size: int,
+    *,
+    repeats: int | None = None,
+    disjoint: bool = True,
+    rng_spec: NPRNGSource | None = None,
 ) -> TTSplit | Iterator[TTSplit]:
     """
     Sample train-test a frame of ratings into train-test partitions.  This
diff --git a/lenskit/lenskit/splitting/users.py b/lenskit/lenskit/splitting/users.py
new file mode 100644
index 000000000..1fb59acbb
--- /dev/null
+++ b/lenskit/lenskit/splitting/users.py
@@ -0,0 +1,171 @@
+import logging
+from typing import Iterable, Iterator, overload
+
+import numpy as np
+import pandas as pd
+from seedbank import numpy_rng
+from seedbank.numpy import NPRNGSource
+
+from lenskit.data.dataset import Dataset, MatrixDataset
+from lenskit.data.vocab import EntityId
+
+from .holdout import HoldoutMethod
+from .split import TTSplit
+
+_log = logging.getLogger(__name__)
+
+
+def crossfold_users(
+    data: Dataset, partitions: int, method: HoldoutMethod, *, rng_spec: NPRNGSource | None = None
+) -> Iterator[TTSplit]:
+    """
+    Partition a frame of ratings or other data into train-test partitions
+    user-by-user. This function does not care what kind of data is in `data`, so
+    long as it is a Pandas DataFrame (or equivalent) and has a `user` column.
+
+    Args:
+        data:
+            a data frame containing ratings or other data you wish to partition.
+        partitions:
+            the number of partitions to produce
+        method:
+            The method for selecting test rows for each user.
+        rng_spec:
+            The RNG or seed (see :func:`seedbank.numpy_rng`).
+
+    Returns
+        The train-test pairs.
+    """
+    rng = numpy_rng(rng_spec)
+
+    users = data.users.ids()
+    _log.info(
+        "partitioning %d rows for %d users into %d partitions",
+        data.count("pairs"),
+        len(users),
+        partitions,
+    )
+
+    # create an array of indexes into user row
+    rows = np.arange(len(users))
+    # shuffle the indices & split into partitions
+    rng.shuffle(rows)
+    test_sets = np.array_split(rows, partitions)
+
+    # get the whole test DF
+    df = data.interaction_matrix("pandas", field="all", original_ids=True).set_index(
+        ["user_id", "item_id"]
+    )
+
+    # convert each partition into a split
+    for i, ts in enumerate(test_sets):
+        # get our users!
+        test_us = users[ts]
+        _log.info("fold %d: selecting test ratings", i)
+
+        yield _make_split(data, df, test_us, method)
+
+
+@overload
+def sample_users(
+    data: Dataset,
+    size: int,
+    method: HoldoutMethod,
+    *,
+    repeats: int,
+    disjoint: bool = True,
+    rng_spec: NPRNGSource | None = None,
+) -> Iterator[TTSplit]: ...
+@overload
+def sample_users(
+    data: Dataset,
+    size: int,
+    method: HoldoutMethod,
+    *,
+    disjoint: bool = True,
+    rng_spec: NPRNGSource | None = None,
+    repeats: None = None,
+) -> TTSplit: ...
+def sample_users(
+    data: Dataset,
+    size: int,
+    method: HoldoutMethod,
+    *,
+    repeats: int | None = None,
+    disjoint: bool = True,
+    rng_spec: NPRNGSource | None = None,
+) -> Iterator[TTSplit] | TTSplit:
+    """
+    Create train-test splits by sampling users.  When ``repeats`` is None,
+    returns a single train-test split; otherwise, it returns an iterator over
+    multiple splits. If ``repeats=1``, this function returns an iterator that
+    yields a single train-test pair.
+
+    Args:
+        data:
+            Data frame containing ratings or other data you wish to partition.
+        size:
+            The sample size.
+        method:
+            The method for obtaining user test ratings.
+        repeats:
+            The number of samples to produce.
+        rng_spec:
+            The RNG or seed (see :func:`seedbank.numpy_rng`).
+
+    Returns:
+        The train-test pair(s).
+    """
+
+    rng = numpy_rng(rng_spec)
+
+    users = data.users.ids()
+    unums = np.arange(len(users))
+    if disjoint and repeats is not None and repeats * size >= len(users):
+        _log.warning(
+            "cannot take %d disjoint samples of size %d from %d users", repeats, size, len(users)
+        )
+        return crossfold_users(data, repeats, method)
+
+    _log.info("sampling %d users (n=%d)", len(users), size)
+
+    # get the whole test DF
+    rate_df = data.interaction_matrix("pandas", field="all", original_ids=True).set_index(
+        ["user_id", "item_id"]
+    )
+
+    if repeats is None:
+        test_us = rng.choice(users, size, replace=False)
+        return _make_split(data, rate_df, test_us, method)
+
+    if disjoint:
+        rng.shuffle(unums)
+        test_usets = [unums[i * size : (i + 1) * size] for i in range(repeats)]
+    else:
+        test_usets = [rng.choice(len(users), size, replace=False) for _i in range(repeats)]
+
+    return (_make_split(data, rate_df, users[us], method) for us in test_usets)
+
+
+def _make_split(
+    data: Dataset, df: pd.DataFrame, test_us: Iterable[EntityId], method: HoldoutMethod
+) -> TTSplit:
+    # create the test sets for these users
+    mask = pd.Series(True, index=df.index)
+    test = {}
+
+    for u in test_us:
+        row = data.user_row(u)
+        assert row is not None
+        u_test = method(row)
+        test[u] = u_test
+        assert all((u, i) in mask.index for i in u_test.ids())
+        mask.loc[[(u, i) for i in u_test.ids()]] = False  # type: ignore
+        assert np.sum(mask.loc[u]) == len(row) - len(u_test)
+
+    train_df = df[mask]
+    train = MatrixDataset(data.users, data.items, train_df.reset_index())
+
+    split = TTSplit(train, test)
+    assert len(train_df) + split.test_size == len(df)
+    return split
diff --git a/lenskit/tests/test_split_users.py b/lenskit/tests/test_split_users.py
index 2a6671b1c..ed6ac7b44 100644
--- a/lenskit/tests/test_split_users.py
+++ b/lenskit/tests/test_split_users.py
@@ -13,181 +13,144 @@
 
 import pytest
 
-import lenskit.crossfold as xf
+from lenskit.data.dataset import Dataset, from_interactions_df
+from lenskit.splitting.holdout import SampleFrac, SampleN
+from lenskit.splitting.users import crossfold_users, sample_users
 
 
-def test_partition_users(ml_ratings: pd.DataFrame):
-    splits = xf.partition_users(ml_ratings, 5, xf.SampleN(5))
+def test_crossfold_users(ml_ds: Dataset):
+    splits = crossfold_users(ml_ds, 5, SampleN(5))
     splits = list(splits)
     assert len(splits) == 5
 
+    users = set()
     for s in splits:
-        ucounts = s.test.groupby("user").agg("count")
-        assert all(ucounts == 5)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        assert all(s.train["user"].isin(s.train["user"].unique()))
-        assert len(s.test) + len(s.train) == len(ml_ratings)
+        assert all(len(il) for il in s.test.values())
+        assert not any(u in users for u in s.test.keys())
+        users |= s.test.keys()
 
-    users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits))
-    assert len(users) == ml_ratings.user.nunique()
-    assert users == set(ml_ratings.user)
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+        assert not test_pairs & train_pairs
+        assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs")
 
+    assert users == set(ml_ds.users)
 
-def test_partition_may_skip_train(ml_ratings: pd.DataFrame):
+
+def test_crossfold_may_skip_train(ml_ratings: pd.DataFrame):
     "Partitioning when users may not have enough ratings to be in the train and test sets."
     # make a data set where some users only have 1 rating
     ml_ratings = ml_ratings.sample(frac=0.1)
-    users = ml_ratings.groupby("user")["rating"].count()
-    assert users.min() == 1.0  # we should have some small users!
-    users.name = "ur_count"
+    ucounts = ml_ratings.groupby("user")["rating"].count()
+    assert ucounts.min() == 1  # we should have some small users!
+    ucounts.name = "ur_count"
+    ml_ds = from_interactions_df(ml_ratings)
 
-    splits = xf.partition_users(ml_ratings, 5, xf.SampleN(1))
+    splits = crossfold_users(ml_ds, 5, SampleN(1))
     splits = list(splits)
     assert len(splits) == 5
 
     # now we go make sure we're missing some users! And don't have any NaN ml_ratings
     for train, test in splits:
-        # no null ml_ratings
-        assert all(train["rating"].notna())
-        # see if test users with 1 rating are missing from train
-        test = test.join(users, on="user")
-        assert all(~(test.loc[test["ur_count"] == 1, "user"].isin(train["user"].unique())))
-        # and users with more than one rating are in train
-        assert all(test.loc[test["ur_count"] > 1, "user"].isin(train["user"].unique()))
+        for u in ucounts[ucounts == 1].index:
+            if u in test:
+                row = train.user_row(u)
+                assert row is not None
+                assert len(row) == 0
 
 
-def test_partition_users_frac(ml_ratings: pd.DataFrame):
-    splits = xf.partition_users(ml_ratings, 5, xf.SampleFrac(0.2))
+def test_crossfold_users_frac(ml_ds: Dataset):
+    splits = crossfold_users(ml_ds, 5, SampleFrac(0.2))
     splits = list(splits)
     assert len(splits) == 5
-    ucounts = ml_ratings.groupby("user").item.count()
-    uss = ucounts * 0.2
+    ustats = ml_ds.user_stats()
+    uss = ustats["count"] * 0.2
 
     for s in splits:
-        tucs = s.test.groupby("user").item.count()
-        assert all(tucs >= uss.loc[tucs.index] - 1)
-        assert all(tucs <= uss.loc[tucs.index] + 1)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        assert len(s.test) + len(s.train) == len(ml_ratings)
+        assert all(len(il) >= uss.loc[u] - 1 for (u, il) in s.test.items())
+        assert all(len(il) <= uss.loc[u] + 1 for (u, il) in s.test.items())
+        assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs")
+
+
+def test_sample_users_single(ml_ds: Dataset):
+    split = sample_users(ml_ds, 100, SampleN(5))
 
-    # we have all users
-    users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits))
-    assert len(users) == ml_ratings.user.nunique()
-    assert users == set(ml_ratings.user)
+    assert len(split.test) == 100
+    assert split.test_size == 500
 
+    test_pairs = set((u, i) for (u, il) in split.test.items() for i in il.ids())
+    assert len(test_pairs) == split.test_size
+    tdf = split.train.interaction_matrix("pandas", field="rating", original_ids=True)
+    train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+    assert len(train_pairs) == split.train.count("pairs")
+    assert len(test_pairs & train_pairs) == 0
+    assert split.test_size + split.train.count("pairs") == ml_ds.count("pairs")
 
-def test_sample_users(ml_ratings: pd.DataFrame):
-    splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5))
+
+def test_sample_users(ml_ds: Dataset):
+    splits = sample_users(ml_ds, 100, SampleN(5), repeats=5)
     splits = list(splits)
     assert len(splits) == 5
 
+    aus = set()
     for s in splits:
-        ucounts = s.test.groupby("user").agg("count")
-        assert len(s.test) == 5 * 100
-        assert len(ucounts) == 100
-        assert all(ucounts == 5)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-
-    # no overlapping users
-    for s1, s2 in it.product(splits, splits):
-        if s1 is s2:
-            continue
-        us1 = s1.test.user.unique()
-        us2 = s2.test.user.unique()
-        assert len(np.intersect1d(us1, us2)) == 0
-
-
-def test_sample_users_frac(ml_ratings: pd.DataFrame):
-    splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleFrac(0.2))
+        assert len(s.test) == 100
+        assert s.test_size == 500
+        # users are disjoint
+        assert not any(u in aus for u in s.test.keys())
+        aus |= s.test.keys()
+
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        assert len(test_pairs) == s.test_size
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+        assert len(train_pairs) == s.train.count("pairs")
+        assert len(test_pairs & train_pairs) == 0
+        assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs")
+
+
+def test_sample_users_non_disjoint(ml_ds: Dataset):
+    splits = sample_users(ml_ds, 100, SampleN(5), repeats=5, disjoint=False)
     splits = list(splits)
     assert len(splits) == 5
-    ucounts = ml_ratings.groupby("user").item.count()
-    uss = ucounts * 0.2
+
+    aus = set()
 
     for s in splits:
-        tucs = s.test.groupby("user").item.count()
-        assert len(tucs) == 100
-        assert all(tucs >= uss.loc[tucs.index] - 1)
-        assert all(tucs <= uss.loc[tucs.index] + 1)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-
-    # no overlapping users
-    for s1, s2 in it.product(splits, splits):
-        if s1 is s2:
-            continue
-        us1 = s1.test.user.unique()
-        us2 = s2.test.user.unique()
-        assert len(np.intersect1d(us1, us2)) == 0
+        assert len(s.test) == 100
+        assert s.test_size == 500
+        aus |= s.test.keys()
+
+        test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids())
+        assert len(test_pairs) == s.test_size
+        tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True)
+        train_pairs = set(zip(tdf["user_id"], tdf["item_id"]))
+        assert len(train_pairs) == s.train.count("pairs")
+        assert len(test_pairs & train_pairs) == 0
+        assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs")
+
+    # some user appears at least once
+    assert len(aus) < 500
 
 
 @pytest.mark.slow
-def test_sample_users_frac_oversize(ml_ratings: pd.DataFrame):
-    splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5))
+def test_sample_users_frac_oversize(ml_ds: Dataset):
+    splits = sample_users(ml_ds, 100, SampleN(5), repeats=20)
     splits = list(splits)
     assert len(splits) == 20
 
     for s in splits:
-        ucounts = s.test.groupby("user").agg("count")
-        assert len(ucounts) < 100
-        assert all(ucounts == 5)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-
-    users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits))
-    assert len(users) == ml_ratings.user.nunique()
-    assert users == set(ml_ratings.user)
-    for s1, s2 in it.product(splits, splits):
-        if s1 is s2:
-            continue
-
-        us1 = s1.test.user.unique()
-        us2 = s2.test.user.unique()
-        assert len(np.intersect1d(us1, us2)) == 0
-
-
-def test_sample_users_frac_oversize_ndj(ml_ratings: pd.DataFrame):
-    splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5), disjoint=False)
+        assert len(s.test) < 100
+        assert all(len(il) == 5 for il in s.test.values())
+
+
+def test_sample_users_frac_oversize_ndj(ml_ds: Dataset):
+    splits = sample_users(ml_ds, 100, SampleN(5), repeats=20, disjoint=False)
     splits = list(splits)
     assert len(splits) == 20
 
     for s in splits:
-        ucounts = s.test.groupby("user").agg("count")
-        assert len(ucounts) == 100
-        assert len(s.test) == 5 * 100
-        assert all(ucounts == 5)
-        assert all(s.test.index.union(s.train.index) == ml_ratings.index)
-        assert len(s.test) + len(s.train) == len(ml_ratings)
-
-
-def test_non_unique_index_partition_users(ml_ratings: pd.DataFrame):
-    """Partitioning users when dataframe has non-unique indices"""
-    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
-    with pytest.raises(ValueError):
-        for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)):
-            pass
-
-
-def test_sample_users_dup_index(ml_ratings: pd.DataFrame):
-    """Sampling users when dataframe has non-unique indices"""
-    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
-    with pytest.raises(ValueError):
-        for split in xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)):
-            pass
-
-
-def test_sample_rows_dup_index(ml_ratings: pd.DataFrame):
-    """Sampling ml_ratings when dataframe has non-unique indices"""
-    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
-    with pytest.raises(ValueError):
-        for split in xf.sample_rows(ml_ratings, partitions=5, size=1000):
-            pass
-
-
-def test_partition_users_dup_index(ml_ratings: pd.DataFrame):
-    """Partitioning ml_ratings when dataframe has non-unique indices"""
-    ml_ratings = ml_ratings.set_index("user")  ##forces non-unique index
-    with pytest.raises(ValueError):
-        for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)):
-            pass
+        assert len(s.test) == 100
+        assert s.test_size == 5 * 100
+        assert all([len(il) for il in s.test.values()])

From b07f2c7c9041a71ec01c65ec6a602c4e51652273 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:37:57 -0400
Subject: [PATCH 19/22] update documentation

---
 docs/crossfold.rst                    | 85 ------------------------
 docs/index.rst                        |  2 +-
 docs/splitting.rst                    | 93 +++++++++++++++++++++++++++
 lenskit/lenskit/splitting/__init__.py |  5 +-
 4 files changed, 98 insertions(+), 87 deletions(-)
 delete mode 100644 docs/crossfold.rst
 create mode 100644 docs/splitting.rst

diff --git a/docs/crossfold.rst b/docs/crossfold.rst
deleted file mode 100644
index 3707344e3..000000000
--- a/docs/crossfold.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-Splitting Data
-==============
-
-.. module:: lenskit.crossfold
-
-The LKPY `crossfold` module provides support for preparing data sets for
-cross-validation.  Crossfold methods are implemented as functions that operate
-on data frames and return generators of `(train, test)` pairs
-(:py:class:`lenskit.crossfold.TTPair` objects).  The train and test objects
-in each pair are also data frames, suitable for evaluation or writing out to
-a file.
-
-Crossfold methods make minimal assumptions about their input data frames, so the
-frames can be ratings, purchases, or whatever.  They do assume that each row
-represents a single data point for the purpose of splitting and sampling.
-
-Experiment code should generally use these functions to prepare train-test files
-for training and evaluating algorithms.  For example, the following will perform
-a user-based 5-fold cross-validation as was the default in the old LensKit::
-
-    import pandas as pd
-    import lenskit.crossfold as xf
-    ratings = pd.read_csv('ml-20m/ratings.csv')
-    ratings = ratings.rename(columns={'userId': 'user', 'movieId': 'item'})
-    for i, tp in enumerate(xf.partition_users(ratings, 5, xf.SampleN(5))):
-        tp.train.to_csv('ml-20m.exp/train-%d.csv' % (i,))
-        tp.train.to_parquet('ml-20m.exp/train-%d.parquet % (i,))
-        tp.test.to_csv('ml-20m.exp/test-%d.csv' % (i,))
-        tp.test.to_parquet('ml-20m.exp/test-%d.parquet % (i,))
-
-Row-based splitting
--------------------
-
-The simplest preparation methods sample or partition the rows in the input frame.
-A 5-fold :py:func:`partition_rows` split will result in 5
-splits, each of which extracts 20% of the rows for testing and leaves 80% for
-training.
-
-.. autofunction:: partition_rows
-
-.. autofunction:: sample_rows
-
-User-based splitting
---------------------
-
-It's often desirable to use users, instead of raw rows, as the basis for splitting
-data.  This allows you to control the experimental conditions on a user-by-user basis,
-e.g. by making sure each user is tested with the same number of ratings.  These methods
-require that the input data frame have a `user` column with the user names or identifiers.
-
-The algorithm used by each is as follows:
-
-1. Sample or partition the set of user IDs into *n* sets of test users.
-2. For each set of test users, select a set of that user's rows to be test rows.
-3. Create a training set for each test set consisting of the non-selected rows from each
-    of that set's test users, along with all rows from each non-test user.
-
-.. autofunction:: partition_users
-
-.. autofunction:: sample_users
-
-Selecting user test rows
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-These functions each take a `method` to decide how select each user's test rows. The method
-is a function that takes a data frame (containing just the user's rows) and returns the
-test rows.  This function is expected to preserve the index of the input data frame (which
-happens by default with common means of implementing samples).
-
-We provide several partition method factories:
-
-.. autofunction:: SampleN
-.. autofunction:: SampleFrac
-.. autofunction:: LastN
-.. autofunction:: LastFrac
-
-Utility Classes
----------------
-
-.. autoclass:: PartitionMethod
-   :members:
-   :special-members:
-
-.. autoclass:: TTPair
-   :members:
diff --git a/docs/index.rst b/docs/index.rst
index cc85cb3ce..9e8dda62a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -39,7 +39,7 @@ Resources
     :caption: Running Experiments
 
     data
-    crossfold
+    splitting
     batch
     evaluation/index
     documenting
diff --git a/docs/splitting.rst b/docs/splitting.rst
new file mode 100644
index 000000000..23276e6d9
--- /dev/null
+++ b/docs/splitting.rst
@@ -0,0 +1,93 @@
+Splitting Data
+==============
+
+.. module:: lenskit.splitting
+
+The LKPY `splitting` module splits data sets for offline evaluation using
+cross-validation and other strategies.  The various splitters are implemented as
+functions that operate on a :class:`~lenskit.data.Dataset` and return one or
+more train-test splits (as :class:`TTSplit` objects).
+
+.. versionchanged:: 2024.1
+    Data splitting was moved from ``lenskit.crossfold`` to the ``lenskit.splitting``
+    module and functions were renamed and had their interfaces revised.
+
+Experiment code should generally use these functions to prepare train-test files
+for training and evaluating algorithms.  For example, the following will perform
+a user-based 5-fold cross-validation as was the default in the old LensKit:
+
+.. code:: python
+
+    import pandas as pd
+    from lenskit.data import load_movielens
+    from lenskit.splitting import crossfold_users, SampleN, dict_to_df
+    dataset = load_movielens('data/ml-20m.zip')
+    for i, tp in enumerate(crossfold_users(ratings, 5, SampleN(5))):
+        train_df = tp.train.interaction_log('pandas', field='all', original_ids=True)
+        train_df.to_parquet(f'ml-20m.exp/train-{i}.parquet')
+        dict_to_df(tp.test).to_parquet(f'ml-20m.exp/test-{i}.parquet')
+
+Record-based Random Splitting
+-----------------------------
+
+The simplest preparation methods sample or partition the records in the input
+data. A 5-fold :func:`crossfold_records` split will result in 5 splits, each of
+which extracts 20% of the user-item interaction records for testing and leaves
+80% for training.
+
+.. note::
+
+    When a dataset has repeated interactions, these functions operate only on
+    the *matrix* view of the data (user-item observations are deduplicated).
+    Specifically, they operate on the results of calling
+    :meth:`~lenskit.data.Dataset.interaction_matrix` with ``format="pandas"``
+    and ``field="all"``.
+
+.. autofunction:: crossfold_records
+
+.. autofunction:: sample_records
+
+User-based Splitting
+--------------------
+
+It's often desirable to use users, instead of raw rows, as the basis for
+splitting data.  This allows you to control the experimental conditions on a
+user-by-user basis, e.g. by making sure each user is tested with the same number
+of ratings.  These methods require that the input data frame have a `user`
+column with the user names or identifiers.
+
+The algorithm used by each is as follows:
+
+1.  Sample or partition the set of user IDs into *n* sets of test users.
+2.  For each set of test users, select a set of that user's rows to be test rows.
+3.  Create a training set for each test set consisting of the non-selected rows
+    from each of that set's test users, along with all rows from each non-test
+    user.
+
+.. autofunction:: crossfold_users
+
+.. autofunction:: sample_users
+
+Selecting user holdout rows
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions each take a `method` to decide how select each user's test rows. The method
+is a function that takes an item list (containing just the user's rows) and returns the
+test rows.
+
+We provide several holdout method factories:
+
+.. autofunction:: SampleN
+.. autofunction:: SampleFrac
+.. autofunction:: LastN
+.. autofunction:: LastFrac
+
+Utility Classes
+---------------
+
+.. autoclass:: lenskit.splitting.holdout.HoldoutMethod
+   :members:
+   :special-members: __call__
+
+.. autoclass:: TTSplit
+   :members:
diff --git a/lenskit/lenskit/splitting/__init__.py b/lenskit/lenskit/splitting/__init__.py
index 080e9a8c2..90346a089 100644
--- a/lenskit/lenskit/splitting/__init__.py
+++ b/lenskit/lenskit/splitting/__init__.py
@@ -8,4 +8,7 @@
 Splitting data for train-test evaluation.
 """
 
-from .split import TTSplit  # noqa: F401
+from .holdout import LastFrac, LastN, SampleFrac, SampleN  # noqa: F401
+from .records import crossfold_records, sample_records  # noqa: F401
+from .split import TTSplit, dict_from_df, dict_to_df  # noqa: F401
+from .users import crossfold_users, sample_users  # noqa: F401

From 9b50ca2032cf2586cd1edb270656dd6b45a8afa7 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:41:18 -0400
Subject: [PATCH 20/22] add df accessors to TTSplit

---
 docs/splitting.rst                 |  5 ++---
 lenskit/lenskit/splitting/split.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/docs/splitting.rst b/docs/splitting.rst
index 23276e6d9..d0ea78171 100644
--- a/docs/splitting.rst
+++ b/docs/splitting.rst
@@ -23,9 +23,8 @@ a user-based 5-fold cross-validation as was the default in the old LensKit:
     from lenskit.splitting import crossfold_users, SampleN, dict_to_df
     dataset = load_movielens('data/ml-20m.zip')
     for i, tp in enumerate(crossfold_users(ratings, 5, SampleN(5))):
-        train_df = tp.train.interaction_log('pandas', field='all', original_ids=True)
-        train_df.to_parquet(f'ml-20m.exp/train-{i}.parquet')
-        dict_to_df(tp.test).to_parquet(f'ml-20m.exp/test-{i}.parquet')
+        tp.train_df.to_parquet(f'ml-20m.exp/train-{i}.parquet')
+        tp.test_df.to_parquet(f'ml-20m.exp/test-{i}.parquet')
 
 Record-based Random Splitting
 -----------------------------
diff --git a/lenskit/lenskit/splitting/split.py b/lenskit/lenskit/splitting/split.py
index 393424c78..5d0ea09ae 100644
--- a/lenskit/lenskit/splitting/split.py
+++ b/lenskit/lenskit/splitting/split.py
@@ -37,6 +37,20 @@ def test_size(self) -> int:
         """
         return sum(len(il) for il in self.test.values())
 
+    @property
+    def test_df(self) -> pd.DataFrame:
+        """
+        Get the test data as a data frame.
+        """
+        return dict_to_df(self.test)
+
+    @property
+    def train_df(self) -> pd.DataFrame:
+        """
+        Get the training data as a data frame.
+        """
+        return self.train.interaction_matrix("pandas", field="all")
+
 
 def dict_to_df(data: dict[EntityId, ItemList]) -> pd.DataFrame:
     """

From e79fdaedbb5376ee48f832317e5ce015e5fed8d3 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:48:44 -0400
Subject: [PATCH 21/22] fix doctests

---
 lenskit/lenskit/splitting/records.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py
index 82bde8d45..50f18d4c2 100644
--- a/lenskit/lenskit/splitting/records.py
+++ b/lenskit/lenskit/splitting/records.py
@@ -89,8 +89,8 @@ def sample_records(
     We can loop over a sequence of train-test pairs::
 
         >>> from lenskit.data.movielens import load_movielens_df
-        >>> ratings = load_movielens_df('data/ml-latest-small')
-        >>> for train, test in sample_records(ratings, 1000, repeats=5):
+        >>> movielens = load_movielens('data/ml-latest-small')
+        >>> for train, test in sample_records(movielens, 1000, repeats=5):
         ...     print(sum(len(il) for il in test.values()))
         1000
         1000
@@ -100,7 +100,7 @@ def sample_records(
 
     Sometimes for testing, it is useful to just get a single pair::
 
-        >>> train, test = sample_records(ratings, 1000)
+        >>> train, test = sample_records(movielens, 1000)
         >>> sum(len(il) for il in test.values())
         1000
 

From d84a1f08e6deaf53535817244f13183056500881 Mon Sep 17 00:00:00 2001
From: Michael Ekstrand <mdekstrand@drexel.edu>
Date: Wed, 31 Jul 2024 19:55:45 -0400
Subject: [PATCH 22/22] fix imports for records doctest

---
 lenskit/lenskit/splitting/records.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py
index 50f18d4c2..a3dafc09b 100644
--- a/lenskit/lenskit/splitting/records.py
+++ b/lenskit/lenskit/splitting/records.py
@@ -88,7 +88,7 @@ def sample_records(
 
     We can loop over a sequence of train-test pairs::
 
-        >>> from lenskit.data.movielens import load_movielens_df
+        >>> from lenskit.data import load_movielens
         >>> movielens = load_movielens('data/ml-latest-small')
         >>> for train, test in sample_records(movielens, 1000, repeats=5):
         ...     print(sum(len(il) for il in test.values()))