From 4ce186de59551d17d60bc06cc915f4b5e1c9474e Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 16:56:51 -0400 Subject: [PATCH 01/22] move/copy split tests --- lenskit/tests/test_split_ratings.py | 121 ++++++++++++++ lenskit/tests/test_split_user_holdout.py | 98 ++++++++++++ lenskit/tests/test_split_users.py | 193 +++++++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 lenskit/tests/test_split_ratings.py create mode 100644 lenskit/tests/test_split_user_holdout.py create mode 100644 lenskit/tests/test_split_users.py diff --git a/lenskit/tests/test_split_ratings.py b/lenskit/tests/test_split_ratings.py new file mode 100644 index 000000000..5e061b87f --- /dev/null +++ b/lenskit/tests/test_split_ratings.py @@ -0,0 +1,121 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import functools as ft +import itertools as it +import math + +import numpy as np +import pandas as pd + +import pytest + +import lenskit.crossfold as xf + + +def test_partition_rows(ml_ratings: pd.DataFrame): + splits = xf.partition_rows(ml_ratings, 5) + splits = list(splits) + assert len(splits) == 5 + + for s in splits: + assert len(s.test) + len(s.train) == len(ml_ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + test_idx = s.test.set_index(["user", "item"]).index + train_idx = s.train.set_index(["user", "item"]).index + assert len(test_idx.intersection(train_idx)) == 0 + + # we should partition! + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + i1 = s1.test.set_index(["user", "item"]).index + i2 = s2.test.set_index(["user", "item"]).index + inter = i1.intersection(i2) + assert len(inter) == 0 + + union = ft.reduce(lambda i1, i2: i1.union(i2), (s.test.index for s in splits)) + assert len(union.unique()) == len(ml_ratings) + + +def test_sample_rows(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, partitions=5, size=1000) + splits = list(splits) + assert len(splits) == 5 + + for s in splits: + assert len(s.test) == 1000 + assert len(s.test) + len(s.train) == len(ml_ratings) + test_idx = s.test.set_index(["user", "item"]).index + train_idx = s.train.set_index(["user", "item"]).index + assert len(test_idx.intersection(train_idx)) == 0 + + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + i1 = s1.test.set_index(["user", "item"]).index + i2 = s2.test.set_index(["user", "item"]).index + inter = i1.intersection(i2) + assert len(inter) == 0 + + +def test_sample_rows_more_smaller_parts(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, partitions=10, size=500) + splits = list(splits) + assert len(splits) == 10 + + for s in splits: + assert len(s.test) == 500 + assert len(s.test) + len(s.train) == len(ml_ratings) + test_idx = s.test.set_index(["user", "item"]).index + train_idx = s.train.set_index(["user", "item"]).index + assert len(test_idx.intersection(train_idx)) == 0 + + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + i1 = s1.test.set_index(["user", "item"]).index + i2 = s2.test.set_index(["user", "item"]).index + inter = i1.intersection(i2) + assert len(inter) == 0 + + +def test_sample_non_disjoint(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, partitions=10, size=1000, disjoint=False) + splits = list(splits) + assert len(splits) == 10 + + for s in splits: + assert len(s.test) == 1000 + assert len(s.test) + len(s.train) == len(ml_ratings) + test_idx = s.test.set_index(["user", "item"]).index + train_idx = s.train.set_index(["user", "item"]).index + assert len(test_idx.intersection(train_idx)) == 0 + + # There are enough splits & items we should pick at least one duplicate + ipairs = ( + (s1.test.set_index(["user", "item"]).index, s2.test.set_index(["user", "item"]).index) + for (s1, s2) in it.product(splits, splits) + ) + isizes = [len(i1.intersection(i2)) for (i1, i2) in ipairs] + assert any(n > 0 for n in isizes) + + +@pytest.mark.slow +def test_sample_oversize(ml_ratings: pd.DataFrame): + splits = xf.sample_rows(ml_ratings, 50, 10000) + splits = list(splits) + assert len(splits) == 50 + + for s in splits: + assert len(s.test) + len(s.train) == len(ml_ratings) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + test_idx = s.test.set_index(["user", "item"]).index + train_idx = s.train.set_index(["user", "item"]).index + assert len(test_idx.intersection(train_idx)) == 0 diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_user_holdout.py new file mode 100644 index 000000000..504c9ee06 --- /dev/null +++ b/lenskit/tests/test_split_user_holdout.py @@ -0,0 +1,98 @@ +import functools as ft +import itertools as it +import math + +import numpy as np +import pandas as pd + +import pytest + +import lenskit.crossfold as xf + + +def test_sample_n(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) + + s5 = xf.SampleN(5) + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = s5(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) == 5 + assert len(tst) + len(trn) == len(udf) + + s10 = xf.SampleN(10) + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = s10(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) == 10 + assert len(tst) + len(trn) == len(udf) + + +def test_sample_frac(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) + + samp = xf.SampleFrac(0.2) + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = samp(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) + len(trn) == len(udf) + assert len(tst) >= math.floor(len(udf) * 0.2) + assert len(tst) <= math.ceil(len(udf) * 0.2) + + samp = xf.SampleFrac(0.5) + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = samp(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) + len(trn) == len(udf) + assert len(tst) >= math.floor(len(udf) * 0.5) + assert len(tst) <= math.ceil(len(udf) * 0.5) + + +def test_last_n(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) + + samp = xf.LastN(5) + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = samp(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) == 5 + assert len(tst) + len(trn) == len(udf) + assert tst.timestamp.min() >= trn.timestamp.max() + + samp = xf.LastN(7) + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = samp(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) == 7 + assert len(tst) + len(trn) == len(udf) + assert tst.timestamp.min() >= trn.timestamp.max() + + +def test_last_frac(ml_ratings: pd.DataFrame): + users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) + + samp = xf.LastFrac(0.2, "timestamp") + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = samp(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) + len(trn) == len(udf) + assert len(tst) >= math.floor(len(udf) * 0.2) + assert len(tst) <= math.ceil(len(udf) * 0.2) + assert tst.timestamp.min() >= trn.timestamp.max() + + samp = xf.LastFrac(0.5, "timestamp") + for u in users: + udf = ml_ratings[ml_ratings.user == u] + tst = samp(udf) + trn = udf.loc[udf.index.difference(tst.index), :] + assert len(tst) + len(trn) == len(udf) + assert len(tst) >= math.floor(len(udf) * 0.5) + assert len(tst) <= math.ceil(len(udf) * 0.5) + assert tst.timestamp.min() >= trn.timestamp.max() diff --git a/lenskit/tests/test_split_users.py b/lenskit/tests/test_split_users.py new file mode 100644 index 000000000..2a6671b1c --- /dev/null +++ b/lenskit/tests/test_split_users.py @@ -0,0 +1,193 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import functools as ft +import itertools as it +import math + +import numpy as np +import pandas as pd + +import pytest + +import lenskit.crossfold as xf + + +def test_partition_users(ml_ratings: pd.DataFrame): + splits = xf.partition_users(ml_ratings, 5, xf.SampleN(5)) + splits = list(splits) + assert len(splits) == 5 + + for s in splits: + ucounts = s.test.groupby("user").agg("count") + assert all(ucounts == 5) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert all(s.train["user"].isin(s.train["user"].unique())) + assert len(s.test) + len(s.train) == len(ml_ratings) + + users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) + assert len(users) == ml_ratings.user.nunique() + assert users == set(ml_ratings.user) + + +def test_partition_may_skip_train(ml_ratings: pd.DataFrame): + "Partitioning when users may not have enough ratings to be in the train and test sets." + # make a data set where some users only have 1 rating + ml_ratings = ml_ratings.sample(frac=0.1) + users = ml_ratings.groupby("user")["rating"].count() + assert users.min() == 1.0 # we should have some small users! + users.name = "ur_count" + + splits = xf.partition_users(ml_ratings, 5, xf.SampleN(1)) + splits = list(splits) + assert len(splits) == 5 + + # now we go make sure we're missing some users! And don't have any NaN ml_ratings + for train, test in splits: + # no null ml_ratings + assert all(train["rating"].notna()) + # see if test users with 1 rating are missing from train + test = test.join(users, on="user") + assert all(~(test.loc[test["ur_count"] == 1, "user"].isin(train["user"].unique()))) + # and users with more than one rating are in train + assert all(test.loc[test["ur_count"] > 1, "user"].isin(train["user"].unique())) + + +def test_partition_users_frac(ml_ratings: pd.DataFrame): + splits = xf.partition_users(ml_ratings, 5, xf.SampleFrac(0.2)) + splits = list(splits) + assert len(splits) == 5 + ucounts = ml_ratings.groupby("user").item.count() + uss = ucounts * 0.2 + + for s in splits: + tucs = s.test.groupby("user").item.count() + assert all(tucs >= uss.loc[tucs.index] - 1) + assert all(tucs <= uss.loc[tucs.index] + 1) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + + # we have all users + users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) + assert len(users) == ml_ratings.user.nunique() + assert users == set(ml_ratings.user) + + +def test_sample_users(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)) + splits = list(splits) + assert len(splits) == 5 + + for s in splits: + ucounts = s.test.groupby("user").agg("count") + assert len(s.test) == 5 * 100 + assert len(ucounts) == 100 + assert all(ucounts == 5) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + + # no overlapping users + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + us1 = s1.test.user.unique() + us2 = s2.test.user.unique() + assert len(np.intersect1d(us1, us2)) == 0 + + +def test_sample_users_frac(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleFrac(0.2)) + splits = list(splits) + assert len(splits) == 5 + ucounts = ml_ratings.groupby("user").item.count() + uss = ucounts * 0.2 + + for s in splits: + tucs = s.test.groupby("user").item.count() + assert len(tucs) == 100 + assert all(tucs >= uss.loc[tucs.index] - 1) + assert all(tucs <= uss.loc[tucs.index] + 1) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + + # no overlapping users + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + us1 = s1.test.user.unique() + us2 = s2.test.user.unique() + assert len(np.intersect1d(us1, us2)) == 0 + + +@pytest.mark.slow +def test_sample_users_frac_oversize(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5)) + splits = list(splits) + assert len(splits) == 20 + + for s in splits: + ucounts = s.test.groupby("user").agg("count") + assert len(ucounts) < 100 + assert all(ucounts == 5) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + + users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) + assert len(users) == ml_ratings.user.nunique() + assert users == set(ml_ratings.user) + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + us1 = s1.test.user.unique() + us2 = s2.test.user.unique() + assert len(np.intersect1d(us1, us2)) == 0 + + +def test_sample_users_frac_oversize_ndj(ml_ratings: pd.DataFrame): + splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5), disjoint=False) + splits = list(splits) + assert len(splits) == 20 + + for s in splits: + ucounts = s.test.groupby("user").agg("count") + assert len(ucounts) == 100 + assert len(s.test) == 5 * 100 + assert all(ucounts == 5) + assert all(s.test.index.union(s.train.index) == ml_ratings.index) + assert len(s.test) + len(s.train) == len(ml_ratings) + + +def test_non_unique_index_partition_users(ml_ratings: pd.DataFrame): + """Partitioning users when dataframe has non-unique indices""" + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index + with pytest.raises(ValueError): + for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)): + pass + + +def test_sample_users_dup_index(ml_ratings: pd.DataFrame): + """Sampling users when dataframe has non-unique indices""" + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index + with pytest.raises(ValueError): + for split in xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)): + pass + + +def test_sample_rows_dup_index(ml_ratings: pd.DataFrame): + """Sampling ml_ratings when dataframe has non-unique indices""" + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index + with pytest.raises(ValueError): + for split in xf.sample_rows(ml_ratings, partitions=5, size=1000): + pass + + +def test_partition_users_dup_index(ml_ratings: pd.DataFrame): + """Partitioning ml_ratings when dataframe has non-unique indices""" + ml_ratings = ml_ratings.set_index("user") ##forces non-unique index + with pytest.raises(ValueError): + for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)): + pass From e9dd86fe3e93a29d623de3cd232a3ddef77cd60c Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Tue, 30 Jul 2024 17:02:18 -0400 Subject: [PATCH 02/22] start creating the new splitting package --- lenskit/lenskit/splitting/__init__.py | 9 ++ lenskit/lenskit/splitting/holdout.py | 101 +++++++++++++++++++++++ lenskit/lenskit/splitting/types.py | 27 ++++++ lenskit/tests/test_split_user_holdout.py | 6 ++ 4 files changed, 143 insertions(+) create mode 100644 lenskit/lenskit/splitting/__init__.py create mode 100644 lenskit/lenskit/splitting/holdout.py create mode 100644 lenskit/lenskit/splitting/types.py diff --git a/lenskit/lenskit/splitting/__init__.py b/lenskit/lenskit/splitting/__init__.py new file mode 100644 index 000000000..ec4765a1f --- /dev/null +++ b/lenskit/lenskit/splitting/__init__.py @@ -0,0 +1,9 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +""" +Splitting data for train-test evaluation. +""" diff --git a/lenskit/lenskit/splitting/holdout.py b/lenskit/lenskit/splitting/holdout.py new file mode 100644 index 000000000..770ec6550 --- /dev/null +++ b/lenskit/lenskit/splitting/holdout.py @@ -0,0 +1,101 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +""" +Per-user rating holdout methods for user-based data splitting. +""" + +from typing import Protocol + +from seedbank import numpy_rng + + +class HoldoutMethod(Protocol): + """ + Holdout methods select test rows for a user (or item). Partition methods + are callable; when called with a data frame, they return the test entries. + """ + + def __call__(self, udf): + """ + Subset a data frame. + + Args: + udf(pandas.DataFrame): + The input data frame of rows for a user or item. + + Returns: + pandas.DataFrame: + The data frame of test rows, a subset of ``udf``. + """ + pass + + +class SampleN(HoldoutMethod): + """ + Randomly select a fixed number of test rows per user/item. + + Args: + n(int): the number of test items to select + rng: the random number generator or seed + """ + + def __init__(self, n, rng_spec=None): + self.n = n + self.rng = numpy_rng(rng_spec) + + def __call__(self, udf): + return udf.sample(n=self.n, random_state=self.rng) + + +class SampleFrac(HoldoutMethod): + """ + Randomly select a fraction of test rows per user/item. + + Args: + frac(float): the fraction items to select for testing. + """ + + def __init__(self, frac, rng_spec=None): + self.fraction = frac + self.rng = numpy_rng(rng_spec) + + def __call__(self, udf): + return udf.sample(frac=self.fraction, random_state=self.rng) + + +class LastN(HoldoutMethod): + """ + Select a fixed number of test rows per user/item, based on ordering by a + column. + + Args: + n(int): The number of test items to select. + """ + + def __init__(self, n, col="timestamp"): + self.n = n + self.column = col + + def __call__(self, udf): + return udf.sort_values(self.column).iloc[-self.n :] + + +class LastFrac(HoldoutMethod): + """ + Select a fraction of test rows per user/item. + + Args: + frac(double): the fraction of items to select for testing. + """ + + def __init__(self, frac, col="timestamp"): + self.fraction = frac + self.column = col + + def __call__(self, udf): + n = round(len(udf) * self.fraction) + return udf.sort_values(self.column).iloc[-n:] diff --git a/lenskit/lenskit/splitting/types.py b/lenskit/lenskit/splitting/types.py new file mode 100644 index 000000000..8190feba5 --- /dev/null +++ b/lenskit/lenskit/splitting/types.py @@ -0,0 +1,27 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +from typing import NamedTuple + +import pandas as pd + +from lenskit.data.dataset import Dataset + + +class TTPair(NamedTuple): + """ + A train-test pair from splitting. + """ + + train: Dataset + """ + The training data. + """ + + test: pd.DataFrame + """ + The test data. + """ diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_user_holdout.py index 504c9ee06..a569aa002 100644 --- a/lenskit/tests/test_split_user_holdout.py +++ b/lenskit/tests/test_split_user_holdout.py @@ -1,3 +1,9 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + import functools as ft import itertools as it import math From 7f13fe03555c8d1885cbc7de6ccffabab25498c0 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 14:30:18 -0400 Subject: [PATCH 03/22] add support for subsetting item lists --- lenskit/lenskit/data/items.py | 38 ++++++++++++++++++++++++++++++ lenskit/tests/test_itemlist.py | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 3fd5ce93c..1cf8ac6d1 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -44,6 +44,10 @@ class ItemList: An item list logically a list of rows, each of which is an item, like a :class:`~pandas.DataFrame` but supporting multiple array backends. + Item lists can be subset as an array (e.g. ``items[selector]``), where + integer indices (or arrays thereof), boolean arrays, and slices are allowed + as selectors. + When an item list is pickled, it is pickled compactly but only for CPUs: the vocabulary is dropped (after ensuring both IDs and numbers are computed), and all arrays are pickled as NumPy arrays. This makes item lists compact @@ -74,6 +78,13 @@ class is doing somewhat double-duty, representing a list of items along scores in the keyword arguments. Other field names should be singular. + .. todo:: + + Right now, selection / subsetting only happens on the CPU, and will move + data to the CPU for the subsetting operation. There is no reason, in + principle, why we cannot subset on GPU. Future revisions may add + support for this. + Args: item_ids: A list or array of item identifiers. ``item_id`` is accepted as an @@ -333,6 +344,33 @@ def to_df(self) -> pd.DataFrame: def __len__(self): return self._len + def __getitem__( + self, + sel: NDArray[np.bool_] | NDArray[np.integer] | Sequence[int] | torch.Tensor | int | slice, + ) -> ItemList: + """ + Subset the item list. + + Args: + sel: + The items to select. Can be either a Boolean array of the same + length as the list that is ``True`` to indicate selected items, + or an array of indices of the items to retain (in order in the + list, starting from 0). + """ + if np.isscalar(sel): + sel = np.array([sel]) + elif not isinstance(sel, slice): + sel = np.asarray(sel) + + # sel is now a selection array, or it is a slice. numpy supports both. + iids = self._ids[sel] if self._ids is not None else None + nums = self._numbers.numpy()[sel] if self._numbers is not None else None + flds = {n: f.numpy()[sel] for (n, f) in self._fields.items()} + return ItemList( + item_ids=iids, item_nums=nums, vocabulary=self._vocab, ordered=self.ordered, **flds + ) + def __getstate__(self) -> dict[str, object]: state: dict[str, object] = {"ordered": self.ordered, "len": self._len} if self._ids is not None: diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index 5b9957ec1..41abda1f9 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -231,3 +231,45 @@ def test_item_list_pickle_fields(ml_ds): assert np.all(r2.field("rating") == row.field("rating")) assert r2.field("timestamp") is not None assert np.all(r2.field("timestamp") == row.field("timestamp")) + + +def test_subset_mask(ml_ds): + row = ml_ds.user_row(user_num=400) + ratings = row.field("rating") + assert ratings is not None + + mask = ratings > 3.0 + pos = row[mask] + + assert len(pos) == np.sum(mask) + assert np.all(pos.ids() == row.ids()[mask]) + assert np.all(pos.numbers() == row.numbers()[mask]) + assert np.all(pos.field("rating") == row.field("rating")[mask]) + assert np.all(pos.field("rating") > 3.0) + + +def test_subset_idx(ml_ds): + row = ml_ds.user_row(user_num=400) + ratings = row.field("rating") + assert ratings is not None + + ks = [0, 5, 15] + pos = row[ks] + + assert len(pos) == 3 + assert np.all(pos.ids() == row.ids()[ks]) + assert np.all(pos.numbers() == row.numbers()[ks]) + assert np.all(pos.field("rating") == row.field("rating")[ks]) + + +def test_subset_slice(ml_ds): + row = ml_ds.user_row(user_num=400) + ratings = row.field("rating") + assert ratings is not None + + pos = row[5:10] + + assert len(pos) == 5 + assert np.all(pos.ids() == row.ids()[5:10]) + assert np.all(pos.numbers() == row.numbers()[5:10]) + assert np.all(pos.field("rating") == row.field("rating")[5:10]) From f993ceb1faaea37bf3bf49cb465a1141764bf527 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 14:44:57 -0400 Subject: [PATCH 04/22] add __str__ to ItemList --- lenskit/lenskit/data/items.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 1cf8ac6d1..6b534f049 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -394,3 +394,6 @@ def __setstate__(self, state: dict[str, Any]): if "numbers" in state: self._numbers = MTArray(state["numbers"]) self._fields = {k[6:]: MTArray(v) for (k, v) in state.items() if k.startswith("field_")} + + def __str__(self) -> str: + return f"" From 1c9b5f93bb9d1b7f70ddcde73ec236231ab8eeb1 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 14:45:08 -0400 Subject: [PATCH 05/22] reimplement holdouts to use item lists --- lenskit/lenskit/splitting/holdout.py | 92 ++++++++++----- lenskit/tests/test_split_user_holdout.py | 143 +++++++++++++---------- 2 files changed, 144 insertions(+), 91 deletions(-) diff --git a/lenskit/lenskit/splitting/holdout.py b/lenskit/lenskit/splitting/holdout.py index 770ec6550..070fc18ea 100644 --- a/lenskit/lenskit/splitting/holdout.py +++ b/lenskit/lenskit/splitting/holdout.py @@ -10,28 +10,33 @@ from typing import Protocol +import numpy as np from seedbank import numpy_rng +from seedbank.numpy import NPRNGSource + +from lenskit.data.items import ItemList class HoldoutMethod(Protocol): """ - Holdout methods select test rows for a user (or item). Partition methods - are callable; when called with a data frame, they return the test entries. + Holdout methods select test rows for a user (or occasionally an item). + Partition methods are callable; when called with a data frame, they return + the test entries. """ - def __call__(self, udf): + def __call__(self, items: ItemList) -> ItemList: """ - Subset a data frame. + Subset an item list (in the uncommon case of item-based holdouts, the + item list actually holds user IDs). Args: - udf(pandas.DataFrame): - The input data frame of rows for a user or item. + udf: + The item list from which holdout items should be selected. Returns: - pandas.DataFrame: - The data frame of test rows, a subset of ``udf``. + The list of test items. """ - pass + raise NotImplementedError() class SampleN(HoldoutMethod): @@ -39,16 +44,23 @@ class SampleN(HoldoutMethod): Randomly select a fixed number of test rows per user/item. Args: - n(int): the number of test items to select + n: the number of test items to select rng: the random number generator or seed """ - def __init__(self, n, rng_spec=None): + n: int + rng: np.random.Generator + + def __init__(self, n: int, rng_spec: NPRNGSource | None = None): self.n = n self.rng = numpy_rng(rng_spec) - def __call__(self, udf): - return udf.sample(n=self.n, random_state=self.rng) + def __call__(self, items: ItemList) -> ItemList: + if len(items) <= self.n: + return items + + sel = self.rng.choice(len(items), self.n, replace=False) + return items[sel] class SampleFrac(HoldoutMethod): @@ -56,32 +68,48 @@ class SampleFrac(HoldoutMethod): Randomly select a fraction of test rows per user/item. Args: - frac(float): the fraction items to select for testing. + frac: the fraction items to select for testing. """ - def __init__(self, frac, rng_spec=None): + fraction: float + rng: np.random.Generator + + def __init__(self, frac: float, rng_spec: NPRNGSource | None = None): self.fraction = frac self.rng = numpy_rng(rng_spec) - def __call__(self, udf): - return udf.sample(frac=self.fraction, random_state=self.rng) + def __call__(self, items: ItemList) -> ItemList: + n = round(len(items) * self.fraction) + sel = self.rng.choice(len(items), n, replace=False) + return items[sel] class LastN(HoldoutMethod): """ Select a fixed number of test rows per user/item, based on ordering by a - column. + field. Args: - n(int): The number of test items to select. + n: The number of test items to select. + field: The field to order by. """ - def __init__(self, n, col="timestamp"): + n: int + field: str + + def __init__(self, n: int, field: str = "timestamp"): self.n = n - self.column = col + self.field = field + + def __call__(self, items: ItemList) -> ItemList: + if len(items) <= self.n: + return items - def __call__(self, udf): - return udf.sort_values(self.column).iloc[-self.n :] + col = items.field(self.field) + if col is None: + raise TypeError(f"item list does not have ordering field {self.field}") + ordered = np.argsort(col) + return items[ordered[-self.n :]] class LastFrac(HoldoutMethod): @@ -92,10 +120,18 @@ class LastFrac(HoldoutMethod): frac(double): the fraction of items to select for testing. """ - def __init__(self, frac, col="timestamp"): + fraction: float + field: str + + def __init__(self, frac: float, field: str = "timestamp"): self.fraction = frac - self.column = col + self.field = field + + def __call__(self, items: ItemList) -> ItemList: + n = round(len(items) * self.fraction) - def __call__(self, udf): - n = round(len(udf) * self.fraction) - return udf.sort_values(self.column).iloc[-n:] + col = items.field(self.field) + if col is None: + raise TypeError(f"item list does not have ordering field {self.field}") + ordered = np.argsort(col) + return items[ordered[-n:]] diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_user_holdout.py index a569aa002..931526864 100644 --- a/lenskit/tests/test_split_user_holdout.py +++ b/lenskit/tests/test_split_user_holdout.py @@ -13,92 +13,109 @@ import pytest -import lenskit.crossfold as xf +from lenskit.data.dataset import Dataset +from lenskit.splitting.holdout import LastFrac, LastN, SampleFrac, SampleN -def test_sample_n(ml_ratings: pd.DataFrame): - users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) +def test_sample_n(ml_ds: Dataset): + users = np.random.choice(ml_ds.users.ids(), 5, replace=False) - s5 = xf.SampleN(5) + s5 = SampleN(5) for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = s5(udf) - trn = udf.loc[udf.index.difference(tst.index), :] + row = ml_ds.user_row(u) + assert row is not None + tst = s5(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] assert len(tst) == 5 - assert len(tst) + len(trn) == len(udf) + assert len(tst) + len(trn) == len(row) - s10 = xf.SampleN(10) + s10 = SampleN(10) for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = s10(udf) - trn = udf.loc[udf.index.difference(tst.index), :] + row = ml_ds.user_row(u) + assert row is not None + tst = s10(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] assert len(tst) == 10 - assert len(tst) + len(trn) == len(udf) + assert len(tst) + len(trn) == len(row) -def test_sample_frac(ml_ratings: pd.DataFrame): - users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) +def test_sample_frac(ml_ds: Dataset): + users = np.random.choice(ml_ds.users.ids(), 5, replace=False) - samp = xf.SampleFrac(0.2) + samp = SampleFrac(0.2) for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = samp(udf) - trn = udf.loc[udf.index.difference(tst.index), :] - assert len(tst) + len(trn) == len(udf) - assert len(tst) >= math.floor(len(udf) * 0.2) - assert len(tst) <= math.ceil(len(udf) * 0.2) - - samp = xf.SampleFrac(0.5) + row = ml_ds.user_row(u) + assert row is not None + tst = samp(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] + assert len(tst) + len(trn) == len(row) + assert len(tst) >= math.floor(len(row) * 0.2) + assert len(tst) <= math.ceil(len(row) * 0.2) + + samp = SampleFrac(0.5) for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = samp(udf) - trn = udf.loc[udf.index.difference(tst.index), :] - assert len(tst) + len(trn) == len(udf) - assert len(tst) >= math.floor(len(udf) * 0.5) - assert len(tst) <= math.ceil(len(udf) * 0.5) + row = ml_ds.user_row(u) + assert row is not None + tst = samp(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] + assert len(tst) + len(trn) == len(row) + assert len(tst) >= math.floor(len(row) * 0.5) + assert len(tst) <= math.ceil(len(row) * 0.5) -def test_last_n(ml_ratings: pd.DataFrame): - users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) +def test_last_n(ml_ds: Dataset): + users = np.random.choice(ml_ds.users.ids(), 5, replace=False) - samp = xf.LastN(5) + samp = LastN(5) for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = samp(udf) - trn = udf.loc[udf.index.difference(tst.index), :] + row = ml_ds.user_row(u) + assert row is not None + tst = samp(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] assert len(tst) == 5 - assert len(tst) + len(trn) == len(udf) - assert tst.timestamp.min() >= trn.timestamp.max() + assert len(tst) + len(trn) == len(row) + assert tst.field("timestamp").min() >= trn.field("timestamp").max() - samp = xf.LastN(7) + samp = LastN(7) for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = samp(udf) - trn = udf.loc[udf.index.difference(tst.index), :] + row = ml_ds.user_row(u) + assert row is not None + tst = samp(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] assert len(tst) == 7 - assert len(tst) + len(trn) == len(udf) - assert tst.timestamp.min() >= trn.timestamp.max() + assert len(tst) + len(trn) == len(row) + assert tst.field("timestamp").min() >= trn.field("timestamp").max() -def test_last_frac(ml_ratings: pd.DataFrame): - users = np.random.choice(ml_ratings.user.unique(), 5, replace=False) +def test_last_frac(ml_ds: Dataset): + users = np.random.choice(ml_ds.users.ids(), 5, replace=False) - samp = xf.LastFrac(0.2, "timestamp") + samp = LastFrac(0.2, "timestamp") for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = samp(udf) - trn = udf.loc[udf.index.difference(tst.index), :] - assert len(tst) + len(trn) == len(udf) - assert len(tst) >= math.floor(len(udf) * 0.2) - assert len(tst) <= math.ceil(len(udf) * 0.2) - assert tst.timestamp.min() >= trn.timestamp.max() - - samp = xf.LastFrac(0.5, "timestamp") + row = ml_ds.user_row(u) + assert row is not None + tst = samp(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] + assert len(tst) + len(trn) == len(row) + assert len(tst) >= math.floor(len(row) * 0.2) + assert len(tst) <= math.ceil(len(row) * 0.2) + assert tst.field("timestamp").min() >= trn.field("timestamp").max() + + samp = LastFrac(0.5, "timestamp") for u in users: - udf = ml_ratings[ml_ratings.user == u] - tst = samp(udf) - trn = udf.loc[udf.index.difference(tst.index), :] - assert len(tst) + len(trn) == len(udf) - assert len(tst) >= math.floor(len(udf) * 0.5) - assert len(tst) <= math.ceil(len(udf) * 0.5) - assert tst.timestamp.min() >= trn.timestamp.max() + row = ml_ds.user_row(u) + assert row is not None + tst = samp(row) + mask = np.isin(row.ids(), tst.ids()) + trn = row[~mask] + assert len(tst) + len(trn) == len(row) + assert len(tst) >= math.floor(len(row) * 0.5) + assert len(tst) <= math.ceil(len(row) * 0.5) + assert tst.field("timestamp").min() >= trn.field("timestamp").max() From f326273647338acda5489ca9c6e84f262f91a50e Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 14:51:53 -0400 Subject: [PATCH 06/22] rename test files for split tests --- .../tests/{test_split_user_holdout.py => test_split_holdout.py} | 0 lenskit/tests/{test_split_ratings.py => test_split_rows.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename lenskit/tests/{test_split_user_holdout.py => test_split_holdout.py} (100%) rename lenskit/tests/{test_split_ratings.py => test_split_rows.py} (100%) diff --git a/lenskit/tests/test_split_user_holdout.py b/lenskit/tests/test_split_holdout.py similarity index 100% rename from lenskit/tests/test_split_user_holdout.py rename to lenskit/tests/test_split_holdout.py diff --git a/lenskit/tests/test_split_ratings.py b/lenskit/tests/test_split_rows.py similarity index 100% rename from lenskit/tests/test_split_ratings.py rename to lenskit/tests/test_split_rows.py From 4cbc27ba532ba5a962115a49339488a8cc3391c9 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 14:52:34 -0400 Subject: [PATCH 07/22] add type ignores for holdout test --- lenskit/tests/test_split_holdout.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lenskit/tests/test_split_holdout.py b/lenskit/tests/test_split_holdout.py index 931526864..f9a0666c3 100644 --- a/lenskit/tests/test_split_holdout.py +++ b/lenskit/tests/test_split_holdout.py @@ -79,7 +79,7 @@ def test_last_n(ml_ds: Dataset): trn = row[~mask] assert len(tst) == 5 assert len(tst) + len(trn) == len(row) - assert tst.field("timestamp").min() >= trn.field("timestamp").max() + assert tst.field("timestamp").min() >= trn.field("timestamp").max() # type: ignore samp = LastN(7) for u in users: @@ -90,7 +90,7 @@ def test_last_n(ml_ds: Dataset): trn = row[~mask] assert len(tst) == 7 assert len(tst) + len(trn) == len(row) - assert tst.field("timestamp").min() >= trn.field("timestamp").max() + assert tst.field("timestamp").min() >= trn.field("timestamp").max() # type: ignore def test_last_frac(ml_ds: Dataset): @@ -106,7 +106,7 @@ def test_last_frac(ml_ds: Dataset): assert len(tst) + len(trn) == len(row) assert len(tst) >= math.floor(len(row) * 0.2) assert len(tst) <= math.ceil(len(row) * 0.2) - assert tst.field("timestamp").min() >= trn.field("timestamp").max() + assert tst.field("timestamp").min() >= trn.field("timestamp").max() # type: ignore samp = LastFrac(0.5, "timestamp") for u in users: @@ -118,4 +118,4 @@ def test_last_frac(ml_ds: Dataset): assert len(tst) + len(trn) == len(row) assert len(tst) >= math.floor(len(row) * 0.5) assert len(tst) <= math.ceil(len(row) * 0.5) - assert tst.field("timestamp").min() >= trn.field("timestamp").max() + assert tst.field("timestamp").min() >= trn.field("timestamp").max() # type: ignore From 151ebc40520bf1cd5f448f9ddce2b6c3a8876342 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 16:20:33 -0400 Subject: [PATCH 08/22] update docs --- docs/releases/2024.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst index 23e92cb05..dfbaa2358 100644 --- a/docs/releases/2024.rst +++ b/docs/releases/2024.rst @@ -41,6 +41,11 @@ Significant Changes without round-tripping through Pandas and NumPy, and keep this transparent to client code). +* Where Pandas data frames are still used, the standard user and item columns + have been renamed to ``user_id`` and ``item_id`` respectively, with + ``user_num`` and ``item_num`` for 0-based user and item numbers. This is to + remove ambiguity about how users and items are being referenced. + * **PyTorch**. LensKit now uses PyTorch to implement most of its algorithms, instead of Numba-accelerated NumPy code. Algorithms using PyTorch are: From b0303e4a6b8b35759f5bd18889fa8777694678a7 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 16:21:54 -0400 Subject: [PATCH 09/22] make ids & numbers optional converting ItemList to a data frame --- lenskit/lenskit/data/items.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 6b534f049..6b0245da2 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -318,21 +318,30 @@ def field( else: return val.to(format) - def to_df(self) -> pd.DataFrame: + def to_df(self, *, ids: bool = True, numbers: bool = True) -> pd.DataFrame: """ Convert this item list to a Pandas data frame. It has the following columns: - * ``item_id`` — the item IDs (if available) - * ``item_id`` — the item numbers (if available) + * ``item_id`` — the item IDs (if available and ``ids=True``) + * ``item_num`` — the item numbers (if available and ``numbers=True``) * ``score`` — the item scores * ``rank`` — the item ranks (if the list is ordered) * all other defined fields, using their field names """ cols = {} - if self._ids is not None or self._vocab is not None: + if ids and self._ids is not None or self._vocab is not None: cols["item_id"] = self.ids() - if self._numbers is not None or self._vocab is not None: + if numbers and self._numbers is not None or self._vocab is not None: cols["item_num"] = self.numbers() + # we need to have numbers or ids, or it makes no sense + if "item_id" not in cols and "item_num" not in cols: + if ids and not numbers: + raise RuntimeError("item list has no vocabulary, cannot compute IDs") + elif numbers and not ids: + raise RuntimeError("item list has no vocabulary, cannot compute numbers") + else: + raise RuntimeError("cannot create item data frame without identifiers or numbers") + if "score" in self._fields: cols["score"] = self.scores() if self.ordered: From 544f1e151dc1e7187917461776f979762d9bf2e3 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 17:28:23 -0400 Subject: [PATCH 10/22] add data splitting move to release notes --- docs/releases/2024.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst index dfbaa2358..0e3baa0e8 100644 --- a/docs/releases/2024.rst +++ b/docs/releases/2024.rst @@ -41,6 +41,12 @@ Significant Changes without round-tripping through Pandas and NumPy, and keep this transparent to client code). +* Data splitting for offline evaluation has been moved into + :mod:`lenskit.splitting`, updated to work with data sets and item lists + instead of raw data frames, and splitting functions have been renamed (e.g. + ``rows`` to ``records``) and had parameters updated for clarity and + consistency. + * Where Pandas data frames are still used, the standard user and item columns have been renamed to ``user_id`` and ``item_id`` respectively, with ``user_num`` and ``item_num`` for 0-based user and item numbers. This is to From 86225113fefb2ad60db46d670c6400078a57f139 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 17:28:36 -0400 Subject: [PATCH 11/22] add support for counting observed pairs --- lenskit/lenskit/data/dataset.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/lenskit/lenskit/data/dataset.py b/lenskit/lenskit/data/dataset.py index abff11283..ddb6a4f57 100644 --- a/lenskit/lenskit/data/dataset.py +++ b/lenskit/lenskit/data/dataset.py @@ -116,6 +116,7 @@ def count(self, what: str) -> int: * users * items + * pairs (observed user-item pairs) * interactions * ratings """ @@ -338,6 +339,11 @@ def interaction_matrix( underlying data, then this is equivalent to ``"indicator"``, except that the ``"pandas"`` format will include a ``"rating"`` column of all 1s. + + The ``"pandas"`` format also supports the special field name + ``"all"`` to return a data frame with all available fields. When + ``field="all"``, a field named ``count`` (if defined) is + combined with the ``sum`` method, and other fields use ``last``. combine: How to combine multiple observations for a single user-item pair. Available methods are: @@ -348,7 +354,8 @@ def interaction_matrix( field. * ``"sum"`` — sum the field values. * ``"first"``, ``"last"`` — take the first or last value seen - (in timestamp order, if timestamps are defined). + (in timestamp order, if timestamps are defined; otherwise, + their order in the original input). layout: The layout for a sparse matrix. Can be either ``csr`` or ``coo``, or ``None`` to use the default for the specified @@ -488,8 +495,8 @@ def user_stats(self) -> pd.DataFrame: class MatrixDataset(Dataset): """ - Dataset implementation using an in-memory rating or implicit-feedback - matrix. + Dataset implementation using an in-memory rating or implicit-feedback matrix + (with no duplicate interactions). .. note:: Client code generally should not construct this class directly. Instead @@ -554,7 +561,7 @@ def count(self, what: str) -> int: return self._users.size case "items": return self._items.size - case "interactions" | "ratings": + case "pairs" | "interactions" | "ratings": return self._matrix.n_obs case _: raise KeyError(f"unknown entity type {what}") @@ -603,16 +610,16 @@ def _int_mat_pandas(self, field: str | None, original_ids: bool) -> pd.DataFrame "user_num": self._matrix.user_nums, "item_num": self._matrix.item_nums, } - if field == "rating": + if field == "all" or field == "rating": if self._matrix.ratings is not None: cols["rating"] = self._matrix.ratings else: cols["rating"] = np.ones(self._matrix.n_obs) - elif field == "timestamp": + elif field == "all" or field == "timestamp": if self._matrix.timestamps is None: raise FieldError("interaction", field) cols["timestamp"] = self._matrix.timestamps - elif field: + elif field and field != "all": raise FieldError("interaction", field) return pd.DataFrame(cols) From c6b7d0eb84c4be7e2e2a1c784944a74add84530f Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 17:29:05 -0400 Subject: [PATCH 12/22] add support for making item lists from pandas dataframes --- lenskit/lenskit/data/items.py | 34 +++++++++++++++++++++++++++++++++- lenskit/tests/test_itemlist.py | 22 ++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/lenskit/lenskit/data/items.py b/lenskit/lenskit/data/items.py index 6b0245da2..a046c44af 100644 --- a/lenskit/lenskit/data/items.py +++ b/lenskit/lenskit/data/items.py @@ -142,7 +142,7 @@ def __init__( if item_ids is not None: self._ids = np.asarray(item_ids) - if not issubclass(self._ids.dtype.type, (np.integer, np.str_, np.bytes_)): + if not issubclass(self._ids.dtype.type, (np.integer, np.str_, np.bytes_, np.object_)): raise TypeError(f"item IDs not integers or bytes (type: {self._ids.dtype})") check_1d(self._ids, label="item_ids") @@ -165,6 +165,38 @@ def __init__( raise ValueError("cannot specify both scores= and score=") self._fields["score"] = MTArray(scores) + @classmethod + def from_df( + cls, df: pd.DataFrame, *, vocabulary=Vocabulary[EntityId], keep_user: bool = False + ) -> ItemList: + """ + Create a item list from a Pandas data frame. The frame should have + ``item_num`` and/or ``item_id`` columns to identify the items; other + columns (e.g. ``score`` or ``rating``) are added as fields. If the data + frame has user columns (``user_id`` or ``user_num``), those are dropped + by default. + + Args: + df: + The data frame to turn into an item list. + vocabulary: + The item vocabulary. + keep_user: + If ``True``, keeps user ID/number columns instead of dropping them. + """ + ids = df["item_id"].values if "item_id" in df.columns else None + nums = df["item_num"].values if "item_num" in df.columns else None + if ids is None and nums is None: + raise TypeError("data frame must have at least one of item_id, item_num columns") + + to_drop = ["item_id", "item_num"] + if not keep_user: + to_drop += ["user_id", "user_num"] + df = df.drop(columns=to_drop, errors="ignore") + + fields = {f: df[f].values for f in df.columns} + return cls(item_ids=ids, item_nums=nums, vocabulary=vocabulary, **fields) # type: ignore + def clone(self) -> ItemList: """ Make a shallow copy of the item list. diff --git a/lenskit/tests/test_itemlist.py b/lenskit/tests/test_itemlist.py index 41abda1f9..416735baa 100644 --- a/lenskit/tests/test_itemlist.py +++ b/lenskit/tests/test_itemlist.py @@ -6,6 +6,7 @@ import pickle import numpy as np +import pandas as pd import torch from pytest import raises @@ -273,3 +274,24 @@ def test_subset_slice(ml_ds): assert np.all(pos.ids() == row.ids()[5:10]) assert np.all(pos.numbers() == row.numbers()[5:10]) assert np.all(pos.field("rating") == row.field("rating")[5:10]) + + +def test_from_df(): + df = pd.DataFrame({"item_id": ITEMS, "item_num": np.arange(5), "score": np.random.randn(5)}) + il = ItemList.from_df(df, vocabulary=VOCAB) # type: ignore + assert len(il) == 5 + assert np.all(il.ids() == ITEMS) + assert np.all(il.numbers() == np.arange(5)) + assert np.all(il.scores() == df["score"].values) + + +def test_from_df_user(): + df = pd.DataFrame( + {"user_id": 50, "item_id": ITEMS, "item_num": np.arange(5), "score": np.random.randn(5)} + ) + il = ItemList.from_df(df, vocabulary=VOCAB) # type: ignore + assert len(il) == 5 + assert np.all(il.ids() == ITEMS) + assert np.all(il.numbers() == np.arange(5)) + assert np.all(il.scores() == df["score"].values) + assert il.field("user_id") is None From 36a73cfaa785733430d83f9a4c65f61e6e6af088 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 17:29:30 -0400 Subject: [PATCH 13/22] add utilities for split test data & test it --- lenskit/lenskit/splitting/__init__.py | 2 ++ lenskit/lenskit/splitting/split.py | 52 +++++++++++++++++++++++++++ lenskit/lenskit/splitting/types.py | 27 -------------- lenskit/tests/test_split_types.py | 24 +++++++++++++ 4 files changed, 78 insertions(+), 27 deletions(-) create mode 100644 lenskit/lenskit/splitting/split.py delete mode 100644 lenskit/lenskit/splitting/types.py create mode 100644 lenskit/tests/test_split_types.py diff --git a/lenskit/lenskit/splitting/__init__.py b/lenskit/lenskit/splitting/__init__.py index ec4765a1f..080e9a8c2 100644 --- a/lenskit/lenskit/splitting/__init__.py +++ b/lenskit/lenskit/splitting/__init__.py @@ -7,3 +7,5 @@ """ Splitting data for train-test evaluation. """ + +from .split import TTSplit # noqa: F401 diff --git a/lenskit/lenskit/splitting/split.py b/lenskit/lenskit/splitting/split.py new file mode 100644 index 000000000..579feaffd --- /dev/null +++ b/lenskit/lenskit/splitting/split.py @@ -0,0 +1,52 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +from typing import Literal, NamedTuple, TypeAlias + +import pandas as pd + +from lenskit.data.dataset import Dataset +from lenskit.data.items import ItemList +from lenskit.data.vocab import EntityId + +SplitTable: TypeAlias = Literal["matrix"] + + +class TTSplit(NamedTuple): + """ + A train-test pair from splitting. + """ + + train: Dataset + """ + The training data. + """ + + test: dict[EntityId, ItemList] + """ + The test data. + """ + + +def dict_to_df(data: dict[EntityId, ItemList]) -> pd.DataFrame: + """ + Convert a dictionary mapping user IDs to item lists into a data frame. + """ + + df = pd.concat( + {u: il.to_df(numbers=False) for (u, il) in data.items()}, + names=["user_id"], + ) + df = df.reset_index("user_id") + df = df.reset_index(drop=True) + return df + + +def dict_from_df(df: pd.DataFrame) -> dict[EntityId, ItemList]: + """ + Convert a dictionary mapping user IDs to item lists into a data frame. + """ + return {u: ItemList.from_df(udf) for (u, udf) in df.groupby("user_id")} # type: ignore diff --git a/lenskit/lenskit/splitting/types.py b/lenskit/lenskit/splitting/types.py deleted file mode 100644 index 8190feba5..000000000 --- a/lenskit/lenskit/splitting/types.py +++ /dev/null @@ -1,27 +0,0 @@ -# This file is part of LensKit. -# Copyright (C) 2018-2023 Boise State University -# Copyright (C) 2023-2024 Drexel University -# Licensed under the MIT license, see LICENSE.md for details. -# SPDX-License-Identifier: MIT - -from typing import NamedTuple - -import pandas as pd - -from lenskit.data.dataset import Dataset - - -class TTPair(NamedTuple): - """ - A train-test pair from splitting. - """ - - train: Dataset - """ - The training data. - """ - - test: pd.DataFrame - """ - The test data. - """ diff --git a/lenskit/tests/test_split_types.py b/lenskit/tests/test_split_types.py new file mode 100644 index 000000000..45c70f82b --- /dev/null +++ b/lenskit/tests/test_split_types.py @@ -0,0 +1,24 @@ +""" +Test the data type utilities in splits. +""" + +import numpy as np +import pandas as pd + +from lenskit.splitting.split import dict_from_df + + +def test_dict_from_df(rng, ml_ratings: pd.DataFrame): + ml_ratings = ml_ratings.rename(columns={"user": "user_id", "item": "item_id"}) + users = dict_from_df(ml_ratings) + assert len(users) == ml_ratings["user_id"].nunique() + assert set(users.keys()) == set(ml_ratings["user_id"]) + + for uid in rng.choice(ml_ratings["user_id"].unique(), 25): + items = users[uid] + udf = ml_ratings[ml_ratings["user_id"] == uid] + assert len(items) == len(udf) + assert np.all(np.unique(items.ids()) == np.unique(udf["item_id"])) + + tot = sum(len(il) for il in users.values()) + assert tot == len(ml_ratings) From 27fb76c5c622742d275144c48ac196e3626169d4 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 17:29:43 -0400 Subject: [PATCH 14/22] make record-based splitting work --- lenskit/lenskit/splitting/records.py | 170 +++++++++++++++++++++++++++ lenskit/tests/test_split_records.py | 164 ++++++++++++++++++++++++++ lenskit/tests/test_split_rows.py | 121 ------------------- 3 files changed, 334 insertions(+), 121 deletions(-) create mode 100644 lenskit/lenskit/splitting/records.py create mode 100644 lenskit/tests/test_split_records.py delete mode 100644 lenskit/tests/test_split_rows.py diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py new file mode 100644 index 000000000..a722247fc --- /dev/null +++ b/lenskit/lenskit/splitting/records.py @@ -0,0 +1,170 @@ +import logging +from typing import Iterator, overload + +import numpy as np +import pandas as pd +from seedbank import numpy_rng + +from lenskit.data.dataset import Dataset, MatrixDataset + +from .split import TTSplit, dict_from_df + +_log = logging.getLogger(__name__) + + +def crossfold_records(data: Dataset, partitions: int, *, rng_spec=None) -> Iterator[TTSplit]: + """ + Partition a dataset by **records** into cross-fold partitions. This + partitions the records (ratings, play counts, clicks, etc.) into *k* + partitions without regard to users or items. + + Since record-based random cross-validation doesn't make much sense with + repeated interactions, this splitter only supports operating on the + dataset's interaction matrix. + + Args: + data: + Ratings or other data you wish to partition. + partitions: + The number of partitions to produce. + rng_spec: + The random number generator or seed (see + :func:`seedbank.numpy_rng`). + + Returns: + iterator: an iterator of train-test pairs + """ + + _log.info("partitioning %d ratings into %d partitions", data.count("pairs"), partitions) + rng = numpy_rng(rng_spec) + + # get the full data list to split + df = data.interaction_matrix(format="pandas", field="all", original_ids=True) + n = len(df) + rows = np.arange(n) + + # shuffle the indices & split into partitions + rng.shuffle(rows) + test_sets = np.array_split(rows, partitions) + + # convert each partition into a split + for ts in test_sets: + yield _make_pair(data, df, ts) + + +@overload +def sample_records( + data: Dataset, size: int, *, disjoint=True, rng_spec=None, repeats: None = None +) -> TTSplit: ... +@overload +def sample_records( + data: Dataset, size: int, *, repeats: int, disjoint=True, rng_spec=None +) -> Iterator[TTSplit]: ... +def sample_records( + data: Dataset, size: int, *, repeats: int | None = None, disjoint=True, rng_spec=None +) -> TTSplit | Iterator[TTSplit]: + """ + Sample train-test a frame of ratings into train-test partitions. This + function does not care what kind of data is in `data`, so long as it is a + Pandas DataFrame (or equivalent). + + We can loop over a sequence of train-test pairs:: + + >>> from lenskit.data.movielens import load_movielens_df + >>> ratings = load_movielens_df('data/ml-latest-small') + >>> for train, test in sample_records(ratings, 1000, repeats=5): + ... print(sum(len(il) for il in test.values())) + 1000 + 1000 + 1000 + 1000 + 1000 + + Sometimes for testing, it is useful to just get a single pair:: + + >>> train, test = sample_records(ratings, 1000) + >>> sum(len(il) for il in test.values()) + 1000 + + Args: + data: + The data set to split. + size: + The size of each test sample. + repeats: + The number of data splits to produce. If ``None``, produce a + _single_ train-test pair instead of an iterator or list. + disjoint: + If ``True``, force test samples to be disjoint. + rng_spec: + The random number generator or seed (see + :py:func:`seedbank.numpy_rng`). + + Returns: + A train-test pair or iterator of such pairs (depending on ``repeats``). + """ + + rng = numpy_rng(rng_spec) + + # get the full data list to split + df = data.interaction_matrix(format="pandas", field="all", original_ids=True) + n = len(df) + + if repeats is None: + test_pos = rng.choice(np.int32(n), size, replace=False) + return _make_pair(data, df, test_pos) + + if disjoint and repeats * size >= n: + _log.warning( + "wanted %d disjoint splits of %d each, but only have %d rows; cross-folding", + repeats, + size, + n, + ) + return crossfold_records(data, repeats, rng_spec=rng) + + # get iterators over index arrays for producing the data + if disjoint: + _log.info("creating %d disjoint samples of size %d", repeats, size) + ips = _disjoint_samples(n, size, repeats, rng) + + else: + _log.info("taking %d samples of size %d", repeats, size) + ips = _n_samples(n, size, repeats, rng) + + # since this func is both generator and return depending on args, + # we can't use yield — need to return a generator expression + return (_make_pair(data, df, test_is) for test_is in ips) + + +def _make_pair( + data: Dataset, df: pd.DataFrame, test_is: np.ndarray[int, np.dtype[np.int32]] +) -> TTSplit: + mask = np.zeros(len(df), np.bool_) + mask[test_is] = True + + test = dict_from_df(df[mask]) + train = MatrixDataset(data.users, data.items, df[~mask]) + + return TTSplit(train, test) + + +def _disjoint_samples( + n: int, size: int, reps: int, rng: np.random.Generator +) -> Iterator[np.ndarray[int, np.dtype[np.int32]]]: + # shuffle the indices & split into partitions + xs = np.arange(n, dtype=np.int32) + rng.shuffle(xs) + + # convert each partition into a split + for i in range(reps): + start = i * size + end = start + size + yield xs[start:end] + + +def _n_samples( + n: int, size: int, reps: int, rng: np.random.Generator +) -> Iterator[np.ndarray[int, np.dtype[np.int32]]]: + for i in range(reps): + yield rng.choice(np.int32(n), size, replace=False) diff --git a/lenskit/tests/test_split_records.py b/lenskit/tests/test_split_records.py new file mode 100644 index 000000000..8baade56c --- /dev/null +++ b/lenskit/tests/test_split_records.py @@ -0,0 +1,164 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University +# Copyright (C) 2023-2024 Drexel University +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import functools as ft +import itertools as it +import math + +import numpy as np +import pandas as pd + +import pytest + +from lenskit.data.dataset import Dataset +from lenskit.splitting.records import crossfold_records, sample_records + + +def test_crossfold_records(ml_ds: Dataset): + splits = crossfold_records(ml_ds, 5) + splits = list(splits) + assert len(splits) == 5 + + for s in splits: + # do we have all the data? + test_count = sum(len(il) for il in s.test.values()) + assert test_count + s.train.interaction_count == ml_ds.count("pairs") + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + + # no overlap + assert not (test_pairs & train_pairs) + # union is complete + assert len(test_pairs | train_pairs) == ml_ds.count("pairs") + + # the test sets are pairwise disjoint + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + p1 = set((u, i) for (u, il) in s1.test.items() for i in il.ids()) + p2 = set((u, i) for (u, il) in s2.test.items() for i in il.ids()) + assert not (p1 & p2) + + +def test_sample_records_once(ml_ds): + train, test = sample_records(ml_ds, size=1000) + + test_count = sum(len(il) for il in test.values()) + assert test_count == 1000 + assert test_count + train.interaction_count == ml_ds.count("pairs") + test_pairs = set((u, i) for (u, il) in test.items() for i in il.ids()) + tdf = train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + + # no overlap + assert not (test_pairs & train_pairs) + # union is complete + assert len(test_pairs | train_pairs) == ml_ds.count("pairs") + + +def test_sample_records(ml_ds): + splits = sample_records(ml_ds, size=1000, repeats=5) + splits = list(splits) + assert len(splits) == 5 + + for s in splits: + test_count = sum(len(il) for il in s.test.values()) + assert test_count == 1000 + assert test_count + s.train.interaction_count == ml_ds.count("pairs") + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + + # no overlap + assert not (test_pairs & train_pairs) + # union is complete + assert len(test_pairs | train_pairs) == ml_ds.count("pairs") + + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + p1 = set((u, i) for (u, il) in s1.test.items() for i in il.ids()) + p2 = set((u, i) for (u, il) in s2.test.items() for i in il.ids()) + assert not (p1 & p2) + + +def test_sample_rows_more_smaller_parts(ml_ds: Dataset): + splits = sample_records(ml_ds, 500, repeats=10) + splits = list(splits) + assert len(splits) == 10 + + for s in splits: + test_count = sum(len(il) for il in s.test.values()) + assert test_count == 500 + assert test_count + s.train.interaction_count == ml_ds.count("pairs") + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + + # no overlap + assert not (test_pairs & train_pairs) + # union is complete + assert len(test_pairs | train_pairs) == ml_ds.count("pairs") + + for s1, s2 in it.product(splits, splits): + if s1 is s2: + continue + + p1 = set((u, i) for (u, il) in s1.test.items() for i in il.ids()) + p2 = set((u, i) for (u, il) in s2.test.items() for i in il.ids()) + assert not (p1 & p2) + + +def test_sample_non_disjoint(ml_ds: Dataset): + splits = sample_records(ml_ds, 1000, repeats=10, disjoint=False) + splits = list(splits) + assert len(splits) == 10 + + for s in splits: + test_count = sum(len(il) for il in s.test.values()) + assert test_count == 1000 + assert test_count + s.train.interaction_count == ml_ds.count("pairs") + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + + # no overlap + assert not (test_pairs & train_pairs) + # union is complete + assert len(test_pairs | train_pairs) == ml_ds.count("pairs") + + # There are enough splits & items we should pick at least one duplicate + ipairs = ( + ( + set((u, i) for (u, il) in s1.test.items() for i in il.ids()), + set((u, i) for (u, il) in s2.test.items() for i in il.ids()), + ) + for (s1, s2) in it.product(splits, splits) + ) + isizes = [len(i1.intersection(i2)) for (i1, i2) in ipairs] + assert any(n > 0 for n in isizes) + + +@pytest.mark.slow +def test_sample_oversize(ml_ds: Dataset): + splits = sample_records(ml_ds, 10000, repeats=50) + splits = list(splits) + assert len(splits) == 50 + + for s in splits: + test_count = sum(len(il) for il in s.test.values()) + assert test_count + s.train.interaction_count == ml_ds.count("pairs") + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + + # no overlap + assert not (test_pairs & train_pairs) + # union is complete + assert len(test_pairs | train_pairs) == ml_ds.count("pairs") diff --git a/lenskit/tests/test_split_rows.py b/lenskit/tests/test_split_rows.py deleted file mode 100644 index 5e061b87f..000000000 --- a/lenskit/tests/test_split_rows.py +++ /dev/null @@ -1,121 +0,0 @@ -# This file is part of LensKit. -# Copyright (C) 2018-2023 Boise State University -# Copyright (C) 2023-2024 Drexel University -# Licensed under the MIT license, see LICENSE.md for details. -# SPDX-License-Identifier: MIT - -import functools as ft -import itertools as it -import math - -import numpy as np -import pandas as pd - -import pytest - -import lenskit.crossfold as xf - - -def test_partition_rows(ml_ratings: pd.DataFrame): - splits = xf.partition_rows(ml_ratings, 5) - splits = list(splits) - assert len(splits) == 5 - - for s in splits: - assert len(s.test) + len(s.train) == len(ml_ratings) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - test_idx = s.test.set_index(["user", "item"]).index - train_idx = s.train.set_index(["user", "item"]).index - assert len(test_idx.intersection(train_idx)) == 0 - - # we should partition! - for s1, s2 in it.product(splits, splits): - if s1 is s2: - continue - - i1 = s1.test.set_index(["user", "item"]).index - i2 = s2.test.set_index(["user", "item"]).index - inter = i1.intersection(i2) - assert len(inter) == 0 - - union = ft.reduce(lambda i1, i2: i1.union(i2), (s.test.index for s in splits)) - assert len(union.unique()) == len(ml_ratings) - - -def test_sample_rows(ml_ratings: pd.DataFrame): - splits = xf.sample_rows(ml_ratings, partitions=5, size=1000) - splits = list(splits) - assert len(splits) == 5 - - for s in splits: - assert len(s.test) == 1000 - assert len(s.test) + len(s.train) == len(ml_ratings) - test_idx = s.test.set_index(["user", "item"]).index - train_idx = s.train.set_index(["user", "item"]).index - assert len(test_idx.intersection(train_idx)) == 0 - - for s1, s2 in it.product(splits, splits): - if s1 is s2: - continue - - i1 = s1.test.set_index(["user", "item"]).index - i2 = s2.test.set_index(["user", "item"]).index - inter = i1.intersection(i2) - assert len(inter) == 0 - - -def test_sample_rows_more_smaller_parts(ml_ratings: pd.DataFrame): - splits = xf.sample_rows(ml_ratings, partitions=10, size=500) - splits = list(splits) - assert len(splits) == 10 - - for s in splits: - assert len(s.test) == 500 - assert len(s.test) + len(s.train) == len(ml_ratings) - test_idx = s.test.set_index(["user", "item"]).index - train_idx = s.train.set_index(["user", "item"]).index - assert len(test_idx.intersection(train_idx)) == 0 - - for s1, s2 in it.product(splits, splits): - if s1 is s2: - continue - - i1 = s1.test.set_index(["user", "item"]).index - i2 = s2.test.set_index(["user", "item"]).index - inter = i1.intersection(i2) - assert len(inter) == 0 - - -def test_sample_non_disjoint(ml_ratings: pd.DataFrame): - splits = xf.sample_rows(ml_ratings, partitions=10, size=1000, disjoint=False) - splits = list(splits) - assert len(splits) == 10 - - for s in splits: - assert len(s.test) == 1000 - assert len(s.test) + len(s.train) == len(ml_ratings) - test_idx = s.test.set_index(["user", "item"]).index - train_idx = s.train.set_index(["user", "item"]).index - assert len(test_idx.intersection(train_idx)) == 0 - - # There are enough splits & items we should pick at least one duplicate - ipairs = ( - (s1.test.set_index(["user", "item"]).index, s2.test.set_index(["user", "item"]).index) - for (s1, s2) in it.product(splits, splits) - ) - isizes = [len(i1.intersection(i2)) for (i1, i2) in ipairs] - assert any(n > 0 for n in isizes) - - -@pytest.mark.slow -def test_sample_oversize(ml_ratings: pd.DataFrame): - splits = xf.sample_rows(ml_ratings, 50, 10000) - splits = list(splits) - assert len(splits) == 50 - - for s in splits: - assert len(s.test) + len(s.train) == len(ml_ratings) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - test_idx = s.test.set_index(["user", "item"]).index - train_idx = s.train.set_index(["user", "item"]).index - assert len(test_idx.intersection(train_idx)) == 0 From bc35baa1fe0844fd32426e2f7a5a69ddf57a4808 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 17:36:48 -0400 Subject: [PATCH 15/22] add test-size utlity --- lenskit/lenskit/splitting/split.py | 7 +++++++ lenskit/tests/test_split_records.py | 15 ++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/lenskit/lenskit/splitting/split.py b/lenskit/lenskit/splitting/split.py index 579feaffd..393424c78 100644 --- a/lenskit/lenskit/splitting/split.py +++ b/lenskit/lenskit/splitting/split.py @@ -30,6 +30,13 @@ class TTSplit(NamedTuple): The test data. """ + @property + def test_size(self) -> int: + """ + Get the number of test pairs. + """ + return sum(len(il) for il in self.test.values()) + def dict_to_df(data: dict[EntityId, ItemList]) -> pd.DataFrame: """ diff --git a/lenskit/tests/test_split_records.py b/lenskit/tests/test_split_records.py index 8baade56c..a4e565dfd 100644 --- a/lenskit/tests/test_split_records.py +++ b/lenskit/tests/test_split_records.py @@ -24,7 +24,7 @@ def test_crossfold_records(ml_ds: Dataset): for s in splits: # do we have all the data? - test_count = sum(len(il) for il in s.test.values()) + test_count = s.test_size assert test_count + s.train.interaction_count == ml_ds.count("pairs") test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) @@ -46,9 +46,10 @@ def test_crossfold_records(ml_ds: Dataset): def test_sample_records_once(ml_ds): - train, test = sample_records(ml_ds, size=1000) + split = sample_records(ml_ds, size=1000) + train, test = split - test_count = sum(len(il) for il in test.values()) + test_count = split.test_size assert test_count == 1000 assert test_count + train.interaction_count == ml_ds.count("pairs") test_pairs = set((u, i) for (u, il) in test.items() for i in il.ids()) @@ -67,7 +68,7 @@ def test_sample_records(ml_ds): assert len(splits) == 5 for s in splits: - test_count = sum(len(il) for il in s.test.values()) + test_count = s.test_size assert test_count == 1000 assert test_count + s.train.interaction_count == ml_ds.count("pairs") test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) @@ -94,7 +95,7 @@ def test_sample_rows_more_smaller_parts(ml_ds: Dataset): assert len(splits) == 10 for s in splits: - test_count = sum(len(il) for il in s.test.values()) + test_count = s.test_size assert test_count == 500 assert test_count + s.train.interaction_count == ml_ds.count("pairs") test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) @@ -121,7 +122,7 @@ def test_sample_non_disjoint(ml_ds: Dataset): assert len(splits) == 10 for s in splits: - test_count = sum(len(il) for il in s.test.values()) + test_count = s.test_size assert test_count == 1000 assert test_count + s.train.interaction_count == ml_ds.count("pairs") test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) @@ -152,7 +153,7 @@ def test_sample_oversize(ml_ds: Dataset): assert len(splits) == 50 for s in splits: - test_count = sum(len(il) for il in s.test.values()) + test_count = s.test_size assert test_count + s.train.interaction_count == ml_ds.count("pairs") test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) From cbfd00cb15dfb2bcddcd4b45ac2fd3a25a4f8a65 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:25:34 -0400 Subject: [PATCH 16/22] make vocabularies iterable --- lenskit/lenskit/data/vocab.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/lenskit/lenskit/data/vocab.py b/lenskit/lenskit/data/vocab.py index f4bdf7ca3..cf11be40c 100644 --- a/lenskit/lenskit/data/vocab.py +++ b/lenskit/lenskit/data/vocab.py @@ -11,7 +11,18 @@ # pyright: basic from __future__ import annotations -from typing import Any, Generic, Hashable, Iterable, Literal, Sequence, TypeAlias, TypeVar, overload +from typing import ( + Any, + Generic, + Hashable, + Iterable, + Iterator, + Literal, + Sequence, + TypeAlias, + TypeVar, + overload, +) import numpy as np import pandas as pd @@ -159,6 +170,9 @@ def __eq__(self, other: Vocabulary[Any]) -> bool: # noqa: F821 def __contains__(self, key: VT) -> bool: return key in self._index + def __iter__(self) -> Iterator[EntityId]: + return iter(self._index.values) + def __len__(self) -> int: return self.size From 6bfc75a81206e1d6a4057529637e1a2fca530943 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:25:55 -0400 Subject: [PATCH 17/22] strengthen user row tests --- lenskit/tests/test_dataset_matrix.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lenskit/tests/test_dataset_matrix.py b/lenskit/tests/test_dataset_matrix.py index eb994d763..3a88c9041 100644 --- a/lenskit/tests/test_dataset_matrix.py +++ b/lenskit/tests/test_dataset_matrix.py @@ -372,12 +372,25 @@ def test_matrix_rows_by_id(rng: np.random.Generator, ml_ratings: pd.DataFrame, m def test_matrix_rows_by_num(rng: np.random.Generator, ml_ratings: pd.DataFrame, ml_ds: Dataset): users = rng.choice(ml_ds.user_count, 50) + rated = set(zip(ml_ratings["user"], ml_ratings["item"])) + rdf = ml_ds.interaction_matrix("pandas") + rnums = set(zip(rdf["user_num"], rdf["item_num"])) + + dfi = ml_ratings.set_index(["user", "item"]) + for user in users: + uid = ml_ds.users.id(user) row = ml_ds.user_row(user_num=user) assert row is not None urows = ml_ratings[ml_ratings["user"] == ml_ds.users.id(user)].sort_values("item") assert set(row.ids()) == set(urows["item"]) + assert np.all(row.numbers() == ml_ds.items.numbers(urows["item"])) + assert all((user, ino) in rnums for ino in row.numbers()) + + assert np.all(row.ids() == ml_ds.items.ids(row.numbers())) + assert all((uid, item) in rated for item in row.ids()) + assert all((uid, item) in dfi.index for item in row.ids()) ratings = row.field("rating") assert ratings is not None From 0f8277fae6d55daef95a8417ffc2c7de4517c7df Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:26:06 -0400 Subject: [PATCH 18/22] working user-based splitting --- lenskit/lenskit/splitting/records.py | 26 ++- lenskit/lenskit/splitting/users.py | 171 ++++++++++++++++++++ lenskit/tests/test_split_users.py | 227 +++++++++++---------------- 3 files changed, 288 insertions(+), 136 deletions(-) create mode 100644 lenskit/lenskit/splitting/users.py diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py index a722247fc..82bde8d45 100644 --- a/lenskit/lenskit/splitting/records.py +++ b/lenskit/lenskit/splitting/records.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from seedbank import numpy_rng +from seedbank.numpy import NPRNGSource from lenskit.data.dataset import Dataset, MatrixDataset @@ -12,7 +13,9 @@ _log = logging.getLogger(__name__) -def crossfold_records(data: Dataset, partitions: int, *, rng_spec=None) -> Iterator[TTSplit]: +def crossfold_records( + data: Dataset, partitions: int, *, rng_spec: NPRNGSource | None = None +) -> Iterator[TTSplit]: """ Partition a dataset by **records** into cross-fold partitions. This partitions the records (ratings, play counts, clicks, etc.) into *k* @@ -54,14 +57,29 @@ def crossfold_records(data: Dataset, partitions: int, *, rng_spec=None) -> Itera @overload def sample_records( - data: Dataset, size: int, *, disjoint=True, rng_spec=None, repeats: None = None + data: Dataset, + size: int, + *, + disjoint: bool = True, + rng_spec: NPRNGSource | None = None, + repeats: None = None, ) -> TTSplit: ... @overload def sample_records( - data: Dataset, size: int, *, repeats: int, disjoint=True, rng_spec=None + data: Dataset, + size: int, + *, + repeats: int, + disjoint: bool = True, + rng_spec: NPRNGSource | None = None, ) -> Iterator[TTSplit]: ... def sample_records( - data: Dataset, size: int, *, repeats: int | None = None, disjoint=True, rng_spec=None + data: Dataset, + size: int, + *, + repeats: int | None = None, + disjoint: bool = True, + rng_spec: NPRNGSource | None = None, ) -> TTSplit | Iterator[TTSplit]: """ Sample train-test a frame of ratings into train-test partitions. This diff --git a/lenskit/lenskit/splitting/users.py b/lenskit/lenskit/splitting/users.py new file mode 100644 index 000000000..1fb59acbb --- /dev/null +++ b/lenskit/lenskit/splitting/users.py @@ -0,0 +1,171 @@ +import logging +from typing import Iterable, Iterator, overload + +import numpy as np +import pandas as pd +from seedbank import numpy_rng +from seedbank.numpy import NPRNGSource + +from lenskit.data.dataset import Dataset, MatrixDataset +from lenskit.data.vocab import EntityId + +from .holdout import HoldoutMethod +from .split import TTSplit + +_log = logging.getLogger(__name__) + + +def crossfold_users( + data: Dataset, partitions: int, method: HoldoutMethod, *, rng_spec: NPRNGSource | None = None +) -> Iterator[TTSplit]: + """ + Partition a frame of ratings or other data into train-test partitions + user-by-user. This function does not care what kind of data is in `data`, so + long as it is a Pandas DataFrame (or equivalent) and has a `user` column. + + Args: + data: + a data frame containing ratings or other data you wish to partition. + partitions: + the number of partitions to produce + method: + The method for selecting test rows for each user. + rng_spec: + The RNG or seed (see :func:`seedbank.numpy_rng`). + + Returns + The train-test pairs. + """ + rng = numpy_rng(rng_spec) + + users = data.users.ids() + _log.info( + "partitioning %d rows for %d users into %d partitions", + data.count("pairs"), + len(users), + partitions, + ) + + # create an array of indexes into user row + rows = np.arange(len(users)) + # shuffle the indices & split into partitions + rng.shuffle(rows) + test_sets = np.array_split(rows, partitions) + + # get the whole test DF + df = data.interaction_matrix("pandas", field="all", original_ids=True).set_index( + ["user_id", "item_id"] + ) + + # convert each partition into a split + for i, ts in enumerate(test_sets): + # get our users! + test_us = users[ts] + _log.info("fold %d: selecting test ratings", i) + + yield _make_split(data, df, test_us, method) + + +@overload +def sample_users( + data: Dataset, + size: int, + method: HoldoutMethod, + *, + repeats: int, + disjoint: bool = True, + rng_spec: NPRNGSource | None = None, +) -> Iterator[TTSplit]: ... +@overload +def sample_users( + data: Dataset, + size: int, + method: HoldoutMethod, + *, + disjoint: bool = True, + rng_spec: NPRNGSource | None = None, + repeats: None = None, +) -> TTSplit: ... +def sample_users( + data: Dataset, + size: int, + method: HoldoutMethod, + *, + repeats: int | None = None, + disjoint: bool = True, + rng_spec: NPRNGSource | None = None, +) -> Iterator[TTSplit] | TTSplit: + """ + Create train-test splits by sampling users. When ``repeats`` is None, + returns a single train-test split; otherwise, it returns an iterator over + multiple splits. If ``repeats=1``, this function returns an iterator that + yields a single train-test pair. + + Args: + data: + Data frame containing ratings or other data you wish to partition. + size: + The sample size. + method: + The method for obtaining user test ratings. + repeats: + The number of samples to produce. + rng_spec: + The RNG or seed (see :func:`seedbank.numpy_rng`). + + Returns: + The train-test pair(s). + """ + + rng = numpy_rng(rng_spec) + + users = data.users.ids() + unums = np.arange(len(users)) + if disjoint and repeats is not None and repeats * size >= len(users): + _log.warning( + "cannot take %d disjoint samples of size %d from %d users", repeats, size, len(users) + ) + return crossfold_users(data, repeats, method) + + _log.info("sampling %d users (n=%d)", len(users), size) + + # get the whole test DF + rate_df = data.interaction_matrix("pandas", field="all", original_ids=True).set_index( + ["user_id", "item_id"] + ) + + if repeats is None: + test_us = rng.choice(users, size, replace=False) + return _make_split(data, rate_df, test_us, method) + + if disjoint: + rng.shuffle(unums) + test_usets = [unums[i * size : (i + 1) * size] for i in range(repeats)] + else: + test_usets = [rng.choice(len(users), size, replace=False) for _i in range(repeats)] + + return (_make_split(data, rate_df, users[us], method) for us in test_usets) + + +def _make_split( + data: Dataset, df: pd.DataFrame, test_us: Iterable[EntityId], method: HoldoutMethod +) -> TTSplit: + # create the test sets for these users + mask = pd.Series(True, index=df.index) + test = {} + + for u in test_us: + row = data.user_row(u) + assert row is not None + u_test = method(row) + test[u] = u_test + assert all((u, i) in mask.index for i in u_test.ids()) + mask.loc[[(u, i) for i in u_test.ids()]] = False # type: ignore + assert np.sum(mask.loc[u]) == len(row) - len(u_test) + + train_df = df[mask] + train = MatrixDataset(data.users, data.items, train_df.reset_index()) + + split = TTSplit(train, test) + assert len(train_df) + split.test_size == len(df) + return split diff --git a/lenskit/tests/test_split_users.py b/lenskit/tests/test_split_users.py index 2a6671b1c..ed6ac7b44 100644 --- a/lenskit/tests/test_split_users.py +++ b/lenskit/tests/test_split_users.py @@ -13,181 +13,144 @@ import pytest -import lenskit.crossfold as xf +from lenskit.data.dataset import Dataset, from_interactions_df +from lenskit.splitting.holdout import SampleFrac, SampleN +from lenskit.splitting.users import crossfold_users, sample_users -def test_partition_users(ml_ratings: pd.DataFrame): - splits = xf.partition_users(ml_ratings, 5, xf.SampleN(5)) +def test_crossfold_users(ml_ds: Dataset): + splits = crossfold_users(ml_ds, 5, SampleN(5)) splits = list(splits) assert len(splits) == 5 + users = set() for s in splits: - ucounts = s.test.groupby("user").agg("count") - assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - assert all(s.train["user"].isin(s.train["user"].unique())) - assert len(s.test) + len(s.train) == len(ml_ratings) + assert all(len(il) for il in s.test.values()) + assert not any(u in users for u in s.test.keys()) + users |= s.test.keys() - users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) - assert len(users) == ml_ratings.user.nunique() - assert users == set(ml_ratings.user) + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + assert not test_pairs & train_pairs + assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs") + assert users == set(ml_ds.users) -def test_partition_may_skip_train(ml_ratings: pd.DataFrame): + +def test_crossfold_may_skip_train(ml_ratings: pd.DataFrame): "Partitioning when users may not have enough ratings to be in the train and test sets." # make a data set where some users only have 1 rating ml_ratings = ml_ratings.sample(frac=0.1) - users = ml_ratings.groupby("user")["rating"].count() - assert users.min() == 1.0 # we should have some small users! - users.name = "ur_count" + ucounts = ml_ratings.groupby("user")["rating"].count() + assert ucounts.min() == 1 # we should have some small users! + ucounts.name = "ur_count" + ml_ds = from_interactions_df(ml_ratings) - splits = xf.partition_users(ml_ratings, 5, xf.SampleN(1)) + splits = crossfold_users(ml_ds, 5, SampleN(1)) splits = list(splits) assert len(splits) == 5 # now we go make sure we're missing some users! And don't have any NaN ml_ratings for train, test in splits: - # no null ml_ratings - assert all(train["rating"].notna()) - # see if test users with 1 rating are missing from train - test = test.join(users, on="user") - assert all(~(test.loc[test["ur_count"] == 1, "user"].isin(train["user"].unique()))) - # and users with more than one rating are in train - assert all(test.loc[test["ur_count"] > 1, "user"].isin(train["user"].unique())) + for u in ucounts[ucounts == 1].index: + if u in test: + row = train.user_row(u) + assert row is not None + assert len(row) == 0 -def test_partition_users_frac(ml_ratings: pd.DataFrame): - splits = xf.partition_users(ml_ratings, 5, xf.SampleFrac(0.2)) +def test_crossfold_users_frac(ml_ds: Dataset): + splits = crossfold_users(ml_ds, 5, SampleFrac(0.2)) splits = list(splits) assert len(splits) == 5 - ucounts = ml_ratings.groupby("user").item.count() - uss = ucounts * 0.2 + ustats = ml_ds.user_stats() + uss = ustats["count"] * 0.2 for s in splits: - tucs = s.test.groupby("user").item.count() - assert all(tucs >= uss.loc[tucs.index] - 1) - assert all(tucs <= uss.loc[tucs.index] + 1) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - assert len(s.test) + len(s.train) == len(ml_ratings) + assert all(len(il) >= uss.loc[u] - 1 for (u, il) in s.test.items()) + assert all(len(il) <= uss.loc[u] + 1 for (u, il) in s.test.items()) + assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs") + + +def test_sample_users_single(ml_ds: Dataset): + split = sample_users(ml_ds, 100, SampleN(5)) - # we have all users - users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) - assert len(users) == ml_ratings.user.nunique() - assert users == set(ml_ratings.user) + assert len(split.test) == 100 + assert split.test_size == 500 + test_pairs = set((u, i) for (u, il) in split.test.items() for i in il.ids()) + assert len(test_pairs) == split.test_size + tdf = split.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + assert len(train_pairs) == split.train.count("pairs") + assert len(test_pairs & train_pairs) == 0 + assert split.test_size + split.train.count("pairs") == ml_ds.count("pairs") -def test_sample_users(ml_ratings: pd.DataFrame): - splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)) + +def test_sample_users(ml_ds: Dataset): + splits = sample_users(ml_ds, 100, SampleN(5), repeats=5) splits = list(splits) assert len(splits) == 5 + aus = set() for s in splits: - ucounts = s.test.groupby("user").agg("count") - assert len(s.test) == 5 * 100 - assert len(ucounts) == 100 - assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - assert len(s.test) + len(s.train) == len(ml_ratings) - - # no overlapping users - for s1, s2 in it.product(splits, splits): - if s1 is s2: - continue - us1 = s1.test.user.unique() - us2 = s2.test.user.unique() - assert len(np.intersect1d(us1, us2)) == 0 - - -def test_sample_users_frac(ml_ratings: pd.DataFrame): - splits = xf.sample_users(ml_ratings, 5, 100, xf.SampleFrac(0.2)) + assert len(s.test) == 100 + assert s.test_size == 500 + # users are disjoint + assert not any(u in aus for u in s.test.keys()) + aus |= s.test.keys() + + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + assert len(test_pairs) == s.test_size + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + assert len(train_pairs) == s.train.count("pairs") + assert len(test_pairs & train_pairs) == 0 + assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs") + + +def test_sample_users_non_disjoint(ml_ds: Dataset): + splits = sample_users(ml_ds, 100, SampleN(5), repeats=5, disjoint=False) splits = list(splits) assert len(splits) == 5 - ucounts = ml_ratings.groupby("user").item.count() - uss = ucounts * 0.2 + + aus = set() for s in splits: - tucs = s.test.groupby("user").item.count() - assert len(tucs) == 100 - assert all(tucs >= uss.loc[tucs.index] - 1) - assert all(tucs <= uss.loc[tucs.index] + 1) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - assert len(s.test) + len(s.train) == len(ml_ratings) - - # no overlapping users - for s1, s2 in it.product(splits, splits): - if s1 is s2: - continue - us1 = s1.test.user.unique() - us2 = s2.test.user.unique() - assert len(np.intersect1d(us1, us2)) == 0 + assert len(s.test) == 100 + assert s.test_size == 500 + aus |= s.test.keys() + + test_pairs = set((u, i) for (u, il) in s.test.items() for i in il.ids()) + assert len(test_pairs) == s.test_size + tdf = s.train.interaction_matrix("pandas", field="rating", original_ids=True) + train_pairs = set(zip(tdf["user_id"], tdf["item_id"])) + assert len(train_pairs) == s.train.count("pairs") + assert len(test_pairs & train_pairs) == 0 + assert s.test_size + s.train.count("pairs") == ml_ds.count("pairs") + + # some user appears at least once + assert len(aus) < 500 @pytest.mark.slow -def test_sample_users_frac_oversize(ml_ratings: pd.DataFrame): - splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5)) +def test_sample_users_frac_oversize(ml_ds: Dataset): + splits = sample_users(ml_ds, 100, SampleN(5), repeats=20) splits = list(splits) assert len(splits) == 20 for s in splits: - ucounts = s.test.groupby("user").agg("count") - assert len(ucounts) < 100 - assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - assert len(s.test) + len(s.train) == len(ml_ratings) - - users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) - assert len(users) == ml_ratings.user.nunique() - assert users == set(ml_ratings.user) - for s1, s2 in it.product(splits, splits): - if s1 is s2: - continue - - us1 = s1.test.user.unique() - us2 = s2.test.user.unique() - assert len(np.intersect1d(us1, us2)) == 0 - - -def test_sample_users_frac_oversize_ndj(ml_ratings: pd.DataFrame): - splits = xf.sample_users(ml_ratings, 20, 100, xf.SampleN(5), disjoint=False) + assert len(s.test) < 100 + assert all(len(il) == 5 for il in s.test.values()) + + +def test_sample_users_frac_oversize_ndj(ml_ds: Dataset): + splits = sample_users(ml_ds, 100, SampleN(5), repeats=20, disjoint=False) splits = list(splits) assert len(splits) == 20 for s in splits: - ucounts = s.test.groupby("user").agg("count") - assert len(ucounts) == 100 - assert len(s.test) == 5 * 100 - assert all(ucounts == 5) - assert all(s.test.index.union(s.train.index) == ml_ratings.index) - assert len(s.test) + len(s.train) == len(ml_ratings) - - -def test_non_unique_index_partition_users(ml_ratings: pd.DataFrame): - """Partitioning users when dataframe has non-unique indices""" - ml_ratings = ml_ratings.set_index("user") ##forces non-unique index - with pytest.raises(ValueError): - for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)): - pass - - -def test_sample_users_dup_index(ml_ratings: pd.DataFrame): - """Sampling users when dataframe has non-unique indices""" - ml_ratings = ml_ratings.set_index("user") ##forces non-unique index - with pytest.raises(ValueError): - for split in xf.sample_users(ml_ratings, 5, 100, xf.SampleN(5)): - pass - - -def test_sample_rows_dup_index(ml_ratings: pd.DataFrame): - """Sampling ml_ratings when dataframe has non-unique indices""" - ml_ratings = ml_ratings.set_index("user") ##forces non-unique index - with pytest.raises(ValueError): - for split in xf.sample_rows(ml_ratings, partitions=5, size=1000): - pass - - -def test_partition_users_dup_index(ml_ratings: pd.DataFrame): - """Partitioning ml_ratings when dataframe has non-unique indices""" - ml_ratings = ml_ratings.set_index("user") ##forces non-unique index - with pytest.raises(ValueError): - for split in xf.partition_users(ml_ratings, 5, xf.SampleN(5)): - pass + assert len(s.test) == 100 + assert s.test_size == 5 * 100 + assert all([len(il) for il in s.test.values()]) From b07f2c7c9041a71ec01c65ec6a602c4e51652273 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:37:57 -0400 Subject: [PATCH 19/22] update documentation --- docs/crossfold.rst | 85 ------------------------ docs/index.rst | 2 +- docs/splitting.rst | 93 +++++++++++++++++++++++++++ lenskit/lenskit/splitting/__init__.py | 5 +- 4 files changed, 98 insertions(+), 87 deletions(-) delete mode 100644 docs/crossfold.rst create mode 100644 docs/splitting.rst diff --git a/docs/crossfold.rst b/docs/crossfold.rst deleted file mode 100644 index 3707344e3..000000000 --- a/docs/crossfold.rst +++ /dev/null @@ -1,85 +0,0 @@ -Splitting Data -============== - -.. module:: lenskit.crossfold - -The LKPY `crossfold` module provides support for preparing data sets for -cross-validation. Crossfold methods are implemented as functions that operate -on data frames and return generators of `(train, test)` pairs -(:py:class:`lenskit.crossfold.TTPair` objects). The train and test objects -in each pair are also data frames, suitable for evaluation or writing out to -a file. - -Crossfold methods make minimal assumptions about their input data frames, so the -frames can be ratings, purchases, or whatever. They do assume that each row -represents a single data point for the purpose of splitting and sampling. - -Experiment code should generally use these functions to prepare train-test files -for training and evaluating algorithms. For example, the following will perform -a user-based 5-fold cross-validation as was the default in the old LensKit:: - - import pandas as pd - import lenskit.crossfold as xf - ratings = pd.read_csv('ml-20m/ratings.csv') - ratings = ratings.rename(columns={'userId': 'user', 'movieId': 'item'}) - for i, tp in enumerate(xf.partition_users(ratings, 5, xf.SampleN(5))): - tp.train.to_csv('ml-20m.exp/train-%d.csv' % (i,)) - tp.train.to_parquet('ml-20m.exp/train-%d.parquet % (i,)) - tp.test.to_csv('ml-20m.exp/test-%d.csv' % (i,)) - tp.test.to_parquet('ml-20m.exp/test-%d.parquet % (i,)) - -Row-based splitting -------------------- - -The simplest preparation methods sample or partition the rows in the input frame. -A 5-fold :py:func:`partition_rows` split will result in 5 -splits, each of which extracts 20% of the rows for testing and leaves 80% for -training. - -.. autofunction:: partition_rows - -.. autofunction:: sample_rows - -User-based splitting --------------------- - -It's often desirable to use users, instead of raw rows, as the basis for splitting -data. This allows you to control the experimental conditions on a user-by-user basis, -e.g. by making sure each user is tested with the same number of ratings. These methods -require that the input data frame have a `user` column with the user names or identifiers. - -The algorithm used by each is as follows: - -1. Sample or partition the set of user IDs into *n* sets of test users. -2. For each set of test users, select a set of that user's rows to be test rows. -3. Create a training set for each test set consisting of the non-selected rows from each - of that set's test users, along with all rows from each non-test user. - -.. autofunction:: partition_users - -.. autofunction:: sample_users - -Selecting user test rows -~~~~~~~~~~~~~~~~~~~~~~~~ - -These functions each take a `method` to decide how select each user's test rows. The method -is a function that takes a data frame (containing just the user's rows) and returns the -test rows. This function is expected to preserve the index of the input data frame (which -happens by default with common means of implementing samples). - -We provide several partition method factories: - -.. autofunction:: SampleN -.. autofunction:: SampleFrac -.. autofunction:: LastN -.. autofunction:: LastFrac - -Utility Classes ---------------- - -.. autoclass:: PartitionMethod - :members: - :special-members: - -.. autoclass:: TTPair - :members: diff --git a/docs/index.rst b/docs/index.rst index cc85cb3ce..9e8dda62a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,7 +39,7 @@ Resources :caption: Running Experiments data - crossfold + splitting batch evaluation/index documenting diff --git a/docs/splitting.rst b/docs/splitting.rst new file mode 100644 index 000000000..23276e6d9 --- /dev/null +++ b/docs/splitting.rst @@ -0,0 +1,93 @@ +Splitting Data +============== + +.. module:: lenskit.splitting + +The LKPY `splitting` module splits data sets for offline evaluation using +cross-validation and other strategies. The various splitters are implemented as +functions that operate on a :class:`~lenskit.data.Dataset` and return one or +more train-test splits (as :class:`TTSplit` objects). + +.. versionchanged:: 2024.1 + Data splitting was moved from ``lenskit.crossfold`` to the ``lenskit.splitting`` + module and functions were renamed and had their interfaces revised. + +Experiment code should generally use these functions to prepare train-test files +for training and evaluating algorithms. For example, the following will perform +a user-based 5-fold cross-validation as was the default in the old LensKit: + +.. code:: python + + import pandas as pd + from lenskit.data import load_movielens + from lenskit.splitting import crossfold_users, SampleN, dict_to_df + dataset = load_movielens('data/ml-20m.zip') + for i, tp in enumerate(crossfold_users(ratings, 5, SampleN(5))): + train_df = tp.train.interaction_log('pandas', field='all', original_ids=True) + train_df.to_parquet(f'ml-20m.exp/train-{i}.parquet') + dict_to_df(tp.test).to_parquet(f'ml-20m.exp/test-{i}.parquet') + +Record-based Random Splitting +----------------------------- + +The simplest preparation methods sample or partition the records in the input +data. A 5-fold :func:`crossfold_records` split will result in 5 splits, each of +which extracts 20% of the user-item interaction records for testing and leaves +80% for training. + +.. note:: + + When a dataset has repeated interactions, these functions operate only on + the *matrix* view of the data (user-item observations are deduplicated). + Specifically, they operate on the results of calling + :meth:`~lenskit.data.Dataset.interaction_matrix` with ``format="pandas"`` + and ``field="all"``. + +.. autofunction:: crossfold_records + +.. autofunction:: sample_records + +User-based Splitting +-------------------- + +It's often desirable to use users, instead of raw rows, as the basis for +splitting data. This allows you to control the experimental conditions on a +user-by-user basis, e.g. by making sure each user is tested with the same number +of ratings. These methods require that the input data frame have a `user` +column with the user names or identifiers. + +The algorithm used by each is as follows: + +1. Sample or partition the set of user IDs into *n* sets of test users. +2. For each set of test users, select a set of that user's rows to be test rows. +3. Create a training set for each test set consisting of the non-selected rows + from each of that set's test users, along with all rows from each non-test + user. + +.. autofunction:: crossfold_users + +.. autofunction:: sample_users + +Selecting user holdout rows +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions each take a `method` to decide how select each user's test rows. The method +is a function that takes an item list (containing just the user's rows) and returns the +test rows. + +We provide several holdout method factories: + +.. autofunction:: SampleN +.. autofunction:: SampleFrac +.. autofunction:: LastN +.. autofunction:: LastFrac + +Utility Classes +--------------- + +.. autoclass:: lenskit.splitting.holdout.HoldoutMethod + :members: + :special-members: __call__ + +.. autoclass:: TTSplit + :members: diff --git a/lenskit/lenskit/splitting/__init__.py b/lenskit/lenskit/splitting/__init__.py index 080e9a8c2..90346a089 100644 --- a/lenskit/lenskit/splitting/__init__.py +++ b/lenskit/lenskit/splitting/__init__.py @@ -8,4 +8,7 @@ Splitting data for train-test evaluation. """ -from .split import TTSplit # noqa: F401 +from .holdout import LastFrac, LastN, SampleFrac, SampleN # noqa: F401 +from .records import crossfold_records, sample_records # noqa: F401 +from .split import TTSplit, dict_from_df, dict_to_df # noqa: F401 +from .users import crossfold_users, sample_users # noqa: F401 From 9b50ca2032cf2586cd1edb270656dd6b45a8afa7 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:41:18 -0400 Subject: [PATCH 20/22] add df accessors to TTSplit --- docs/splitting.rst | 5 ++--- lenskit/lenskit/splitting/split.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/splitting.rst b/docs/splitting.rst index 23276e6d9..d0ea78171 100644 --- a/docs/splitting.rst +++ b/docs/splitting.rst @@ -23,9 +23,8 @@ a user-based 5-fold cross-validation as was the default in the old LensKit: from lenskit.splitting import crossfold_users, SampleN, dict_to_df dataset = load_movielens('data/ml-20m.zip') for i, tp in enumerate(crossfold_users(ratings, 5, SampleN(5))): - train_df = tp.train.interaction_log('pandas', field='all', original_ids=True) - train_df.to_parquet(f'ml-20m.exp/train-{i}.parquet') - dict_to_df(tp.test).to_parquet(f'ml-20m.exp/test-{i}.parquet') + tp.train_df.to_parquet(f'ml-20m.exp/train-{i}.parquet') + tp.test_df.to_parquet(f'ml-20m.exp/test-{i}.parquet') Record-based Random Splitting ----------------------------- diff --git a/lenskit/lenskit/splitting/split.py b/lenskit/lenskit/splitting/split.py index 393424c78..5d0ea09ae 100644 --- a/lenskit/lenskit/splitting/split.py +++ b/lenskit/lenskit/splitting/split.py @@ -37,6 +37,20 @@ def test_size(self) -> int: """ return sum(len(il) for il in self.test.values()) + @property + def test_df(self) -> pd.DataFrame: + """ + Get the test data as a data frame. + """ + return dict_to_df(self.test) + + @property + def train_df(self) -> pd.DataFrame: + """ + Get the training data as a data frame. + """ + return self.train.interaction_matrix("pandas", field="all") + def dict_to_df(data: dict[EntityId, ItemList]) -> pd.DataFrame: """ From e79fdaedbb5376ee48f832317e5ce015e5fed8d3 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:48:44 -0400 Subject: [PATCH 21/22] fix doctests --- lenskit/lenskit/splitting/records.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py index 82bde8d45..50f18d4c2 100644 --- a/lenskit/lenskit/splitting/records.py +++ b/lenskit/lenskit/splitting/records.py @@ -89,8 +89,8 @@ def sample_records( We can loop over a sequence of train-test pairs:: >>> from lenskit.data.movielens import load_movielens_df - >>> ratings = load_movielens_df('data/ml-latest-small') - >>> for train, test in sample_records(ratings, 1000, repeats=5): + >>> movielens = load_movielens('data/ml-latest-small') + >>> for train, test in sample_records(movielens, 1000, repeats=5): ... print(sum(len(il) for il in test.values())) 1000 1000 @@ -100,7 +100,7 @@ def sample_records( Sometimes for testing, it is useful to just get a single pair:: - >>> train, test = sample_records(ratings, 1000) + >>> train, test = sample_records(movielens, 1000) >>> sum(len(il) for il in test.values()) 1000 From d84a1f08e6deaf53535817244f13183056500881 Mon Sep 17 00:00:00 2001 From: Michael Ekstrand Date: Wed, 31 Jul 2024 19:55:45 -0400 Subject: [PATCH 22/22] fix imports for records doctest --- lenskit/lenskit/splitting/records.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lenskit/lenskit/splitting/records.py b/lenskit/lenskit/splitting/records.py index 50f18d4c2..a3dafc09b 100644 --- a/lenskit/lenskit/splitting/records.py +++ b/lenskit/lenskit/splitting/records.py @@ -88,7 +88,7 @@ def sample_records( We can loop over a sequence of train-test pairs:: - >>> from lenskit.data.movielens import load_movielens_df + >>> from lenskit.data import load_movielens >>> movielens = load_movielens('data/ml-latest-small') >>> for train, test in sample_records(movielens, 1000, repeats=5): ... print(sum(len(il) for il in test.values()))