Merge pull request #441 from mdekstrand/feature/use-dataset

Use dataset (#352) in algorithms and tests
lenskit · Jul 24, 2024 · ba40360 · ba40360
2 parents 47c0aff + 8d523c9
commit ba40360
Show file tree

Hide file tree

Showing 48 changed files with 1,264 additions and 735 deletions.
diff --git a/docs/GettingStarted.ipynb b/docs/GettingStarted.ipynb
@@ -27,6 +27,7 @@
    "outputs": [],
    "source": [
     "from lenskit.datasets import ML100K\n",
+    "from lenskit.data import from_interactions_df\n",
     "from lenskit import batch, topn, util\n",
     "from lenskit import crossfold as xf\n",
     "from lenskit.algorithms import Recommender, als, knn\n",
@@ -216,7 +217,7 @@
     "def eval(aname, algo, train, test):\n",
     "    fittable = util.clone(algo)\n",
     "    fittable = Recommender.adapt(fittable)\n",
-    "    fittable.fit(train)\n",
+    "    fittable.fit(from_interactions_df(train))\n",
     "    users = test.user.unique()\n",
     "    # now we run the recommender\n",
     "    recs = batch.recommend(fittable, users, 100)\n",

diff --git a/docs/data.rst b/docs/data.rst
@@ -69,6 +69,13 @@ instance to be passed to :meth:`~lenskit.algorithms.Recommender.fit`.
 
 .. autoclass:: Dataset
 
+Creating Dataset
+~~~~~~~~~~~~~~~~
+
+Several functions create :class:`Dataset`s from different input data sources.
+
+.. autofunction:: from_interaction_df
+
 Vocabularies
 ~~~~~~~~~~~~
 

diff --git a/docs/releases/2024.rst b/docs/releases/2024.rst
@@ -64,6 +64,7 @@ however, documented here.
   is now :math:`10^{-6}`.
 * k-NN algorithms no longer support negative similarities; ``min_sim`` is clamped
   to be at least the smallest normal in 32-bit floating point (:math:`1.75 \times 10^{-38}`).
+* The :mod:`implicit` bridge algorithms no longer look at rating values when they are present.
 
 Bug Fixes
 ~~~~~~~~~

diff --git a/lenskit-funksvd/lenskit/funksvd.py b/lenskit-funksvd/lenskit/funksvd.py
@@ -13,13 +13,13 @@
 
 import numba as n
 import numpy as np
-import pandas as pd
 from numba.experimental import jitclass
 from seedbank import numpy_rng
 
 from lenskit import util
 from lenskit.algorithms.bias import Bias
 from lenskit.algorithms.mf_common import MFPredictor
+from lenskit.data.dataset import Dataset
 
 _logger = logging.getLogger(__name__)
 
@@ -33,7 +33,7 @@
         ("item_count", n.int32),
         ("initial_value", n.double),
     ]
-)
+)  # type: ignore
 class Model:
     "Internal model class for training SGD MF."
 
@@ -66,7 +66,7 @@ def _fresh_model(nfeatures, nusers, nitems, init=0.1):
         ("rmin", n.double),
         ("rmax", n.double),
     ]
-)
+)  # type: ignore
 class _Params:
     def __init__(self, niters, lrate, reg, rmin, rmax):
         self.iter_count = niters
@@ -86,7 +86,7 @@ def make_params(niters, lrate, reg, range):
     return _Params(niters, lrate, reg, rmin, rmax)
 
 
-@jitclass([("est", n.double[:]), ("feature", n.int32), ("trail", n.double)])
+@jitclass([("est", n.double[:]), ("feature", n.int32), ("trail", n.double)])  # type: ignore
 class _FeatContext:
     def __init__(self, est, feature, trail):
         self.est = est
@@ -102,7 +102,7 @@ def __init__(self, est, feature, trail):
         ("bias", n.double[:]),
         ("n_samples", n.uint64),
     ]
-)
+)  # type: ignore
 class Context:
     def __init__(self, users, items, ratings, bias):
         self.users = users
@@ -206,30 +206,39 @@ class FunkSVD(MFPredictor[np.ndarray]):
     you can extract from a trained model.
 
     Args:
-        features(int): the number of features to train
-        iterations(int): the number of iterations to train each feature
-        lrate(double): the learning rate
-        reg(double): the regularization factor
-        damping(double): damping factor for the underlying mean
-        bias(Predictor): the underlying bias model to fit.  If ``True``, then a
+        features: the number of features to train
+        iterations: the number of iterations to train each feature
+        lrate: the learning rate
+        reg: the regularization factor
+        damping: damping factor for the underlying mean
+        bias: the underlying bias model to fit.  If ``True``, then a
             :py:class:`.bias.Bias` model is fit with ``damping``.
-        range(tuple):
+        range:
             the ``(min, max)`` rating values to clamp ratings, or ``None`` to leave
             predictions unclamped.
         random_state:
             The random state for shuffling the data prior to training.
     """
 
+    features: int
+    iterations: int
+    lrate: float
+    reg: float
+    damping: float | tuple[float, float]
+    range: tuple[float, float] | None
+    bias: Bias | None
+    random: np.random.Generator
+
     def __init__(
         self,
-        features,
-        iterations=100,
+        features: int,
+        iterations: int = 100,
         *,
-        lrate=0.001,
-        reg=0.015,
-        damping=5,
-        range=None,
-        bias=True,
+        lrate: float = 0.001,
+        reg: float = 0.015,
+        damping: float | tuple[float, float] = 5,
+        range: tuple[float, float] | None = None,
+        bias: bool | Bias | None = True,
         random_state=None,
     ):
         self.features = features
@@ -246,60 +255,55 @@ def __init__(
             self.bias = bias
         self.random = numpy_rng(random_state)
 
-    def fit(self, ratings, **kwargs):
+    def fit(self, data: Dataset, **kwargs):
         """
         Train a FunkSVD model.
 
         Args:
             ratings: the ratings data frame.
         """
         timer = util.Stopwatch()
-        if "rating" not in ratings:
-            _logger.warning("no rating column found, assuming rating values of 1.0")
-            ratings = ratings.assign(rating=1.0)
+        rate_df = data.interaction_matrix(format="pandas", layout="coo", field="rating")
 
         if self.bias:
             _logger.info("[%s] fitting bias model", timer)
-            self.bias.fit(ratings)
+            self.bias.fit(data)
 
-        _logger.info("[%s] preparing rating data for %d samples", timer, len(ratings))
+        _logger.info("[%s] preparing rating data for %d samples", timer, len(rate_df))
         _logger.debug("shuffling rating data")
-        shuf = np.arange(len(ratings), dtype=np.int_)
+        shuf = np.arange(len(rate_df), dtype=np.int_)
         self.random.shuffle(shuf)
-        ratings = ratings.iloc[shuf, :]
-
-        _logger.debug("[%s] indexing users and items", timer)
-        uidx = pd.Index(ratings.user.unique())
-        iidx = pd.Index(ratings.item.unique())
+        rate_df = rate_df.iloc[shuf, :]
 
-        users = uidx.get_indexer(ratings.user).astype(np.int32)
-        assert np.all(users >= 0)
-        items = iidx.get_indexer(ratings.item).astype(np.int32)
-        assert np.all(items >= 0)
+        users = np.array(rate_df["user_num"])
+        items = np.array(rate_df["item_num"])
+        ratings = np.array(rate_df["rating"], dtype=np.float_)
 
         _logger.debug("[%s] computing initial estimates", timer)
         if self.bias:
-            initial = pd.Series(self.bias.mean_, index=ratings.index, dtype=np.float_)
-            ibias, initial = _align_add_bias(self.bias.item_offsets_, iidx, ratings.item, initial)
-            ubias, initial = _align_add_bias(self.bias.user_offsets_, uidx, ratings.user, initial)
+            initial = np.full(len(users), self.bias.mean_, dtype=np.float_)
+            if self.bias.item_offsets_ is not None:
+                initial += self.bias.item_offsets_.values[items]
+            if self.bias.user_offsets_ is not None:
+                initial += self.bias.user_offsets_.values[users]
         else:
-            initial = pd.Series(0.0, index=ratings.index)
+            initial = np.zeros(len(users))
 
-        _logger.debug("have %d estimates for %d ratings", len(initial), len(ratings))
-        assert len(initial) == len(ratings)
+        _logger.debug("have %d estimates for %d ratings", len(initial), len(rate_df))
+        assert len(initial) == len(rate_df)
 
         _logger.debug("[%s] initializing data structures", timer)
-        context = Context(users, items, ratings.rating.astype(np.float_).values, initial.values)
+        context = Context(users, items, ratings, initial)
         params = make_params(self.iterations, self.lrate, self.reg, self.range)
 
-        model = _fresh_model(self.features, len(uidx), len(iidx))
+        model = _fresh_model(self.features, data.users.size, data.items.size)
 
         _logger.info("[%s] training biased MF model with %d features", timer, self.features)
         train(context, params, model, timer)
         _logger.info("finished model training in %s", timer)
 
-        self.user_index_ = uidx
-        self.item_index_ = iidx
+        self.users_ = data.users.copy()
+        self.items_ = data.items.copy()
         self.user_features_ = model.user_features
         self.item_features_ = model.item_features
 

diff --git a/lenskit-funksvd/tests/test_funksvd.py b/lenskit-funksvd/tests/test_funksvd.py
@@ -13,6 +13,7 @@
 
 from pytest import approx, mark
 
+from lenskit.data.dataset import from_interactions_df
 import lenskit.funksvd as svd
 import lenskit.util.test as lktu
 
@@ -21,30 +22,34 @@
 simple_df = pd.DataFrame(
     {"item": [1, 1, 2, 3], "user": [10, 12, 10, 13], "rating": [4.0, 3.0, 5.0, 2.0]}
 )
+simple_ds = from_interactions_df(simple_df)
 
 
 def test_fsvd_basic_build():
     algo = svd.FunkSVD(20, iterations=20)
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
 
 
 def test_fsvd_clamp_build():
     algo = svd.FunkSVD(20, iterations=20, range=(1, 5))
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
 
 
 def test_fsvd_predict_basic():
     algo = svd.FunkSVD(20, iterations=20)
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
@@ -58,8 +63,9 @@ def test_fsvd_predict_basic():
 
 def test_fsvd_predict_clamp():
     algo = svd.FunkSVD(20, iterations=20, range=(1, 5))
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
@@ -74,7 +80,7 @@ def test_fsvd_predict_clamp():
 
 def test_fsvd_no_bias():
     algo = svd.FunkSVD(20, iterations=20, bias=None)
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
     assert algo.bias is None
     assert algo.item_features_.shape == (3, 20)
@@ -88,8 +94,9 @@ def test_fsvd_no_bias():
 
 def test_fsvd_predict_bad_item():
     algo = svd.FunkSVD(20, iterations=20)
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
@@ -102,8 +109,9 @@ def test_fsvd_predict_bad_item():
 
 def test_fsvd_predict_bad_item_clamp():
     algo = svd.FunkSVD(20, iterations=20, range=(1, 5))
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
@@ -116,8 +124,9 @@ def test_fsvd_predict_bad_item_clamp():
 
 def test_fsvd_predict_bad_user():
     algo = svd.FunkSVD(20, iterations=20)
-    algo.fit(simple_df)
+    algo.fit(simple_ds)
 
+    assert algo.bias is not None
     assert algo.bias.mean_ == approx(simple_df.rating.mean())
     assert algo.item_features_.shape == (3, 20)
     assert algo.user_features_.shape == (3, 20)
@@ -134,8 +143,9 @@ def test_fsvd_save_load():
     ratings = lktu.ml_test.ratings
 
     original = svd.FunkSVD(20, iterations=20)
-    original.fit(ratings)
+    original.fit(from_interactions_df(ratings))
 
+    assert original.bias is not None
     assert original.bias.mean_ == approx(ratings.rating.mean())
     assert original.item_features_.shape == (ratings.item.nunique(), 20)
     assert original.user_features_.shape == (ratings.user.nunique(), 20)
@@ -149,8 +159,8 @@ def test_fsvd_save_load():
     assert np.all(algo.bias.item_offsets_ == original.bias.item_offsets_)
     assert np.all(algo.user_features_ == original.user_features_)
     assert np.all(algo.item_features_ == original.item_features_)
-    assert np.all(algo.item_index_ == original.item_index_)
-    assert np.all(algo.user_index_ == original.user_index_)
+    assert np.all(algo.items_.index == original.items_.index)
+    assert np.all(algo.users_.index == original.users_.index)
 
 
 @lktu.wantjit
@@ -159,7 +169,7 @@ def test_fsvd_train_binary():
     ratings = lktu.ml_test.ratings.drop(columns=["rating", "timestamp"])
 
     original = svd.FunkSVD(20, iterations=20, bias=False)
-    original.fit(ratings)
+    original.fit(from_interactions_df(ratings))
 
     assert original.bias is None
     assert original.item_features_.shape == (ratings.item.nunique(), 20)
@@ -171,7 +181,7 @@ def test_fsvd_train_binary():
 def test_fsvd_known_preds():
     algo = svd.FunkSVD(15, iterations=125, lrate=0.001)
     _log.info("training %s on ml data", algo)
-    algo.fit(lktu.ml_test.ratings)
+    algo.fit(from_interactions_df(lktu.ml_test.ratings))
 
     dir = Path(__file__).parent
     pred_file = dir / "funksvd-preds.csv"
@@ -211,7 +221,7 @@ def test_fsvd_batch_accuracy():
 
     def eval(train, test):
         _log.info("running training")
-        algo.fit(train)
+        algo.fit(from_interactions_df(train))
         _log.info("testing %d users", test.user.nunique())
         return batch.predict(algo, test)