Skip to content

Commit

Permalink
Merge pull request #441 from mdekstrand/feature/use-dataset
Browse files Browse the repository at this point in the history
Use dataset (#352) in algorithms and tests
  • Loading branch information
mdekstrand authored Jul 24, 2024
2 parents 47c0aff + 8d523c9 commit ba40360
Show file tree
Hide file tree
Showing 48 changed files with 1,264 additions and 735 deletions.
3 changes: 2 additions & 1 deletion docs/GettingStarted.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"outputs": [],
"source": [
"from lenskit.datasets import ML100K\n",
"from lenskit.data import from_interactions_df\n",
"from lenskit import batch, topn, util\n",
"from lenskit import crossfold as xf\n",
"from lenskit.algorithms import Recommender, als, knn\n",
Expand Down Expand Up @@ -216,7 +217,7 @@
"def eval(aname, algo, train, test):\n",
" fittable = util.clone(algo)\n",
" fittable = Recommender.adapt(fittable)\n",
" fittable.fit(train)\n",
" fittable.fit(from_interactions_df(train))\n",
" users = test.user.unique()\n",
" # now we run the recommender\n",
" recs = batch.recommend(fittable, users, 100)\n",
Expand Down
7 changes: 7 additions & 0 deletions docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ instance to be passed to :meth:`~lenskit.algorithms.Recommender.fit`.

.. autoclass:: Dataset

Creating Dataset
~~~~~~~~~~~~~~~~

Several functions create :class:`Dataset`s from different input data sources.
.. autofunction:: from_interaction_df
Vocabularies
~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions docs/releases/2024.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ however, documented here.
is now :math:`10^{-6}`.
* k-NN algorithms no longer support negative similarities; ``min_sim`` is clamped
to be at least the smallest normal in 32-bit floating point (:math:`1.75 \times 10^{-38}`).
* The :mod:`implicit` bridge algorithms no longer look at rating values when they are present.

Bug Fixes
~~~~~~~~~
Expand Down
94 changes: 49 additions & 45 deletions lenskit-funksvd/lenskit/funksvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@

import numba as n
import numpy as np
import pandas as pd
from numba.experimental import jitclass
from seedbank import numpy_rng

from lenskit import util
from lenskit.algorithms.bias import Bias
from lenskit.algorithms.mf_common import MFPredictor
from lenskit.data.dataset import Dataset

_logger = logging.getLogger(__name__)

Expand All @@ -33,7 +33,7 @@
("item_count", n.int32),
("initial_value", n.double),
]
)
) # type: ignore
class Model:
"Internal model class for training SGD MF."

Expand Down Expand Up @@ -66,7 +66,7 @@ def _fresh_model(nfeatures, nusers, nitems, init=0.1):
("rmin", n.double),
("rmax", n.double),
]
)
) # type: ignore
class _Params:
def __init__(self, niters, lrate, reg, rmin, rmax):
self.iter_count = niters
Expand All @@ -86,7 +86,7 @@ def make_params(niters, lrate, reg, range):
return _Params(niters, lrate, reg, rmin, rmax)


@jitclass([("est", n.double[:]), ("feature", n.int32), ("trail", n.double)])
@jitclass([("est", n.double[:]), ("feature", n.int32), ("trail", n.double)]) # type: ignore
class _FeatContext:
def __init__(self, est, feature, trail):
self.est = est
Expand All @@ -102,7 +102,7 @@ def __init__(self, est, feature, trail):
("bias", n.double[:]),
("n_samples", n.uint64),
]
)
) # type: ignore
class Context:
def __init__(self, users, items, ratings, bias):
self.users = users
Expand Down Expand Up @@ -206,30 +206,39 @@ class FunkSVD(MFPredictor[np.ndarray]):
you can extract from a trained model.
Args:
features(int): the number of features to train
iterations(int): the number of iterations to train each feature
lrate(double): the learning rate
reg(double): the regularization factor
damping(double): damping factor for the underlying mean
bias(Predictor): the underlying bias model to fit. If ``True``, then a
features: the number of features to train
iterations: the number of iterations to train each feature
lrate: the learning rate
reg: the regularization factor
damping: damping factor for the underlying mean
bias: the underlying bias model to fit. If ``True``, then a
:py:class:`.bias.Bias` model is fit with ``damping``.
range(tuple):
range:
the ``(min, max)`` rating values to clamp ratings, or ``None`` to leave
predictions unclamped.
random_state:
The random state for shuffling the data prior to training.
"""

features: int
iterations: int
lrate: float
reg: float
damping: float | tuple[float, float]
range: tuple[float, float] | None
bias: Bias | None
random: np.random.Generator

def __init__(
self,
features,
iterations=100,
features: int,
iterations: int = 100,
*,
lrate=0.001,
reg=0.015,
damping=5,
range=None,
bias=True,
lrate: float = 0.001,
reg: float = 0.015,
damping: float | tuple[float, float] = 5,
range: tuple[float, float] | None = None,
bias: bool | Bias | None = True,
random_state=None,
):
self.features = features
Expand All @@ -246,60 +255,55 @@ def __init__(
self.bias = bias
self.random = numpy_rng(random_state)

def fit(self, ratings, **kwargs):
def fit(self, data: Dataset, **kwargs):
"""
Train a FunkSVD model.
Args:
ratings: the ratings data frame.
"""
timer = util.Stopwatch()
if "rating" not in ratings:
_logger.warning("no rating column found, assuming rating values of 1.0")
ratings = ratings.assign(rating=1.0)
rate_df = data.interaction_matrix(format="pandas", layout="coo", field="rating")

if self.bias:
_logger.info("[%s] fitting bias model", timer)
self.bias.fit(ratings)
self.bias.fit(data)

_logger.info("[%s] preparing rating data for %d samples", timer, len(ratings))
_logger.info("[%s] preparing rating data for %d samples", timer, len(rate_df))
_logger.debug("shuffling rating data")
shuf = np.arange(len(ratings), dtype=np.int_)
shuf = np.arange(len(rate_df), dtype=np.int_)
self.random.shuffle(shuf)
ratings = ratings.iloc[shuf, :]

_logger.debug("[%s] indexing users and items", timer)
uidx = pd.Index(ratings.user.unique())
iidx = pd.Index(ratings.item.unique())
rate_df = rate_df.iloc[shuf, :]

users = uidx.get_indexer(ratings.user).astype(np.int32)
assert np.all(users >= 0)
items = iidx.get_indexer(ratings.item).astype(np.int32)
assert np.all(items >= 0)
users = np.array(rate_df["user_num"])
items = np.array(rate_df["item_num"])
ratings = np.array(rate_df["rating"], dtype=np.float_)

_logger.debug("[%s] computing initial estimates", timer)
if self.bias:
initial = pd.Series(self.bias.mean_, index=ratings.index, dtype=np.float_)
ibias, initial = _align_add_bias(self.bias.item_offsets_, iidx, ratings.item, initial)
ubias, initial = _align_add_bias(self.bias.user_offsets_, uidx, ratings.user, initial)
initial = np.full(len(users), self.bias.mean_, dtype=np.float_)
if self.bias.item_offsets_ is not None:
initial += self.bias.item_offsets_.values[items]
if self.bias.user_offsets_ is not None:
initial += self.bias.user_offsets_.values[users]
else:
initial = pd.Series(0.0, index=ratings.index)
initial = np.zeros(len(users))

_logger.debug("have %d estimates for %d ratings", len(initial), len(ratings))
assert len(initial) == len(ratings)
_logger.debug("have %d estimates for %d ratings", len(initial), len(rate_df))
assert len(initial) == len(rate_df)

_logger.debug("[%s] initializing data structures", timer)
context = Context(users, items, ratings.rating.astype(np.float_).values, initial.values)
context = Context(users, items, ratings, initial)
params = make_params(self.iterations, self.lrate, self.reg, self.range)

model = _fresh_model(self.features, len(uidx), len(iidx))
model = _fresh_model(self.features, data.users.size, data.items.size)

_logger.info("[%s] training biased MF model with %d features", timer, self.features)
train(context, params, model, timer)
_logger.info("finished model training in %s", timer)

self.user_index_ = uidx
self.item_index_ = iidx
self.users_ = data.users.copy()
self.items_ = data.items.copy()
self.user_features_ = model.user_features
self.item_features_ = model.item_features

Expand Down
38 changes: 24 additions & 14 deletions lenskit-funksvd/tests/test_funksvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from pytest import approx, mark

from lenskit.data.dataset import from_interactions_df
import lenskit.funksvd as svd
import lenskit.util.test as lktu

Expand All @@ -21,30 +22,34 @@
simple_df = pd.DataFrame(
{"item": [1, 1, 2, 3], "user": [10, 12, 10, 13], "rating": [4.0, 3.0, 5.0, 2.0]}
)
simple_ds = from_interactions_df(simple_df)


def test_fsvd_basic_build():
algo = svd.FunkSVD(20, iterations=20)
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)


def test_fsvd_clamp_build():
algo = svd.FunkSVD(20, iterations=20, range=(1, 5))
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)


def test_fsvd_predict_basic():
algo = svd.FunkSVD(20, iterations=20)
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)
Expand All @@ -58,8 +63,9 @@ def test_fsvd_predict_basic():

def test_fsvd_predict_clamp():
algo = svd.FunkSVD(20, iterations=20, range=(1, 5))
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)
Expand All @@ -74,7 +80,7 @@ def test_fsvd_predict_clamp():

def test_fsvd_no_bias():
algo = svd.FunkSVD(20, iterations=20, bias=None)
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is None
assert algo.item_features_.shape == (3, 20)
Expand All @@ -88,8 +94,9 @@ def test_fsvd_no_bias():

def test_fsvd_predict_bad_item():
algo = svd.FunkSVD(20, iterations=20)
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)
Expand All @@ -102,8 +109,9 @@ def test_fsvd_predict_bad_item():

def test_fsvd_predict_bad_item_clamp():
algo = svd.FunkSVD(20, iterations=20, range=(1, 5))
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)
Expand All @@ -116,8 +124,9 @@ def test_fsvd_predict_bad_item_clamp():

def test_fsvd_predict_bad_user():
algo = svd.FunkSVD(20, iterations=20)
algo.fit(simple_df)
algo.fit(simple_ds)

assert algo.bias is not None
assert algo.bias.mean_ == approx(simple_df.rating.mean())
assert algo.item_features_.shape == (3, 20)
assert algo.user_features_.shape == (3, 20)
Expand All @@ -134,8 +143,9 @@ def test_fsvd_save_load():
ratings = lktu.ml_test.ratings

original = svd.FunkSVD(20, iterations=20)
original.fit(ratings)
original.fit(from_interactions_df(ratings))

assert original.bias is not None
assert original.bias.mean_ == approx(ratings.rating.mean())
assert original.item_features_.shape == (ratings.item.nunique(), 20)
assert original.user_features_.shape == (ratings.user.nunique(), 20)
Expand All @@ -149,8 +159,8 @@ def test_fsvd_save_load():
assert np.all(algo.bias.item_offsets_ == original.bias.item_offsets_)
assert np.all(algo.user_features_ == original.user_features_)
assert np.all(algo.item_features_ == original.item_features_)
assert np.all(algo.item_index_ == original.item_index_)
assert np.all(algo.user_index_ == original.user_index_)
assert np.all(algo.items_.index == original.items_.index)
assert np.all(algo.users_.index == original.users_.index)


@lktu.wantjit
Expand All @@ -159,7 +169,7 @@ def test_fsvd_train_binary():
ratings = lktu.ml_test.ratings.drop(columns=["rating", "timestamp"])

original = svd.FunkSVD(20, iterations=20, bias=False)
original.fit(ratings)
original.fit(from_interactions_df(ratings))

assert original.bias is None
assert original.item_features_.shape == (ratings.item.nunique(), 20)
Expand All @@ -171,7 +181,7 @@ def test_fsvd_train_binary():
def test_fsvd_known_preds():
algo = svd.FunkSVD(15, iterations=125, lrate=0.001)
_log.info("training %s on ml data", algo)
algo.fit(lktu.ml_test.ratings)
algo.fit(from_interactions_df(lktu.ml_test.ratings))

dir = Path(__file__).parent
pred_file = dir / "funksvd-preds.csv"
Expand Down Expand Up @@ -211,7 +221,7 @@ def test_fsvd_batch_accuracy():

def eval(train, test):
_log.info("running training")
algo.fit(train)
algo.fit(from_interactions_df(train))
_log.info("testing %d users", test.user.nunique())
return batch.predict(algo, test)

Expand Down
Loading

0 comments on commit ba40360

Please sign in to comment.