Update implicit-feedback run

lenskit · May 6, 2024 · da9c4d5 · da9c4d5
1 parent b9df58e
commit da9c4d5
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 21 deletions.
diff --git a/lenskit/algorithms/item_knn.py b/lenskit/algorithms/item_knn.py
@@ -169,9 +169,10 @@ def _predict_weighted_average(
         # save the fast-path items
         if torch.any(fast):
             ris_fast = row_is[fast]
+            vs_fast = row_vs[fast]
             avs_fast = row_avs[fast]
-            vals_fast = row_vs[fast] * rate_v[i]
-            nbr_sims[ris_fast, counts[ris_fast]] = avs_fast
+            vals_fast = vs_fast * rate_v[i]
+            nbr_sims[ris_fast, counts[ris_fast]] = vs_fast
             nbr_vals[ris_fast, counts[ris_fast]] = vals_fast
             counts[ris_fast] += 1
             t_sims[ris_fast] += avs_fast
@@ -198,7 +199,7 @@ def _predict_weighted_average(
         ris_exc = ris_slow[exc]
         ravs_exc = row_avs[slow][exc]
         rvs_exc = row_vs[slow][exc]
-        t_sims[ris_exc] += ravs_exc - min_sims[exc]
+        t_sims[ris_exc] += ravs_exc - min_sims[exc].abs()
         scores[ris_exc] += rvs_exc * rate_v[i] - min_vals[exc]
         # and save
         nbr_sims[ris_exc, mins[exc]] = ravs_exc
@@ -212,6 +213,7 @@ def _predict_weighted_average(
     return scores
 
 
+@torch.jit.script
 def _predict_sum(
     model: torch.Tensor,
     nrange: tuple[int, int],
@@ -222,36 +224,60 @@ def _predict_sum(
     nitems, _ni = model.shape
     assert nitems == _ni
     min_nbrs, max_nbrs = nrange
-    t_sims = np.full(nitems, np.nan, dtype=np.float_)
-
-    _logger.debug("rated: %s", rated)
-    _logger.debug("ratev: %s", rate_v)
+    _msg(logging.DEBUG, f"sum-scoring with {len(rated)} items")
 
     # we proceed rating-by-rating, and accumulate results
     t_sims = torch.zeros(nitems)
     counts = torch.zeros(nitems, dtype=torch.int32)
+    nbr_sims = torch.zeros((nitems, max_nbrs))
 
-    # fast path: compute everything that we can
     for i, iidx in enumerate(rated):
+        iidx = int(iidx)
         row = model[iidx]
-        row_is = row.indices()
+        row_is = row.indices()[0]
         row_vs = row.values()
-        _logger.debug("item %d: %d neighbors", iidx, len(row_is))
+        assert row_is.shape == row_vs.shape
 
-        counts[row_is] += 1
-        t_sims[row_is] += torch.abs(row_vs)
+        fast = counts[row_is] < max_nbrs
 
-    # slow-path items that have too many sims
-    if torch.any(counts > max_nbrs):
-        n = torch.sum(counts > max_nbrs)
-        _msg(logging.WARNING, f"{n} items have too many neighbors")
+        # save the fast-path items
+        if torch.any(fast):
+            ris_fast = row_is[fast]
+            vs_fast = row_vs[fast]
+            nbr_sims[ris_fast, counts[ris_fast]] = vs_fast
+            counts[ris_fast] += 1
+            t_sims[ris_fast] += vs_fast
 
-    # compute averages for items that don't match the threshold
-    _logger.debug("sims: %s", t_sims)
-    _logger.debug("counts: %s", counts)
-    mask = counts >= min_nbrs
+        # skip early if we're done
+        if torch.all(fast):
+            continue
+
+        # now we have the slow-path items
+        slow = torch.logical_not(fast)
+        ris_slow = row_is[slow]
+        rvs_slow = row_vs[slow]
+        # this is brute-force linear search for simplicity right now
+        # for each, find the neighbor that's the smallest:
+        mins = torch.argmin(nbr_sims[ris_slow], dim=1)
+        # find the items where this neighbor exceeds the smallest so far:
+        min_sims = nbr_sims[ris_slow, mins]
+        exc = min_sims < rvs_slow
+        if not torch.any(exc):
+            continue
+
+        # now we need to update values: add in new and remove old
+        # anywhere our new neighbor is grater than smallest, replace smallest
+        ris_exc = ris_slow[exc]
+        rvs_exc = rvs_slow[exc]
+        t_sims[ris_exc] -= min_sims[exc]
+        t_sims[ris_exc] += rvs_exc
+        # and save
+        nbr_sims[ris_exc, mins[exc]] = rvs_exc
+
+    # compute averages for items that pass match the threshold
+    t_sims[counts < min_nbrs] = torch.nan
 
-    return torch.where(mask, t_sims, torch.nan)
+    return t_sims
 
 
 _predictors: dict[str, AggFun] = {

diff --git a/tests/test_knn_item_item.py b/tests/test_knn_item_item.py
@@ -408,6 +408,47 @@ def test_ii_large_models(rng):
                 raise AssertionError(f"missing {np.sum(missing)} unbounded values")
 
 
+@lktu.wantjit
+@mark.slow
+def test_ii_implicit_large(rng):
+    "Test that implicit-feedback mode works on full test data."
+    _log.info("training model")
+    NBRS = 5
+    NUSERS = 25
+    NRECS = 50
+    algo = knn.ItemItem(NBRS, feedback="implicit")
+    _log.info("agg: %s", algo.aggregate)
+    algo = Recommender.adapt(algo)
+    algo.fit(ml_ratings[["user", "item"]])
+
+    users = rng.choice(ml_ratings["user"].unique(), NUSERS)
+
+    items: pd.Index = algo.predictor.item_index_
+    mat: torch.Tensor = algo.predictor.sim_matrix_.to_dense()
+
+    for user in users:
+        recs = algo.recommend(user, NRECS)
+        _log.info("user %s recs\n%s", user, recs)
+        assert len(recs) == NRECS
+        urates = ml_ratings[ml_ratings["user"] == user]
+
+        smat = mat[torch.from_numpy(items.get_indexer_for(urates["item"].values)), :]
+        for row in recs.itertuples():
+            col = smat[:, items.get_loc(row.item)]
+            top, _is = torch.topk(col, NBRS)
+            score = top.sum()
+            try:
+                assert row.score == approx(score)
+            except AssertionError as e:
+                _log.error("test failed for user %s item %s", user, row.item)
+                _log.info("score: %.6f", row.score)
+                _log.info("sims:\n%s", col)
+                _log.info("total: %.3f", col.sum())
+                _log.info("filtered: %s", top)
+                _log.info("filtered sum: %.3f", top.sum())
+                raise e
+
+
 @lktu.wantjit
 def test_ii_save_load(tmp_path, ml_subset):
     "Save and load a model"