Remove drop_duplicates() from SAR method fix #1464 (#1588)

* Remove drop_duplicates() from SAR method fix #1464 * flake is complaining * Typos * Define self.unity_user_affinity inside __init__() * Remove drop_duplicates() from SAR method * Remove duplicates in testing data * Remove duplicates in test data for recommend_k_items * Allow duplicates in score data Co-authored-by: miguelgfierro <[email protected]> Co-authored-by: Andreas Argyriou <[email protected]> Co-authored-by: Simon Zhao <[email protected]>
recommenders-team · Feb 28, 2022 · 96b5053 · 96b5053
1 parent de4210f
commit 96b5053
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 9 deletions.
diff --git a/recommenders/models/sar/sar_singlenode.py b/recommenders/models/sar/sar_singlenode.py
@@ -90,6 +90,7 @@ def __init__(
         # set flag to capture unity-rating user-affinity matrix for scaling scores
         self.normalize = normalize
         self.col_unity_rating = "_unity_rating"
+        self.unity_user_affinity = None
 
         # column for mapping user / item ids to internal indices
         self.col_item_id = "_indexed_items"
@@ -156,7 +157,7 @@ def compute_time_decay(self, df, decay_column):
         # group time decayed ratings by user-item and take the sum as the user-item affinity
         return df.groupby([self.col_user, self.col_item]).sum().reset_index()
 
-    def compute_coocurrence_matrix(self, df):
+    def compute_cooccurrence_matrix(self, df):
         """Co-occurrence matrix.
 
         The co-occurrence matrix is defined as :math:`C = U^T * U`
@@ -205,8 +206,12 @@ def set_index(self, df):
     def fit(self, df):
         """Main fit method for SAR.
 
+        .. note::
+
+        Please make sure that `df` has no duplicates.
+
         Args:
-            df (pandas.DataFrame): User item rating dataframe
+            df (pandas.DataFrame): User item rating dataframe (without duplicates).
         """
 
         # generate continuous indices if this hasn't been done
@@ -226,12 +231,6 @@ def fit(self, df):
         if self.time_decay_flag:
             logger.info("Calculating time-decayed affinities")
             temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating)
-        else:
-            # without time decay use the latest user-item rating in the dataset as the affinity score
-            logger.info("De-duplicating the user-item counts")
-            temp_df = temp_df.drop_duplicates(
-                [self.col_user, self.col_item], keep="last"
-            )
 
         logger.info("Creating index columns")
         # add mapping of user and item ids to indices
@@ -263,7 +262,7 @@ def fit(self, df):
 
         # calculate item co-occurrence
         logger.info("Calculating item co-occurrence")
-        item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df)
+        item_cooccurrence = self.compute_cooccurrence_matrix(df=temp_df)
 
         # free up some space
         del temp_df

diff --git a/tests/unit/recommenders/models/test_sar_singlenode.py b/tests/unit/recommenders/models/test_sar_singlenode.py
@@ -158,6 +158,15 @@ def test_sar_item_similarity(
         **header
     )
 
+    # Remove duplicates
+    demo_usage_data = demo_usage_data.sort_values(
+        header["col_timestamp"], ascending=False
+    )
+    demo_usage_data = demo_usage_data.drop_duplicates(
+        [header["col_user"], header["col_item"]],
+        keep="first"
+    )
+
     model.fit(demo_usage_data)
 
     true_item_similarity, row_ids, col_ids = read_matrix(