Skip to content

Commit

Permalink
Remove drop_duplicates() from SAR method fix #1464 (#1588)
Browse files Browse the repository at this point in the history
* Remove drop_duplicates() from SAR method fix #1464

* flake is complaining

* Typos

* Define self.unity_user_affinity inside __init__()

* Remove drop_duplicates() from SAR method

* Remove duplicates in testing data

* Remove duplicates in test data for recommend_k_items

* Allow duplicates in score data

Co-authored-by: miguelgfierro <[email protected]>
Co-authored-by: Andreas Argyriou <[email protected]>
Co-authored-by: Simon Zhao <[email protected]>
  • Loading branch information
4 people authored Feb 28, 2022
1 parent de4210f commit 96b5053
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
17 changes: 8 additions & 9 deletions recommenders/models/sar/sar_singlenode.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __init__(
# set flag to capture unity-rating user-affinity matrix for scaling scores
self.normalize = normalize
self.col_unity_rating = "_unity_rating"
self.unity_user_affinity = None

# column for mapping user / item ids to internal indices
self.col_item_id = "_indexed_items"
Expand Down Expand Up @@ -156,7 +157,7 @@ def compute_time_decay(self, df, decay_column):
# group time decayed ratings by user-item and take the sum as the user-item affinity
return df.groupby([self.col_user, self.col_item]).sum().reset_index()

def compute_coocurrence_matrix(self, df):
def compute_cooccurrence_matrix(self, df):
"""Co-occurrence matrix.
The co-occurrence matrix is defined as :math:`C = U^T * U`
Expand Down Expand Up @@ -205,8 +206,12 @@ def set_index(self, df):
def fit(self, df):
"""Main fit method for SAR.
.. note::
Please make sure that `df` has no duplicates.
Args:
df (pandas.DataFrame): User item rating dataframe
df (pandas.DataFrame): User item rating dataframe (without duplicates).
"""

# generate continuous indices if this hasn't been done
Expand All @@ -226,12 +231,6 @@ def fit(self, df):
if self.time_decay_flag:
logger.info("Calculating time-decayed affinities")
temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating)
else:
# without time decay use the latest user-item rating in the dataset as the affinity score
logger.info("De-duplicating the user-item counts")
temp_df = temp_df.drop_duplicates(
[self.col_user, self.col_item], keep="last"
)

logger.info("Creating index columns")
# add mapping of user and item ids to indices
Expand Down Expand Up @@ -263,7 +262,7 @@ def fit(self, df):

# calculate item co-occurrence
logger.info("Calculating item co-occurrence")
item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df)
item_cooccurrence = self.compute_cooccurrence_matrix(df=temp_df)

# free up some space
del temp_df
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/recommenders/models/test_sar_singlenode.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,15 @@ def test_sar_item_similarity(
**header
)

# Remove duplicates
demo_usage_data = demo_usage_data.sort_values(
header["col_timestamp"], ascending=False
)
demo_usage_data = demo_usage_data.drop_duplicates(
[header["col_user"], header["col_item"]],
keep="first"
)

model.fit(demo_usage_data)

true_item_similarity, row_ids, col_ids = read_matrix(
Expand Down

0 comments on commit 96b5053

Please sign in to comment.