From 7cbc66dcb0bbc67947767267e5abd265c62bec67 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 28 May 2021 17:50:05 +0000 Subject: [PATCH 01/33] add diversity evaluation metrics --- .../als_movielens_diversity_metrics.ipynb | 680 ++++++++++++++++++ reco_utils/evaluation/diversity_evaluator.py | 262 +++++++ tests/conftest.py | 2 +- ...test_spark_evaluation_diversity_metrics.py | 134 ++++ 4 files changed, 1077 insertions(+), 1 deletion(-) create mode 100644 examples/03_evaluate/als_movielens_diversity_metrics.ipynb create mode 100644 reco_utils/evaluation/diversity_evaluator.py create mode 100644 tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb new file mode 100644 index 0000000000..de7f9cd232 --- /dev/null +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -0,0 +1,680 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Apply Diversity Metrics \n", + "## -- Compare ALS and Random Recommenders on MovieLens (PySpark)\n", + "\n", + "We demonstrate how to evaluate a recommender using diversity metrics in addition to commonly used rating/ranking metrics.\n", + "\n", + "We compare the performance of two algorithms: ALS recommender and a random recommender. \n", + " - Matrix factorization by [ALS](https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/recommendation.html#ALS) (Alternating Least Squares) is a well known collaborative filtering algorithm.\n", + " - We also define a random recommender which randomly recommends unseen items to each user. \n", + " \n", + "The comparision results show that ALS recommender outperforms random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while random recommender outperforms ALS recommender on diversity metrics (catalog_coverage, distributional_coverage, novelty, diversity, and serendipity)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.11 | packaged by conda-forge | (default, Nov 27 2020, 18:57:37) \n", + "[GCC 9.3.0]\n", + "Spark version: 2.4.5\n" + ] + } + ], + "source": [ + "# set the environment path to find Recommenders\n", + "import sys\n", + "sys.path.append(\"../../\")\n", + "import pyspark\n", + "from pyspark.ml.recommendation import ALS\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.types import StructType, StructField\n", + "from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n", + "\n", + "from reco_utils.common.timer import Timer\n", + "from reco_utils.dataset import movielens\n", + "from reco_utils.common.notebook_utils import is_jupyter\n", + "from reco_utils.dataset.spark_splitters import spark_random_split\n", + "from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n", + "from reco_utils.common.spark_utils import start_or_get_spark\n", + "\n", + "from reco_utils.evaluation.diversity_evaluator import DiversityEvaluator\n", + "from pyspark.sql.window import Window\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Spark version: {}\".format(pyspark.__version__))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the default parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# top k items to recommend\n", + "TOP_K = 10\n", + "\n", + "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", + "MOVIELENS_DATA_SIZE = '100k'\n", + "\n", + "# user, item column names\n", + "COL_USER=\"UserId\"\n", + "COL_ITEM=\"MovieId\"\n", + "COL_RATING=\"Rating\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up Spark context\n", + "\n", + "The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# the following settings work well for debugging locally on VM - change when running on a cluster\n", + "# set up a giant single executor with many threads and specify memory cap\n", + "\n", + "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")\n", + "\n", + "spark.conf.set(\"spark.sql.crossJoin.enabled\", \"true\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download the MovieLens dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4.81k/4.81k [00:00<00:00, 18.8kKB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+-------+------+---------+\n", + "|UserId|MovieId|Rating|Timestamp|\n", + "+------+-------+------+---------+\n", + "| 196| 242| 3.0|881250949|\n", + "| 186| 302| 3.0|891717742|\n", + "| 22| 377| 1.0|878887116|\n", + "| 244| 51| 2.0|880606923|\n", + "| 166| 346| 1.0|886397596|\n", + "| 298| 474| 4.0|884182806|\n", + "| 115| 265| 2.0|881171488|\n", + "| 253| 465| 5.0|891628467|\n", + "| 305| 451| 3.0|886324817|\n", + "| 6| 86| 3.0|883603013|\n", + "| 62| 257| 2.0|879372434|\n", + "| 286| 1014| 5.0|879781125|\n", + "| 200| 222| 5.0|876042340|\n", + "| 210| 40| 3.0|891035994|\n", + "| 224| 29| 3.0|888104457|\n", + "| 303| 785| 3.0|879485318|\n", + "| 122| 387| 5.0|879270459|\n", + "| 194| 274| 2.0|879539794|\n", + "| 291| 1042| 4.0|874834944|\n", + "| 234| 1184| 2.0|892079237|\n", + "+------+-------+------+---------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", + "schema = StructType(\n", + " (\n", + " StructField(COL_USER, IntegerType()),\n", + " StructField(COL_ITEM, IntegerType()),\n", + " StructField(COL_RATING, FloatType()),\n", + " StructField(\"Timestamp\", LongType()),\n", + " )\n", + ")\n", + "\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", + "data.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split the data using the Spark random splitter provided in utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N train 75193\n", + "N test 24807\n" + ] + } + ], + "source": [ + "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", + "print (\"N train\", train.cache().count())\n", + "print (\"N test\", test.cache().count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train the ALS model on the training data, and get the top-k recommendations for our testing data\n", + "\n", + "To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used in building the model are referenced from [here](http://mymedialite.net/examples/datasets.html). We do not constrain the latent factors (`nonnegative = False`) in order to allow for both positive and negative preferences towards movies.\n", + "Timing will vary depending on the machine being used to train." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "header = {\n", + " \"userCol\": COL_USER,\n", + " \"itemCol\": COL_ITEM,\n", + " \"ratingCol\": COL_RATING,\n", + "}\n", + "\n", + "\n", + "als = ALS(\n", + " rank=10,\n", + " maxIter=15,\n", + " implicitPrefs=False,\n", + " regParam=0.05,\n", + " coldStartStrategy='drop',\n", + " nonnegative=False,\n", + " seed=42,\n", + " **header\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 3.6827992300004553 seconds for training.\n" + ] + } + ], + "source": [ + "with Timer() as train_time:\n", + " model = als.fit(train)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. Therefore, the rated movies are removed from the recommended items.\n", + "\n", + "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1477928\n", + "9448\n" + ] + } + ], + "source": [ + "# Get the cross join of all user-item pairs and score them.\n", + "users = train.select(COL_USER).distinct()\n", + "items = train.select(COL_ITEM).distinct()\n", + "user_item = users.crossJoin(items)\n", + "dfs_pred = model.transform(user_item)\n", + "\n", + "# Remove seen items.\n", + "dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", + " train.alias(\"train\"),\n", + " (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),\n", + " how='outer'\n", + ")\n", + "\n", + "top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", + " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", + "\n", + "print(top_all.count())\n", + " \n", + "window = Window.partitionBy(COL_USER).orderBy(F.col(\"prediction\").desc()) \n", + "top_k_reco = top_all.select(\"*\", F.rank().over(window).alias(\"rank\")).filter(F.col(\"rank\") <= 10).drop(\"rank\")\n", + " \n", + "print(top_k_reco.count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Random Recommender\n", + "\n", + "We define a random recommender which randomly recommends unseen items to each user. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = train.select(COL_USER, COL_ITEM, COL_RATING)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# random recommender\n", + "window = Window.partitionBy(COL_USER).orderBy(F.rand())\n", + "\n", + "# randomly generated recommendations for each user\n", + "pred_df = (\n", + " train_df\n", + " # join training data with all possible user-item pairs (seen in training)\n", + " .join(train_df\n", + " .select(COL_USER)\n", + " .distinct()\n", + " .join(train_df\n", + " .select(COL_ITEM)\n", + " .distinct()),\n", + " on=[COL_USER, COL_ITEM],\n", + " how=\"right\"\n", + " )\n", + " # get user-item pairs that were not seen in the training data\n", + " .filter(F.col(COL_RATING).isNull())\n", + " # count items for each user (randomly sorting them)\n", + " .withColumn(\"score\", F.row_number().over(window))\n", + " # get the top k items per user\n", + " .filter(F.col(\"score\") <= TOP_K)\n", + " .drop(COL_RATING)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. ALS vs Random Recommenders Performance Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def get_ranking_results(ranking_eval):\n", + " metrics = {\n", + " \"Precision@k\": ranking_eval.precision_at_k(),\n", + " \"Recall@k\": ranking_eval.recall_at_k(),\n", + " \"NDCG@k\": ranking_eval.ndcg_at_k(),\n", + " \"Mean average precision\": ranking_eval.map_at_k()\n", + " \n", + " }\n", + " return metrics \n", + "\n", + "def get_diversity_results(diversity_eval):\n", + " metrics = {\n", + " \"catalog_coverage\":diversity_eval.catalog_coverage(),\n", + " \"distributional_coverage\":diversity_eval.distributional_coverage(), \n", + " \"novelty\": diversity_eval.novelty().first()[0], \n", + " \"diversity\": diversity_eval.diversity().first()[0], \n", + " \"serendipity\": diversity_eval.serendipity().first()[0]\n", + " }\n", + " return metrics " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):\n", + " summary = {\"Data\": data, \"Algo\": algo, \"K\": k}\n", + "\n", + " if ranking_metrics is None:\n", + " ranking_metrics = { \n", + " \"Precision@k\": np.nan,\n", + " \"Recall@k\": np.nan, \n", + " \"nDCG@k\": np.nan,\n", + " \"MAP\": np.nan,\n", + " }\n", + " summary.update(ranking_metrics)\n", + " summary.update(diversity_metrics)\n", + " return summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### ALS Recommender Performance Results" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "als_ranking_eval = SparkRankingEvaluation(\n", + " test, \n", + " top_all, \n", + " k = TOP_K, \n", + " col_user=\"UserId\", \n", + " col_item=\"MovieId\",\n", + " col_rating=\"Rating\", \n", + " col_prediction=\"prediction\",\n", + " relevancy_method=\"top_k\"\n", + ")\n", + "\n", + "als_diversity_eval = DiversityEvaluator(\n", + " train_df = train_df, \n", + " reco_df = top_k_reco,\n", + " user_col=\"UserId\", \n", + " item_col=\"MovieId\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "als_ranking_metrics = get_ranking_results(als_ranking_eval)\n", + "als_diversity_metrics = get_diversity_results(als_diversity_eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "als_results = generate_summary(MOVIELENS_DATA_SIZE, \"als\", TOP_K, als_ranking_metrics, als_diversity_metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Random Recommender Performance Results" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "random_ranking_eval = SparkRankingEvaluation(\n", + " test,\n", + " pred_df,\n", + " col_user=COL_USER,\n", + " col_item=COL_ITEM,\n", + " col_rating=COL_RATING,\n", + " col_prediction=\"score\",\n", + " k=TOP_K,\n", + ")\n", + "\n", + "random_diversity_eval = DiversityEvaluator(\n", + " train_df = train_df, \n", + " reco_df = pred_df, \n", + " user_col=COL_USER, \n", + " item_col=COL_ITEM\n", + ")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "random_ranking_metrics = get_ranking_results(random_ranking_eval)\n", + "random_diversity_metrics = get_diversity_results(random_diversity_eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "random_results = generate_summary(MOVIELENS_DATA_SIZE, \"random\", TOP_K, random_ranking_metrics, random_diversity_metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "cols = [\"Data\", \"Algo\", \"K\", \"Precision@k\", \"Recall@k\", \"NDCG@k\", \"Mean average precision\",\"catalog_coverage\", \"distributional_coverage\",\"novelty\", \"diversity\", \"serendipity\" ]\n", + "df_results = pd.DataFrame(columns=cols)\n", + "\n", + "df_results.loc[1] = als_results \n", + "df_results.loc[2] = random_results " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DataAlgoKPrecision@kRecall@kNDCG@kMean average precisioncatalog_coveragedistributional_coveragenoveltydiversityserendipity
1100kals100.0519110.0175140.0474600.0057340.3819060.0097534.6045870.8920360.880369
2100krandom100.0168790.0053430.0172680.0015440.9957500.0128187.1668530.9242710.894810
\n", + "
" + ], + "text/plain": [ + " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", + "1 100k als 10 0.051911 0.017514 0.047460 0.005734 \n", + "2 100k random 10 0.016879 0.005343 0.017268 0.001544 \n", + "\n", + " catalog_coverage distributional_coverage novelty diversity serendipity \n", + "1 0.381906 0.009753 4.604587 0.892036 0.880369 \n", + "2 0.995750 0.012818 7.166853 0.924271 0.894810 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_results" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# cleanup spark instance\n", + "spark.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (reco_pyspark)", + "language": "python", + "name": "reco_pyspark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.11" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py new file mode 100644 index 0000000000..32b51b07f5 --- /dev/null +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -0,0 +1,262 @@ +""" +Enterprise Customer License: + +Copyright (C) Microsoft Corporation. All rights reserved. + +Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, royalty-free right to use, copy, and modify the software code provided by us ("Software Code"). You may not sublicense the Software Code or any use of it (except to your affiliates and to vendors to perform work on your behalf) through distribution, network access, service agreement, lease, rental, or otherwise. This license does not purport to express any claim of ownership over data you may have shared with Microsoft in the creation of the Software Code. Unless applicable law gives you more rights, Microsoft reserves all other rights not expressly granted herein, whether by implication, estoppel or otherwise. + +THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +from pyspark.sql.types import * +from pyspark.sql import functions as F + +class DiversityEvaluator: + def __init__(self, train_df, reco_df, + user_col='UserId', item_col='ItemId', relevence_col=None): + """Diversity evaluator. + train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. + + Metrics includes: + Coverage - The proportion of items that can be recommended. It includes two metrics: catalog_coverage and distributional_coverage. + Novelty - A more novel item indicates it is less popular. + Diversity - The dissimilarity of items being recommended. + Serendipity - The “unusualness” or “surprise” of recommendations to a user. + + + Args: + train_df (pySpark DataFrame): Training set used for the recommender, + containing user_col, item_col. + reco_df (pySpark DataFrame): Recommender's prediction output, + containing user_col, item_col, relevence_col (optional). + user_col (str): User id column name. + item_col (str): Item id column name. + relevence_col (str): this column indicates whether the recommended item is actually relevent to the user or not. + """ + + self.train_df = train_df.select(user_col,item_col) + self.user_col = user_col + self.item_col = item_col + self.sim_col = "sim" + self.df_user_item_serendipity = None + self.df_item_novelty = None + self.df_intralist_similarity = None + + if relevence_col == None: + self.relevence_col = 'relevence' + # relevence term, default is 1 (relevent) for all + self.reco_df = reco_df.select(user_col, item_col, F.lit(1.0).alias(self.relevence_col)) + else: + self.relevence_col = relevence_col + self.reco_df = reco_df.select(user_col, item_col, F.col(self.relevence_col).cast(DoubleType())) + + # check if reco_df contain any user_item pairs that are already shown train_df + count_intersection = ( + self.train_df + .select(self.user_col, self.item_col) + .intersect(self.reco_df.select(self.user_col, self.item_col)) + .count() + ) + + if (count_intersection != 0): + raise Exception("reco_df should not contain any user_item pairs that are already shown train_df") + + def get_all_user_item_pairs(self, df): + return ( + df.select(self.user_col).distinct() + .join(df.select(self.item_col).distinct()) + ) + + + def get_pairwise_items(self, df, full_matrix=False): + if full_matrix==False: + return ( + df + .select(self.user_col, F.col(self.item_col).alias("i1")) + # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) + .join(df.select(F.col(self.user_col).alias("_user"), F.col(self.item_col).alias("i2")), + (F.col(self.user_col) == F.col("_user")) & (F.col("i1") <= F.col("i2"))) + .select(self.user_col, "i1", "i2") + ) + else: + return ( + df + .select(self.user_col, F.col(self.item_col).alias("i1")) + # get pairwise combinations of items per user (including both pairs [1,2] and [2,1]) + .join(df.select(F.col(self.user_col).alias("_user"), F.col(self.item_col).alias("i2")), + (F.col(self.user_col) == F.col("_user"))) + .select(self.user_col, "i1", "i2") + ) + + + def get_cosine_similarity(self, full_matrix=False, n_partitions=200): + # TODO: make sure there are no null values in user or item columns + # TODO: make sure temporary column names don't match existing user or item column names + + pairs = self.get_pairwise_items(df=self.train_df, full_matrix=full_matrix) + item_count = self.train_df.groupBy(self.item_col).count() + + return ( + pairs + .groupBy("i1", "i2").count() + .join(item_count.select(F.col(self.item_col).alias("i1"), F.pow(F.col("count"), 0.5).alias("i1_sqrt_count")), on="i1") + .join(item_count.select(F.col(self.item_col).alias("i2"), F.pow(F.col("count"), 0.5).alias("i2_sqrt_count")), on="i2") + .select("i1", "i2", (F.col("count") / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count"))).alias("sim")) + .repartition(n_partitions, "i1", "i2") + .sortWithinPartitions("i1", "i2") + ) + + + + # diversity metrics + def get_intralist_similarity(self, df, similarity_df): + pairs = self.get_pairwise_items(df=df) + return ( + pairs + .join(similarity_df, on=["i1", "i2"], how="left").fillna(0) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. + .filter(F.col("i1") != F.col("i2")) + .groupBy(self.user_col).agg(F.mean(self.sim_col).alias("avg_il_sim")) + .select(self.user_col, "avg_il_sim") + ) + + def user_diversity(self): + if (self.df_intralist_similarity == None): + cossim = self.get_cosine_similarity().orderBy("i1", "i2") + self.df_intralist_similarity = self.get_intralist_similarity(df=self.reco_df, similarity_df = cossim) + return ( + self.df_intralist_similarity + .withColumn("diversity", 1-F.col("avg_il_sim")) + .select(self.user_col, "diversity") + .orderBy(self.user_col) + ) + + def diversity(self): + # TODO: add error handling logic for conditions where user_id is not valid + if (self.df_intralist_similarity == None): + cossim = self.get_cosine_similarity().orderBy("i1", "i2") + self.df_intralist_similarity = self.get_intralist_similarity(df=self.reco_df, similarity_df = cossim) + return (self.df_intralist_similarity + .withColumn("diversity", 1-F.col("avg_il_sim")) + .select(F.mean("diversity").alias("diversity")) + ) + + + # novelty metrics + def get_item_novelty(self): + train_pairs = self.get_all_user_item_pairs(df=self.train_df) + return ( + train_pairs + .join(self.train_df.withColumn("seen", F.lit(1)), on=[self.user_col, self.item_col], how="left") + .filter(F.col("seen").isNull()) + .groupBy(self.item_col).count() + .join(self.reco_df.groupBy(self.item_col).agg(F.count(self.user_col).alias("reco_count")), on=self.item_col) + .withColumn("item_novelty", -F.log2(F.col("reco_count") / F.col("count"))) + ) + + def item_novelty(self): + if (self.df_item_novelty == None): + self.df_item_novelty = self.get_item_novelty() + return self.df_item_novelty.select(self.item_col, "item_novelty").orderBy(self.item_col) + + def user_novelty(self): + if (self.df_item_novelty == None): + self.df_item_novelty = self.get_item_novelty() + return ( + self.reco_df + .join(self.df_item_novelty, on=self.item_col) + .groupBy(self.user_col) + .agg(F.mean("item_novelty").alias("user_novelty")) + .orderBy(self.user_col) + ) + + def novelty(self): + # TODO: add error handling logic for any other conditions + if (self.df_item_novelty == None): + self.df_item_novelty = self.get_item_novelty() + return ( + self.reco_df + .join(self.df_item_novelty, on=self.item_col) + .agg(F.mean("item_novelty").alias("novelty")) + ) + + # serendipity metrics + def get_user_item_serendipity(self): + # TODO: add relevence term as input parameter + + # for every user_col, item_col in reco_df, join all interacted items from train_df. + # These interacted items are reapeated for each item in reco_df for a specific user. + reco_item_interacted_history = (self.reco_df.withColumn("i1",F.col(self.item_col)) + .join(self.train_df.withColumn("i2",F.col(self.item_col)), on=[self.user_col]) + .select(self.user_col,"i1","i2") + ) + cossim_full = self.get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") + join_sim = (reco_item_interacted_history + .join(cossim_full, on=["i1", "i2"], how="left").fillna(0) + .groupBy(self.user_col, "i1") + .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) + .withColumn(self.item_col, F.col("i1")) + .drop("i1") + ) + return ( + join_sim.join(self.reco_df, on=[self.user_col, self.item_col]) + .withColumn("user_item_serendipity",(1-F.col("avg_item2interactedHistory_sim"))*F.col(self.relevence_col)) + ) + + + def user_item_serendipity(self): + if (self.df_user_item_serendipity == None): + self.df_user_item_serendipity = self.get_user_item_serendipity() + + return ( + self.df_user_item_serendipity + .select(self.user_col, self.item_col, "user_item_serendipity") + .orderBy(self.user_col, self.item_col) + ) + + def user_serendipity(self): + if (self.df_user_item_serendipity == None): + self.df_user_item_serendipity = self.get_user_item_serendipity() + + return ( + self.df_user_item_serendipity + .groupBy(self.user_col) + .agg(F.mean("user_item_serendipity").alias("user_serendipity")) + .orderBy(self.user_col) + ) + + def serendipity(self): + # TODO: add error handling logic for any other conditions + if (self.df_user_item_serendipity == None): + self.df_user_item_serendipity = self.get_user_item_serendipity() + + return ( + self.df_user_item_serendipity + .agg(F.mean("user_item_serendipity").alias("serendipity")) + ) + + + # coverage metrics + def catalog_coverage(self): + # distinct item count in reco_df + count_distinct_item_reco = self.reco_df.select(F.countDistinct(self.item_col)).collect()[0][0] + # distinct item count in train_df + count_distinct_item_train = self.train_df.select(F.countDistinct(self.item_col)).collect()[0][0] + + # cagalog coverage + c_coverage= count_distinct_item_reco/count_distinct_item_train + return c_coverage + + def distributional_coverage(self): + # In reco_df, how many times each item_col is being recommended + df_itemcnt_reco = self.reco_df.groupBy(self.item_col).count() + # distinct item count in train_df + count_distinct_item_train = self.train_df.select(F.countDistinct(self.item_col)).collect()[0][0] + # the number of total recommendations + count_row_reco = self.reco_df.count() + df_entropy=(df_itemcnt_reco + .withColumn("p(i)",F.col("count")/count_row_reco) + .withColumn("entropy(i)", F.col("p(i)")*F.log2(F.col("p(i)"))) + ) + # distributional coverage + d_coverage=(-2/count_distinct_item_train)*df_entropy.agg(F.sum("entropy(i)")).collect()[0][0] + + return d_coverage \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 9f7d8ee56c..0c09d6a289 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -77,7 +77,7 @@ def spark(tmp_path_factory, app_name="Sample", url="local[*]"): """ with TemporaryDirectory(dir=tmp_path_factory.getbasetemp()) as td: - config = {"spark.local.dir": td, "spark.sql.shuffle.partitions": 1} + config = {"spark.local.dir": td, "spark.sql.shuffle.partitions": 1, "spark.sql.crossJoin.enabled": "true"} spark = start_or_get_spark(app_name=app_name, url=url, config=config) yield spark spark.stop() diff --git a/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py b/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py new file mode 100644 index 0000000000..a0a0e25165 --- /dev/null +++ b/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py @@ -0,0 +1,134 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import itertools +import pytest +import numpy as np +import pandas as pd +from pandas.util.testing import assert_frame_equal +from pyspark.sql import Row +from reco_utils.evaluation.diversity_evaluator import DiversityEvaluator + + +TOL = 0.0001 + +@pytest.fixture(scope="module") +def target_metrics(): + return { + "c_coverage": pytest.approx(0.8, TOL), + "d_coverage": pytest.approx(0.76732, TOL), + "item_novelty": pd.DataFrame( + dict(ItemId=[1, 2, 3, 5], item_novelty=[1.0, 0.0, 0.0, 0.0]) + ), + "user_novelty": pd.DataFrame( + dict(UserId=[1, 2, 3], user_novelty=[0.0, 0.0, 0.5]) + ), + "novelty": pd.DataFrame( + dict(novelty=[0.16667]) + ), + "diversity": pd.DataFrame( + dict(diversity=[0.43096]) + ), + "user_diversity": pd.DataFrame( + dict(UserId=[1, 2, 3], diversity=[0.29289, 1.0, 0.0]) + ), + "user_item_serendipity": pd.DataFrame( + dict(UserId=[1, 1, 2, 2, 3, 3], reco_item= [3, 5, 2, 5, 1, 2], user_item_serendipity=[0.72783, 0.80755, 0.71132, 0.35777, 0.80755, 0.80755]) + ), + "user_serendipity": pd.DataFrame( + dict(UserId=[1, 2, 3], user_serendipity=[0.76770, 0.53455, 0.80755]) + ), + "serendipity": pd.DataFrame( + dict(serendipity=[0.70326]) + ), + } + + + +@pytest.fixture(scope="module") +def data(spark): + train_df = spark.createDataFrame([ + Row(UserId=1, ItemId=1), + Row(UserId=1, ItemId=2), + Row(UserId=1, ItemId=4), + Row(UserId=2, ItemId=3), + Row(UserId=2, ItemId=4), + Row(UserId=3, ItemId=3), + Row(UserId=3, ItemId=4), + Row(UserId=3, ItemId=5), + ]) + reco_df = spark.createDataFrame([ + Row(UserId=1, ItemId=3, Rating=1), + Row(UserId=1, ItemId=5, Rating=1), + Row(UserId=2, ItemId=2, Rating=1), + Row(UserId=2, ItemId=5, Rating=1), + Row(UserId=3, ItemId=1, Rating=1), + Row(UserId=3, ItemId=2, Rating=1), + ]) + return train_df, reco_df + +@pytest.mark.spark +@pytest.fixture() +def evaluator(data): + train_df, reco_df = data + div = DiversityEvaluator(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') + print("init evaluator") + return div + + +@pytest.mark.spark +def test_init_spark(spark): + assert spark is not None + +@pytest.mark.spark +def test_catalog_coverage(evaluator, target_metrics): + + c_coverage = evaluator.catalog_coverage() + assert c_coverage == target_metrics["c_coverage"] + +@pytest.mark.spark +def test_distributional_coverage(evaluator, target_metrics): + + d_coverage = evaluator.distributional_coverage() + assert d_coverage == target_metrics["d_coverage"] + +@pytest.mark.spark +def test_item_novelty(evaluator, target_metrics): + actual = evaluator.item_novelty().toPandas() + assert_frame_equal(target_metrics["item_novelty"], actual, check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_user_novelty(evaluator, target_metrics): + actual = evaluator.user_novelty().toPandas() + assert_frame_equal(target_metrics["user_novelty"], actual, check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_novelty(evaluator, target_metrics): + actual = evaluator.novelty().toPandas() + assert_frame_equal(target_metrics["novelty"], actual, check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_user_diversity(evaluator, target_metrics): + actual = evaluator.user_diversity().toPandas() + assert_frame_equal(target_metrics["user_diversity"], actual, check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_diversity(evaluator, target_metrics): + actual = evaluator.diversity().toPandas() + assert_frame_equal(target_metrics["diversity"], actual,check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_user_item_serendipity(evaluator, target_metrics): + actual = evaluator.user_item_serendipity().toPandas() + assert_frame_equal(target_metrics["user_item_serendipity"], actual, check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_user_serendipity(evaluator, target_metrics): + actual = evaluator.user_serendipity().toPandas() + assert_frame_equal(target_metrics["user_serendipity"], actual, check_exact=False, check_less_precise=4) + +@pytest.mark.spark +def test_serendipity(evaluator, target_metrics): + actual = evaluator.serendipity().toPandas() + assert_frame_equal(target_metrics["serendipity"], actual, check_exact=False, check_less_precise=4) From 406e8e02a7cb55a06a132b1ff76f33d3d793c566 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 28 May 2021 18:01:19 +0000 Subject: [PATCH 02/33] fix typo --- reco_utils/evaluation/diversity_evaluator.py | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index 32b51b07f5..6958f82d15 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -12,7 +12,7 @@ class DiversityEvaluator: def __init__(self, train_df, reco_df, - user_col='UserId', item_col='ItemId', relevence_col=None): + user_col='UserId', item_col='ItemId', relevance_col=None): """Diversity evaluator. train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. @@ -27,10 +27,10 @@ def __init__(self, train_df, reco_df, train_df (pySpark DataFrame): Training set used for the recommender, containing user_col, item_col. reco_df (pySpark DataFrame): Recommender's prediction output, - containing user_col, item_col, relevence_col (optional). + containing user_col, item_col, relevance_col (optional). user_col (str): User id column name. item_col (str): Item id column name. - relevence_col (str): this column indicates whether the recommended item is actually relevent to the user or not. + relevance_col (str): this column indicates whether the recommended item is actually relevent to the user or not. """ self.train_df = train_df.select(user_col,item_col) @@ -41,13 +41,13 @@ def __init__(self, train_df, reco_df, self.df_item_novelty = None self.df_intralist_similarity = None - if relevence_col == None: - self.relevence_col = 'relevence' - # relevence term, default is 1 (relevent) for all - self.reco_df = reco_df.select(user_col, item_col, F.lit(1.0).alias(self.relevence_col)) + if relevance_col == None: + self.relevance_col = 'relevance' + # relevance term, default is 1 (relevent) for all + self.reco_df = reco_df.select(user_col, item_col, F.lit(1.0).alias(self.relevance_col)) else: - self.relevence_col = relevence_col - self.reco_df = reco_df.select(user_col, item_col, F.col(self.relevence_col).cast(DoubleType())) + self.relevance_col = relevance_col + self.reco_df = reco_df.select(user_col, item_col, F.col(self.relevance_col).cast(DoubleType())) # check if reco_df contain any user_item pairs that are already shown train_df count_intersection = ( @@ -180,7 +180,7 @@ def novelty(self): # serendipity metrics def get_user_item_serendipity(self): - # TODO: add relevence term as input parameter + # TODO: add relevance term as input parameter # for every user_col, item_col in reco_df, join all interacted items from train_df. # These interacted items are reapeated for each item in reco_df for a specific user. @@ -198,7 +198,7 @@ def get_user_item_serendipity(self): ) return ( join_sim.join(self.reco_df, on=[self.user_col, self.item_col]) - .withColumn("user_item_serendipity",(1-F.col("avg_item2interactedHistory_sim"))*F.col(self.relevence_col)) + .withColumn("user_item_serendipity",(1-F.col("avg_item2interactedHistory_sim"))*F.col(self.relevance_col)) ) From d28167e4f9eb2f6518c9ea2a23c59e63fce4497b Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Tue, 1 Jun 2021 15:16:23 +0000 Subject: [PATCH 03/33] formating using black --- reco_utils/evaluation/diversity_evaluator.py | 341 +++++++++++-------- 1 file changed, 198 insertions(+), 143 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index 6958f82d15..9beb1141e7 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -10,15 +10,22 @@ from pyspark.sql.types import * from pyspark.sql import functions as F -class DiversityEvaluator: - def __init__(self, train_df, reco_df, - user_col='UserId', item_col='ItemId', relevance_col=None): + +class DiversityEvaluator: + def __init__( + self, + train_df, + reco_df, + user_col="UserId", + item_col="ItemId", + relevance_col=None, + ): """Diversity evaluator. train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. - + Metrics includes: Coverage - The proportion of items that can be recommended. It includes two metrics: catalog_coverage and distributional_coverage. - Novelty - A more novel item indicates it is less popular. + Novelty - A more novel item indicates it is less popular. Diversity - The dissimilarity of items being recommended. Serendipity - The “unusualness” or “surprise” of recommendations to a user. @@ -29,65 +36,76 @@ def __init__(self, train_df, reco_df, reco_df (pySpark DataFrame): Recommender's prediction output, containing user_col, item_col, relevance_col (optional). user_col (str): User id column name. - item_col (str): Item id column name. - relevance_col (str): this column indicates whether the recommended item is actually relevent to the user or not. + item_col (str): Item id column name. + relevance_col (str): this column indicates whether the recommended item is actually relevent to the user or not. """ - self.train_df = train_df.select(user_col,item_col) + self.train_df = train_df.select(user_col, item_col) self.user_col = user_col self.item_col = item_col self.sim_col = "sim" self.df_user_item_serendipity = None self.df_item_novelty = None self.df_intralist_similarity = None - + if relevance_col == None: - self.relevance_col = 'relevance' + self.relevance_col = "relevance" # relevance term, default is 1 (relevent) for all - self.reco_df = reco_df.select(user_col, item_col, F.lit(1.0).alias(self.relevance_col)) + self.reco_df = reco_df.select( + user_col, item_col, F.lit(1.0).alias(self.relevance_col) + ) else: self.relevance_col = relevance_col - self.reco_df = reco_df.select(user_col, item_col, F.col(self.relevance_col).cast(DoubleType())) - + self.reco_df = reco_df.select( + user_col, item_col, F.col(self.relevance_col).cast(DoubleType()) + ) + # check if reco_df contain any user_item pairs that are already shown train_df count_intersection = ( - self.train_df - .select(self.user_col, self.item_col) - .intersect(self.reco_df.select(self.user_col, self.item_col)) - .count() - ) - - if (count_intersection != 0): - raise Exception("reco_df should not contain any user_item pairs that are already shown train_df") + self.train_df.select(self.user_col, self.item_col) + .intersect(self.reco_df.select(self.user_col, self.item_col)) + .count() + ) + + if count_intersection != 0: + raise Exception( + "reco_df should not contain any user_item pairs that are already shown train_df" + ) def get_all_user_item_pairs(self, df): return ( - df.select(self.user_col).distinct() + df.select(self.user_col) + .distinct() .join(df.select(self.item_col).distinct()) ) - def get_pairwise_items(self, df, full_matrix=False): - if full_matrix==False: + if full_matrix == False: return ( - df - .select(self.user_col, F.col(self.item_col).alias("i1")) + df.select(self.user_col, F.col(self.item_col).alias("i1")) # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) - .join(df.select(F.col(self.user_col).alias("_user"), F.col(self.item_col).alias("i2")), - (F.col(self.user_col) == F.col("_user")) & (F.col("i1") <= F.col("i2"))) - .select(self.user_col, "i1", "i2") + .join( + df.select( + F.col(self.user_col).alias("_user"), + F.col(self.item_col).alias("i2"), + ), + (F.col(self.user_col) == F.col("_user")) + & (F.col("i1") <= F.col("i2")), + ).select(self.user_col, "i1", "i2") ) else: return ( - df - .select(self.user_col, F.col(self.item_col).alias("i1")) - # get pairwise combinations of items per user (including both pairs [1,2] and [2,1]) - .join(df.select(F.col(self.user_col).alias("_user"), F.col(self.item_col).alias("i2")), - (F.col(self.user_col) == F.col("_user"))) - .select(self.user_col, "i1", "i2") - ) - - + df.select(self.user_col, F.col(self.item_col).alias("i1")) + # get pairwise combinations of items per user (including both pairs [1,2] and [2,1]) + .join( + df.select( + F.col(self.user_col).alias("_user"), + F.col(self.item_col).alias("i2"), + ), + (F.col(self.user_col) == F.col("_user")), + ).select(self.user_col, "i1", "i2") + ) + def get_cosine_similarity(self, full_matrix=False, n_partitions=200): # TODO: make sure there are no null values in user or item columns # TODO: make sure temporary column names don't match existing user or item column names @@ -96,167 +114,204 @@ def get_cosine_similarity(self, full_matrix=False, n_partitions=200): item_count = self.train_df.groupBy(self.item_col).count() return ( - pairs - .groupBy("i1", "i2").count() - .join(item_count.select(F.col(self.item_col).alias("i1"), F.pow(F.col("count"), 0.5).alias("i1_sqrt_count")), on="i1") - .join(item_count.select(F.col(self.item_col).alias("i2"), F.pow(F.col("count"), 0.5).alias("i2_sqrt_count")), on="i2") - .select("i1", "i2", (F.col("count") / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count"))).alias("sim")) + pairs.groupBy("i1", "i2") + .count() + .join( + item_count.select( + F.col(self.item_col).alias("i1"), + F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"), + ), + on="i1", + ) + .join( + item_count.select( + F.col(self.item_col).alias("i2"), + F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"), + ), + on="i2", + ) + .select( + "i1", + "i2", + ( + F.col("count") / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count")) + ).alias("sim"), + ) .repartition(n_partitions, "i1", "i2") .sortWithinPartitions("i1", "i2") ) - - # diversity metrics def get_intralist_similarity(self, df, similarity_df): pairs = self.get_pairwise_items(df=df) return ( - pairs - .join(similarity_df, on=["i1", "i2"], how="left").fillna(0) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. + pairs.join(similarity_df, on=["i1", "i2"], how="left") + .fillna( + 0 + ) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. .filter(F.col("i1") != F.col("i2")) - .groupBy(self.user_col).agg(F.mean(self.sim_col).alias("avg_il_sim")) + .groupBy(self.user_col) + .agg(F.mean(self.sim_col).alias("avg_il_sim")) .select(self.user_col, "avg_il_sim") ) - + def user_diversity(self): - if (self.df_intralist_similarity == None): + if self.df_intralist_similarity == None: cossim = self.get_cosine_similarity().orderBy("i1", "i2") - self.df_intralist_similarity = self.get_intralist_similarity(df=self.reco_df, similarity_df = cossim) + self.df_intralist_similarity = self.get_intralist_similarity( + df=self.reco_df, similarity_df=cossim + ) return ( - self.df_intralist_similarity - .withColumn("diversity", 1-F.col("avg_il_sim")) + self.df_intralist_similarity.withColumn( + "diversity", 1 - F.col("avg_il_sim") + ) .select(self.user_col, "diversity") .orderBy(self.user_col) - ) - + ) + def diversity(self): - # TODO: add error handling logic for conditions where user_id is not valid - if (self.df_intralist_similarity == None): + # TODO: add error handling logic for conditions where user_id is not valid + if self.df_intralist_similarity == None: cossim = self.get_cosine_similarity().orderBy("i1", "i2") - self.df_intralist_similarity = self.get_intralist_similarity(df=self.reco_df, similarity_df = cossim) - return (self.df_intralist_similarity - .withColumn("diversity", 1-F.col("avg_il_sim")) - .select(F.mean("diversity").alias("diversity")) - ) - - + self.df_intralist_similarity = self.get_intralist_similarity( + df=self.reco_df, similarity_df=cossim + ) + return self.df_intralist_similarity.withColumn( + "diversity", 1 - F.col("avg_il_sim") + ).select(F.mean("diversity").alias("diversity")) + # novelty metrics def get_item_novelty(self): train_pairs = self.get_all_user_item_pairs(df=self.train_df) return ( - train_pairs - .join(self.train_df.withColumn("seen", F.lit(1)), on=[self.user_col, self.item_col], how="left") - .filter(F.col("seen").isNull()) - .groupBy(self.item_col).count() - .join(self.reco_df.groupBy(self.item_col).agg(F.count(self.user_col).alias("reco_count")), on=self.item_col) - .withColumn("item_novelty", -F.log2(F.col("reco_count") / F.col("count"))) + train_pairs.join( + self.train_df.withColumn("seen", F.lit(1)), + on=[self.user_col, self.item_col], + how="left", + ) + .filter(F.col("seen").isNull()) + .groupBy(self.item_col) + .count() + .join( + self.reco_df.groupBy(self.item_col).agg( + F.count(self.user_col).alias("reco_count") + ), + on=self.item_col, + ) + .withColumn("item_novelty", -F.log2(F.col("reco_count") / F.col("count"))) ) - + def item_novelty(self): - if (self.df_item_novelty == None): - self.df_item_novelty = self.get_item_novelty() - return self.df_item_novelty.select(self.item_col, "item_novelty").orderBy(self.item_col) - + if self.df_item_novelty == None: + self.df_item_novelty = self.get_item_novelty() + return self.df_item_novelty.select(self.item_col, "item_novelty").orderBy( + self.item_col + ) + def user_novelty(self): - if (self.df_item_novelty == None): - self.df_item_novelty = self.get_item_novelty() + if self.df_item_novelty == None: + self.df_item_novelty = self.get_item_novelty() return ( - self.reco_df - .join(self.df_item_novelty, on=self.item_col) + self.reco_df.join(self.df_item_novelty, on=self.item_col) .groupBy(self.user_col) .agg(F.mean("item_novelty").alias("user_novelty")) .orderBy(self.user_col) ) - + def novelty(self): # TODO: add error handling logic for any other conditions - if (self.df_item_novelty == None): - self.df_item_novelty = self.get_item_novelty() - return ( - self.reco_df - .join(self.df_item_novelty, on=self.item_col) - .agg(F.mean("item_novelty").alias("novelty")) + if self.df_item_novelty == None: + self.df_item_novelty = self.get_item_novelty() + return self.reco_df.join(self.df_item_novelty, on=self.item_col).agg( + F.mean("item_novelty").alias("novelty") ) - - # serendipity metrics + + # serendipity metrics def get_user_item_serendipity(self): # TODO: add relevance term as input parameter - - # for every user_col, item_col in reco_df, join all interacted items from train_df. + + # for every user_col, item_col in reco_df, join all interacted items from train_df. # These interacted items are reapeated for each item in reco_df for a specific user. - reco_item_interacted_history = (self.reco_df.withColumn("i1",F.col(self.item_col)) - .join(self.train_df.withColumn("i2",F.col(self.item_col)), on=[self.user_col]) - .select(self.user_col,"i1","i2") - ) + reco_item_interacted_history = ( + self.reco_df.withColumn("i1", F.col(self.item_col)) + .join( + self.train_df.withColumn("i2", F.col(self.item_col)), on=[self.user_col] + ) + .select(self.user_col, "i1", "i2") + ) cossim_full = self.get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") - join_sim = (reco_item_interacted_history - .join(cossim_full, on=["i1", "i2"], how="left").fillna(0) - .groupBy(self.user_col, "i1") - .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) - .withColumn(self.item_col, F.col("i1")) - .drop("i1") - ) - return ( - join_sim.join(self.reco_df, on=[self.user_col, self.item_col]) - .withColumn("user_item_serendipity",(1-F.col("avg_item2interactedHistory_sim"))*F.col(self.relevance_col)) + join_sim = ( + reco_item_interacted_history.join(cossim_full, on=["i1", "i2"], how="left") + .fillna(0) + .groupBy(self.user_col, "i1") + .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) + .withColumn(self.item_col, F.col("i1")) + .drop("i1") ) - - + return join_sim.join( + self.reco_df, on=[self.user_col, self.item_col] + ).withColumn( + "user_item_serendipity", + (1 - F.col("avg_item2interactedHistory_sim")) * F.col(self.relevance_col), + ) + def user_item_serendipity(self): - if (self.df_user_item_serendipity == None): + if self.df_user_item_serendipity == None: self.df_user_item_serendipity = self.get_user_item_serendipity() - - return ( - self.df_user_item_serendipity - .select(self.user_col, self.item_col, "user_item_serendipity") - .orderBy(self.user_col, self.item_col) - ) - + + return self.df_user_item_serendipity.select( + self.user_col, self.item_col, "user_item_serendipity" + ).orderBy(self.user_col, self.item_col) + def user_serendipity(self): - if (self.df_user_item_serendipity == None): + if self.df_user_item_serendipity == None: self.df_user_item_serendipity = self.get_user_item_serendipity() - + return ( - self.df_user_item_serendipity - .groupBy(self.user_col) + self.df_user_item_serendipity.groupBy(self.user_col) .agg(F.mean("user_item_serendipity").alias("user_serendipity")) .orderBy(self.user_col) ) - + def serendipity(self): # TODO: add error handling logic for any other conditions - if (self.df_user_item_serendipity == None): + if self.df_user_item_serendipity == None: self.df_user_item_serendipity = self.get_user_item_serendipity() - - return ( - self.df_user_item_serendipity - .agg(F.mean("user_item_serendipity").alias("serendipity")) + + return self.df_user_item_serendipity.agg( + F.mean("user_item_serendipity").alias("serendipity") ) - - - # coverage metrics + + # coverage metrics def catalog_coverage(self): # distinct item count in reco_df - count_distinct_item_reco = self.reco_df.select(F.countDistinct(self.item_col)).collect()[0][0] + count_distinct_item_reco = self.reco_df.select( + F.countDistinct(self.item_col) + ).collect()[0][0] # distinct item count in train_df - count_distinct_item_train = self.train_df.select(F.countDistinct(self.item_col)).collect()[0][0] - + count_distinct_item_train = self.train_df.select( + F.countDistinct(self.item_col) + ).collect()[0][0] + # cagalog coverage - c_coverage= count_distinct_item_reco/count_distinct_item_train - return c_coverage - - def distributional_coverage(self): - # In reco_df, how many times each item_col is being recommended + c_coverage = count_distinct_item_reco / count_distinct_item_train + return c_coverage + + def distributional_coverage(self): + # In reco_df, how many times each item_col is being recommended df_itemcnt_reco = self.reco_df.groupBy(self.item_col).count() # distinct item count in train_df - count_distinct_item_train = self.train_df.select(F.countDistinct(self.item_col)).collect()[0][0] + count_distinct_item_train = self.train_df.select( + F.countDistinct(self.item_col) + ).collect()[0][0] # the number of total recommendations count_row_reco = self.reco_df.count() - df_entropy=(df_itemcnt_reco - .withColumn("p(i)",F.col("count")/count_row_reco) - .withColumn("entropy(i)", F.col("p(i)")*F.log2(F.col("p(i)"))) - ) + df_entropy = df_itemcnt_reco.withColumn( + "p(i)", F.col("count") / count_row_reco + ).withColumn("entropy(i)", F.col("p(i)") * F.log2(F.col("p(i)"))) # distributional coverage - d_coverage=(-2/count_distinct_item_train)*df_entropy.agg(F.sum("entropy(i)")).collect()[0][0] - - return d_coverage \ No newline at end of file + d_coverage = (-2 / count_distinct_item_train) * df_entropy.agg( + F.sum("entropy(i)") + ).collect()[0][0] + + return d_coverage From f565ff1b76329d6059c06d807727b7ebc18ddbcc Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 2 Jun 2021 14:34:54 +0000 Subject: [PATCH 04/33] modify licence --- reco_utils/evaluation/diversity_evaluator.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index 9beb1141e7..990c8afd07 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -1,12 +1,6 @@ -""" -Enterprise Customer License: +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. -Copyright (C) Microsoft Corporation. All rights reserved. - -Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, royalty-free right to use, copy, and modify the software code provided by us ("Software Code"). You may not sublicense the Software Code or any use of it (except to your affiliates and to vendors to perform work on your behalf) through distribution, network access, service agreement, lease, rental, or otherwise. This license does not purport to express any claim of ownership over data you may have shared with Microsoft in the creation of the Software Code. Unless applicable law gives you more rights, Microsoft reserves all other rights not expressly granted herein, whether by implication, estoppel or otherwise. - -THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" from pyspark.sql.types import * from pyspark.sql import functions as F From ec93a794ada9eaff74095d3796f0783014378ee5 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 2 Jun 2021 15:46:15 +0000 Subject: [PATCH 05/33] replace == None with is None --- reco_utils/evaluation/diversity_evaluator.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index 990c8afd07..ac5eedcf15 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -42,7 +42,7 @@ def __init__( self.df_item_novelty = None self.df_intralist_similarity = None - if relevance_col == None: + if relevance_col is None: self.relevance_col = "relevance" # relevance term, default is 1 (relevent) for all self.reco_df = reco_df.select( @@ -150,7 +150,7 @@ def get_intralist_similarity(self, df, similarity_df): ) def user_diversity(self): - if self.df_intralist_similarity == None: + if self.df_intralist_similarity is None: cossim = self.get_cosine_similarity().orderBy("i1", "i2") self.df_intralist_similarity = self.get_intralist_similarity( df=self.reco_df, similarity_df=cossim @@ -165,7 +165,7 @@ def user_diversity(self): def diversity(self): # TODO: add error handling logic for conditions where user_id is not valid - if self.df_intralist_similarity == None: + if self.df_intralist_similarity is None: cossim = self.get_cosine_similarity().orderBy("i1", "i2") self.df_intralist_similarity = self.get_intralist_similarity( df=self.reco_df, similarity_df=cossim @@ -196,14 +196,14 @@ def get_item_novelty(self): ) def item_novelty(self): - if self.df_item_novelty == None: + if self.df_item_novelty is None: self.df_item_novelty = self.get_item_novelty() return self.df_item_novelty.select(self.item_col, "item_novelty").orderBy( self.item_col ) def user_novelty(self): - if self.df_item_novelty == None: + if self.df_item_novelty is None: self.df_item_novelty = self.get_item_novelty() return ( self.reco_df.join(self.df_item_novelty, on=self.item_col) @@ -214,7 +214,7 @@ def user_novelty(self): def novelty(self): # TODO: add error handling logic for any other conditions - if self.df_item_novelty == None: + if self.df_item_novelty is None: self.df_item_novelty = self.get_item_novelty() return self.reco_df.join(self.df_item_novelty, on=self.item_col).agg( F.mean("item_novelty").alias("novelty") @@ -250,7 +250,7 @@ def get_user_item_serendipity(self): ) def user_item_serendipity(self): - if self.df_user_item_serendipity == None: + if self.df_user_item_serendipity is None: self.df_user_item_serendipity = self.get_user_item_serendipity() return self.df_user_item_serendipity.select( @@ -258,7 +258,7 @@ def user_item_serendipity(self): ).orderBy(self.user_col, self.item_col) def user_serendipity(self): - if self.df_user_item_serendipity == None: + if self.df_user_item_serendipity is None: self.df_user_item_serendipity = self.get_user_item_serendipity() return ( @@ -269,7 +269,7 @@ def user_serendipity(self): def serendipity(self): # TODO: add error handling logic for any other conditions - if self.df_user_item_serendipity == None: + if self.df_user_item_serendipity is None: self.df_user_item_serendipity = self.get_user_item_serendipity() return self.df_user_item_serendipity.agg( From 1fe412fccc59812f0e5eca6d5e2836d5a218867d Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 2 Jun 2021 20:53:51 +0000 Subject: [PATCH 06/33] enhance code --- reco_utils/evaluation/diversity_evaluator.py | 77 +++++++++----------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index ac5eedcf15..eee8627b6a 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -39,8 +39,11 @@ def __init__( self.item_col = item_col self.sim_col = "sim" self.df_user_item_serendipity = None + self.df_user_serendipity = None self.df_item_novelty = None + self.df_user_novelty = None self.df_intralist_similarity = None + self.df_user_diversity = None if relevance_col is None: self.relevance_col = "relevance" @@ -66,14 +69,14 @@ def __init__( "reco_df should not contain any user_item pairs that are already shown train_df" ) - def get_all_user_item_pairs(self, df): + def _get_all_user_item_pairs(self, df): return ( df.select(self.user_col) .distinct() .join(df.select(self.item_col).distinct()) ) - def get_pairwise_items(self, df, full_matrix=False): + def _get_pairwise_items(self, df, full_matrix=False): if full_matrix == False: return ( df.select(self.user_col, F.col(self.item_col).alias("i1")) @@ -100,11 +103,11 @@ def get_pairwise_items(self, df, full_matrix=False): ).select(self.user_col, "i1", "i2") ) - def get_cosine_similarity(self, full_matrix=False, n_partitions=200): + def _get_cosine_similarity(self, full_matrix=False, n_partitions=200): # TODO: make sure there are no null values in user or item columns # TODO: make sure temporary column names don't match existing user or item column names - pairs = self.get_pairwise_items(df=self.train_df, full_matrix=full_matrix) + pairs = self._get_pairwise_items(df=self.train_df, full_matrix=full_matrix) item_count = self.train_df.groupBy(self.item_col).count() return ( @@ -136,8 +139,8 @@ def get_cosine_similarity(self, full_matrix=False, n_partitions=200): ) # diversity metrics - def get_intralist_similarity(self, df, similarity_df): - pairs = self.get_pairwise_items(df=df) + def _get_intralist_similarity(self, df, similarity_df): + pairs = self._get_pairwise_items(df=df) return ( pairs.join(similarity_df, on=["i1", "i2"], how="left") .fillna( @@ -151,33 +154,28 @@ def get_intralist_similarity(self, df, similarity_df): def user_diversity(self): if self.df_intralist_similarity is None: - cossim = self.get_cosine_similarity().orderBy("i1", "i2") - self.df_intralist_similarity = self.get_intralist_similarity( + cossim = self._get_cosine_similarity().orderBy("i1", "i2") + self.df_intralist_similarity = self._get_intralist_similarity( df=self.reco_df, similarity_df=cossim ) return ( self.df_intralist_similarity.withColumn( - "diversity", 1 - F.col("avg_il_sim") + "user_diversity", 1 - F.col("avg_il_sim") ) - .select(self.user_col, "diversity") + .select(self.user_col, "user_diversity") .orderBy(self.user_col) ) def diversity(self): - # TODO: add error handling logic for conditions where user_id is not valid - if self.df_intralist_similarity is None: - cossim = self.get_cosine_similarity().orderBy("i1", "i2") - self.df_intralist_similarity = self.get_intralist_similarity( - df=self.reco_df, similarity_df=cossim - ) - return self.df_intralist_similarity.withColumn( - "diversity", 1 - F.col("avg_il_sim") - ).select(F.mean("diversity").alias("diversity")) + if self.df_user_diversity is None: + self.df_user_diversity = self.user_diversity() + return self.df_user_diversity.select(F.mean("user_diversity").alias("diversity")) # novelty metrics - def get_item_novelty(self): - train_pairs = self.get_all_user_item_pairs(df=self.train_df) - return ( + def item_novelty(self): + if self.df_item_novelty is None: + train_pairs = self._get_all_user_item_pairs(df=self.train_df) + self.df_item_novelty = ( train_pairs.join( self.train_df.withColumn("seen", F.lit(1)), on=[self.user_col, self.item_col], @@ -193,18 +191,15 @@ def get_item_novelty(self): on=self.item_col, ) .withColumn("item_novelty", -F.log2(F.col("reco_count") / F.col("count"))) + .select(self.item_col, "item_novelty") + .orderBy(self.item_col) ) - - def item_novelty(self): - if self.df_item_novelty is None: - self.df_item_novelty = self.get_item_novelty() - return self.df_item_novelty.select(self.item_col, "item_novelty").orderBy( - self.item_col - ) + return self.df_item_novelty + def user_novelty(self): if self.df_item_novelty is None: - self.df_item_novelty = self.get_item_novelty() + self.df_item_novelty = self.item_novelty() return ( self.reco_df.join(self.df_item_novelty, on=self.item_col) .groupBy(self.user_col) @@ -213,17 +208,14 @@ def user_novelty(self): ) def novelty(self): - # TODO: add error handling logic for any other conditions - if self.df_item_novelty is None: - self.df_item_novelty = self.get_item_novelty() - return self.reco_df.join(self.df_item_novelty, on=self.item_col).agg( - F.mean("item_novelty").alias("novelty") + if self.df_user_novelty is None: + self.df_user_novelty = self.user_novelty() + return self.df_user_novelty.agg( + F.mean("user_novelty").alias("novelty") ) # serendipity metrics def get_user_item_serendipity(self): - # TODO: add relevance term as input parameter - # for every user_col, item_col in reco_df, join all interacted items from train_df. # These interacted items are reapeated for each item in reco_df for a specific user. reco_item_interacted_history = ( @@ -233,7 +225,7 @@ def get_user_item_serendipity(self): ) .select(self.user_col, "i1", "i2") ) - cossim_full = self.get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") + cossim_full = self._get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") join_sim = ( reco_item_interacted_history.join(cossim_full, on=["i1", "i2"], how="left") .fillna(0) @@ -268,12 +260,11 @@ def user_serendipity(self): ) def serendipity(self): - # TODO: add error handling logic for any other conditions - if self.df_user_item_serendipity is None: - self.df_user_item_serendipity = self.get_user_item_serendipity() + if self.df_user_serendipity is None: + self.df_user_serendipity = self.user_serendipity() - return self.df_user_item_serendipity.agg( - F.mean("user_item_serendipity").alias("serendipity") + return self.df_user_serendipity.agg( + F.mean("user_serendipity").alias("serendipity") ) # coverage metrics From b968b42a80e88e392ed603a673cb2fc1fd9687b8 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 2 Jun 2021 20:56:01 +0000 Subject: [PATCH 07/33] formatting --- reco_utils/evaluation/diversity_evaluator.py | 45 ++++++++++---------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index eee8627b6a..acd989a653 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -169,33 +169,36 @@ def user_diversity(self): def diversity(self): if self.df_user_diversity is None: self.df_user_diversity = self.user_diversity() - return self.df_user_diversity.select(F.mean("user_diversity").alias("diversity")) + return self.df_user_diversity.select( + F.mean("user_diversity").alias("diversity") + ) # novelty metrics def item_novelty(self): if self.df_item_novelty is None: train_pairs = self._get_all_user_item_pairs(df=self.train_df) self.df_item_novelty = ( - train_pairs.join( - self.train_df.withColumn("seen", F.lit(1)), - on=[self.user_col, self.item_col], - how="left", - ) - .filter(F.col("seen").isNull()) - .groupBy(self.item_col) - .count() - .join( - self.reco_df.groupBy(self.item_col).agg( - F.count(self.user_col).alias("reco_count") - ), - on=self.item_col, + train_pairs.join( + self.train_df.withColumn("seen", F.lit(1)), + on=[self.user_col, self.item_col], + how="left", + ) + .filter(F.col("seen").isNull()) + .groupBy(self.item_col) + .count() + .join( + self.reco_df.groupBy(self.item_col).agg( + F.count(self.user_col).alias("reco_count") + ), + on=self.item_col, + ) + .withColumn( + "item_novelty", -F.log2(F.col("reco_count") / F.col("count")) + ) + .select(self.item_col, "item_novelty") + .orderBy(self.item_col) ) - .withColumn("item_novelty", -F.log2(F.col("reco_count") / F.col("count"))) - .select(self.item_col, "item_novelty") - .orderBy(self.item_col) - ) return self.df_item_novelty - def user_novelty(self): if self.df_item_novelty is None: @@ -210,9 +213,7 @@ def user_novelty(self): def novelty(self): if self.df_user_novelty is None: self.df_user_novelty = self.user_novelty() - return self.df_user_novelty.agg( - F.mean("user_novelty").alias("novelty") - ) + return self.df_user_novelty.agg(F.mean("user_novelty").alias("novelty")) # serendipity metrics def get_user_item_serendipity(self): From 8ae928bd538958d9240fbdd9ed21d1ec913087e1 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 02:27:36 +0000 Subject: [PATCH 08/33] fix --- reco_utils/evaluation/diversity_evaluator.py | 162 ++++++++++--------- 1 file changed, 82 insertions(+), 80 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index acd989a653..caf0a68143 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -40,10 +40,13 @@ def __init__( self.sim_col = "sim" self.df_user_item_serendipity = None self.df_user_serendipity = None + self.df_serendipity = None self.df_item_novelty = None self.df_user_novelty = None + self.df_novelty = None self.df_intralist_similarity = None self.df_user_diversity = None + self.df_diversity = None if relevance_col is None: self.relevance_col = "relevance" @@ -139,39 +142,39 @@ def _get_cosine_similarity(self, full_matrix=False, n_partitions=200): ) # diversity metrics - def _get_intralist_similarity(self, df, similarity_df): - pairs = self._get_pairwise_items(df=df) - return ( - pairs.join(similarity_df, on=["i1", "i2"], how="left") - .fillna( - 0 - ) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. - .filter(F.col("i1") != F.col("i2")) - .groupBy(self.user_col) - .agg(F.mean(self.sim_col).alias("avg_il_sim")) - .select(self.user_col, "avg_il_sim") - ) - - def user_diversity(self): + def _get_intralist_similarity(self, df): if self.df_intralist_similarity is None: - cossim = self._get_cosine_similarity().orderBy("i1", "i2") - self.df_intralist_similarity = self._get_intralist_similarity( - df=self.reco_df, similarity_df=cossim + pairs = self._get_pairwise_items(df=df) + similarity_df = self._get_cosine_similarity().orderBy("i1", "i2") + self.df_intralist_similarity = ( + pairs.join(similarity_df, on=["i1", "i2"], how="left") + .fillna(0) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. + .filter(F.col("i1") != F.col("i2")) + .groupBy(self.user_col) + .agg(F.mean(self.sim_col).alias("avg_il_sim")) + .select(self.user_col, "avg_il_sim") ) - return ( - self.df_intralist_similarity.withColumn( - "user_diversity", 1 - F.col("avg_il_sim") + return self.df_intralist_similarity + + def user_diversity(self): + if self.df_user_diversity is None: + self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df) + self.df_user_diversity = ( + self.df_intralist_similarity.withColumn( + "user_diversity", 1 - F.col("avg_il_sim") + ) + .select(self.user_col, "user_diversity") + .orderBy(self.user_col) ) - .select(self.user_col, "user_diversity") - .orderBy(self.user_col) - ) + return self.df_user_diversity def diversity(self): - if self.df_user_diversity is None: - self.df_user_diversity = self.user_diversity() - return self.df_user_diversity.select( - F.mean("user_diversity").alias("diversity") - ) + if self.df_diversity is None: + self.df_user_diversity = self.user_diversity() + self.df_diversity = self.df_user_diversity.select( + F.mean("user_diversity").alias("diversity") + ) + return self.df_diversity # novelty metrics def item_novelty(self): @@ -201,72 +204,71 @@ def item_novelty(self): return self.df_item_novelty def user_novelty(self): - if self.df_item_novelty is None: - self.df_item_novelty = self.item_novelty() - return ( - self.reco_df.join(self.df_item_novelty, on=self.item_col) - .groupBy(self.user_col) - .agg(F.mean("item_novelty").alias("user_novelty")) - .orderBy(self.user_col) - ) + if self.df_user_novelty is None: + self.df_item_novelty = self.item_novelty() + self.df_user_novelty = ( + self.reco_df.join(self.df_item_novelty, on=self.item_col) + .groupBy(self.user_col) + .agg(F.mean("item_novelty").alias("user_novelty")) + .orderBy(self.user_col) + ) + return self.df_user_novelty def novelty(self): - if self.df_user_novelty is None: + if self.df_novelty is None: self.df_user_novelty = self.user_novelty() - return self.df_user_novelty.agg(F.mean("user_novelty").alias("novelty")) + self.df_novelty = self.df_user_novelty.agg(F.mean("user_novelty").alias("novelty")) + return self.df_novelty # serendipity metrics - def get_user_item_serendipity(self): + def user_item_serendipity(self): # for every user_col, item_col in reco_df, join all interacted items from train_df. # These interacted items are reapeated for each item in reco_df for a specific user. - reco_item_interacted_history = ( - self.reco_df.withColumn("i1", F.col(self.item_col)) - .join( - self.train_df.withColumn("i2", F.col(self.item_col)), on=[self.user_col] - ) - .select(self.user_col, "i1", "i2") - ) - cossim_full = self._get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") - join_sim = ( - reco_item_interacted_history.join(cossim_full, on=["i1", "i2"], how="left") - .fillna(0) - .groupBy(self.user_col, "i1") - .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) - .withColumn(self.item_col, F.col("i1")) - .drop("i1") - ) - return join_sim.join( - self.reco_df, on=[self.user_col, self.item_col] - ).withColumn( - "user_item_serendipity", - (1 - F.col("avg_item2interactedHistory_sim")) * F.col(self.relevance_col), - ) - - def user_item_serendipity(self): if self.df_user_item_serendipity is None: - self.df_user_item_serendipity = self.get_user_item_serendipity() + reco_item_interacted_history = ( + self.reco_df.withColumn("i1", F.col(self.item_col)) + .join( + self.train_df.withColumn("i2", F.col(self.item_col)), on=[self.user_col] + ) + .select(self.user_col, "i1", "i2") + ) + cossim_full = self._get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") + join_sim = ( + reco_item_interacted_history.join(cossim_full, on=["i1", "i2"], how="left") + .fillna(0) + .groupBy(self.user_col, "i1") + .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) + .withColumn(self.item_col, F.col("i1")) + .drop("i1") + ) + self.df_user_item_serendipity = join_sim.join( + self.reco_df, on=[self.user_col, self.item_col] + ).withColumn( + "user_item_serendipity", + (1 - F.col("avg_item2interactedHistory_sim")) * F.col(self.relevance_col), + ).select( + self.user_col, self.item_col, "user_item_serendipity" + ).orderBy(self.user_col, self.item_col) + return self.df_user_item_serendipity - return self.df_user_item_serendipity.select( - self.user_col, self.item_col, "user_item_serendipity" - ).orderBy(self.user_col, self.item_col) def user_serendipity(self): - if self.df_user_item_serendipity is None: - self.df_user_item_serendipity = self.get_user_item_serendipity() - - return ( - self.df_user_item_serendipity.groupBy(self.user_col) - .agg(F.mean("user_item_serendipity").alias("user_serendipity")) - .orderBy(self.user_col) - ) + if self.df_user_serendipity is None: + self.df_user_item_serendipity = self.user_item_serendipity() + self.df_user_serendipity = ( + self.df_user_item_serendipity.groupBy(self.user_col) + .agg(F.mean("user_item_serendipity").alias("user_serendipity")) + .orderBy(self.user_col) + ) + return self.df_user_serendipity def serendipity(self): - if self.df_user_serendipity is None: + if self.df_serendipity is None: self.df_user_serendipity = self.user_serendipity() - - return self.df_user_serendipity.agg( - F.mean("user_serendipity").alias("serendipity") - ) + self.df_serendipity = self.df_user_serendipity.agg( + F.mean("user_serendipity").alias("serendipity") + ) + return self.df_serendipity # coverage metrics def catalog_coverage(self): From d75370db1658f871570cf631b53f484296ce70cd Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 02:37:33 +0000 Subject: [PATCH 09/33] fix --- reco_utils/evaluation/diversity_evaluator.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index caf0a68143..7d7d98c201 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -273,13 +273,10 @@ def serendipity(self): # coverage metrics def catalog_coverage(self): # distinct item count in reco_df - count_distinct_item_reco = self.reco_df.select( - F.countDistinct(self.item_col) - ).collect()[0][0] + count_distinct_item_reco = self.reco_df.select(self.item_col).distinct().count() # distinct item count in train_df - count_distinct_item_train = self.train_df.select( - F.countDistinct(self.item_col) - ).collect()[0][0] + count_distinct_item_train = self.train_df.select(self.item_col).distinct().count() + # cagalog coverage c_coverage = count_distinct_item_reco / count_distinct_item_train @@ -289,9 +286,7 @@ def distributional_coverage(self): # In reco_df, how many times each item_col is being recommended df_itemcnt_reco = self.reco_df.groupBy(self.item_col).count() # distinct item count in train_df - count_distinct_item_train = self.train_df.select( - F.countDistinct(self.item_col) - ).collect()[0][0] + count_distinct_item_train = self.train_df.select(self.item_col).distinct().count() # the number of total recommendations count_row_reco = self.reco_df.count() df_entropy = df_itemcnt_reco.withColumn( From c21ce802d9183a18ed997bb728ef9ee5cd5f5dd0 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 15:42:52 +0000 Subject: [PATCH 10/33] optimize logic for calculating serendipity, removing full matrix of similarity matrix --- reco_utils/evaluation/diversity_evaluator.py | 118 ++++++++----------- 1 file changed, 48 insertions(+), 70 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index 7d7d98c201..d9e1218c39 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -38,6 +38,7 @@ def __init__( self.user_col = user_col self.item_col = item_col self.sim_col = "sim" + self.df_cosine_similariy = None self.df_user_item_serendipity = None self.df_user_serendipity = None self.df_serendipity = None @@ -79,8 +80,8 @@ def _get_all_user_item_pairs(self, df): .join(df.select(self.item_col).distinct()) ) - def _get_pairwise_items(self, df, full_matrix=False): - if full_matrix == False: + def _get_pairwise_items(self, df): + return ( df.select(self.user_col, F.col(self.item_col).alias("i1")) # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) @@ -93,53 +94,41 @@ def _get_pairwise_items(self, df, full_matrix=False): & (F.col("i1") <= F.col("i2")), ).select(self.user_col, "i1", "i2") ) - else: - return ( - df.select(self.user_col, F.col(self.item_col).alias("i1")) - # get pairwise combinations of items per user (including both pairs [1,2] and [2,1]) + + + def _get_cosine_similarity(self, n_partitions=200): + if self.df_cosine_similariy is None: + pairs = self._get_pairwise_items(df=self.train_df) + item_count = self.train_df.groupBy(self.item_col).count() + + self.df_cosine_similariy = ( + pairs.groupBy("i1", "i2") + .count() .join( - df.select( - F.col(self.user_col).alias("_user"), + item_count.select( + F.col(self.item_col).alias("i1"), + F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"), + ), + on="i1", + ) + .join( + item_count.select( F.col(self.item_col).alias("i2"), + F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"), ), - (F.col(self.user_col) == F.col("_user")), - ).select(self.user_col, "i1", "i2") - ) - - def _get_cosine_similarity(self, full_matrix=False, n_partitions=200): - # TODO: make sure there are no null values in user or item columns - # TODO: make sure temporary column names don't match existing user or item column names - - pairs = self._get_pairwise_items(df=self.train_df, full_matrix=full_matrix) - item_count = self.train_df.groupBy(self.item_col).count() - - return ( - pairs.groupBy("i1", "i2") - .count() - .join( - item_count.select( - F.col(self.item_col).alias("i1"), - F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"), - ), - on="i1", - ) - .join( - item_count.select( - F.col(self.item_col).alias("i2"), - F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"), - ), - on="i2", - ) - .select( - "i1", - "i2", - ( - F.col("count") / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count")) - ).alias("sim"), + on="i2", + ) + .select( + "i1", + "i2", + ( + F.col("count") / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count")) + ).alias("sim"), + ) + .repartition(n_partitions, "i1", "i2") + .sortWithinPartitions("i1", "i2") ) - .repartition(n_partitions, "i1", "i2") - .sortWithinPartitions("i1", "i2") - ) + return self.df_cosine_similariy # diversity metrics def _get_intralist_similarity(self, df): @@ -223,35 +212,24 @@ def novelty(self): # serendipity metrics def user_item_serendipity(self): # for every user_col, item_col in reco_df, join all interacted items from train_df. - # These interacted items are reapeated for each item in reco_df for a specific user. + # These interacted items are repeated for each item in reco_df for a specific user. if self.df_user_item_serendipity is None: - reco_item_interacted_history = ( - self.reco_df.withColumn("i1", F.col(self.item_col)) - .join( - self.train_df.withColumn("i2", F.col(self.item_col)), on=[self.user_col] - ) - .select(self.user_col, "i1", "i2") - ) - cossim_full = self._get_cosine_similarity(full_matrix=True).orderBy("i1", "i2") - join_sim = ( - reco_item_interacted_history.join(cossim_full, on=["i1", "i2"], how="left") - .fillna(0) - .groupBy(self.user_col, "i1") - .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) - .withColumn(self.item_col, F.col("i1")) - .drop("i1") + self.df_cosine_similariy = self._get_cosine_similarity().orderBy("i1", "i2") + self.df_user_item_serendipity = (self.reco_df + .withColumn("reco_item", F.col(self.item_col)) # duplicate item_col to keep + .select(self.user_col, "reco_item", F.col(self.item_col).alias("reco_item_tmp")) + .join(self.train_df.select(self.user_col, F.col(self.item_col).alias("train_item_tmp")), on=[self.user_col]) + .select(self.user_col, "reco_item", F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias("i1"), F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias("i2")) + .join(self.df_cosine_similariy, on=["i1", "i2"], how="left").fillna(0) + .groupBy(self.user_col,F.col("reco_item").alias(self.item_col)) + .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) + .join(self.reco_df, on=[self.user_col, self.item_col]) + .withColumn("user_item_serendipity",(1-F.col("avg_item2interactedHistory_sim"))*F.col(self.relevance_col)) + .select(self.user_col, self.item_col, "user_item_serendipity") + .orderBy(self.user_col, self.item_col) ) - self.df_user_item_serendipity = join_sim.join( - self.reco_df, on=[self.user_col, self.item_col] - ).withColumn( - "user_item_serendipity", - (1 - F.col("avg_item2interactedHistory_sim")) * F.col(self.relevance_col), - ).select( - self.user_col, self.item_col, "user_item_serendipity" - ).orderBy(self.user_col, self.item_col) return self.df_user_item_serendipity - def user_serendipity(self): if self.df_user_serendipity is None: self.df_user_item_serendipity = self.user_item_serendipity() From 120490945e74ca99cc3a2ea612d76da579dd89f1 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 16:01:08 +0000 Subject: [PATCH 11/33] add docstring --- reco_utils/evaluation/diversity_evaluator.py | 160 ++++++++++++++----- 1 file changed, 122 insertions(+), 38 deletions(-) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/diversity_evaluator.py index d9e1218c39..9eb6cb81bd 100644 --- a/reco_utils/evaluation/diversity_evaluator.py +++ b/reco_utils/evaluation/diversity_evaluator.py @@ -81,20 +81,18 @@ def _get_all_user_item_pairs(self, df): ) def _get_pairwise_items(self, df): - - return ( - df.select(self.user_col, F.col(self.item_col).alias("i1")) - # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) - .join( - df.select( - F.col(self.user_col).alias("_user"), - F.col(self.item_col).alias("i2"), - ), - (F.col(self.user_col) == F.col("_user")) - & (F.col("i1") <= F.col("i2")), - ).select(self.user_col, "i1", "i2") - ) - + + return ( + df.select(self.user_col, F.col(self.item_col).alias("i1")) + # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) + .join( + df.select( + F.col(self.user_col).alias("_user"), + F.col(self.item_col).alias("i2"), + ), + (F.col(self.user_col) == F.col("_user")) & (F.col("i1") <= F.col("i2")), + ).select(self.user_col, "i1", "i2") + ) def _get_cosine_similarity(self, n_partitions=200): if self.df_cosine_similariy is None: @@ -122,7 +120,8 @@ def _get_cosine_similarity(self, n_partitions=200): "i1", "i2", ( - F.col("count") / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count")) + F.col("count") + / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count")) ).alias("sim"), ) .repartition(n_partitions, "i1", "i2") @@ -137,7 +136,9 @@ def _get_intralist_similarity(self, df): similarity_df = self._get_cosine_similarity().orderBy("i1", "i2") self.df_intralist_similarity = ( pairs.join(similarity_df, on=["i1", "i2"], how="left") - .fillna(0) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. + .fillna( + 0 + ) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. .filter(F.col("i1") != F.col("i2")) .groupBy(self.user_col) .agg(F.mean(self.sim_col).alias("avg_il_sim")) @@ -146,8 +147,13 @@ def _get_intralist_similarity(self, df): return self.df_intralist_similarity def user_diversity(self): + """Calculate average diversity for recommendations for each user. + + Returns: + pyspark.sql.dataframe.DataFrame: user_col, user_diversity + """ if self.df_user_diversity is None: - self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df) + self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df) self.df_user_diversity = ( self.df_intralist_similarity.withColumn( "user_diversity", 1 - F.col("avg_il_sim") @@ -158,15 +164,25 @@ def user_diversity(self): return self.df_user_diversity def diversity(self): + """Calculate average diversity for recommendations across all users. + + Returns: + pyspark.sql.dataframe.DataFrame: diversity + """ if self.df_diversity is None: - self.df_user_diversity = self.user_diversity() - self.df_diversity = self.df_user_diversity.select( + self.df_user_diversity = self.user_diversity() + self.df_diversity = self.df_user_diversity.select( F.mean("user_diversity").alias("diversity") ) return self.df_diversity # novelty metrics def item_novelty(self): + """Calculate novelty for each item in the recommendations. + + Returns: + pyspark.sql.dataframe.DataFrame: item_col, item_novelty + """ if self.df_item_novelty is None: train_pairs = self._get_all_user_item_pairs(df=self.train_df) self.df_item_novelty = ( @@ -193,44 +209,91 @@ def item_novelty(self): return self.df_item_novelty def user_novelty(self): - if self.df_user_novelty is None: - self.df_item_novelty = self.item_novelty() + """Calculate average item novelty for each user's recommendations. + + Returns: + pyspark.sql.dataframe.DataFrame: user_col, user_novelty + """ + if self.df_user_novelty is None: + self.df_item_novelty = self.item_novelty() self.df_user_novelty = ( self.reco_df.join(self.df_item_novelty, on=self.item_col) .groupBy(self.user_col) .agg(F.mean("item_novelty").alias("user_novelty")) .orderBy(self.user_col) ) - return self.df_user_novelty + return self.df_user_novelty def novelty(self): + """Calculate average novelty for recommendations across all users. + + Returns: + pyspark.sql.dataframe.DataFrame: novelty + """ if self.df_novelty is None: self.df_user_novelty = self.user_novelty() - self.df_novelty = self.df_user_novelty.agg(F.mean("user_novelty").alias("novelty")) + self.df_novelty = self.df_user_novelty.agg( + F.mean("user_novelty").alias("novelty") + ) return self.df_novelty # serendipity metrics def user_item_serendipity(self): + """Calculate serendipity of each item in the recommendations for each user. + + Returns: + pyspark.sql.dataframe.DataFrame: user_col, item_col, user_item_serendipity + """ # for every user_col, item_col in reco_df, join all interacted items from train_df. # These interacted items are repeated for each item in reco_df for a specific user. if self.df_user_item_serendipity is None: self.df_cosine_similariy = self._get_cosine_similarity().orderBy("i1", "i2") - self.df_user_item_serendipity = (self.reco_df - .withColumn("reco_item", F.col(self.item_col)) # duplicate item_col to keep - .select(self.user_col, "reco_item", F.col(self.item_col).alias("reco_item_tmp")) - .join(self.train_df.select(self.user_col, F.col(self.item_col).alias("train_item_tmp")), on=[self.user_col]) - .select(self.user_col, "reco_item", F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias("i1"), F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias("i2")) - .join(self.df_cosine_similariy, on=["i1", "i2"], how="left").fillna(0) - .groupBy(self.user_col,F.col("reco_item").alias(self.item_col)) - .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) - .join(self.reco_df, on=[self.user_col, self.item_col]) - .withColumn("user_item_serendipity",(1-F.col("avg_item2interactedHistory_sim"))*F.col(self.relevance_col)) - .select(self.user_col, self.item_col, "user_item_serendipity") - .orderBy(self.user_col, self.item_col) + self.df_user_item_serendipity = ( + self.reco_df.withColumn( + "reco_item", F.col(self.item_col) + ) # duplicate item_col to keep + .select( + self.user_col, + "reco_item", + F.col(self.item_col).alias("reco_item_tmp"), + ) + .join( + self.train_df.select( + self.user_col, F.col(self.item_col).alias("train_item_tmp") + ), + on=[self.user_col], + ) + .select( + self.user_col, + "reco_item", + F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias( + "i1" + ), + F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias( + "i2" + ), + ) + .join(self.df_cosine_similariy, on=["i1", "i2"], how="left") + .fillna(0) + .groupBy(self.user_col, F.col("reco_item").alias(self.item_col)) + .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) + .join(self.reco_df, on=[self.user_col, self.item_col]) + .withColumn( + "user_item_serendipity", + (1 - F.col("avg_item2interactedHistory_sim")) + * F.col(self.relevance_col), + ) + .select(self.user_col, self.item_col, "user_item_serendipity") + .orderBy(self.user_col, self.item_col) ) return self.df_user_item_serendipity def user_serendipity(self): + """Calculate average serendipity for each user's recommendations. + + Returns: + pyspark.sql.dataframe.DataFrame: user_col, user_serendipity + """ if self.df_user_serendipity is None: self.df_user_item_serendipity = self.user_item_serendipity() self.df_user_serendipity = ( @@ -241,6 +304,11 @@ def user_serendipity(self): return self.df_user_serendipity def serendipity(self): + """Calculate average serentipity for recommendations across all users. + + Returns: + pyspark.sql.dataframe.DataFrame: serendipity + """ if self.df_serendipity is None: self.df_user_serendipity = self.user_serendipity() self.df_serendipity = self.df_user_serendipity.agg( @@ -250,21 +318,37 @@ def serendipity(self): # coverage metrics def catalog_coverage(self): + """Calculate catalog coverage for recommendations across all users. + + Info: + G. Shani and A. Gunawardana, Evaluating Recommendation Systems, Recommender Systems Handbook pp. 257-297, 2010. + + Returns: + float: catalog coverage + """ # distinct item count in reco_df count_distinct_item_reco = self.reco_df.select(self.item_col).distinct().count() # distinct item count in train_df - count_distinct_item_train = self.train_df.select(self.item_col).distinct().count() - + count_distinct_item_train = ( + self.train_df.select(self.item_col).distinct().count() + ) # cagalog coverage c_coverage = count_distinct_item_reco / count_distinct_item_train return c_coverage def distributional_coverage(self): + """Calculate distributional coverage for recommendations across all users. + + Returns: + float: distributional coverage + """ # In reco_df, how many times each item_col is being recommended df_itemcnt_reco = self.reco_df.groupBy(self.item_col).count() # distinct item count in train_df - count_distinct_item_train = self.train_df.select(self.item_col).distinct().count() + count_distinct_item_train = ( + self.train_df.select(self.item_col).distinct().count() + ) # the number of total recommendations count_row_reco = self.reco_df.count() df_entropy = df_itemcnt_reco.withColumn( From ce954e5dde2bd75df93632badcc9fdbf12bd4d26 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 16:07:29 +0000 Subject: [PATCH 12/33] change file name --- .../{diversity_evaluator.py => spark_diversity_evaluator.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename reco_utils/evaluation/{diversity_evaluator.py => spark_diversity_evaluator.py} (100%) diff --git a/reco_utils/evaluation/diversity_evaluator.py b/reco_utils/evaluation/spark_diversity_evaluator.py similarity index 100% rename from reco_utils/evaluation/diversity_evaluator.py rename to reco_utils/evaluation/spark_diversity_evaluator.py From 1e70a718c91d6a4eef3e25e6b02069a4b50d6468 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 16:10:06 +0000 Subject: [PATCH 13/33] fix import --- .../evaluation/test_spark_evaluation_diversity_metrics.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py b/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py index a0a0e25165..7bbb6aa92f 100644 --- a/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py +++ b/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py @@ -6,9 +6,12 @@ import numpy as np import pandas as pd from pandas.util.testing import assert_frame_equal -from pyspark.sql import Row -from reco_utils.evaluation.diversity_evaluator import DiversityEvaluator +try: + from pyspark.sql import Row + from reco_utils.evaluation.spark_diversity_evaluator import DiversityEvaluator +except ImportError: + pass # skip this import if we are in pure python environment TOL = 0.0001 From a5631dbaaaeec175069538c410230ec5bc56d27f Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 16:13:17 +0000 Subject: [PATCH 14/33] changed file name --- ...ion_diversity_metrics.py => test_spark_diversity_evaluator.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/reco_utils/evaluation/{test_spark_evaluation_diversity_metrics.py => test_spark_diversity_evaluator.py} (100%) diff --git a/tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py similarity index 100% rename from tests/unit/reco_utils/evaluation/test_spark_evaluation_diversity_metrics.py rename to tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py From 42700d5c365e074935693c6d76a502ebd5f64e4d Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 16:24:16 +0000 Subject: [PATCH 15/33] fix --- .../reco_utils/evaluation/test_spark_diversity_evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py index 7bbb6aa92f..76638ae6f3 100644 --- a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py +++ b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py @@ -33,10 +33,10 @@ def target_metrics(): dict(diversity=[0.43096]) ), "user_diversity": pd.DataFrame( - dict(UserId=[1, 2, 3], diversity=[0.29289, 1.0, 0.0]) + dict(UserId=[1, 2, 3], user_diversity=[0.29289, 1.0, 0.0]) ), "user_item_serendipity": pd.DataFrame( - dict(UserId=[1, 1, 2, 2, 3, 3], reco_item= [3, 5, 2, 5, 1, 2], user_item_serendipity=[0.72783, 0.80755, 0.71132, 0.35777, 0.80755, 0.80755]) + dict(UserId=[1, 1, 2, 2, 3, 3], ItemId= [3, 5, 2, 5, 1, 2], user_item_serendipity=[0.72783, 0.80755, 0.71132, 0.35777, 0.80755, 0.80755]) ), "user_serendipity": pd.DataFrame( dict(UserId=[1, 2, 3], user_serendipity=[0.76770, 0.53455, 0.80755]) From a2974cd6e40701f538cc5e63b34b65f1d83e9b4b Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 4 Jun 2021 16:28:34 +0000 Subject: [PATCH 16/33] evaluation example notebook --- .../als_movielens_diversity_metrics.ipynb | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index de7f9cd232..08b238353d 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -143,7 +143,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 18.8kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.1kKB/s]\n" ] }, { @@ -265,7 +265,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Took 3.6827992300004553 seconds for training.\n" + "Took 3.238317724000808 seconds for training.\n" ] } ], @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -453,27 +453,28 @@ " relevancy_method=\"top_k\"\n", ")\n", "\n", - "als_diversity_eval = DiversityEvaluator(\n", - " train_df = train_df, \n", - " reco_df = top_k_reco,\n", - " user_col=\"UserId\", \n", - " item_col=\"MovieId\"\n", - ")" + "als_ranking_metrics = get_ranking_results(als_ranking_eval)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "als_ranking_metrics = get_ranking_results(als_ranking_eval)\n", + "als_diversity_eval = DiversityEvaluator(\n", + " train_df = train_df, \n", + " reco_df = top_k_reco,\n", + " user_col=\"UserId\", \n", + " item_col=\"MovieId\"\n", + ")\n", + "\n", "als_diversity_metrics = get_diversity_results(als_diversity_eval)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -489,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -503,28 +504,28 @@ " k=TOP_K,\n", ")\n", "\n", - "random_diversity_eval = DiversityEvaluator(\n", - " train_df = train_df, \n", - " reco_df = pred_df, \n", - " user_col=COL_USER, \n", - " item_col=COL_ITEM\n", - ")\n", - " " + "random_ranking_metrics = get_ranking_results(random_ranking_eval)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "random_ranking_metrics = get_ranking_results(random_ranking_eval)\n", + "random_diversity_eval = DiversityEvaluator(\n", + " train_df = train_df, \n", + " reco_df = pred_df, \n", + " user_col=COL_USER, \n", + " item_col=COL_ITEM\n", + ")\n", + " \n", "random_diversity_metrics = get_diversity_results(random_diversity_eval)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -540,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -603,24 +604,24 @@ " 0.005734\n", " 0.381906\n", " 0.009753\n", - " 4.604587\n", + " 4.604989\n", " 0.892036\n", - " 0.880369\n", + " 0.880336\n", " \n", " \n", " 2\n", " 100k\n", " random\n", " 10\n", - " 0.016879\n", - " 0.005343\n", - " 0.017268\n", - " 0.001544\n", - " 0.995750\n", - " 0.012818\n", - " 7.166853\n", - " 0.924271\n", - " 0.894810\n", + " 0.019427\n", + " 0.007366\n", + " 0.019456\n", + " 0.002294\n", + " 0.997571\n", + " 0.012820\n", + " 7.164951\n", + " 0.922180\n", + " 0.892728\n", " \n", " \n", "\n", @@ -629,11 +630,11 @@ "text/plain": [ " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", "1 100k als 10 0.051911 0.017514 0.047460 0.005734 \n", - "2 100k random 10 0.016879 0.005343 0.017268 0.001544 \n", + "2 100k random 10 0.019427 0.007366 0.019456 0.002294 \n", "\n", " catalog_coverage distributional_coverage novelty diversity serendipity \n", - "1 0.381906 0.009753 4.604587 0.892036 0.880369 \n", - "2 0.995750 0.012818 7.166853 0.924271 0.894810 " + "1 0.381906 0.009753 4.604989 0.892036 0.880336 \n", + "2 0.997571 0.012820 7.164951 0.922180 0.892728 " ] }, "execution_count": 22, @@ -647,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ From 2ccae9a29cbdb8a58debbf5d44699b258e84899e Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Sun, 6 Jun 2021 11:23:04 +0000 Subject: [PATCH 17/33] fix --- .../reco_utils/evaluation/test_spark_diversity_evaluator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py index 76638ae6f3..092d3a8356 100644 --- a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py +++ b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py @@ -1,9 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import itertools import pytest -import numpy as np import pandas as pd from pandas.util.testing import assert_frame_equal From 72bf0f74fed0883ebc7e2445da90d1c684c03d43 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Sun, 6 Jun 2021 11:25:27 +0000 Subject: [PATCH 18/33] fix --- .../reco_utils/evaluation/test_spark_diversity_evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py index 092d3a8356..512690652b 100644 --- a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py +++ b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py @@ -7,7 +7,7 @@ try: from pyspark.sql import Row - from reco_utils.evaluation.spark_diversity_evaluator import DiversityEvaluator + from reco_utils.evaluation.spark_diversity_evaluation import DiversityEvaluation except ImportError: pass # skip this import if we are in pure python environment @@ -72,7 +72,7 @@ def data(spark): @pytest.fixture() def evaluator(data): train_df, reco_df = data - div = DiversityEvaluator(train_df=train_df, reco_df=reco_df, + div = DiversityEvaluation(train_df=train_df, reco_df=reco_df, user_col='UserId', item_col='ItemId') print("init evaluator") return div From 42bc56284d7119551c9380d952346c19cc620528 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Sun, 6 Jun 2021 11:26:50 +0000 Subject: [PATCH 19/33] change file name --- ..._diversity_evaluator.py => test_spark_diversity_evaluation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/reco_utils/evaluation/{test_spark_diversity_evaluator.py => test_spark_diversity_evaluation.py} (100%) diff --git a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py similarity index 100% rename from tests/unit/reco_utils/evaluation/test_spark_diversity_evaluator.py rename to tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py From f4be166a97a3658ddb4138c21d1bbca8f3ff56ba Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Sun, 6 Jun 2021 11:28:05 +0000 Subject: [PATCH 20/33] fix --- reco_utils/evaluation/spark_diversity_evaluator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluator.py b/reco_utils/evaluation/spark_diversity_evaluator.py index 9eb6cb81bd..c84bea249e 100644 --- a/reco_utils/evaluation/spark_diversity_evaluator.py +++ b/reco_utils/evaluation/spark_diversity_evaluator.py @@ -5,7 +5,7 @@ from pyspark.sql import functions as F -class DiversityEvaluator: +class DiversityEvaluation: def __init__( self, train_df, @@ -81,7 +81,6 @@ def _get_all_user_item_pairs(self, df): ) def _get_pairwise_items(self, df): - return ( df.select(self.user_col, F.col(self.item_col).alias("i1")) # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) From 47d4fba5c0a8736eb5ad0966fe0c6d5090ecb990 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Sun, 6 Jun 2021 11:28:55 +0000 Subject: [PATCH 21/33] change file name --- ...spark_diversity_evaluator.py => spark_diversity_evaluation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename reco_utils/evaluation/{spark_diversity_evaluator.py => spark_diversity_evaluation.py} (100%) diff --git a/reco_utils/evaluation/spark_diversity_evaluator.py b/reco_utils/evaluation/spark_diversity_evaluation.py similarity index 100% rename from reco_utils/evaluation/spark_diversity_evaluator.py rename to reco_utils/evaluation/spark_diversity_evaluation.py From dae9b0f9fa59429f2ae743f0974102f45450cca7 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Mon, 7 Jun 2021 01:56:03 +0000 Subject: [PATCH 22/33] remove div evaluator fixture --- .../test_spark_diversity_evaluation.py | 70 +++++++++++-------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py index 512690652b..dd8fa23ec2 100644 --- a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py +++ b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py @@ -44,10 +44,8 @@ def target_metrics(): ), } - - @pytest.fixture(scope="module") -def data(spark): +def spark_data(spark): train_df = spark.createDataFrame([ Row(UserId=1, ItemId=1), Row(UserId=1, ItemId=2), @@ -68,68 +66,82 @@ def data(spark): ]) return train_df, reco_df -@pytest.mark.spark -@pytest.fixture() -def evaluator(data): - train_df, reco_df = data - div = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') - print("init evaluator") - return div - - -@pytest.mark.spark -def test_init_spark(spark): - assert spark is not None - @pytest.mark.spark -def test_catalog_coverage(evaluator, target_metrics): - +def test_catalog_coverage(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') c_coverage = evaluator.catalog_coverage() assert c_coverage == target_metrics["c_coverage"] @pytest.mark.spark -def test_distributional_coverage(evaluator, target_metrics): - +def test_distributional_coverage(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') d_coverage = evaluator.distributional_coverage() assert d_coverage == target_metrics["d_coverage"] @pytest.mark.spark -def test_item_novelty(evaluator, target_metrics): +def test_item_novelty(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.item_novelty().toPandas() assert_frame_equal(target_metrics["item_novelty"], actual, check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_user_novelty(evaluator, target_metrics): +def test_user_novelty(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.user_novelty().toPandas() assert_frame_equal(target_metrics["user_novelty"], actual, check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_novelty(evaluator, target_metrics): +def test_novelty(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.novelty().toPandas() assert_frame_equal(target_metrics["novelty"], actual, check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_user_diversity(evaluator, target_metrics): +def test_user_diversity(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.user_diversity().toPandas() assert_frame_equal(target_metrics["user_diversity"], actual, check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_diversity(evaluator, target_metrics): +def test_diversity(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.diversity().toPandas() assert_frame_equal(target_metrics["diversity"], actual,check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_user_item_serendipity(evaluator, target_metrics): +def test_user_item_serendipity(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.user_item_serendipity().toPandas() assert_frame_equal(target_metrics["user_item_serendipity"], actual, check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_user_serendipity(evaluator, target_metrics): +def test_user_serendipity(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.user_serendipity().toPandas() assert_frame_equal(target_metrics["user_serendipity"], actual, check_exact=False, check_less_precise=4) @pytest.mark.spark -def test_serendipity(evaluator, target_metrics): +def test_serendipity(spark_data, target_metrics): + train_df, reco_df = spark_data + evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, + user_col='UserId', item_col='ItemId') actual = evaluator.serendipity().toPandas() assert_frame_equal(target_metrics["serendipity"], actual, check_exact=False, check_less_precise=4) From 6dc27aa44cbb69086d319eebc035ce6493e9c0cd Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Mon, 7 Jun 2021 11:06:01 +0000 Subject: [PATCH 23/33] fix --- .../als_movielens_diversity_metrics.ipynb | 139 ++++-------------- 1 file changed, 27 insertions(+), 112 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 08b238353d..32bdc05870 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -65,7 +65,7 @@ "from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n", "from reco_utils.common.spark_utils import start_or_get_spark\n", "\n", - "from reco_utils.evaluation.diversity_evaluator import DiversityEvaluator\n", + "from reco_utils.evaluation.spark_diversity_evaluation import DiversityEvaluation\n", "from pyspark.sql.window import Window\n", "\n", "import numpy as np\n", @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "tags": [ "parameters" @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -136,14 +136,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.1kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.8kKB/s]\n" ] }, { @@ -203,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -233,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -258,14 +258,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Took 3.238317724000808 seconds for training.\n" + "Took 3.8314080880008987 seconds for training.\n" ] } ], @@ -280,14 +280,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. Therefore, the rated movies are removed from the recommended items.\n", + "In the movie recommendation use case, recommending movies that have been rated by the users does not make sense. Therefore, the rated movies are removed from the recommended items.\n", "\n", "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -410,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -458,11 +458,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "als_diversity_eval = DiversityEvaluator(\n", + "als_diversity_eval = DiversityEvaluation(\n", " train_df = train_df, \n", " reco_df = top_k_reco,\n", " user_col=\"UserId\", \n", @@ -474,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -490,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -509,11 +509,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "random_diversity_eval = DiversityEvaluator(\n", + "random_diversity_eval = DiversityEvaluation(\n", " train_df = train_df, \n", " reco_df = pred_df, \n", " user_col=COL_USER, \n", @@ -525,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -541,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -554,94 +554,9 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DataAlgoKPrecision@kRecall@kNDCG@kMean average precisioncatalog_coveragedistributional_coveragenoveltydiversityserendipity
1100kals100.0519110.0175140.0474600.0057340.3819060.0097534.6049890.8920360.880336
2100krandom100.0194270.0073660.0194560.0022940.9975710.0128207.1649510.9221800.892728
\n", - "
" - ], - "text/plain": [ - " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", - "1 100k als 10 0.051911 0.017514 0.047460 0.005734 \n", - "2 100k random 10 0.019427 0.007366 0.019456 0.002294 \n", - "\n", - " catalog_coverage distributional_coverage novelty diversity serendipity \n", - "1 0.381906 0.009753 4.604989 0.892036 0.880336 \n", - "2 0.997571 0.012820 7.164951 0.922180 0.892728 " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_results" ] From f9b14d014585139838c8dfd1bf729370910a0446 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 02:28:39 +0000 Subject: [PATCH 24/33] fix input variable names --- .../evaluation/spark_diversity_evaluation.py | 139 +++++++++--------- .../test_spark_diversity_evaluation.py | 20 +-- 2 files changed, 82 insertions(+), 77 deletions(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index c84bea249e..c8548831f0 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -4,15 +4,20 @@ from pyspark.sql.types import * from pyspark.sql import functions as F +from reco_utils.common.constants import ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, +) + class DiversityEvaluation: def __init__( self, train_df, reco_df, - user_col="UserId", - item_col="ItemId", - relevance_col=None, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_relevance=None, ): """Diversity evaluator. train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. @@ -26,17 +31,17 @@ def __init__( Args: train_df (pySpark DataFrame): Training set used for the recommender, - containing user_col, item_col. + containing col_user, col_item. reco_df (pySpark DataFrame): Recommender's prediction output, - containing user_col, item_col, relevance_col (optional). - user_col (str): User id column name. - item_col (str): Item id column name. - relevance_col (str): this column indicates whether the recommended item is actually relevent to the user or not. + containing col_user, col_item, col_relevance (optional). + col_user (str): User id column name. + col_item (str): Item id column name. + col_relevance (str): this column indicates whether the recommended item is actually relevent to the user or not. """ - self.train_df = train_df.select(user_col, item_col) - self.user_col = user_col - self.item_col = item_col + self.train_df = train_df.select(col_user, col_item) + self.col_user = col_user + self.col_item = col_item self.sim_col = "sim" self.df_cosine_similariy = None self.df_user_item_serendipity = None @@ -49,22 +54,22 @@ def __init__( self.df_user_diversity = None self.df_diversity = None - if relevance_col is None: - self.relevance_col = "relevance" + if col_relevance is None: + self.col_relevance = "relevance" # relevance term, default is 1 (relevent) for all self.reco_df = reco_df.select( - user_col, item_col, F.lit(1.0).alias(self.relevance_col) + col_user, col_item, F.lit(1.0).alias(self.col_relevance) ) else: - self.relevance_col = relevance_col + self.col_relevance = col_relevance self.reco_df = reco_df.select( - user_col, item_col, F.col(self.relevance_col).cast(DoubleType()) + col_user, col_item, F.col(self.col_relevance).cast(DoubleType()) ) # check if reco_df contain any user_item pairs that are already shown train_df count_intersection = ( - self.train_df.select(self.user_col, self.item_col) - .intersect(self.reco_df.select(self.user_col, self.item_col)) + self.train_df.select(self.col_user, self.col_item) + .intersect(self.reco_df.select(self.col_user, self.col_item)) .count() ) @@ -75,42 +80,42 @@ def __init__( def _get_all_user_item_pairs(self, df): return ( - df.select(self.user_col) + df.select(self.col_user) .distinct() - .join(df.select(self.item_col).distinct()) + .join(df.select(self.col_item).distinct()) ) def _get_pairwise_items(self, df): return ( - df.select(self.user_col, F.col(self.item_col).alias("i1")) + df.select(self.col_user, F.col(self.col_item).alias("i1")) # get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1]) .join( df.select( - F.col(self.user_col).alias("_user"), - F.col(self.item_col).alias("i2"), + F.col(self.col_user).alias("_user"), + F.col(self.col_item).alias("i2"), ), - (F.col(self.user_col) == F.col("_user")) & (F.col("i1") <= F.col("i2")), - ).select(self.user_col, "i1", "i2") + (F.col(self.col_user) == F.col("_user")) & (F.col("i1") <= F.col("i2")), + ).select(self.col_user, "i1", "i2") ) def _get_cosine_similarity(self, n_partitions=200): if self.df_cosine_similariy is None: pairs = self._get_pairwise_items(df=self.train_df) - item_count = self.train_df.groupBy(self.item_col).count() + item_count = self.train_df.groupBy(self.col_item).count() self.df_cosine_similariy = ( pairs.groupBy("i1", "i2") .count() .join( item_count.select( - F.col(self.item_col).alias("i1"), + F.col(self.col_item).alias("i1"), F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"), ), on="i1", ) .join( item_count.select( - F.col(self.item_col).alias("i2"), + F.col(self.col_item).alias("i2"), F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"), ), on="i2", @@ -139,9 +144,9 @@ def _get_intralist_similarity(self, df): 0 ) # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items. e.g. i1 and i2 have never occurred together. .filter(F.col("i1") != F.col("i2")) - .groupBy(self.user_col) + .groupBy(self.col_user) .agg(F.mean(self.sim_col).alias("avg_il_sim")) - .select(self.user_col, "avg_il_sim") + .select(self.col_user, "avg_il_sim") ) return self.df_intralist_similarity @@ -149,7 +154,7 @@ def user_diversity(self): """Calculate average diversity for recommendations for each user. Returns: - pyspark.sql.dataframe.DataFrame: user_col, user_diversity + pyspark.sql.dataframe.DataFrame: col_user, user_diversity """ if self.df_user_diversity is None: self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df) @@ -157,8 +162,8 @@ def user_diversity(self): self.df_intralist_similarity.withColumn( "user_diversity", 1 - F.col("avg_il_sim") ) - .select(self.user_col, "user_diversity") - .orderBy(self.user_col) + .select(self.col_user, "user_diversity") + .orderBy(self.col_user) ) return self.df_user_diversity @@ -180,30 +185,30 @@ def item_novelty(self): """Calculate novelty for each item in the recommendations. Returns: - pyspark.sql.dataframe.DataFrame: item_col, item_novelty + pyspark.sql.dataframe.DataFrame: col_item, item_novelty """ if self.df_item_novelty is None: train_pairs = self._get_all_user_item_pairs(df=self.train_df) self.df_item_novelty = ( train_pairs.join( self.train_df.withColumn("seen", F.lit(1)), - on=[self.user_col, self.item_col], + on=[self.col_user, self.col_item], how="left", ) .filter(F.col("seen").isNull()) - .groupBy(self.item_col) + .groupBy(self.col_item) .count() .join( - self.reco_df.groupBy(self.item_col).agg( - F.count(self.user_col).alias("reco_count") + self.reco_df.groupBy(self.col_item).agg( + F.count(self.col_user).alias("reco_count") ), - on=self.item_col, + on=self.col_item, ) .withColumn( "item_novelty", -F.log2(F.col("reco_count") / F.col("count")) ) - .select(self.item_col, "item_novelty") - .orderBy(self.item_col) + .select(self.col_item, "item_novelty") + .orderBy(self.col_item) ) return self.df_item_novelty @@ -211,15 +216,15 @@ def user_novelty(self): """Calculate average item novelty for each user's recommendations. Returns: - pyspark.sql.dataframe.DataFrame: user_col, user_novelty + pyspark.sql.dataframe.DataFrame: col_user, user_novelty """ if self.df_user_novelty is None: self.df_item_novelty = self.item_novelty() self.df_user_novelty = ( - self.reco_df.join(self.df_item_novelty, on=self.item_col) - .groupBy(self.user_col) + self.reco_df.join(self.df_item_novelty, on=self.col_item) + .groupBy(self.col_user) .agg(F.mean("item_novelty").alias("user_novelty")) - .orderBy(self.user_col) + .orderBy(self.col_user) ) return self.df_user_novelty @@ -241,29 +246,29 @@ def user_item_serendipity(self): """Calculate serendipity of each item in the recommendations for each user. Returns: - pyspark.sql.dataframe.DataFrame: user_col, item_col, user_item_serendipity + pyspark.sql.dataframe.DataFrame: col_user, col_item, user_item_serendipity """ - # for every user_col, item_col in reco_df, join all interacted items from train_df. + # for every col_user, col_item in reco_df, join all interacted items from train_df. # These interacted items are repeated for each item in reco_df for a specific user. if self.df_user_item_serendipity is None: self.df_cosine_similariy = self._get_cosine_similarity().orderBy("i1", "i2") self.df_user_item_serendipity = ( self.reco_df.withColumn( - "reco_item", F.col(self.item_col) - ) # duplicate item_col to keep + "reco_item", F.col(self.col_item) + ) # duplicate col_item to keep .select( - self.user_col, + self.col_user, "reco_item", - F.col(self.item_col).alias("reco_item_tmp"), + F.col(self.col_item).alias("reco_item_tmp"), ) .join( self.train_df.select( - self.user_col, F.col(self.item_col).alias("train_item_tmp") + self.col_user, F.col(self.col_item).alias("train_item_tmp") ), - on=[self.user_col], + on=[self.col_user], ) .select( - self.user_col, + self.col_user, "reco_item", F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias( "i1" @@ -274,16 +279,16 @@ def user_item_serendipity(self): ) .join(self.df_cosine_similariy, on=["i1", "i2"], how="left") .fillna(0) - .groupBy(self.user_col, F.col("reco_item").alias(self.item_col)) + .groupBy(self.col_user, F.col("reco_item").alias(self.col_item)) .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) - .join(self.reco_df, on=[self.user_col, self.item_col]) + .join(self.reco_df, on=[self.col_user, self.col_item]) .withColumn( "user_item_serendipity", (1 - F.col("avg_item2interactedHistory_sim")) - * F.col(self.relevance_col), + * F.col(self.col_relevance), ) - .select(self.user_col, self.item_col, "user_item_serendipity") - .orderBy(self.user_col, self.item_col) + .select(self.col_user, self.col_item, "user_item_serendipity") + .orderBy(self.col_user, self.col_item) ) return self.df_user_item_serendipity @@ -291,14 +296,14 @@ def user_serendipity(self): """Calculate average serendipity for each user's recommendations. Returns: - pyspark.sql.dataframe.DataFrame: user_col, user_serendipity + pyspark.sql.dataframe.DataFrame: col_user, user_serendipity """ if self.df_user_serendipity is None: self.df_user_item_serendipity = self.user_item_serendipity() self.df_user_serendipity = ( - self.df_user_item_serendipity.groupBy(self.user_col) + self.df_user_item_serendipity.groupBy(self.col_user) .agg(F.mean("user_item_serendipity").alias("user_serendipity")) - .orderBy(self.user_col) + .orderBy(self.col_user) ) return self.df_user_serendipity @@ -326,10 +331,10 @@ def catalog_coverage(self): float: catalog coverage """ # distinct item count in reco_df - count_distinct_item_reco = self.reco_df.select(self.item_col).distinct().count() + count_distinct_item_reco = self.reco_df.select(self.col_item).distinct().count() # distinct item count in train_df count_distinct_item_train = ( - self.train_df.select(self.item_col).distinct().count() + self.train_df.select(self.col_item).distinct().count() ) # cagalog coverage @@ -342,11 +347,11 @@ def distributional_coverage(self): Returns: float: distributional coverage """ - # In reco_df, how many times each item_col is being recommended - df_itemcnt_reco = self.reco_df.groupBy(self.item_col).count() + # In reco_df, how many times each col_item is being recommended + df_itemcnt_reco = self.reco_df.groupBy(self.col_item).count() # distinct item count in train_df count_distinct_item_train = ( - self.train_df.select(self.item_col).distinct().count() + self.train_df.select(self.col_item).distinct().count() ) # the number of total recommendations count_row_reco = self.reco_df.count() diff --git a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py index dd8fa23ec2..217056ee63 100644 --- a/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py +++ b/tests/unit/reco_utils/evaluation/test_spark_diversity_evaluation.py @@ -70,7 +70,7 @@ def spark_data(spark): def test_catalog_coverage(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') c_coverage = evaluator.catalog_coverage() assert c_coverage == target_metrics["c_coverage"] @@ -78,7 +78,7 @@ def test_catalog_coverage(spark_data, target_metrics): def test_distributional_coverage(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') d_coverage = evaluator.distributional_coverage() assert d_coverage == target_metrics["d_coverage"] @@ -86,7 +86,7 @@ def test_distributional_coverage(spark_data, target_metrics): def test_item_novelty(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.item_novelty().toPandas() assert_frame_equal(target_metrics["item_novelty"], actual, check_exact=False, check_less_precise=4) @@ -94,7 +94,7 @@ def test_item_novelty(spark_data, target_metrics): def test_user_novelty(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.user_novelty().toPandas() assert_frame_equal(target_metrics["user_novelty"], actual, check_exact=False, check_less_precise=4) @@ -102,7 +102,7 @@ def test_user_novelty(spark_data, target_metrics): def test_novelty(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.novelty().toPandas() assert_frame_equal(target_metrics["novelty"], actual, check_exact=False, check_less_precise=4) @@ -110,7 +110,7 @@ def test_novelty(spark_data, target_metrics): def test_user_diversity(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.user_diversity().toPandas() assert_frame_equal(target_metrics["user_diversity"], actual, check_exact=False, check_less_precise=4) @@ -118,7 +118,7 @@ def test_user_diversity(spark_data, target_metrics): def test_diversity(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.diversity().toPandas() assert_frame_equal(target_metrics["diversity"], actual,check_exact=False, check_less_precise=4) @@ -126,7 +126,7 @@ def test_diversity(spark_data, target_metrics): def test_user_item_serendipity(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.user_item_serendipity().toPandas() assert_frame_equal(target_metrics["user_item_serendipity"], actual, check_exact=False, check_less_precise=4) @@ -134,7 +134,7 @@ def test_user_item_serendipity(spark_data, target_metrics): def test_user_serendipity(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.user_serendipity().toPandas() assert_frame_equal(target_metrics["user_serendipity"], actual, check_exact=False, check_less_precise=4) @@ -142,6 +142,6 @@ def test_user_serendipity(spark_data, target_metrics): def test_serendipity(spark_data, target_metrics): train_df, reco_df = spark_data evaluator = DiversityEvaluation(train_df=train_df, reco_df=reco_df, - user_col='UserId', item_col='ItemId') + col_user='UserId', col_item='ItemId') actual = evaluator.serendipity().toPandas() assert_frame_equal(target_metrics["serendipity"], actual, check_exact=False, check_less_precise=4) From c9e85da085494e673f32ee42daf80e8c09d93f32 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 02:40:34 +0000 Subject: [PATCH 25/33] fix typo --- reco_utils/evaluation/spark_diversity_evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index c8548831f0..314f3e74c1 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -36,7 +36,7 @@ def __init__( containing col_user, col_item, col_relevance (optional). col_user (str): User id column name. col_item (str): Item id column name. - col_relevance (str): this column indicates whether the recommended item is actually relevent to the user or not. + col_relevance (str): This column indicates whether the recommended item is actually relevant to the user or not. """ self.train_df = train_df.select(col_user, col_item) From 8a126e1814257e1d94ca7bcc6ffae04c11fbec0d Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 03:01:41 +0000 Subject: [PATCH 26/33] improve docstring --- reco_utils/evaluation/spark_diversity_evaluation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index 314f3e74c1..6da64d500e 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -23,10 +23,10 @@ def __init__( train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. Metrics includes: - Coverage - The proportion of items that can be recommended. It includes two metrics: catalog_coverage and distributional_coverage. - Novelty - A more novel item indicates it is less popular. + Coverage - The proportion of items that can be recommended. It includes two metrics: (1) catalog_coverage, which measures the proportion of items that get recommended from the item catalog; (2) distributional_coverage, which measures how unequally different items are recommended in the recommendations to all users. + Novelty - A more novel item indicates it is less popular, i.e., it gets recommended less frequently. Diversity - The dissimilarity of items being recommended. - Serendipity - The “unusualness” or “surprise” of recommendations to a user. + Serendipity - The "unusualness" or "surprise" of recommendations to a user. When 'col_relevance' is used, it indicates how "pleasant surprise" of recommendations is to a user. Args: From f3148e73d23b6b98bdc8c8f60e724d630b15cd4c Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 03:03:21 +0000 Subject: [PATCH 27/33] fix --- reco_utils/evaluation/spark_diversity_evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index 6da64d500e..fc7f351f09 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -22,7 +22,7 @@ def __init__( """Diversity evaluator. train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. - Metrics includes: + Metrics include: Coverage - The proportion of items that can be recommended. It includes two metrics: (1) catalog_coverage, which measures the proportion of items that get recommended from the item catalog; (2) distributional_coverage, which measures how unequally different items are recommended in the recommendations to all users. Novelty - A more novel item indicates it is less popular, i.e., it gets recommended less frequently. Diversity - The dissimilarity of items being recommended. From 5501d6f35798ed26f1b8d281816bba43f967a8a6 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 03:41:13 +0000 Subject: [PATCH 28/33] improve ALS example notebook --- .../als_movielens_diversity_metrics.ipynb | 174 ++++++++++++++---- 1 file changed, 143 insertions(+), 31 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 32bdc05870..d24e16f620 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -18,11 +18,21 @@ "\n", "We demonstrate how to evaluate a recommender using diversity metrics in addition to commonly used rating/ranking metrics.\n", "\n", + "Diversity metrics include:\n", + "- Coverage - The proportion of items that can be recommended. It includes two metrics: \n", + " - (1) catalog_coverage, which measures the proportion of items that get recommended from the item catalog; \n", + " - (2) distributional_coverage, which measures how unequally different items are recommended in the recommendations to all users.\n", + "- Novelty - A more novel item indicates it is less popular, i.e., it gets recommended less frequently.\n", + "- Diversity - The dissimilarity of items being recommended.\n", + "- Serendipity - The \"unusualness\" or \"surprise\" of recommendations to a user.\n", + "\n", "We compare the performance of two algorithms: ALS recommender and a random recommender. \n", " - Matrix factorization by [ALS](https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/recommendation.html#ALS) (Alternating Least Squares) is a well known collaborative filtering algorithm.\n", " - We also define a random recommender which randomly recommends unseen items to each user. \n", " \n", - "The comparision results show that ALS recommender outperforms random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while random recommender outperforms ALS recommender on diversity metrics (catalog_coverage, distributional_coverage, novelty, diversity, and serendipity)." + "The comparision results show that ALS recommender outperforms random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while random recommender outperforms ALS recommender on diversity metrics. Why ALS performs better on ranking metrics while worse on diversity metrics? ALS is optimized for estimating the item rating as accurate as possible, therefore it performs well on accuracy metrics including precision, recall, etc. Ranking metrics are built upoin these accuracy metrics. As a side effect, the items being recommended tend to be popular items, which are the items mostly sold or viewed. It leaves the long-tail less popular items having less chance to get introduced to the users. This is the reason why ALS is not as well performing as a random recommender on diversity metrics. \n", + "\n", + "We understand that there is usually a trade-off between one metric and the other. We should decide which set of metrics to optimize based on business scenarios." ] }, { @@ -34,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -84,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -115,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -136,14 +146,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.8kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.3kKB/s]\n" ] }, { @@ -194,6 +204,24 @@ "data.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get all possible user-item pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "users = data.select(COL_USER).distinct()\n", + "items = data.select(COL_ITEM).distinct()\n", + "user_item = users.crossJoin(items)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -265,7 +293,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Took 3.8314080880008987 seconds for training.\n" + "Took 3.6067020109985606 seconds for training.\n" ] } ], @@ -295,15 +323,12 @@ "output_type": "stream", "text": [ "1477928\n", - "9448\n" + "9430\n" ] } ], "source": [ - "# Get the cross join of all user-item pairs and score them.\n", - "users = train.select(COL_USER).distinct()\n", - "items = train.select(COL_ITEM).distinct()\n", - "user_item = users.crossJoin(items)\n", + "# Score all user-item pairs\n", "dfs_pred = model.transform(user_item)\n", "\n", "# Remove seen items.\n", @@ -319,7 +344,7 @@ "print(top_all.count())\n", " \n", "window = Window.partitionBy(COL_USER).orderBy(F.col(\"prediction\").desc()) \n", - "top_k_reco = top_all.select(\"*\", F.rank().over(window).alias(\"rank\")).filter(F.col(\"rank\") <= 10).drop(\"rank\")\n", + "top_k_reco = top_all.select(\"*\", F.row_number().over(window).alias(\"rank\")).filter(F.col(\"rank\") <= 10).drop(\"rank\")\n", " \n", "print(top_k_reco.count())" ] @@ -355,12 +380,7 @@ "pred_df = (\n", " train_df\n", " # join training data with all possible user-item pairs (seen in training)\n", - " .join(train_df\n", - " .select(COL_USER)\n", - " .distinct()\n", - " .join(train_df\n", - " .select(COL_ITEM)\n", - " .distinct()),\n", + " .join(user_item,\n", " on=[COL_USER, COL_ITEM],\n", " how=\"right\"\n", " )\n", @@ -458,15 +478,15 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "als_diversity_eval = DiversityEvaluation(\n", " train_df = train_df, \n", " reco_df = top_k_reco,\n", - " user_col=\"UserId\", \n", - " item_col=\"MovieId\"\n", + " col_user=\"UserId\", \n", + " col_item=\"MovieId\"\n", ")\n", "\n", "als_diversity_metrics = get_diversity_results(als_diversity_eval)" @@ -474,7 +494,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -490,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -509,15 +529,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "random_diversity_eval = DiversityEvaluation(\n", " train_df = train_df, \n", " reco_df = pred_df, \n", - " user_col=COL_USER, \n", - " item_col=COL_ITEM\n", + " col_user=COL_USER, \n", + " col_item=COL_ITEM\n", ")\n", " \n", "random_diversity_metrics = get_diversity_results(random_diversity_eval)" @@ -525,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -541,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -554,13 +574,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DataAlgoKPrecision@kRecall@kNDCG@kMean average precisioncatalog_coveragedistributional_coveragenoveltydiversityserendipity
1100kals100.0519110.0175140.0474600.0057340.3819060.0097514.6054850.8924490.880236
2100krandom100.0168790.0066590.0181810.0019411.0188220.0128467.1921310.9268000.896563
\n", + "
" + ], + "text/plain": [ + " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", + "1 100k als 10 0.051911 0.017514 0.047460 0.005734 \n", + "2 100k random 10 0.016879 0.006659 0.018181 0.001941 \n", + "\n", + " catalog_coverage distributional_coverage novelty diversity serendipity \n", + "1 0.381906 0.009751 4.605485 0.892449 0.880236 \n", + "2 1.018822 0.012846 7.192131 0.926800 0.896563 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_results" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: the value range of catalog_coverage is [0, 1] with the assumption that training data contains all catalog items. In the above results we observe that catalog_coverage is greater than 1 for random recommender. This is because the above assumption is violated, i.e., the training data does not contain the full catalog of items while the recommender randomly recommends items that are not in the training data. " + ] + }, { "cell_type": "code", "execution_count": 16, From 334e2ba8174faf0ba50f89a8e3f13a11a40a50ba Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 14:41:47 +0000 Subject: [PATCH 29/33] improve ALS example notebook --- .../als_movielens_diversity_metrics.ipynb | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index d24e16f620..f9f34c6d6c 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -60,7 +60,6 @@ "source": [ "# set the environment path to find Recommenders\n", "import sys\n", - "sys.path.append(\"../../\")\n", "import pyspark\n", "from pyspark.ml.recommendation import ALS\n", "import pyspark.sql.functions as F\n", @@ -89,6 +88,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "\n", "Set the default parameters." ] }, @@ -153,7 +153,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.3kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 17.5kKB/s]\n" ] }, { @@ -204,24 +204,6 @@ "data.show()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get all possible user-item pairs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "users = data.select(COL_USER).distinct()\n", - "items = data.select(COL_ITEM).distinct()\n", - "user_item = users.crossJoin(items)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -231,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -249,6 +231,31 @@ "print (\"N test\", test.cache().count())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get all possible user-item pairs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: We have the assumption that training data contains all users and all catalog items. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "users = train.select(COL_USER).distinct()\n", + "items = train.select(COL_ITEM).distinct()\n", + "user_item = users.crossJoin(items)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -293,7 +300,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Took 3.6067020109985606 seconds for training.\n" + "Took 3.2652852770006575 seconds for training.\n" ] } ], @@ -633,15 +640,15 @@ " 100k\n", " random\n", " 10\n", - " 0.016879\n", - " 0.006659\n", - " 0.018181\n", - " 0.001941\n", - " 1.018822\n", - " 0.012846\n", - " 7.192131\n", - " 0.926800\n", - " 0.896563\n", + " 0.016773\n", + " 0.006561\n", + " 0.017131\n", + " 0.002134\n", + " 0.996964\n", + " 0.012811\n", + " 7.160845\n", + " 0.923641\n", + " 0.894481\n", " \n", " \n", "\n", @@ -650,11 +657,11 @@ "text/plain": [ " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", "1 100k als 10 0.051911 0.017514 0.047460 0.005734 \n", - "2 100k random 10 0.016879 0.006659 0.018181 0.001941 \n", + "2 100k random 10 0.016773 0.006561 0.017131 0.002134 \n", "\n", " catalog_coverage distributional_coverage novelty diversity serendipity \n", "1 0.381906 0.009751 4.605485 0.892449 0.880236 \n", - "2 1.018822 0.012846 7.192131 0.926800 0.896563 " + "2 0.996964 0.012811 7.160845 0.923641 0.894481 " ] }, "execution_count": 21, @@ -666,16 +673,9 @@ "df_results" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: the value range of catalog_coverage is [0, 1] with the assumption that training data contains all catalog items. In the above results we observe that catalog_coverage is greater than 1 for random recommender. This is because the above assumption is violated, i.e., the training data does not contain the full catalog of items while the recommender randomly recommends items that are not in the training data. " - ] - }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ From cda1bbc966d5e67a9045f94cf8f60dc75b94e12e Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Wed, 9 Jun 2021 14:42:10 +0000 Subject: [PATCH 30/33] optimize code --- reco_utils/evaluation/spark_diversity_evaluation.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index fc7f351f09..5ca827f3ac 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -253,13 +253,10 @@ def user_item_serendipity(self): if self.df_user_item_serendipity is None: self.df_cosine_similariy = self._get_cosine_similarity().orderBy("i1", "i2") self.df_user_item_serendipity = ( - self.reco_df.withColumn( - "reco_item", F.col(self.col_item) - ) # duplicate col_item to keep - .select( + self.reco_df.select( self.col_user, - "reco_item", - F.col(self.col_item).alias("reco_item_tmp"), + self.col_item, + F.col(self.col_item).alias("reco_item_tmp"), # duplicate col_item to keep ) .join( self.train_df.select( @@ -269,7 +266,7 @@ def user_item_serendipity(self): ) .select( self.col_user, - "reco_item", + self.col_item, F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias( "i1" ), @@ -279,7 +276,7 @@ def user_item_serendipity(self): ) .join(self.df_cosine_similariy, on=["i1", "i2"], how="left") .fillna(0) - .groupBy(self.col_user, F.col("reco_item").alias(self.col_item)) + .groupBy(self.col_user, self.col_item) .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim")) .join(self.reco_df, on=[self.col_user, self.col_item]) .withColumn( From 747faf1f933280e443549e545d573094c52af04a Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Thu, 10 Jun 2021 13:06:21 +0000 Subject: [PATCH 31/33] fix --- .../evaluation/spark_diversity_evaluation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index 5ca827f3ac..57e83c46f6 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -154,7 +154,7 @@ def user_diversity(self): """Calculate average diversity for recommendations for each user. Returns: - pyspark.sql.dataframe.DataFrame: col_user, user_diversity + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, user_diversity. """ if self.df_user_diversity is None: self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df) @@ -171,7 +171,7 @@ def diversity(self): """Calculate average diversity for recommendations across all users. Returns: - pyspark.sql.dataframe.DataFrame: diversity + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: diversity. """ if self.df_diversity is None: self.df_user_diversity = self.user_diversity() @@ -185,7 +185,7 @@ def item_novelty(self): """Calculate novelty for each item in the recommendations. Returns: - pyspark.sql.dataframe.DataFrame: col_item, item_novelty + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_item, item_novelty. """ if self.df_item_novelty is None: train_pairs = self._get_all_user_item_pairs(df=self.train_df) @@ -216,7 +216,7 @@ def user_novelty(self): """Calculate average item novelty for each user's recommendations. Returns: - pyspark.sql.dataframe.DataFrame: col_user, user_novelty + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, user_novelty. """ if self.df_user_novelty is None: self.df_item_novelty = self.item_novelty() @@ -232,7 +232,7 @@ def novelty(self): """Calculate average novelty for recommendations across all users. Returns: - pyspark.sql.dataframe.DataFrame: novelty + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: novelty. """ if self.df_novelty is None: self.df_user_novelty = self.user_novelty() @@ -246,7 +246,7 @@ def user_item_serendipity(self): """Calculate serendipity of each item in the recommendations for each user. Returns: - pyspark.sql.dataframe.DataFrame: col_user, col_item, user_item_serendipity + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, col_item, user_item_serendipity. """ # for every col_user, col_item in reco_df, join all interacted items from train_df. # These interacted items are repeated for each item in reco_df for a specific user. @@ -293,7 +293,7 @@ def user_serendipity(self): """Calculate average serendipity for each user's recommendations. Returns: - pyspark.sql.dataframe.DataFrame: col_user, user_serendipity + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, user_serendipity. """ if self.df_user_serendipity is None: self.df_user_item_serendipity = self.user_item_serendipity() @@ -308,7 +308,7 @@ def serendipity(self): """Calculate average serentipity for recommendations across all users. Returns: - pyspark.sql.dataframe.DataFrame: serendipity + pyspark.sql.dataframe.DataFrame: A dataframe with following columns: serendipity. """ if self.df_serendipity is None: self.df_user_serendipity = self.user_serendipity() From fa1c4f88a895e372eb0cae44653c31ad2a3536a1 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Thu, 10 Jun 2021 16:01:00 +0000 Subject: [PATCH 32/33] add reference --- .../als_movielens_diversity_metrics.ipynb | 13 ++++++++++++ .../evaluation/spark_diversity_evaluation.py | 20 ++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index f9f34c6d6c..2afbeb6c48 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -673,6 +673,19 @@ "df_results" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reference\n", + "The metric definitions/formulations are based on following reference with modification:\n", + "- G. Shani and A. Gunawardana, Evaluating Recommendation Systems, Recommender Systems Handbook pp. 257-297, 2010.\n", + "- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012\n", + "- P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems: choice, discovery and relevance, ECIR 2011\n", + "- Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems, towards data science, April 2020\n", + "- N. Hurley and M. Zhang, Novelty and diversity in top-n recommendation--analysis and evaluation, ACM Transactions, 2011" + ] + }, { "cell_type": "code", "execution_count": 22, diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index 57e83c46f6..316d747e6a 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -26,8 +26,19 @@ def __init__( Coverage - The proportion of items that can be recommended. It includes two metrics: (1) catalog_coverage, which measures the proportion of items that get recommended from the item catalog; (2) distributional_coverage, which measures how unequally different items are recommended in the recommendations to all users. Novelty - A more novel item indicates it is less popular, i.e., it gets recommended less frequently. Diversity - The dissimilarity of items being recommended. - Serendipity - The "unusualness" or "surprise" of recommendations to a user. When 'col_relevance' is used, it indicates how "pleasant surprise" of recommendations is to a user. + Serendipity - The "unusualness" or "surprise" of recommendations to a user. When 'col_relevance' is used, it indicates how "pleasant surprise" of recommendations is to a user. + Info: + The metric definitions/formulations are based on following reference with modification: + - G. Shani and A. Gunawardana, Evaluating Recommendation Systems, Recommender Systems Handbook pp. 257-297, 2010. + + - Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012 + + - P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems: choice, discovery and relevance, ECIR 2011 + + - Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems, towards data science, April 2020 + + - N. Hurley and M. Zhang, Novelty and diversity in top-n recommendation--analysis and evaluation, ACM Transactions, 2011 Args: train_df (pySpark DataFrame): Training set used for the recommender, @@ -256,7 +267,9 @@ def user_item_serendipity(self): self.reco_df.select( self.col_user, self.col_item, - F.col(self.col_item).alias("reco_item_tmp"), # duplicate col_item to keep + F.col(self.col_item).alias( + "reco_item_tmp" + ), # duplicate col_item to keep ) .join( self.train_df.select( @@ -321,9 +334,6 @@ def serendipity(self): def catalog_coverage(self): """Calculate catalog coverage for recommendations across all users. - Info: - G. Shani and A. Gunawardana, Evaluating Recommendation Systems, Recommender Systems Handbook pp. 257-297, 2010. - Returns: float: catalog coverage """ From dcb1b1a2b9119b62932752669b373370bb6980a5 Mon Sep 17 00:00:00 2001 From: YanZhangADS Date: Fri, 11 Jun 2021 19:15:21 +0000 Subject: [PATCH 33/33] improve docstring --- reco_utils/evaluation/spark_diversity_evaluation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/reco_utils/evaluation/spark_diversity_evaluation.py b/reco_utils/evaluation/spark_diversity_evaluation.py index 316d747e6a..1fae0dfbf1 100644 --- a/reco_utils/evaluation/spark_diversity_evaluation.py +++ b/reco_utils/evaluation/spark_diversity_evaluation.py @@ -11,6 +11,8 @@ class DiversityEvaluation: + """Spark Diversity Evaluator""" + def __init__( self, train_df, @@ -19,10 +21,10 @@ def __init__( col_item=DEFAULT_ITEM_COL, col_relevance=None, ): - """Diversity evaluator. - train (train_df) and recommendation (reco_df) dataframes should already be groupped by user-item pair. + """Initializer. - Metrics include: + This is the Spark version of diversity metrics evaluator. + The methods of this class calculate following diversity metrics: Coverage - The proportion of items that can be recommended. It includes two metrics: (1) catalog_coverage, which measures the proportion of items that get recommended from the item catalog; (2) distributional_coverage, which measures how unequally different items are recommended in the recommendations to all users. Novelty - A more novel item indicates it is less popular, i.e., it gets recommended less frequently. Diversity - The dissimilarity of items being recommended.