recommenders-team · miguelgfierro · Sep 25, 2020 · Jul 25, 2020 · Jul 25, 2020 · Jul 25, 2020
@@ -156,6 +156,17 @@ ml-20m/
 *.model
 *.mml
 nohup.out
+
+#####  kdd 2020 tutorial data folder
+scenarios/KDD2020-tutorial/data_folder.zip
+scenarios/KDD2020-tutorial/data_folder/
+scenarios/KDD2020-tutorial/.ipynb_checkpoints/
+scenarios/academic/KDD2020-tutorial/data_folder/
+scenarios/academic/KDD2020-tutorial/data_folder.zip
+scenarios/academic/KDD2020-tutorial/.ipynb_checkpoints/
+
 *.vec
 *.tsv
 *.sh
+
+tests/resources/
@@ -390,12 +390,19 @@
     "\\[3\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html <br>\n",
     "\\[4\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python (reco_gpu)",
+   "display_name": "reco_gpu",
    "language": "python",
    "name": "reco_gpu"
   },
@@ -409,7 +416,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.6.8"
   },
   "pycharm": {
    "stem_cell": {

@@ -798,12 +798,19 @@
     "\n",
     "2. LightGCN implementation [TensorFlow]: https://github.com/kuandeng/lightgcn"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.5",
    "language": "python",
    "name": "python3"
   },
@@ -817,7 +824,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.5.6"
   }
  },
  "nbformat": 4,

@@ -176,7 +176,7 @@ def create_norm_adj_mat(self):
         print("Already create adjacency matrix.")
 
         rowsum = np.array(adj_mat.sum(1))
-        d_inv = np.power(rowsum, -0.5).flatten()
+        d_inv = np.power(rowsum + 1e-9, -0.5).flatten()
         d_inv[np.isinf(d_inv)] = 0.0
         d_mat_inv = sp.diags(d_inv)
         norm_adj_mat = d_mat_inv.dot(adj_mat)

@@ -0,0 +1,103 @@
+
+import tensorflow as tf
+import numpy as np
+
+from reco_utils.recommender.deeprec.io.dkn_iterator import DKNTextIterator
+
+r"""
+This new iterator is for DKN's item-to-item recommendations version.
+The tutorial can be found at: https://github.com/microsoft/recommenders/blob/kdd2020_tutorial/scenarios/academic/KDD2020-tutorial/step4_run_dkn_item2item.ipynb
+ """
+class DKNItem2itemTextIterator(DKNTextIterator):
+    def __init__(self, hparams, graph):
+        """
+        Compared with user-to-item recommendations, we don't need the user behavior module.
+        So the placeholder can be simplified from the original DKNTextIterator.
+        """
+        self.hparams = hparams
+        self.graph = graph
+        self.neg_num = hparams.neg_num
+        self.batch_size = hparams.batch_size * (self.neg_num + 2)
+        self.doc_size = hparams.doc_size
+        with self.graph.as_default():
+            self.candidate_news_index_batch = tf.placeholder(
+                tf.int64,
+                [self.batch_size, self.doc_size],
+                name="candidate_news_index"
+            )
+            self.candidate_news_entity_index_batch = tf.placeholder(
+                tf.int64,
+                [self.batch_size, self.doc_size],
+                name="candidate_news_entity_index",
+            )
+
+        self._loading_nessary_files()
+
+    def _loading_nessary_files(self):
+        """
+        Only one feature file is needed:  news_feature_file
+        This function loads the news article's features into two dictionaries: self.news_word_index and self.news_entity_index
+        """
+        hparams = self.hparams
+        self.news_word_index = {}
+        self.news_entity_index = {}
+        with open(hparams.news_feature_file, "r") as rd:
+            while True:
+                line = rd.readline()
+                if not line:
+                    break
+                newsid, word_index, entity_index = line.strip().split(' ')
+                self.news_word_index[newsid] = [int(item) for item in word_index.split(',')]
+                self.news_entity_index[newsid] = [int(item) for item in entity_index.split(',')]
+
+
+    def load_data_from_file(self, infile):
+        """
+        Each line of infile is a news article's ID
+        This function will return a mini-batch of data with features, 
+        by looking up news_word_index dictionary and news_entity_index dictionary according to the news article's ID.
+        """
+        newsid_list = []
+        candidate_news_index_batch = []
+        candidate_news_entity_index_batch = []
+        cnt = 0
+        with open(infile, "r") as rd:
+            while True:
+                line = rd.readline()
+                if not line:
+                    break
+                newsid = line.strip()
+                word_index, entity_index = self.news_word_index[newsid], self.news_entity_index[newsid]
+                newsid_list.append(newsid)
+
+                candidate_news_index_batch.append(word_index)
+                candidate_news_entity_index_batch.append(entity_index)
+
+                cnt += 1
+                if cnt >= self.batch_size:
+                    res = self._convert_infer_data(
+                        candidate_news_index_batch,
+                        candidate_news_entity_index_batch,
+                    )
+                    data_size = self.batch_size
+                    yield self.gen_infer_feed_dict(res), newsid_list, data_size
+                    candidate_news_index_batch = []
+                    candidate_news_entity_index_batch = []
+                    newsid_list = []
+                    cnt = 0
+
+            if cnt > 0:
+                data_size = cnt
+                while cnt < self.batch_size:
+                    candidate_news_index_batch.append(
+                        candidate_news_index_batch[cnt % data_size]
+                    )
+                    candidate_news_entity_index_batch.append(
+                        candidate_news_entity_index_batch[cnt % data_size]
+                    )
+                    cnt += 1
+                res = self._convert_infer_data(
+                    candidate_news_index_batch,
+                    candidate_news_entity_index_batch,
+                )
+                yield self.gen_infer_feed_dict(res), newsid_list, data_size
@@ -0,0 +1,98 @@
+import tensorflow as tf
+from reco_utils.recommender.deeprec.models.dkn import DKN
+import numpy as np
+from reco_utils.recommender.deeprec.deeprec_utils import cal_metric
+
+r"""
+This new model adapts DKN's structure for item-to-item recommendations.
+The tutorial can be found at: https://github.com/microsoft/recommenders/blob/kdd2020_tutorial/scenarios/academic/KDD2020-tutorial/step4_run_dkn_item2item.ipynb
+ """
+class DKNItem2Item(DKN):
+    def _compute_data_loss(self):
+        logits = self.pred
+        data_loss = -1 * tf.reduce_sum(tf.math.log(logits[:, 0] + 1e-10))
+        return data_loss
+
+    def _build_dkn(self):
+        """The main function to create DKN's logic.
+
+        Returns:
+            obj: Prediction of item2item relation scores made by the DKN model, in the shape of (batch_size, num_negative + 1).
+        """
+        news_field_embed_final_batch = self._build_doc_embedding(
+            self.iterator.candidate_news_index_batch,
+            self.iterator.candidate_news_entity_index_batch
+        )
+
+        self.news_field_embed_final_batch = tf.math.l2_normalize(
+            news_field_embed_final_batch, axis=-1, epsilon=1e-12
+        )
+
+        item_embs_train = tf.reshape(
+            self.news_field_embed_final_batch,
+            [-1, self.iterator.neg_num + 2, self.news_field_embed_final_batch.shape[-1]]
+        )  # (B, group, D)
+
+        item_embs_source = item_embs_train[:, 0, :]  # get the source item
+        item_embs_source = tf.expand_dims(item_embs_source, 1)
+
+        item_embs_target = item_embs_train[:, 1:, :]
+
+        item_relation = tf.math.multiply(
+            item_embs_target,
+            item_embs_source
+        )
+        item_relation = tf.reduce_sum(
+            item_relation,
+            -1
+        )#(B, neg_num + 1)
+
+        self.pred_logits = item_relation
+
+        return self.pred_logits
+
+    def _get_pred(self, logit, task):
+        return tf.nn.softmax(logit, axis=-1)
+
+    def _build_doc_embedding(self, candidate_word_batch, candidate_entity_batch):
+        """
+        To make the document embedding be dense, we add one tanh layer on top of the kims_cnn module.
+        """
+        with tf.variable_scope("kcnn", initializer=self.initializer):
+            news_field_embed = self._kims_cnn(candidate_word_batch, candidate_entity_batch, self.hparams)
+            W = tf.get_variable(
+                name="W_doc_trans",
+                shape=(news_field_embed.shape[-1], self.num_filters_total),
+                dtype=tf.float32,
+                initializer=tf.contrib.layers.xavier_initializer(uniform=False),
+            )
+            if W not in self.layer_params:
+                self.layer_params.append(W)
+            news_field_embed = tf.tanh(tf.matmul(news_field_embed, W))
+        return news_field_embed
+
+    def eval(self, sess, feed_dict):
+        feed_dict[self.layer_keeps] = self.keep_prob_test
+        feed_dict[self.is_train_stage] = False
+        preds = sess.run(self.pred, feed_dict=feed_dict)
+        labels = np.zeros_like(preds, dtype=np.int32)
+        labels[:, 0] = 1
+        return (preds, labels)
+
+    def run_eval(self, filename):
+        load_sess = self.sess
+        group_preds = []
+        group_labels = []
+
+        for batch_data_input, newsid_list, data_size in self.iterator.load_data_from_file(
+                filename
+        ):
+            if batch_data_input:
+                step_pred, step_labels = self.eval(load_sess, batch_data_input)
+                group_preds.extend(step_pred)
+                group_labels.extend(step_labels)
+
+        res = cal_metric(
+            group_labels, group_preds, self.hparams.pairwise_metrics
+        )
+        return res
@@ -222,9 +222,12 @@ def fit(self):
 
             if self.save_model and epoch % self.save_epoch == 0:
                 save_path_str = os.path.join(self.model_dir, "epoch_" + str(epoch))
+                if not os.path.exists(save_path_str):
+                    os.makedirs(save_path_str)
                 checkpoint_path = self.saver.save(
                     sess=self.sess, save_path=save_path_str
                 )
+                print('Save model to path {0}'.format(os.path.abspath(save_path_str)))
 
             if self.eval_epoch == -1 or epoch % self.eval_epoch != 0:
                 print(
@@ -380,6 +383,13 @@ def recommend_k_items(
 
         return df.replace(-np.inf, np.nan).dropna()
 
+    def output_embeddings(self, idmapper, n, target, user_file):
+        embeddings = list(target.eval(session=self.sess))
+        with open(user_file, 'w') as wt:
+            for i in range(n):
+                wt.write('{0}\t{1}\n'.format(idmapper[i], ' '.join([str(a) for a in embeddings[i]])))
+
+
     def infer_embedding(self, user_file, item_file):
         """Export user and item embeddings to csv files.
 
@@ -398,18 +408,5 @@ def infer_embedding(self, user_file, item_file):
 
         data = self.data
 
-        df = pd.DataFrame(
-            {
-                data.col_user: [data.id2user[id] for id in range(self.n_users)],
-                "embedding": list(self.ua_embeddings.eval(session=self.sess)),
-            }
-        )
-        df.to_csv(user_file, sep=" ", index=False)
-
-        df = pd.DataFrame(
-            {
-                data.col_item: [data.id2item[id] for id in range(self.n_items)],
-                "embedding": list(self.ia_embeddings.eval(session=self.sess)),
-            }
-        )
-        df.to_csv(item_file, sep=" ", index=False)
+        self.output_embeddings(data.id2user, self.n_users, self.ua_embeddings, user_file)
+        self.output_embeddings(data.id2item, self.n_items, self.ia_embeddings, item_file)
diff --git a/scenarios/academic/KDD2020-tutorial/README.md b/scenarios/academic/KDD2020-tutorial/README.md
@@ -0,0 +1,51 @@
+# Environment setup
+The following setup instructions assume users work in a Linux system. The testing was performed on a Ubuntu Linux system.
+We use Conda to install packages and manage the virtual environment. Type ``` conda list ``` to check if you have conda in your machine. If not, please follow the instructions on https://conda.io/projects/conda/en/latest/user-guide/install/linux.html to install either Miniconda or Anaconda (preferred) before we proceed. 
+
+1. Clone the repository
+    ```bash
+    git clone https://github.com/microsoft/recommenders 
+    ```
+
+1. Check out the tutorial branch
+    ```bash
+    cd recommenders
+    git checkout kdd2020_tutorial
+    ```
+    The materials for the tutorial are located under the directory of `recommenders/scenarios/academic/KDD2020-tutorial`.
+    ```bash
+    cd scenarios/academic/KDD2020-tutorial
+    ```
+1. Download the dataset
+    1. Download the dataset for hands on experiments and unzip to data_folder:
+    ```bash
+    wget https://recodatasets.blob.core.windows.net/kdd2020/data_folder.zip
+    unzip data_folder.zip -d data_folder
+    ```
+    After you unzip the file, there are two folders under data_folder, i.e. 'raw' and 'my_cached'.   'raw' folder contains original txt files from the COVID MAG dataset. 'my_cached' folder contains processed data files, if you miss some steps during the hands-on tutorial, you can make it up by copying corresponding files into experiment folders.
+1. Install the dependencies
+    1. The model pre-training will use a tool for converting the original data into embeddings. Use of the tool will require `g++`. The following installs `g++` on a Linux system.
+        ```bash
+        sudo apt-get install g++
+        ```
+    1. The Python script will be run in a conda environment where the dependencies are installed. This can be done by using the `reco_gpu_kdd.yaml` file provided in the branch subfolder with the following commands.
+        ```bash
+        conda env create -n kdd_tutorial_2020 -f reco_gpu_kdd.yaml
+        conda activate kdd_tutorial_2020
+        ```
+1. The tutorial will be conducated by using the Jupyter notebooks. The newly created conda kernel can be registered with the Jupyter notebook server
+    ```bash
+    python -m ipykernel install --user --name kdd_tutorial_2020 --display-name "Python (kdd tutorial)"
+    ```
+
+# Tutorial notebooks/scripts
+After the setup, the users should be able to launch the notebooks locally with the command 
+```bash
+jupyter notebook --port=8080
+```
+Then the notebook can be spinned off in a browser at the address of `localhost:8080`.
+Alternatively, if the jupyter notebook server is on a remote server, the users can launch the jupyter notebook by using the following command.
+```bash
+jupyter notebook --no-browser --ip=10.214.70.89 --port=8080
+```
+From the local browser, the notebook can be spinned off at the address of `10.214.70.89:8080`.