update notebook step2

recommenders-team · miguelgfierro · Sep 25, 2020 · Jul 25, 2020 · Jul 25, 2020 · Jul 25, 2020
commit eacac58f50641b6ea3aed348148d45ab6054d60a
@@ -803,7 +803,7 @@
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.5",
    "language": "python",
    "name": "python3"
   },
@@ -817,7 +817,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.5.6"
   }
  },
  "nbformat": 4,

diff --git a/scenarios/KDD2020-tutorial/step1_data_preparation.ipynb b/scenarios/KDD2020-tutorial/step1_data_preparation.ipynb
@@ -14,7 +14,7 @@
    "metadata": {},
    "source": [
     "# Data manipulation\n",
-    "This notebook provides all necessary steps to generate DKN's input dataset from the MAG COVID-19 raw dataset "
+    "This notebook provides necessary steps to generate DKN's input dataset from the MAG COVID-19 raw dataset "
    ]
   },
   {
@@ -356,7 +356,6 @@
     }
    ],
    "source": [
-    "\n",
     "split_train_valid_file(\n",
     "    [Path_paper_pair_cocitation, Path_FirstAuthorPaperPair, Path_paper_pair_coreference],\n",
     "    OutFile_dir_DKN\n",

diff --git a/scenarios/KDD2020-tutorial/step2_pretraining-embeddings.ipynb b/scenarios/KDD2020-tutorial/step2_pretraining-embeddings.ipynb
@@ -9,6 +9,14 @@
     "<i>Licensed under the MIT License.</i>\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pretraining word and entity embeddings\n",
+    "This notebook trains word embeddings and entity embeddings for DKN initializations."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -60,6 +68,13 @@
     "OutFile_dir_DKN = 'data_folder/my/DKN-training-folder'"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We use word2vec algorithm implemented in Gensim (https://radimrehurek.com/gensim/models/word2vec.html) to generate word embeddings."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 19,
@@ -82,21 +97,29 @@
     "\n",
     "    print('start to train word embedding...', end=' ')\n",
     "    my_sentences = MySentenceCollection(Path_sentences)\n",
-    "    model = Word2Vec(my_sentences, size=32, window=5, min_count=1, workers=4, iter=50)\n",
+    "    model = Word2Vec(my_sentences, size=32, window=5, min_count=1, workers=8, iter=30)\n",
     "\n",
     "    model.save(OutFile_word2vec)\n",
     "    model.wv.save_word2vec_format(OutFile_word2vec_txt, binary=False)\n",
     "    print('\\tdone . ')\n",
     "\n",
     "Path_sentences = os.path.join(InFile_dir, 'sentence.txt')\n",
-    "# train_word2vec(Path_sentences, OutFile_dir)\n",
     "\n",
     "t0 = time.time()\n",
     "train_word2vec(Path_sentences, OutFile_dir)\n",
     "t1 = time.time()\n",
     "print('time elapses: {0:.1f}s'.format(t1 - t0))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We leverage a graph embedding model to encode entities into embedding vectors.\n",
+    "<img src=\"https://recodatasets.blob.core.windows.net/kdd2020/images%2Fkg-embedding.JPG\" width=\"600\">\n",
+    "We use an open-source implementation of TransE (https://github.com/thunlp/Fast-TransX) for generating knowledge graph embeddings:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 9,
@@ -122,10 +145,6 @@
     }
    ],
    "source": [
-    "## some step in transE training\n",
-    "\n",
-    "## https://github.com/thunlp/Fast-TransX\n",
-    "\n",
     "!bash ./run_transE.sh"
    ]
   },
@@ -137,54 +156,11 @@
    "source": []
   },
   {
-   "cell_type": "code",
-   "execution_count": 10,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "def gen_context_embedding(entity_file, context_file, kg_file):\n",
-    "    #load embedding_vec\n",
-    "    entity_index = 0\n",
-    "    entity_dict = {}\n",
-    "    fp_entity = open(entity_file, 'r')\n",
-    "    for line in fp_entity:\n",
-    "        linesplit = line.strip().split('\\t')[:EMBEDDING_LENGTH]\n",
-    "        linesplit = list(map(float, linesplit))\n",
-    "        entity_dict[str(entity_index)] = linesplit\n",
-    "        entity_index += 1\n",
-    "    fp_entity.close()\n",
-    "\n",
-    "    #build neighbor for entity in entity_dict\n",
-    "    fp_kg = open(kg_file, 'r', encoding='utf-8')\n",
-    "    triple_num = fp_kg.readline()\n",
-    "    triples = fp_kg.readlines()\n",
-    "    kg_neighbor_dict = {}\n",
-    "    for triple in triples:\n",
-    "        linesplit = triple.strip().split(' ')\n",
-    "        head = linesplit[0]\n",
-    "        tail = linesplit[1]\n",
-    "        if head not in kg_neighbor_dict:\n",
-    "            kg_neighbor_dict[head] = set()\n",
-    "        kg_neighbor_dict[head].add(tail)\n",
-    "\n",
-    "        if tail not in kg_neighbor_dict:\n",
-    "            kg_neighbor_dict[tail] = set()\n",
-    "        kg_neighbor_dict[tail].add(head)        \n",
-    "    fp_kg.close()\n",
-    "\n",
-    "    context_embeddings = np.zeros([entity_index , EMBEDDING_LENGTH])\n",
-    "\n",
-    "    for entity in entity_dict:\n",
-    "        if entity in kg_neighbor_dict:\n",
-    "            context_entity = kg_neighbor_dict[entity]\n",
-    "            context_vecs = []\n",
-    "            for c_entity in context_entity:\n",
-    "                context_vecs.append(entity_dict[c_entity])\n",
-    "\n",
-    "            context_vec = np.mean(np.asarray(context_vecs), axis=0)\n",
-    "            context_embeddings[int(entity)] = context_vec\n",
-    "\n",
-    "    np.savetxt(context_file, context_embeddings, delimiter='\\t')"
+    "DKN take considerations of both the entity embeddings and its context embeddings.\n",
+    "<img src=\"https://recodatasets.blob.core.windows.net/kdd2020/images/context-embedding.JPG\" width=\"600\">"
    ]
   },
   {

diff --git a/scenarios/KDD2020-tutorial/utils/task_helper.py b/scenarios/KDD2020-tutorial/utils/task_helper.py
@@ -509,7 +509,50 @@ def format_word_embeddings(word_vecfile, word2id_file, np_file):
     with open(np_file, 'wb') as f:
         np.save(f, word_embeddings)
 
-
+def gen_context_embedding(entity_file, context_file, kg_file):
+    #load embedding_vec
+    entity_index = 0
+    entity_dict = {}
+    fp_entity = open(entity_file, 'r')
+    for line in fp_entity:
+        linesplit = line.strip().split('\t')[:EMBEDDING_LENGTH]
+        linesplit = list(map(float, linesplit))
+        entity_dict[str(entity_index)] = linesplit
+        entity_index += 1
+    fp_entity.close()
+
+    #build neighbor for entity in entity_dict
+    fp_kg = open(kg_file, 'r', encoding='utf-8')
+    triple_num = fp_kg.readline()
+    triples = fp_kg.readlines()
+    kg_neighbor_dict = {}
+    for triple in triples:
+        linesplit = triple.strip().split(' ')
+        head = linesplit[0]
+        tail = linesplit[1]
+        if head not in kg_neighbor_dict:
+            kg_neighbor_dict[head] = set()
+        kg_neighbor_dict[head].add(tail)
+
+        if tail not in kg_neighbor_dict:
+            kg_neighbor_dict[tail] = set()
+        kg_neighbor_dict[tail].add(head)        
+    fp_kg.close()
+
+    context_embeddings = np.zeros([entity_index , EMBEDDING_LENGTH])
+
+    for entity in entity_dict:
+        if entity in kg_neighbor_dict:
+            context_entity = kg_neighbor_dict[entity]
+            context_vecs = []
+            for c_entity in context_entity:
+                context_vecs.append(entity_dict[c_entity])
+
+            context_vec = np.mean(np.asarray(context_vecs), axis=0)
+            context_embeddings[int(entity)] = context_vec
+
+    np.savetxt(context_file, context_embeddings, delimiter='\t')
+
 
 ########  data preparation for lightGCN
 def load_instance_file(