update notebook steps

recommenders-team · miguelgfierro · Sep 25, 2020 · Jul 25, 2020 · Jul 25, 2020 · Jul 25, 2020
commit a38528d1dc68b2db1ed1954b0b25ca3433e8683c
diff --git a/scenarios/KDD2020-tutorial/step1_data_preparation.ipynb b/scenarios/KDD2020-tutorial/step1_data_preparation.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +38,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -53,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,15 +77,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "loading file PaperTitleAbs_bySentence.txt...\n",
-      "loading line: 880000, time elapses: 10.3s  \n",
+      "loading line: 880000, time elapses: 11.4s  \n",
       "parsing into feature file  ...\n",
       "parsed paper count: 110000, time elapses: 0.5s \n"
      ]
@@ -111,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -143,15 +142,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "loading file PaperTitleAbs_bySentence.txt...\n",
-      "loading line: 880000, time elapses: 8.7s "
+      "loading line: 880000, time elapses: 11.6s "
      ]
     }
    ],
@@ -182,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -193,7 +192,7 @@
       "loading Papers.txt...\n",
       "loading PaperReferences.txt...\n",
       "parsing user's reference list ...\n",
-      "parsed user count: 430000, time elapses: 3.6s \n",
+      "parsed user count: 430000, time elapses: 4.6s \n",
       "outputing author reference list\n"
      ]
     }
@@ -237,20 +236,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "expanding user behaviors...\n",
-      "processing user number : 287000, time elapses: 1.7s done. Sample number in train / valid / test is 161272 / 8449 / 8449\n",
+      "processing user number : 287000, time elapses: 1.9s done. Sample number in train / valid / test is 140524 / 7465 / 7465\n",
       "Negative sampling for train...\n",
+      "sampling  140000 / 140524, time elapses: 51.8s \tdone.\n",
       "Negative sampling for validation...\n",
+      "sampling  7000 / 7465, time elapses: 2.6s \tdone.\n",
       "Negative sampling for test...\n",
+      "sampling  7000 / 7465, time elapses: 2.7s \tdone.\n",
       "done.\n",
-      "time elapses for user is : 86.4s\n"
+      "time elapses for user is : 80.4s\n"
      ]
     }
    ],
@@ -282,16 +284,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "loading PaperReferences.txt...\n",
-      "process paper num 53400 / 53452...time elapses: 9.0s\tDone.\n",
-      "process paper num 73600 / 73699...time elapses: 49.2s\tDone.\n",
+      "process paper num 53400 / 53452...time elapses: 10.6s\tDone.\n",
+      "process paper num 73600 / 73699...time elapses: 57.4s\tDone.\n",
       "loading Papers.txt...\n",
       "loading PaperAuthorAffiliations.txt...\n",
       "process author num 435800 / 435822...time elapses: 1.0s"
@@ -301,11 +303,10 @@
    "source": [
     "OutFile_dir_item2item = r'data_folder/my/item2item'\n",
     "create_dir(OutFile_dir_item2item)\n",
-    "Path_PaperFeature\n",
     "item_set = load_has_feature_items(Path_PaperFeature)\n",
     "\n",
     "\n",
-    "Path_PaperReference = os.path.join(InFile_dir, 'PaperReferences.txt')\n",
+    "# Path_PaperReference = os.path.join(InFile_dir, 'PaperReferences.txt')\n",
     "pair2CocitedCnt, pair2CoReferenceCnt = gen_paper_cocitation(Path_PaperReference)\n",
     "\n",
     "Path_paper_pair_cocitation = os.path.join(OutFile_dir_item2item, 'paper_pair_cocitation_cnt.csv')\n",
@@ -341,17 +342,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "negative sampling for file item2item_train.txt...\n",
-      "process line num 182600 / 182645...time elapses: 3.4s\tdone.\n",
+      "process line num 182200 / 182261...time elapses: 4.1s\tdone.\n",
       "negative sampling for file item2item_valid.txt...\n",
-      "process line num 45500 / 45505...time elapses: 0.9s\tdone.\n"
+      "process line num 45800 / 45889...time elapses: 1.0s\tdone.\n"
      ]
     }
    ],
@@ -378,42 +379,57 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Generating the full dataset will take a longer time, let it run in the background freely..."
+    "Generating the full dataset for theuser2item recommendation task will take a longer time, so let put its running to the end of this notebook ..."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "expanding user behaviors...\n",
+      "processing user number : 287000, time elapses: 2.7s done. Sample number in train / valid / test is 365242 / 23066 / 23066\n",
+      "Negative sampling for train...\n",
+      "sampling  365000 / 365242, time elapses: 283.6s \tdone.\n",
+      "Negative sampling for validation...\n",
+      "sampling  23000 / 23066, time elapses: 18.0s \tdone.\n",
+      "Negative sampling for test...\n",
+      "sampling  23000 / 23066, time elapses: 18.8s \tdone.\n",
+      "done.\n",
+      "time elapses is : 324.1s\n"
+     ]
+    }
+   ],
    "source": [
+    "_t0 = time.time()\n",
     "gen_experiment_splits(\n",
     "    Path_Author2ReferencePapers,\n",
     "    OutFile_dir_DKN,\n",
     "    Path_PaperFeature,\n",
-    "    item_ratio=1.0,\n",
-    "    tag='full'\n",
-    ") \n"
+    "    item_ratio=0.2,\n",
+    "    tag='medium'\n",
+    ") \n",
+    "_t1 = time.time()\n",
+    "print('time elapses is : {0:.1f}s'.format(_t1 - _t0))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "reco_gpu_kdd",
    "language": "python",
    "name": "reco_gpu_kdd"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
   }
  },
  "nbformat": 4,

diff --git a/scenarios/KDD2020-tutorial/step2_pretraining-embeddings.ipynb b/scenarios/KDD2020-tutorial/step2_pretraining-embeddings.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -77,15 +77,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "start to train word embedding... \tdone . \n",
-      "time elapses: 649.8s\n"
+      "time elapses: 526.3s\n"
      ]
     }
    ],
@@ -123,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -132,16 +132,16 @@
      "text": [
       "/data/home/jialia/jialia/kdd2020tutorial/formal_02/recommenders/scenarios/KDD2020-tutorial\n",
       "fatal: destination path 'Fast-TransX' already exists and is not an empty directory.\n",
-      "epoch 0 454690.656250\n",
-      "epoch 1 376927.000000\n",
-      "epoch 2 344530.656250\n",
-      "epoch 3 315695.781250\n",
-      "epoch 4 290692.281250\n",
-      "epoch 5 268658.906250\n",
-      "epoch 6 250159.546875\n",
-      "epoch 7 231006.828125\n",
-      "epoch 8 215869.140625\n",
-      "epoch 9 200701.406250\n"
+      "epoch 0 464878.218750\n",
+      "epoch 1 392123.312500\n",
+      "epoch 2 361906.625000\n",
+      "epoch 3 315392.156250\n",
+      "epoch 4 310050.875000\n",
+      "epoch 5 281908.250000\n",
+      "epoch 6 271810.968750\n",
+      "epoch 7 240873.968750\n",
+      "epoch 8 237960.375000\n",
+      "epoch 9 221742.484375\n"
      ]
     }
    ],
@@ -166,9 +166,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'EMBEDDING_LENGTH' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-6-867053a0e641>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mcontext_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mOutFile_dir_KG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'context2vec.vec'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mkg_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mOutFile_dir_KG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'train2id.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mgen_context_embedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentity_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontext_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkg_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/data/home/jialia/jialia/kdd2020tutorial/formal_02/recommenders/scenarios/KDD2020-tutorial/utils/task_helper.py\u001b[0m in \u001b[0;36mgen_context_embedding\u001b[0;34m(entity_file, context_file, kg_file)\u001b[0m\n\u001b[1;32m    516\u001b[0m     \u001b[0mfp_entity\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentity_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    517\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfp_entity\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 518\u001b[0;31m         \u001b[0mlinesplit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mEMBEDDING_LENGTH\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    519\u001b[0m         \u001b[0mlinesplit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlinesplit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    520\u001b[0m         \u001b[0mentity_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentity_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlinesplit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'EMBEDDING_LENGTH' is not defined"
+     ]
+    }
+   ],
    "source": [
     "##### build context embedding\n",
     "EMBEDDING_LENGTH = 32\n",
@@ -180,7 +193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [