Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kdd2020 tutorial updated #1208

Merged
merged 33 commits into from
Sep 25, 2020
Merged
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
ffbce15
add kdd2020 tutorials for knowledge-aware recommendations
Leavingseason Jul 25, 2020
141eb91
v0: ready for running
Leavingseason Jul 25, 2020
184d289
add environment config files
Leavingseason Jul 25, 2020
8f37eb8
text changes
Leavingseason Jul 25, 2020
70f0c47
update notebook step1
Leavingseason Jul 25, 2020
eacac58
update notebook step2
Leavingseason Jul 25, 2020
9db5623
update notebook step3
Leavingseason Jul 25, 2020
a38528d
update notebook steps
Leavingseason Jul 27, 2020
aa6d9d9
add README
yueguoguo Jul 27, 2020
1949734
update readme
yueguoguo Jul 27, 2020
6238d41
Merge pull request #1164 from microsoft/le/kdd_tutorial
Leavingseason Jul 27, 2020
171d244
update notebooks; move functions to utils
Leavingseason Jul 27, 2020
681239e
update notebook step 3
Leavingseason Jul 27, 2020
c101ad7
update step1 and step5
Leavingseason Jul 31, 2020
5918168
fix LightGCN bug and update step2 step5
Leavingseason Jul 31, 2020
d840596
add reco_gpu_kdd.yaml
Leavingseason Jul 31, 2020
d7c0c0e
delete unused folder; add cpu yaml
Leavingseason Aug 24, 2020
1b40882
update reco_cpu_kdd.yaml
Leavingseason Aug 24, 2020
a2679a6
update yaml config: remove pytorch and fastai
Leavingseason Aug 24, 2020
950dfd8
Update README.md
Leavingseason Aug 25, 2020
a9aa7ed
add scripts for subgraph analysis
Leavingseason Aug 25, 2020
cc9c645
Update reco_gpu_kdd.yaml
miguelgfierro Aug 25, 2020
03d3b19
Merge branch 'staging' into kdd2020_tutorial
Leavingseason Sep 19, 2020
283a3bd
Merge branch 'staging' into kdd2020_tutorial
Leavingseason Sep 24, 2020
e884a69
update yaml
Leavingseason Sep 24, 2020
d854c39
Adjust structure; update comments
Leavingseason Sep 25, 2020
df9d996
add test cases
Leavingseason Sep 25, 2020
9394ede
add gensim to yaml env config
Leavingseason Sep 25, 2020
464f5fb
add liscense info
Leavingseason Sep 25, 2020
b55f3d3
move the tutorial to examples/07_tutorials
Leavingseason Sep 25, 2020
7058113
add yaml and sh files
Leavingseason Sep 25, 2020
e13cf67
update step4
Leavingseason Sep 25, 2020
2d7249d
update README
Leavingseason Sep 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update notebook steps
Leavingseason committed Jul 27, 2020
commit a38528d1dc68b2db1ed1954b0b25ca3433e8683c
94 changes: 55 additions & 39 deletions scenarios/KDD2020-tutorial/step1_data_preparation.ipynb
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -38,7 +38,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -53,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -78,15 +77,15 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading file PaperTitleAbs_bySentence.txt...\n",
"loading line: 880000, time elapses: 10.3s \n",
"loading line: 880000, time elapses: 11.4s \n",
"parsing into feature file ...\n",
"parsed paper count: 110000, time elapses: 0.5s \n"
]
@@ -111,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -143,15 +142,15 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading file PaperTitleAbs_bySentence.txt...\n",
"loading line: 880000, time elapses: 8.7s "
"loading line: 880000, time elapses: 11.6s "
]
}
],
@@ -182,7 +181,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -193,7 +192,7 @@
"loading Papers.txt...\n",
"loading PaperReferences.txt...\n",
"parsing user's reference list ...\n",
"parsed user count: 430000, time elapses: 3.6s \n",
"parsed user count: 430000, time elapses: 4.6s \n",
"outputing author reference list\n"
]
}
@@ -237,20 +236,23 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"expanding user behaviors...\n",
"processing user number : 287000, time elapses: 1.7s done. Sample number in train / valid / test is 161272 / 8449 / 8449\n",
"processing user number : 287000, time elapses: 1.9s done. Sample number in train / valid / test is 140524 / 7465 / 7465\n",
"Negative sampling for train...\n",
"sampling 140000 / 140524, time elapses: 51.8s \tdone.\n",
"Negative sampling for validation...\n",
"sampling 7000 / 7465, time elapses: 2.6s \tdone.\n",
"Negative sampling for test...\n",
"sampling 7000 / 7465, time elapses: 2.7s \tdone.\n",
"done.\n",
"time elapses for user is : 86.4s\n"
"time elapses for user is : 80.4s\n"
]
}
],
@@ -282,16 +284,16 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading PaperReferences.txt...\n",
"process paper num 53400 / 53452...time elapses: 9.0s\tDone.\n",
"process paper num 73600 / 73699...time elapses: 49.2s\tDone.\n",
"process paper num 53400 / 53452...time elapses: 10.6s\tDone.\n",
"process paper num 73600 / 73699...time elapses: 57.4s\tDone.\n",
"loading Papers.txt...\n",
"loading PaperAuthorAffiliations.txt...\n",
"process author num 435800 / 435822...time elapses: 1.0s"
@@ -301,11 +303,10 @@
"source": [
"OutFile_dir_item2item = r'data_folder/my/item2item'\n",
"create_dir(OutFile_dir_item2item)\n",
"Path_PaperFeature\n",
"item_set = load_has_feature_items(Path_PaperFeature)\n",
"\n",
"\n",
"Path_PaperReference = os.path.join(InFile_dir, 'PaperReferences.txt')\n",
"# Path_PaperReference = os.path.join(InFile_dir, 'PaperReferences.txt')\n",
"pair2CocitedCnt, pair2CoReferenceCnt = gen_paper_cocitation(Path_PaperReference)\n",
"\n",
"Path_paper_pair_cocitation = os.path.join(OutFile_dir_item2item, 'paper_pair_cocitation_cnt.csv')\n",
@@ -341,17 +342,17 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"negative sampling for file item2item_train.txt...\n",
"process line num 182600 / 182645...time elapses: 3.4s\tdone.\n",
"process line num 182200 / 182261...time elapses: 4.1s\tdone.\n",
"negative sampling for file item2item_valid.txt...\n",
"process line num 45500 / 45505...time elapses: 0.9s\tdone.\n"
"process line num 45800 / 45889...time elapses: 1.0s\tdone.\n"
]
}
],
@@ -378,42 +379,57 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Generating the full dataset will take a longer time, let it run in the background freely..."
"Generating the full dataset for theuser2item recommendation task will take a longer time, so let put its running to the end of this notebook ..."
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"expanding user behaviors...\n",
"processing user number : 287000, time elapses: 2.7s done. Sample number in train / valid / test is 365242 / 23066 / 23066\n",
"Negative sampling for train...\n",
"sampling 365000 / 365242, time elapses: 283.6s \tdone.\n",
"Negative sampling for validation...\n",
"sampling 23000 / 23066, time elapses: 18.0s \tdone.\n",
"Negative sampling for test...\n",
"sampling 23000 / 23066, time elapses: 18.8s \tdone.\n",
"done.\n",
"time elapses is : 324.1s\n"
]
}
],
"source": [
"_t0 = time.time()\n",
"gen_experiment_splits(\n",
" Path_Author2ReferencePapers,\n",
" OutFile_dir_DKN,\n",
" Path_PaperFeature,\n",
" item_ratio=1.0,\n",
" tag='full'\n",
") \n"
" item_ratio=0.2,\n",
" tag='medium'\n",
") \n",
"_t1 = time.time()\n",
"print('time elapses is : {0:.1f}s'.format(_t1 - _t0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "reco_gpu_kdd",
"language": "python",
"name": "reco_gpu_kdd"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
47 changes: 30 additions & 17 deletions scenarios/KDD2020-tutorial/step2_pretraining-embeddings.ipynb
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -77,15 +77,15 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"start to train word embedding... \tdone . \n",
"time elapses: 649.8s\n"
"time elapses: 526.3s\n"
]
}
],
@@ -123,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -132,16 +132,16 @@
"text": [
"/data/home/jialia/jialia/kdd2020tutorial/formal_02/recommenders/scenarios/KDD2020-tutorial\n",
"fatal: destination path 'Fast-TransX' already exists and is not an empty directory.\n",
"epoch 0 454690.656250\n",
"epoch 1 376927.000000\n",
"epoch 2 344530.656250\n",
"epoch 3 315695.781250\n",
"epoch 4 290692.281250\n",
"epoch 5 268658.906250\n",
"epoch 6 250159.546875\n",
"epoch 7 231006.828125\n",
"epoch 8 215869.140625\n",
"epoch 9 200701.406250\n"
"epoch 0 464878.218750\n",
"epoch 1 392123.312500\n",
"epoch 2 361906.625000\n",
"epoch 3 315392.156250\n",
"epoch 4 310050.875000\n",
"epoch 5 281908.250000\n",
"epoch 6 271810.968750\n",
"epoch 7 240873.968750\n",
"epoch 8 237960.375000\n",
"epoch 9 221742.484375\n"
]
}
],
@@ -166,9 +166,22 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'EMBEDDING_LENGTH' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-867053a0e641>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mcontext_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mOutFile_dir_KG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'context2vec.vec'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mkg_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mOutFile_dir_KG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'train2id.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mgen_context_embedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentity_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontext_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkg_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/data/home/jialia/jialia/kdd2020tutorial/formal_02/recommenders/scenarios/KDD2020-tutorial/utils/task_helper.py\u001b[0m in \u001b[0;36mgen_context_embedding\u001b[0;34m(entity_file, context_file, kg_file)\u001b[0m\n\u001b[1;32m 516\u001b[0m \u001b[0mfp_entity\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentity_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfp_entity\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 518\u001b[0;31m \u001b[0mlinesplit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mEMBEDDING_LENGTH\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 519\u001b[0m \u001b[0mlinesplit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlinesplit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mentity_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mentity_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlinesplit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'EMBEDDING_LENGTH' is not defined"
]
}
],
"source": [
"##### build context embedding\n",
"EMBEDDING_LENGTH = 32\n",
@@ -180,7 +193,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Loading