Skip to content

Commit

Permalink
files addes with peter
Browse files Browse the repository at this point in the history
  • Loading branch information
robertour committed Apr 11, 2019
1 parent d2440c5 commit 61492c1
Show file tree
Hide file tree
Showing 4 changed files with 11,736 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
data/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
279 changes: 279 additions & 0 deletions notebooks/1_download_rev_content.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import traceback"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def get_contents(baseurl, content, start_rev_id, end_rev_id=\"\"):\n",
" content_url = os.path.join(baseurl, \"rev_content\", content, str(start_rev_id)+\"/\")\n",
" if end_rev_id:\n",
" content_url = os.path.join(content_url, str(end_rev_id)+\"/\")\n",
" params = { \"o_rev_id\": \"false\", \"editor\": \"false\", \"token_id\": \"true\", \"in\": \"false\", \"out\": \"false\" }\n",
" try:\n",
" response = requests.get(content_url, params= params)\n",
" if response.status_code == requests.codes.ok: \n",
" response = response.json()\n",
" if \"revisions\" in response.keys() :\n",
" return response[\"revisions\"]\n",
" elif \"revisions\" not in response.keys() : \n",
" raise AttributeError(\"Server did not return revisions key it returned \\t\"+response.keys())\n",
" elif response.status_code != requests.codes.ok : \n",
" print(content_url)\n",
" raise AttributeError(\"Server returned bad code\\t\"+response.status_code)\n",
" except:\n",
" print(traceback.format_exc())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def tokens_to_df(tokens):\n",
" tokens.insert(0, {'token_id':-1, 'str': \"{st@rt}\"})\n",
" tokens.append({'token_id':-2, 'str': \"{$nd}\"})\n",
" return pd.DataFrame(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def save_content(revison_series, filename, content, step=200, baseurl=\"https://api.wikiwho.net/en/api/v1.0.0-beta/\"):\n",
" end_index = revison_series.size\n",
" from_index = 0\n",
" with pd.HDFStore(filename, 'a') as store:\n",
" try:\n",
" for to_index in range(0, end_index, step): \n",
" rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))\n",
" from_index = to_index\n",
" for rev_content in rev_contents:\n",
" key = \"r\"+list(rev_content.keys())[0]\n",
" df = tokens_to_df(list(rev_content.values())[0][\"tokens\"])\n",
" store.put(key, df, table=False)\n",
" to_index = from_index + (end_index-1)%step\n",
" rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))\n",
" rev_contents.extend(get_contents(baseurl, content, str(revison_series[to_index])))\n",
" for rev_content in rev_contents:\n",
" key = \"r\"+list(rev_content.keys())[0]\n",
" df = tokens_to_df(list(rev_content.values())[0][\"tokens\"])\n",
" store.put(key, df, table=False)\n",
" except:\n",
" print(\"problem \", traceback.format_exc())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def save_article(article_name, baseurl=\"https://api.wikiwho.net/en/api/v1.0.0-beta/\", save_dir = \"../data/content\", step=200):\n",
" params = {\"editor\": \"true\", \"timestamp\": \"true\"}\n",
" filename = article_name + \".h5\"\n",
" revisions_url = os.path.join( baseurl, \"rev_ids\", article_name+\"/\")\n",
" response = requests.get(revisions_url, params= params)\n",
" revisons_list = response.json()[\"revisions\"]\n",
" rev_list_df = pd.DataFrame(revisons_list)\n",
" save_path = os.path.join(save_dir, filename)\n",
" \n",
" all_content_url = os.path.join(baseurl, \"all_content\", article_name +\"/\")\n",
" params = { \"o_rev_id\": \"true\", \"editor\": \"false\", \"token_id\": \"true\", \"in\": \"true\", \"out\": \"true\" }\n",
" all_rev_data = requests.get(all_content_url, params= params)\n",
" all_tokens_df = pd.DataFrame( all_rev_data.json()[\"all_tokens\"] )\n",
" \n",
" with pd.HDFStore(save_path, 'a') as store:\n",
" store.put(\"rev_list\", rev_list_df, table=False)\n",
" store.put(\"all_tokens\", all_tokens_df, table=False)\n",
"\n",
" save_content(rev_list_df[\"id\"], save_path, article_name, step=step)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def tokens_to_list(tokens):\n",
" token_ids = [ token[\"token_id\"] for token in tokens ]\n",
"\n",
" token_ids.insert(0, -1)\n",
" token_ids.append(-2)\n",
" return pd.DataFrame(token_ids)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def save_content_list(revison_series, filename, content, step=200, baseurl=\"https://api.wikiwho.net/en/api/v1.0.0-beta/\"):\n",
" end_index = revison_series.size\n",
" from_index = 0\n",
" with pd.HDFStore(filename, 'a') as store:\n",
" try:\n",
" for to_index in range(0, end_index, step): \n",
" rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))\n",
" from_index = to_index\n",
" for rev_content in rev_contents:\n",
" key = \"r\"+list(rev_content.keys())[0]\n",
" df = tokens_to_list(list(rev_content.values())[0][\"tokens\"])\n",
" store.put(key, df, table=False)\n",
" # \n",
" to_index = from_index + (end_index-1)%step\n",
" rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))\n",
" rev_contents.extend(get_contents(baseurl, content, str(revison_series[to_index])))\n",
" for rev_content in rev_contents:\n",
" key = \"r\"+list(rev_content.keys())[0]\n",
" df = tokens_to_list(list(rev_content.values())[0][\"tokens\"])\n",
" store.put(key, df, table=False)\n",
" except:\n",
" print(\"problem \", traceback.format_exc())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 19.7 ms, sys: 4.04 ms, total: 23.7 ms\n",
"Wall time: 455 ms\n"
]
}
],
"source": [
"%%time\n",
"baseurl = \"https://api.wikiwho.net/en/api/v1.0.0-beta/\"\n",
"content = \"Bioglass\"\n",
"save_dir = \"../data/content\"\n",
"params = {\"editor\": \"true\", \"timestamp\": \"true\"}\n",
"filename = content + \".h5\"\n",
"save_path = os.path.join(save_dir, filename)\n",
"revisions_url = os.path.join( baseurl, \"rev_ids\", content+\"/\")\n",
"response = requests.get(revisions_url, params= params)\n",
"revisons_list = response.json()[\"revisions\"]\n",
"rev_list_df = pd.DataFrame(revisons_list)\n",
"# print(save_path)\n",
"# save_content_list(rev_list_df[\"id\"], save_path, content, step=200)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3 µs, sys: 0 ns, total: 3 µs\n",
"Wall time: 5.72 µs\n"
]
}
],
"source": [
"%%time\n",
"rev_contents = {}\n",
"# for revision in revisons_list[50:160]:\n",
"# rev_id = str(revision[\"id\"])\n",
"# response = get_contents(baseurl, content, rev_id)\n",
"# rev_contents[revision[\"id\"]] = [ token[\"token_id\"] for token in response[0][ rev_id ][\"tokens\"] ]\n",
"# with pd.HDFStore(filename, 'a') as store:\n",
"# store.put(\"rev_list\", rev_list_df, table=False)\n",
"\n",
"# %time save_content(rev_list_df[\"id\"], filename, content, step=200)\n",
"\n",
"# %time save_content(rev_list_df[\"id\"], filename, content, step=50)\n",
"\n",
"# %time save_content(rev_list_df[\"id\"], filename, content, step=20)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#article_series=pd.read_csv(\"../conflicted_article.csv\")[\"articles\"]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ulloaro/.virtualenvs/wikiconflict/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3296: PerformanceWarning: \n",
"your performance may suffer as PyTables will pickle object types that it cannot\n",
"map directly to c-types [inferred_type->mixed,key->block1_values] [items->['in', 'out', 'str']]\n",
"\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
}
],
"source": [
"save_article(\"bioglass\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# %%time\n",
"# for article in article_series[-3:]:\n",
"# print(article)\n",
"# save_article(article)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 61492c1

Please sign in to comment.