-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adds change object calculation script
- Loading branch information
Peter Heuz
committed
Apr 11, 2019
1 parent
f8af451
commit d2440c5
Showing
1 changed file
with
91 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import sys,os | ||
sys.path.append("../") | ||
import pandas as pd | ||
import numpy as np | ||
import requests | ||
import pickle | ||
import traceback | ||
from scripts.wiki import Wiki,Revision | ||
|
||
import ipdb | ||
|
||
def create_change_object(article_name, content_dir = "../data/content/", | ||
change_object_dir = "../data/change objects/", epsilon_size=30, save=False): | ||
|
||
content_filepath = os.path.join(content_dir, article_name+".h5") | ||
change_object_filepath = os.path.join(change_object_dir, article_name+".pkl") | ||
|
||
with pd.HDFStore(content_filepath, 'r') as store: | ||
#retrieving all rev list and change object from file | ||
rev_list = store.get("rev_list") | ||
all_rev = store.get("all_tokens") | ||
all_tokens = all_rev.to_dict(orient="records") | ||
|
||
#making revision objects | ||
revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1) | ||
revs.index = rev_list.id | ||
|
||
# Getting first revision object and adding content ot it | ||
from_rev_id = revs.index[0] | ||
wiki = Wiki(2345, article_name, revs, all_tokens) | ||
wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] | ||
# adding content to all other revision and finding change object between them. | ||
|
||
for to_rev_id in list(revs.index[1:]): | ||
key="r"+str(to_rev_id) | ||
to_rev_content = store[key] | ||
wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size) | ||
from_rev_id = to_rev_id | ||
|
||
if save: | ||
with open(change_object_filepath, "wb") as file: | ||
pickle.dump(wiki, file) | ||
|
||
return wiki | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
article_series=pd.read_csv("../conflicted_article.csv")["articles"] | ||
for article in article_series[13:]: | ||
print(article) | ||
|
||
content_dir = "../data/content/" | ||
|
||
len_file = article + "_rev_len.h5" | ||
len_file_path = os.path.join(content_dir, len_file) | ||
|
||
filename = article + ".h5" | ||
filepath = os.path.join(content_dir, filename) | ||
|
||
with pd.HDFStore(filepath, 'r') as store: | ||
#retrieving all rev list and change object from file | ||
rev_list = store.get("rev_list")["id"].values.tolist() | ||
keys = ["r" + str(rev) for rev in rev_list] | ||
rev_len_list = [store.get(key).shape[0] for key in keys] | ||
rev_len_df = pd.DataFrame({"rev_id":rev_list[:-1], "length": rev_len_list[:-1]}) | ||
rev_len_df.to_hdf(len_file_path, "rev_len") | ||
|
||
wiki = create_change_object(article, save=False) | ||
change_objects = [] | ||
wiki.revisions.iloc[:-1].apply(lambda revision: change_objects.append(revision.change_df)) | ||
timestamp_s = pd.to_datetime([ rev.timestamp for rev in wiki.revisions.values.ravel().tolist()]) | ||
time_gap = pd.to_timedelta(timestamp_s[1:]-timestamp_s[:-1]) | ||
|
||
rev_ids = [ rev.id for rev in wiki.revisions.tolist()] | ||
from_rev_ids = rev_ids[:-1] | ||
to_rev_ids= rev_ids[1:] | ||
|
||
editor_s = [ rev.editor for rev in wiki.revisions.tolist()] | ||
|
||
index = list(zip(*[from_rev_ids, to_rev_ids, timestamp_s.tolist()[1:], time_gap, editor_s[1:]])) | ||
cols = change_objects[0].columns | ||
change_df = pd.concat((x[cols] for x in change_objects), sort=False, keys=index, names=["from revision id", "to revision id", "timestamp", "timegap", "editor"]) | ||
change_object_dir = "../data/change objects/" | ||
try: | ||
os.makedirs(change_object_dir) | ||
except FileExistsError: | ||
pass | ||
change_dataframe_path = os.path.join(change_object_dir, article+"_change.h5") | ||
ipdb.set_trace() | ||
change_df.to_hdf(change_dataframe_path, key="data") |