-
Notifications
You must be signed in to change notification settings - Fork 0
/
reformat_conv.py
101 lines (85 loc) · 3.67 KB
/
reformat_conv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import argparse
import json
def reformat(dataset, filename):
postpath = "data/keyphrase/{}/{}_post.txt".format(dataset, filename)
convpath = "data/keyphrase/{}/{}_conv.txt".format(dataset, filename)
trgpath = "data/keyphrase/{}/{}_tag.txt".format(dataset, filename)
post = open(postpath, "r", encoding='utf-8')
conv = open(convpath, "r", encoding='utf-8')
trg = open(trgpath, "r", encoding='utf-8')
src_list = []
for idx, (text_1, text_2) in enumerate(zip(post, conv)):
curr_dict = {}
curr_dict["title"] = ""
curr_dict["abstract"] = ""
curr_dict["id"] = str(idx)
curr_dict["src"] = text_1.strip() + " " + text_2.strip()
src_list.append(curr_dict)
savepath = "data/keyphrase/meng17/{}".format(dataset)
if not os.path.exists(savepath):
os.makedirs(savepath)
savepath_file = "{}/{}_{}.src".format(savepath, dataset, filename)
with open(savepath_file, 'w', encoding='utf-8') as json_file:
for item in src_list:
json.dump(item, json_file)
json_file.write('\n')
hashtag_list = []
for idx, tags in enumerate(trg):
curr_dict = {}
curr_dict["keywords"] = tags.strip().split(';')
curr_dict["id"] = str(idx)
curr_dict["tgt"] = tags.strip().split(';')
hashtag_list.append(curr_dict)
savepath_file = "{}/{}_{}.tgt".format(savepath, dataset, filename)
with open(savepath_file, 'w', encoding='utf-8') as json_file:
for item in hashtag_list:
json.dump(item, json_file)
json_file.write('\n')
def reformat_json(dataset, filename):
postpath = "data/keyphrase/{}/{}_post.txt".format(dataset, filename)
convpath = "data/keyphrase/{}/{}_conv.txt".format(dataset, filename)
trgpath = "data/keyphrase/{}/{}_tag.txt".format(dataset, filename)
post = open(postpath, "r", encoding='utf-8')
conv = open(convpath, "r", encoding='utf-8')
trg = open(trgpath, "r", encoding='utf-8')
src_list = []
for idx, (text_1, text_2, tag) in enumerate(zip(post, conv, trg)):
curr_dict = {}
curr_dict["title"] = ""
curr_dict["abstract"] = text_1.strip() + " " + text_2.strip()
curr_dict["id"] = str(idx)
curr_dict["keywords"] = tag.strip()
src_list.append(curr_dict)
savepath = "data/keyphrase/json/{}".format(dataset)
if not os.path.exists(savepath):
os.makedirs(savepath)
savepath_file = "{}/{}_{}.json".format(savepath, dataset, filename)
with open(savepath_file, 'w', encoding="utf-8") as json_file:
for item in src_list:
json.dump(item, json_file)
json_file.write('\n')
def reformat_to_takg(dataset, filename):
postpath = "data/keyphrase/{}/{}_post.txt".format(dataset, filename)
convpath = "data/keyphrase/{}/{}_conv.txt".format(dataset, filename)
post = open(postpath, "r", encoding='utf-8')
conv = open(convpath, "r", encoding='utf-8')
src_list = []
for idx, (text_1, text_2) in enumerate(zip(post, conv)):
curr = text_1.strip() + " " + text_2.strip()
src_list.append(curr)
savepath = "../TAKG/data/{}/{}_{}.src".format(dataset, dataset, filename)
with open(savepath, 'w') as f:
for s in src_list:
f.write("%s\n" % s)
def main(config):
dataset = config.dataset.lower()
for data_type in ["test", "valid", "train"]:
reformat(dataset, data_type)
reformat_json(dataset, data_type)
# reformat_to_takg(dataset, data_type)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default="twitter_conv")
config = parser.parse_args()
main(config)