-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_data_csv.py
89 lines (70 loc) · 2.7 KB
/
convert_data_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 3 21:22:38 2014
convert the data from wikivoyage to a csv file
@author: dvats
"""
import re
import pandas as pd
import numpy as np
def CleanData(text):
# remove headers like See, Do, etc
section_headings = ["See", "Do", "Learn", \
"Eat", "Drink", "Buy", \
"GuideClass", "LinkBefore", "Region"]
for s in section_headings:
text = text.replace("<h2>"+s+"</h2>", "")
# remove brackets like [ and ]
text = text.replace("[", "").replace("]", "")
return text
def GetPattern(text, pattern):
"""
return pattern from a string that contain pattern="id_num" url
"""
tmp = re.search(pattern, text)
return text[tmp.end():].split('"', 1)[1].split('"', 1)[0]
def GetMainData(text):
"""
return a dictionary with headings given by <h>HEADING</h>
"""
temp_text = text.split('\n', 1)[1]
temp_text = temp_text.replace("</doc>", "")
# find all instances of <h>
h_p = [m.start() for m in re.finditer('<h>', temp_text)]
h_p.append(len(temp_text))
text_divide = [temp_text[h_p[i]:h_p[i+1]] \
for i, h in enumerate(h_p) if h != h_p[-1]]
MainData = {}
for tt in text_divide:
st = tt.split('\n', 1)[0] # get te <h>LABEL</h> line
st = st.replace("<h>", "").replace("</h>", "")
MainData[st] = tt.split('\n', 1)[1].replace('\n', " ").replace('\r', " ").lstrip().rstrip()
return MainData
ff = open("./data/wikivoyage_data")
all_data = ff.read()
# find all instances of <doc and </doc>
start_doc = [m.start() for m in re.finditer('<doc', all_data)]
end_doc = [m.end() for m in re.finditer('</doc>', all_data)]
# separate the data
sep_data = [all_data[sd:end_doc[i]] for i, sd in enumerate(start_doc)]
id_data = np.array([GetPattern(ss.split('\n', 1)[0], "id=") for ss in sep_data])
title_data = np.array([GetPattern(ss.split('\n', 1)[0], "title=") for ss in sep_data])
url_data = np.array([GetPattern(ss.split('\n', 1)[0], "url=") for ss in sep_data])
final_data = {}
final_data["title"] = title_data
final_data["url"] = url_data
final_data["id"] = id_data
num_rows = len(id_data)
section_headings = ["See", "Do", "Learn", \
"Eat", "Drink", "Buy", \
"GuideClass", "LinkBefore", "Region"]
for h in section_headings:
final_data[h] = [''] * num_rows
# iterate over all elements in sep_data
for i, ss in enumerate(sep_data):
temp_data = GetMainData(ss) # return a dictionary
for k in temp_data.keys():
final_data[k][i] = temp_data[k]
# write the data to a csv file
final_data = pd.DataFrame(final_data)
#final_data.to_csv("./data/TravelData.csv")