-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser_functions.py
100 lines (82 loc) · 2.9 KB
/
parser_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
from urllib.parse import urlparse
from collections import Counter, defaultdict
def jloadfunc(filepath):
with open(filepath, encoding='utf-8-sig') as f:
data = json.load(f)
return data
def rubrics_iterator(jsondict, pathdict):
dctlist = jsondict.get('rubrics', '')
allrubs = []
primrub = []
for i in dctlist:
rubric = i.get('name', '')
allrubs.append(rubric)
if i.get('kind', '') == 'primary':
primrub.append(i.get('name', ''))
pathdict['primary rubric'] = primrub
pathdict['rubrics'] = allrubs
def adm_div_iterator(jsondict, pathdict):
dctlist = jsondict.get('adm_div', '')
new_dict = dict()
for i in dctlist:
new_dict[i.get('type', '')] = i.get('name', '')
pathdict['country'] = new_dict.get('country', '')
pathdict['city'] = new_dict.get('city', '')
def whatsappphone_extractor(str):
u = urlparse(str)
phone = '+' + u.path[1:]
return phone
def manage_duplicates(pairs):
d = {}
k_counter = Counter(defaultdict(int))
for k, v in pairs:
d[k + ' ' + str(k_counter[k] + 1)] = v
k_counter[k] += 1
return d
def contacts_iterator(jsondict, pathdict):
dctlist = jsondict.get('contact_groups', '')
tuple_list = []
for i in dctlist:
for x in i:
endlist = i.get(x, '')
for d in endlist:
if d.get('type', '') == 'website':
k = d.get('type', '')
v = d.get('url', '')
tuple_list.append((k, v))
elif d.get('type', '') == 'whatsapp':
k = d.get('type', '')
v_long = d.get('value', '')
v = whatsappphone_extractor(v_long)
tuple_list.append((k, v))
else:
k = d.get('type', '')
v = d.get('value', '')
tuple_list.append((k, v))
new_dict = manage_duplicates(tuple_list)
for i in new_dict:
pathdict[i] = new_dict.get(i, '')
def url_generator(jsondict, pathdict):
json_id = jsondict['id']
url_id = json_id.split('_', 1)[0]
url_beginning = 'https://2gis.com/firm/'
url = url_beginning + url_id
pathdict['2GIS URL'] = url
def json_processer(filepath):
data = jloadfunc(filepath)
datalist = []
for i in data:
dct = dict()
dct['organization id'] = 'ID' + i['org'].get('id', '')
dct['branch count'] = i['org'].get('branch_count', '')
dct['name'] = i['org'].get('name', '')
dct['description'] = i['name_ex'].get('extension', '')
rubrics_iterator(i, dct)
dct['address name'] = i.get('address_name', '')
adm_div_iterator(i, dct)
contacts_iterator(i, dct)
url_generator(i, dct)
datalist.append(dct)
res_lst = list({dct['2GIS URL']: dct for dct in datalist}.values())
return res_lst