-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_rvk_index.py
100 lines (73 loc) · 3.36 KB
/
build_rvk_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# The MIT License
#
# Copyright 2018-2020 Hans-Georg Becker <https://orcid.org/0000-0003-0432-294X>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import collections
from os import listdir
from os.path import isfile, join
import simplejson as json
import subprocess
from urllib.parse import quote_plus
import xmltodict
import config_rvk as config
def url_encode(query=''):
return quote_plus(query)
def build_index_data(node):
json_data = ''
document = {'id': url_encode(node['@notation']), 'notation': node['@notation'], 'label': node['@benennung']}
if node.get('register'):
document['register'] = node['register']
json_data += '%s\n' % json.dumps(document)
# build children
if node.get('children'):
if type(node['children']['node']) is list:
for child in node['children']['node']:
json_data += build_index_data(child)
elif type(node['children']['node']) is collections.OrderedDict:
json_data += build_index_data(node['children']['node'])
return json_data
def start_solrbulk(data_dir=''):
""" start indexing using solrbulk """
# index data using solrbulk
solr_index_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
idx = 1
for file in solr_index_files:
# start solrbulk: https://github.com/miku/solrbulk
# solrbulk -verbose -server https://localhost:7007/solr/biblio solr_add_data.json
if idx == 1:
result = subprocess.run(["/usr/sbin/solrbulk", "-purge", "-server", "%s" % config.SOLR_URL, "%s/%s" % (data_dir, file)], stdout=subprocess.PIPE)
else:
result = subprocess.run(["/usr/sbin/solrbulk", "-server", "%s" % config.SOLR_URL, "%s/%s" % (data_dir, file)], stdout=subprocess.PIPE)
print(result)
idx += 1
if __name__ == '__main__':
with open(config.RVK_XML, 'rb') as thedata:
# read rvko XML data
try:
data = xmltodict.parse(thedata)['classification_scheme']
except Exception as e:
data = None
print("ERROR: Couldn't read %s as XML." % config.RVK_XML)
print(e)
if data:
with open('%srvk.index.json' % config.SOLR_DATA, 'w') as thedata:
for data_node in data['node']:
thedata.write(build_index_data(data_node))
start_solrbulk(data_dir=config.SOLR_DATA)