This repository has been archived by the owner on May 9, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecidmap.py
394 lines (345 loc) · 14.3 KB
/
recidmap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
#!/usr/bin/python
"""
Tool to turn bibmatch output into MARCXML uploads
"""
import re
import os
import sys
import getopt
import tempfile
import time
from invenio.invenio_connector import InvenioConnector
from invenio.bibrecord import create_records, record_get_field_value, \
record_add_field, record_delete_field, \
record_has_field, record_xml_output
from invenio.xmlmarc2textmarc import get_sysno_from_record, create_marc_record, get_sysno_generator
re_matched_recid = re.compile("<!-- BibMatch-Matching-Found: http[s:]{1,2}\/\/.+?\/record\/([0-9]*) -->")
#re_original_recid = re.compile("<controlfield tag=\"001\">([0-9]*)<\/controlfield>")
re_original_id = re.compile("035__ .*?\$\$9CDS\$\$[az][\s]*([0-9]*).*?|035__ .*?\$\$[az][\s]*([0-9]*)\$\$9CDS.*?")
re_original_id_spires = re.compile("035__ .*?\$\$9SPIRES\$\$[az][\s]*([0-9]*).*?|035__ .*?\$\$[az][\s]*([0-9]*)\$\$9SPIRES.*?", re.IGNORECASE)
re_original_id_inspire = re.compile("035__ .*?\$\$9Inspire\$\$[az][\s]*([0-9]*).*?|035__ .*?\$\$[az][\s]*([0-9]*)\$\$9Inspire.*?", re.IGNORECASE)
re_original_id_cern = re.compile("595__ .*?CDS\-([0-9]*).*?")
re_matched_mode = re.compile("<!-- BibMatch-Matching-Mode: (.+?) -->")
re_record = re.compile("([0-9]{3}): (.*)")
IDENTIFIER_MAP = {'inspirebeta.net' : 'Inspire', 'cdsweb.cern.ch' : 'CDS'}
def get_record(server, recid, tries=0):
""" Get record by recid from passed Invenio server url """
if tries > 5:
return None
try:
rec = server.get_record(recid)
except URLError:
print "Error: Retrying"
time.sleep(2.0)
return get_record(server, recid, tries + 1)
return rec
def from_bibrec_to_marc(record, sysno="", options={'text-marc':1, 'aleph-marc':0}):
""" This function will convert a BibRec object into textmarc string """
if not sysno:
sysno_gen = get_sysno_generator()
sysno = sysno_gen.next()
return create_marc_record(record, sysno, options)
def inject_recid(data):
""" """
updated_records = []
for match in data:
original_record_bibrec = create_records(match)[0][0]
if not record_has_field(original_record_bibrec, '001'):
rec_id = re_matched_recid.findall(match)[0][1]
record_add_field(original_record_bibrec, tag='001', controlfield_value=rec_id)
updated_records.append(original_record_bibrec)
return updated_records
def parse_resultfile(data, recid_patterns=(re_original_id,), recids=[],
sysno_patterns=None, preserved_tags=[]):
"""
This function will look for the original recid and any matching recids in a
BibMatch result file containing references to matching records in comments before
every record in MARCXML format.
Returns a list of BibRec structure with found recids for original and matching records.
"""
record_pairs = []
sysno_gen = get_sysno_generator()
options = {'text-marc':1, 'aleph-marc':0}
for index, match in enumerate(data):
original_record_bibrec = create_records(match)[0][0]
if record_has_field(original_record_bibrec, '001'):
rec_id = record_get_field_value(original_record_bibrec, '001')
else:
sysno = sysno_gen.next()
original_record_marc = create_marc_record(original_record_bibrec, sysno, options)
rec_id = ""
for pattern in recid_patterns:
matches = pattern.findall(original_record_marc)
if len(matches) > 0:
rec_id = matches[0]
break
if recids:
matching_result_recids = [recids[index]]
else:
matching_result_recids = re_matched_recid.findall(match)
matching_result_sysnos = []
preserved_fields = {}
print preserved_tags
for tag in preserved_tags:
try:
print 'doing it' + tag
preserved_fields[tag] = original_record_bibrec[tag]
except KeyError:
pass
record_pairs.append((rec_id, matching_result_recids, matching_result_sysnos, preserved_fields))
return record_pairs
def parse_noresultfile(data, recid_patterns=(re_original_id,), sysno_patterns=None):
"""
This function will look for the original recid in 001 and any matching recids
from given regular expression patterns in the textmarc format of given record.
Returns a list of BibRec structure with found recids for original and matching records.
"""
record_pairs = []
sysno_gen = get_sysno_generator()
options = {'text-marc':1, 'aleph-marc':0}
for match in data:
original_record_bibrec = create_records(match)[0][0]
rec_id = record_get_field_value(original_record_bibrec, '001')
sysno = sysno_gen.next()
original_record_marc = create_marc_record(original_record_bibrec, sysno, options)
matching_result_recids = []
for pattern in recid_patterns:
matches = pattern.findall(original_record_marc)
for match in matches:
if type(match) is tuple:
for res in match:
if res != "":
matching_result_recids = [res]
break
elif type(match) is str:
matching_result_recids = [match]
break
if len(matching_result_recids) > 0:
break
matching_result_sysnos = []
for pattern in sysno_patterns:
matches = pattern.findall(original_record_marc)
for match in matches:
if type(match) is tuple:
for res in match:
if res != "":
matching_result_sysnos = [res]
break
elif type(match) is str:
matching_result_sysnos = [match]
break
if len(matching_result_sysnos) > 0:
break
record_pairs.append((rec_id, matching_result_recids, matching_result_sysnos))
return record_pairs
def get_sysno_from_recid(server_url, recid):
"""
This function will look for a record with record ID - recid on server - server_url
and return the system number - sysno
"""
server = InvenioConnector(server_url)
rec = server.search_with_retry(p="001:%s" % (recid,))
try:
sysno = rec[0][970][0]['a'][0]
except (KeyError, IndexError):
return None
if 'SPIRES' in sysno:
sysno = sysno.split("-")[1]
elif 'CER' in sysno:
sysno = sysno.split("CER")[0]
return sysno
def get_recid_from_sysno(server_url, sysno):
"""
This function will look for a record with sysno on server - server_url
and return the record id
"""
server = InvenioConnector(server_url)
rec = server.search_with_retry(p="970:%s" % (sysno.strip(),), of='id')
print rec
try:
recid = str(rec[0])
except (KeyError, IndexError):
return ""
return recid
def load_file(filename):
"Loads a file's contents into a string"
fd = open(filename)
res = fd.read()
fd.close()
return res
def get_rec_from_fieldlist(field_list):
""" """
rec = {}
for tag, value in field_list:
if value != "":
rec[tag] = value
return rec
def get_recid_from_result(rec):
""" """
recid = None
if '001' in rec:
recid = rec["001"]
elif '970' in rec:
recid = get_recid_from_sysno('http://cdsweb.cern.ch', eval(rec["970"])[0])
elif '035' in rec:
i = 0
repno_list = eval(rec["035"])
for repno in repno_list:
if repno == 'CDS':
recid = repno_list[i + 1]
break
i += 1
return recid
def main():
usage = """
recidmap.py [--id, --reverse] bibmatch.result
Tool to turn bibmatch output into MARCXML append uploads by mapping
bibmatch match recid to 035 and having 001 as 001
-i, --ident
specify $$9 identifier to be used in 035
-n, --nomatch
specify nomatch parsing
-a, --alter
add any found recid to 001. Use when no 001 is present in original records.
-r, --reverse
reverse the order of which recid to be mapped to 001 and 035
-l, --recids
give a file containing recids to match IN ORDER to found original record IDs. One rec-id per line.
Alternatively give a range of recids like this, 1234->1238
-b, --bare
use when input records are not from BibMatch output.
-p, --preserve
Fields to preserve from the original record by tags, comma seperated values
-c, --cern
look for CERN specific identifiers
--inspire
look for Inspire specific identifiers
--spires
look for SPIRES specific identifiers
Examples:
Add Inspire uploaded record IDs to CDS:
$ recidmap.py -i Inspire --bare --recids=recids.txt --cern ./cds_theses_non_particle/desy-theses/inspire.cds-desy-theses.insert.xml
$ recidmap.py -i Inspire --bare --recids='123->204' --cern ./cds_theses_non_particle/desy-theses/inspire.cds-desy-theses.insert.xml
Add matched record IDs to CDS:
$ recidmap.py -i Inspire --inspire bibmatch.matched
"""
try:
opts, args = getopt.getopt(sys.argv[1:], "hrni:al:pbc", ["help", "reverse", "nomatch", "ident=", "alter", "recids=", "preserve=", "bare", "cern", "inspire", "spires"])
except getopt.GetoptError, e:
sys.stderr.write("Error:" + str(e) + "\n")
print usage
sys.exit(1)
reverse = False
nomatch = False
inject = False
identifier = None
regexps = [re_original_id]
server_url = "http://inspirebeta.net"
sysnos = []
recid_file = ""
recids = []
preserved_tags = []
bare = False
for opt, opt_value in opts:
if opt in ['-h', '--help']:
print usage
sys.exit(0)
if opt in ['-n', '--nomatch']:
nomatch = True
if opt in ['-r', '--reverse']:
reverse = True
if opt in ['-i', '--ident']:
identifier = opt_value
if opt in ['-a', '--alter']:
inject = True
if opt in ['-b', '--bare']:
bare = True
if opt in ['-l', '--recids']:
recid_file = opt_value
if opt in ['-p', '--preserve']:
preserved_tags = opt_value.split(',')
if opt in ['-c', '--cern']:
regexps.append(re_original_id_cern)
if opt in ['--inspire']:
regexps.append(re_original_id_inspire)
if opt in ['--spires']:
sysnos.append(re_original_id_spires)
filename = args[0]
sys.stderr.write("Reading in %s...\n" % (filename,))
data = load_file(filename)
if '->' in recid_file:
sys.stderr.write("Reading in record IDs from %s...\n" % (recid_file,))
start = int(recid_file.split('->')[0].strip())
end = int(recid_file.split('->')[1].strip())
sys.stderr.write("Start ID: %d, End ID: %d...\n" % (start, end))
if start > end:
sys.stderr.write("Error: Start ID larger then end ID...\n")
sys.exit(1)
recids = [str(recid) for recid in range(start, end + 1)]
sys.stderr.write("Found %d record IDs...\n" % (len(recids),))
elif recid_file:
sys.stderr.write("Reading in record IDs from %s...\n" % (recid_file,))
recids = [str(recid) for recid in load_file(recid_file).split('\n') if recid != ""]
sys.stderr.write("Found %d record IDs...\n" % (len(recids),))
nomatch_records = []
sys.stderr.write("Parsing data in %s...\n" % (filename,))
if bare:
records = [rec + "</record>" for rec in data.split("</record>") if rec != "" and rec != "\n" and rec != "\n</collection>\n"]
else:
records = data.split("<!-- BibMatch-Matching-Results: -->\n")[1:]
sys.stderr.write("Found %d records...\n" % (len(records),))
if recids and len(recids) != len(records):
sys.stderr.write("Error parsing data. Records and given Ids is not synced....\n")
sys.exit(1)
if inject:
output_records = inject_recid(records)
else:
if nomatch:
sys.stderr.write("Nomatch parsing...\n")
record_pairs = parse_noresultfile(records, regexps, sysnos)
else:
sys.stderr.write("Match parsing...\n")
record_pairs = parse_resultfile(records, regexps, recids, preserved_tags=preserved_tags)
sys.stderr.write("Found %d\n" % (len(record_pairs),))
sys.stderr.write("Preparing data...\n")
output_records = []
for recid, match_recids, match_sysno, preserve in record_pairs:
print identifier, recid, match_recids, match_sysno, preserve
if match_recids == []:
if match_sysno == []:
# No matches
print "NOMATCH"
break
else:
match_recid = get_recid_from_sysno(server_url, "SPIRES-%s" % (match_sysno[0].strip(),))
if match_recid == "":
# No recid found
nomatch_records.append((recid, match_recid))
continue
match_recids.append(match_recid)
if identifier is None:
ident = IDENTIFIER_MAP[match_recids[0]]
else:
ident = identifier
rec = {}
if reverse:
record_add_field(rec, '001', controlfield_value=match_recids[0].strip())
record_add_field(rec, '035', subfields=[('9', ident), ('a', recid.strip())])
else:
record_add_field(rec, '001', controlfield_value=recid)
record_add_field(rec, '035', subfields=[('9', ident), ('a', match_recids[0].strip())])
for key, value in preserve.iteritems():
rec[key] = value
output_records.append(rec)
timestamp = time.strftime("%Y%m%d%H%M%S")
print output_records
if len(output_records) > 0:
name = "%s_upload_match.xml" % (timestamp,)
fd = open(name, 'w')
fd.write("<collection>\n" + "\n".join([record_xml_output(r) for r in output_records]) + "\n</collection>")
fd.close()
sys.stderr.write("Wrote %s\n" % (name,))
print "\n".join([str(rec) for rec in nomatch_records])
sys.stderr.write("Done.\n")
if __name__ == "__main__":
main()