forked from ckan/ckanext-spatial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspatial_harvester.py
477 lines (385 loc) · 18 KB
/
spatial_harvester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
from six.moves.urllib.parse import urlparse, urlunparse, urlencode
from ckan import model
from ckan import plugins as p
from ckantoolkit import config
from ckan.plugins.core import SingletonPlugin, implements
from ckanext.spatial.harvesters.base import SpatialHarvester
from ckanext.spatial.interfaces import ISpatialHarvester
import logging
log = logging.getLogger(__name__)
class ISO19115SpatialHarvester(SpatialHarvester, SingletonPlugin):
'''
An harvester for ISO19115 metadata
'''
implements(ISpatialHarvester)
# ISpatialHarvester
# From parent SpatialHarvester
def get_package_dict(self, context, data_dict):
'''
Allows to modify the dataset dict that will be created or updated
This is the dict that the harvesters will pass to the `package_create`
or `package_update` actions. Extensions can modify it to suit their
needs, adding or removing filds, modifying the default ones, etc.
This method should always return a package_dict. Note that, although
unlikely in a particular instance, this method could be implemented by
more than one plugin.
If a dict is not returned by this function, the import stage will be
cancelled.
:param context: Contains a reference to the model, eg to
perform DB queries, and the user name used for
authorization.
:type context: dict
:param data_dict: Available data. Contains four keys:
* `package_dict`
The default package_dict generated by the harvester. Modify this
or create a brand new one.
* `iso_values`
The parsed ISO XML document values. These contain more fields
that are not added by default to the ``package_dict``.
* `xml_tree`
The full XML etree object. If some values not present in
``iso_values`` are needed, these can be extracted via xpath.
* `harvest_object`
A ``HarvestObject`` domain object which contains a reference
to the original metadata document (``harvest_object.content``)
and the harvest source (``harvest_object.source``).
:type data_dict: dict
:returns: A dataset dict ready to be used by ``package_create`` or
``package_update``
:rtype: dict
'''
_dict = data_dict['package_dict']
_values = data_dict['iso_values']
# _tree = data_dict['xml_tree']
_object = data_dict['harvest_object']
# _dict2 = elem2dict(_tree)
# TODO delegate
# self.source_config = context['config']
try:
csw_harvester = p.get_plugin('csw_harvester')
return csw_harvester.get_package_dict(_values, _object)
except Exception as e:
log.error('Failed to get package from base implementation:\n%r', str(e))
# TODO readme (below)
return self._fault_tolerant_get_package_dict(_values, _object)
def get_validators(self):
'''
Allows to register custom Validators that can be applied to harvested
metadata documents.
Validators are classes that implement the ``is_valid`` method. Check
the `Writing custom validators`_ section in the docs to know more
about writing custom validators.
:returns: A list of Validator classes
:rtype: list
'''
import ckanext.spatial.harvesters.iso19115.validators as validators
return [
validators.ISO19115_Schema,
validators.ISO19115_2_Schema,
validators.ISO19115_1_Schema,
validators.ISO19115_Schematron]
# From parent SpatialHarvester
# def transform_to_iso(self, original_document, original_format, harvest_object):
# '''
# Transforms an XML document to ISO 19139
# This method will be only called from the import stage if the
# harvest_object content is null and original_document and
# original_format harvest object extras exist (eg if an FGDC document
# was harvested).
# In that case, this method should do the necessary to provide an
# ISO 1939 like document, otherwise the import process will stop.
# :param original_document: Original XML document
# :type original_document: string
# :param original_format: Original format (eg 'fgdc')
# :type original_format: string
# :param harvest_object: HarvestObject domain object (with access to
# job and source objects)
# :type harvest_object: HarvestObject
# :returns: An ISO 19139 document or None if the transformation was not
# successful
# :rtype: string
# '''
# return None
### TODO provide PR to master and remove
# TODO removeme
# We are extending concrete class SpatialHarvester
# to delegate some of the self.... methods below
# this imply beeing a IHarvester as well....
# Once removed below functions no need to extend anymore
# we can be a pure ISpatialHarvester
def info(self):
return {
'name': 'iso19115',
'title': 'ISO19115',
'description': ''
}
# source_config = {}
# force_import = False
def _fault_tolerant_get_package_dict(self, iso_values, harvest_object):
'''
DEPRECATED: should be used untill PR on master are accepted
Constructs a package_dict suitable to be passed to package_create or
package_update. See documentation on
ckan.logic.action.create.package_create for more details
'''
from string import Template
from datetime import datetime
import six
from six.moves.urllib.parse import urlparse
from six.moves.urllib.request import urlopen
# from owslib import wms
# from lxml import etree
from ckanext.harvest.harvesters.base import munge_tag
from ckan.lib.helpers import json
tags = []
if 'tags' in iso_values:
do_clean = self.source_config.get('clean_tags')
tags_val = [munge_tag(tag) if do_clean else tag[:100] for tag in iso_values['tags']]
tags = [{'name': tag} for tag in tags_val]
# Add default_tags from config
default_tags = self.source_config.get('default_tags', [])
if default_tags:
for tag in default_tags:
tags.append({'name': tag})
package_dict = {
'title': iso_values['title'],
'notes': iso_values['abstract'],
'tags': tags,
'resources': [],
}
# We need to get the owner organization (if any) from the harvest
# source dataset
source_dataset = model.Package.get(harvest_object.source.id)
if source_dataset.owner_org:
package_dict['owner_org'] = source_dataset.owner_org
# Package name
package = harvest_object.package
if package is None or package.title != iso_values['title']:
name = self._gen_new_name(iso_values['title'])
if not name:
name = self._gen_new_name(six.text_type(iso_values['guid']))
if not name:
raise Exception('Could not generate a unique name from the title or the GUID. Please choose a more unique title.')
package_dict['name'] = name
else:
package_dict['name'] = package.name
extras = {
'guid': harvest_object.guid,
'spatial_harvester': True,
}
# Just add some of the metadata as extras, not the whole lot
for name in [
# Essentials
'spatial-reference-system',
'guid',
# Usefuls
'dataset-reference-date',
'metadata-language', # Language
'metadata-date', # Released
'coupled-resource',
'contact-email',
'frequency-of-update',
'spatial-data-service-type',
]:
extras[name] = iso_values[name]
if len(iso_values.get('progress', [])):
extras['progress'] = iso_values['progress'][0]
else:
extras['progress'] = ''
if len(iso_values.get('resource-type', [])):
extras['resource-type'] = iso_values['resource-type'][0]
else:
extras['resource-type'] = ''
extras['licence'] = iso_values.get('use-constraints', '')
def _extract_first_license_url(licences):
for licence in licences:
o = urlparse(licence)
if o.scheme and o.netloc:
return licence
return None
if len(extras['licence']):
license_url_extracted = _extract_first_license_url(extras['licence'])
if license_url_extracted:
extras['licence_url'] = license_url_extracted
# Metadata license ID check for package
use_constraints = iso_values.get('use-constraints')
if use_constraints:
context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
license_list = p.toolkit.get_action('license_list')(context, {})
for constraint in use_constraints:
package_license = None
for license in license_list:
if constraint.lower() == license.get('id') or constraint == license.get('url'):
package_license = license.get('id')
break
if package_license:
package_dict['license_id'] = package_license
break
extras['access_constraints'] = iso_values.get('limitations-on-public-access', '')
# Grpahic preview
browse_graphic = iso_values.get('browse-graphic')
if browse_graphic:
browse_graphic = browse_graphic[0]
extras['graphic-preview-file'] = browse_graphic.get('file')
if browse_graphic.get('description'):
extras['graphic-preview-description'] = browse_graphic.get('description')
if browse_graphic.get('type'):
extras['graphic-preview-type'] = browse_graphic.get('type')
for key in ['temporal-extent-begin', 'temporal-extent-end']:
if len(iso_values.get(key, '')) > 0:
extras[key] = iso_values[key][0]
# Save responsible organization roles
if iso_values['responsible-organisation']:
parties = {}
for party in iso_values['responsible-organisation']:
if party['organisation-name'] in parties:
if not party['role'] in parties[party['organisation-name']]:
parties[party['organisation-name']].append(party['role'])
else:
parties[party['organisation-name']] = [party['role']]
extras['responsible-party'] = [{'name': k, 'roles': v} for k, v in parties.items()]
if len(iso_values.get('bbox',[])) > 0:
bbox = iso_values['bbox'][0]
extras['bbox-east-long'] = bbox['east']
extras['bbox-north-lat'] = bbox['north']
extras['bbox-south-lat'] = bbox['south']
extras['bbox-west-long'] = bbox['west']
if iso_values.get('spatial'):
extras['spatial'] = iso_values['spatial']
else:
try:
xmin = float(bbox['west'])
xmax = float(bbox['east'])
ymin = float(bbox['south'])
ymax = float(bbox['north'])
except ValueError as e:
self._save_object_error('Error parsing bounding box value: {0}'.format(six.text_type(e)),
harvest_object, 'Import')
else:
# Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
# Some publishers define the same two corners for the bbox (ie a point),
# that causes problems in the search if stored as polygon
if xmin == xmax or ymin == ymax:
extent_string = Template('{"type": "Point", "coordinates": [$x, $y]}').substitute(
x=xmin, y=ymin
)
self._save_object_error('Point extent defined instead of polygon',
harvest_object, 'Import')
else:
extent_string = self.extent_template.substitute(
xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax
)
extras['spatial'] = extent_string.strip()
else:
log.debug('No spatial extent defined for this object')
resource_locators = iso_values.get('resource-locator', []) +\
iso_values.get('resource-locator-identification', [])
if len(resource_locators):
for resource_locator in resource_locators:
url = resource_locator.get('url', '').strip()
if url:
resource = {}
resource['format'] = _guess_resource_format(url)
if resource['format'] == 'wms' and config.get('ckanext.spatial.harvest.validate_wms', False):
# Check if the service is a view service
test_url = url.split('?')[0] if '?' in url else url
if self._is_wms(test_url):
resource['verified'] = True
resource['verified_date'] = datetime.now().isoformat()
resource.update(
{
'url': url,
'name': resource_locator.get('name') or p.toolkit._('Unnamed resource'),
'description': resource_locator.get('description') or '',
'resource_locator_protocol': resource_locator.get('protocol') or '',
'resource_locator_function': resource_locator.get('function') or '',
})
package_dict['resources'].append(resource)
# Add default_extras from config
default_extras = self.source_config.get('default_extras',{})
if default_extras:
override_extras = self.source_config.get('override_extras',False)
for key,value in default_extras.items():
log.debug('Processing extra %s', key)
if not key in extras or override_extras:
# Look for replacement strings
if isinstance(value,six.string_types):
value = value.format(harvest_source_id=harvest_object.job.source.id,
harvest_source_url=harvest_object.job.source.url.strip('/'),
harvest_source_title=harvest_object.job.source.title,
harvest_job_id=harvest_object.job.id,
harvest_object_id=harvest_object.id)
extras[key] = value
extras_as_dict = []
for key, value in extras.items():
if isinstance(value, (list, dict)):
extras_as_dict.append({'key': key, 'value': json.dumps(value)})
else:
extras_as_dict.append({'key': key, 'value': value})
package_dict['extras'] = extras_as_dict
return package_dict
def _guess_resource_format(url, use_mimetypes=True):
'''
DEPRECATED should be removed once PR are accepted on master
Given a URL try to guess the best format to assign to the resource
The function looks for common patterns in popular geospatial services and
file extensions, so it may not be 100% accurate. It just looks at the
provided URL, it does not attempt to perform any remote check.
if 'use_mimetypes' is True (default value), the mimetypes module will be
used if no match was found before.
Returns None if no format could be guessed.
'''
import mimetypes
url = url.lower().strip()
resource_types = {
# OGC
'wms': ('service=wms', 'geoserver/wms', 'mapserver/wmsserver', 'com.esri.wms.Esrimap', 'service/wms'),
'wfs': ('service=wfs', 'geoserver/wfs', 'mapserver/wfsserver', 'com.esri.wfs.Esrimap'),
'wcs': ('service=wcs', 'geoserver/wcs', 'imageserver/wcsserver', 'mapserver/wcsserver'),
'sos': ('service=sos',),
'csw': ('service=csw',),
# ESRI
'kml': ('mapserver/generatekml',),
'arcims': ('com.esri.esrimap.esrimap',),
'arcgis_rest': ('arcgis/rest/services',),
}
for resource_type, parts in resource_types.items():
if any(part in url for part in parts):
return resource_type
file_types = {
'kml' : ('kml',),
'kmz': ('kmz',),
'gml': ('gml',),
}
for file_type, extensions in file_types.items():
if any(url.endswith(extension) for extension in extensions):
return file_type
resource_format, encoding = mimetypes.guess_type(url)
if resource_format:
return resource_format
return None
#####################################################
# TOOLS
#####################################################
def elem2dict(node):
"""
Convert an lxml.etree node tree into a dict.
"""
result = {}
for element in node.iterchildren():
# Remove namespace prefix
key = element.tag.split('}')[1] if '}' in element.tag else element.tag
# Process element as tree element if the inner XML contains non-whitespace content
if element.text and element.text.strip():
value = element.text
else:
value = elem2dict(element)
if key in result:
if type(result[key]) is list:
result[key].append(value)
else:
tempvalue = result[key].copy()
result[key] = [tempvalue, value]
else:
result[key] = value
return result