ckanext/spatial/harvesters/iso19115/spatial_harvester.py

from six.moves.urllib.parse import urlparse, urlunparse, urlencode


from ckan import model
from ckan import plugins as p
from ckantoolkit import config

from ckan.plugins.core import SingletonPlugin, implements

from ckanext.spatial.harvesters.base import SpatialHarvester
from ckanext.spatial.interfaces import ISpatialHarvester

import logging
log = logging.getLogger(__name__)

class ISO19115SpatialHarvester(SpatialHarvester, SingletonPlugin):
    '''
    An harvester for ISO19115 metadata
    '''
    implements(ISpatialHarvester)

    # ISpatialHarvester

    # From parent SpatialHarvester
    def get_package_dict(self, context, data_dict):
        '''
        Allows to modify the dataset dict that will be created or updated

        This is the dict that the harvesters will pass to the `package_create`
        or `package_update` actions. Extensions can modify it to suit their
        needs, adding or removing filds, modifying the default ones, etc.

        This method should always return a package_dict. Note that, although
        unlikely in a particular instance, this method could be implemented by
        more than one plugin.

        If a dict is not returned by this function, the import stage will be
        cancelled.


        :param context: Contains a reference to the model, eg to
                        perform DB queries, and the user name used for
                        authorization.
        :type context: dict
        :param data_dict: Available data. Contains four keys:

            * `package_dict`
               The default package_dict generated by the harvester. Modify this
               or create a brand new one.
            * `iso_values`
               The parsed ISO XML document values. These contain more fields
               that are not added by default to the ``package_dict``.
            * `xml_tree`
               The full XML etree object. If some values not present in
               ``iso_values`` are needed, these can be extracted via xpath.
            * `harvest_object`
               A ``HarvestObject`` domain object which contains a reference
               to the original metadata document (``harvest_object.content``)
               and the harvest source (``harvest_object.source``).

        :type data_dict: dict

        :returns: A dataset dict ready to be used by ``package_create`` or
                  ``package_update``
        :rtype: dict
        '''
        _dict = data_dict['package_dict']
        _values = data_dict['iso_values']
        # _tree = data_dict['xml_tree']
        _object = data_dict['harvest_object']
        # _dict2 = elem2dict(_tree)
        
        # TODO delegate
        # self.source_config = context['config']
        try:
            csw_harvester = p.get_plugin('csw_harvester')
            return csw_harvester.get_package_dict(_values, _object)
        except Exception as e:
            log.error('Failed to get package from base implementation:\n%r', str(e))

        # TODO readme (below)
        return self._fault_tolerant_get_package_dict(_values, _object)


    def get_validators(self):
        '''
        Allows to register custom Validators that can be applied to harvested
        metadata documents.

        Validators are classes that implement the ``is_valid`` method. Check
        the `Writing custom validators`_ section in the docs to know more
        about writing custom validators.

        :returns: A list of Validator classes
        :rtype: list
        '''
        import ckanext.spatial.harvesters.iso19115.validators as validators
        return [
                  validators.ISO19115_Schema,
                  validators.ISO19115_2_Schema,
                  validators.ISO19115_1_Schema,
                  validators.ISO19115_Schematron]

    # From parent SpatialHarvester
    # def transform_to_iso(self, original_document, original_format, harvest_object):
        # '''
        # Transforms an XML document to ISO 19139

        # This method will be only called from the import stage if the
        # harvest_object content is null and original_document and
        # original_format harvest object extras exist (eg if an FGDC document
        # was harvested).

        # In that case, this method should do the necessary to provide an
        # ISO 1939 like document, otherwise the import process will stop.


        # :param original_document: Original XML document
        # :type original_document: string
        # :param original_format: Original format (eg 'fgdc')
        # :type original_format: string
        # :param harvest_object: HarvestObject domain object (with access to
        #     job and source objects)
        # :type harvest_object: HarvestObject

        # :returns: An ISO 19139 document or None if the transformation was not
        #     successful
        # :rtype: string

        # '''
        # return None


### TODO provide PR to master and remove

    # TODO removeme
    # We are extending concrete class SpatialHarvester 
    # to delegate some of the self.... methods below
    # this imply beeing a IHarvester as well....
    # Once removed below functions no need to extend anymore
    # we can be a pure ISpatialHarvester
    def info(self):
        return {
            'name': 'iso19115',
            'title': 'ISO19115',
            'description': ''
            }

    # source_config = {}

    # force_import = False

    def _fault_tolerant_get_package_dict(self, iso_values, harvest_object):
        '''
        DEPRECATED: should be used untill PR on master are accepted

        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        '''

        from string import Template
        from datetime import datetime
        import six
        from six.moves.urllib.parse import urlparse
        from six.moves.urllib.request import urlopen
        # from owslib import wms
        # from lxml import etree
        from ckanext.harvest.harvesters.base import munge_tag
        from ckan.lib.helpers import json
        tags = []

        if 'tags' in iso_values:
            do_clean = self.source_config.get('clean_tags')
            tags_val = [munge_tag(tag) if do_clean else tag[:100] for tag in iso_values['tags']]
            tags = [{'name': tag} for tag in tags_val]

        # Add default_tags from config
        default_tags = self.source_config.get('default_tags', [])
        if default_tags:
            for tag in default_tags:
                tags.append({'name': tag})

        package_dict = {
            'title': iso_values['title'],
            'notes': iso_values['abstract'],
            'tags': tags,
            'resources': [],
        }

        # We need to get the owner organization (if any) from the harvest
        # source dataset
        source_dataset = model.Package.get(harvest_object.source.id)
        if source_dataset.owner_org:
            package_dict['owner_org'] = source_dataset.owner_org

        # Package name
        package = harvest_object.package
        if package is None or package.title != iso_values['title']:
            name = self._gen_new_name(iso_values['title'])
            if not name:
                name = self._gen_new_name(six.text_type(iso_values['guid']))
            if not name:
                raise Exception('Could not generate a unique name from the title or the GUID. Please choose a more unique title.')
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        extras = {
            'guid': harvest_object.guid,
            'spatial_harvester': True,
        }

        # Just add some of the metadata as extras, not the whole lot
        for name in [
            # Essentials
            'spatial-reference-system',
            'guid',
            # Usefuls
            'dataset-reference-date',
            'metadata-language',  # Language
            'metadata-date',  # Released
            'coupled-resource',
            'contact-email',
            'frequency-of-update',
            'spatial-data-service-type',
        ]:
            extras[name] = iso_values[name]

        if len(iso_values.get('progress', [])):
            extras['progress'] = iso_values['progress'][0]
        else:
            extras['progress'] = ''

        if len(iso_values.get('resource-type', [])):
            extras['resource-type'] = iso_values['resource-type'][0]
        else:
            extras['resource-type'] = ''

        extras['licence'] = iso_values.get('use-constraints', '')

        def _extract_first_license_url(licences):
            for licence in licences:
                o = urlparse(licence)
                if o.scheme and o.netloc:
                    return licence
            return None

        if len(extras['licence']):
            license_url_extracted = _extract_first_license_url(extras['licence'])
            if license_url_extracted:
                extras['licence_url'] = license_url_extracted


        # Metadata license ID check for package
        use_constraints = iso_values.get('use-constraints')
        if use_constraints:

            context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
            license_list = p.toolkit.get_action('license_list')(context, {})

            for constraint in use_constraints:
                package_license = None

                for license in license_list:
                    if constraint.lower() == license.get('id') or constraint == license.get('url'):
                        package_license = license.get('id')
                        break

                if package_license:
                    package_dict['license_id'] = package_license
                    break


        extras['access_constraints'] = iso_values.get('limitations-on-public-access', '')

        # Grpahic preview
        browse_graphic = iso_values.get('browse-graphic')
        if browse_graphic:
            browse_graphic = browse_graphic[0]
            extras['graphic-preview-file'] = browse_graphic.get('file')
            if browse_graphic.get('description'):
                extras['graphic-preview-description'] = browse_graphic.get('description')
            if browse_graphic.get('type'):
                extras['graphic-preview-type'] = browse_graphic.get('type')


        for key in ['temporal-extent-begin', 'temporal-extent-end']:
            if len(iso_values.get(key, '')) > 0:
                extras[key] = iso_values[key][0]

        # Save responsible organization roles
        if iso_values['responsible-organisation']:
            parties = {}
            for party in iso_values['responsible-organisation']:
                if party['organisation-name'] in parties:
                    if not party['role'] in parties[party['organisation-name']]:
                        parties[party['organisation-name']].append(party['role'])
                else:
                    parties[party['organisation-name']] = [party['role']]
            extras['responsible-party'] = [{'name': k, 'roles': v} for k, v in parties.items()]

        if len(iso_values.get('bbox',[])) > 0:
            bbox = iso_values['bbox'][0]
            extras['bbox-east-long'] = bbox['east']
            extras['bbox-north-lat'] = bbox['north']
            extras['bbox-south-lat'] = bbox['south']
            extras['bbox-west-long'] = bbox['west']

            if iso_values.get('spatial'):
                extras['spatial'] = iso_values['spatial']
            else:
                try:
                    xmin = float(bbox['west'])
                    xmax = float(bbox['east'])
                    ymin = float(bbox['south'])
                    ymax = float(bbox['north'])
                except ValueError as e:
                    self._save_object_error('Error parsing bounding box value: {0}'.format(six.text_type(e)),
                                        harvest_object, 'Import')
                else:
                    # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry

                    # Some publishers define the same two corners for the bbox (ie a point),
                    # that causes problems in the search if stored as polygon
                    if xmin == xmax or ymin == ymax:
                        extent_string = Template('{"type": "Point", "coordinates": [$x, $y]}').substitute(
                            x=xmin, y=ymin
                        )
                        self._save_object_error('Point extent defined instead of polygon',
                                        harvest_object, 'Import')
                    else:
                        extent_string = self.extent_template.substitute(
                            xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax
                        )
                    extras['spatial'] = extent_string.strip()
        else:
            log.debug('No spatial extent defined for this object')

        resource_locators = iso_values.get('resource-locator', []) +\
            iso_values.get('resource-locator-identification', [])

        if len(resource_locators):
            for resource_locator in resource_locators:
                url = resource_locator.get('url', '').strip()
                if url:
                    resource = {}
                    resource['format'] = _guess_resource_format(url)
                    if resource['format'] == 'wms' and config.get('ckanext.spatial.harvest.validate_wms', False):
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now().isoformat()

                    resource.update(
                        {
                            'url': url,
                            'name': resource_locator.get('name') or p.toolkit._('Unnamed resource'),
                            'description': resource_locator.get('description') or  '',
                            'resource_locator_protocol': resource_locator.get('protocol') or '',
                            'resource_locator_function': resource_locator.get('function') or '',
                        })
                    package_dict['resources'].append(resource)


        # Add default_extras from config
        default_extras = self.source_config.get('default_extras',{})
        if default_extras:
           override_extras = self.source_config.get('override_extras',False)
           for key,value in default_extras.items():
              log.debug('Processing extra %s', key)
              if not key in extras or override_extras:
                 # Look for replacement strings
                 if isinstance(value,six.string_types):
                    value = value.format(harvest_source_id=harvest_object.job.source.id,
                             harvest_source_url=harvest_object.job.source.url.strip('/'),
                             harvest_source_title=harvest_object.job.source.title,
                             harvest_job_id=harvest_object.job.id,
                             harvest_object_id=harvest_object.id)
                 extras[key] = value

        extras_as_dict = []
        for key, value in extras.items():
            if isinstance(value, (list, dict)):
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})
            else:
                extras_as_dict.append({'key': key, 'value': value})

        package_dict['extras'] = extras_as_dict

        return package_dict

def _guess_resource_format(url, use_mimetypes=True):
    '''

    DEPRECATED should be removed once PR are accepted on master

    Given a URL try to guess the best format to assign to the resource

    The function looks for common patterns in popular geospatial services and
    file extensions, so it may not be 100% accurate. It just looks at the
    provided URL, it does not attempt to perform any remote check.

    if 'use_mimetypes' is True (default value), the mimetypes module will be
    used if no match was found before.

    Returns None if no format could be guessed.

    '''
    import mimetypes
    url = url.lower().strip()

    resource_types = {
        # OGC
        'wms': ('service=wms', 'geoserver/wms', 'mapserver/wmsserver', 'com.esri.wms.Esrimap', 'service/wms'),
        'wfs': ('service=wfs', 'geoserver/wfs', 'mapserver/wfsserver', 'com.esri.wfs.Esrimap'),
        'wcs': ('service=wcs', 'geoserver/wcs', 'imageserver/wcsserver', 'mapserver/wcsserver'),
        'sos': ('service=sos',),
        'csw': ('service=csw',),
        # ESRI
        'kml': ('mapserver/generatekml',),
        'arcims': ('com.esri.esrimap.esrimap',),
        'arcgis_rest': ('arcgis/rest/services',),
    }

    for resource_type, parts in resource_types.items():
        if any(part in url for part in parts):
            return resource_type

    file_types = {
        'kml' : ('kml',),
        'kmz': ('kmz',),
        'gml': ('gml',),
    }

    for file_type, extensions in file_types.items():
        if any(url.endswith(extension) for extension in extensions):
            return file_type

    resource_format, encoding = mimetypes.guess_type(url)
    if resource_format:
        return resource_format

    return None

#####################################################
# TOOLS
#####################################################

def elem2dict(node):
    """
    Convert an lxml.etree node tree into a dict.
    """
    result = {}

    for element in node.iterchildren():
        # Remove namespace prefix
        key = element.tag.split('}')[1] if '}' in element.tag else element.tag

        # Process element as tree element if the inner XML contains non-whitespace content
        if element.text and element.text.strip():
            value = element.text
        else:
            value = elem2dict(element)
        if key in result:

            
            if type(result[key]) is list:
                result[key].append(value)
            else:
                tempvalue = result[key].copy()
                result[key] = [tempvalue, value]
        else:
            result[key] = value
    return result