From 0a3eb2a5c9e5b59c682c3ce62dbd1bef0ea33ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 7 Oct 2021 23:04:17 +0200 Subject: [PATCH 01/47] Add script for analyzing broken/external links --- .../lxl-2483-broken-or-external-links.groovy | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy diff --git a/whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy b/whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy new file mode 100644 index 0000000000..f600ea8433 --- /dev/null +++ b/whelktool/scripts/analysis/lxl-2483-broken-or-external-links.groovy @@ -0,0 +1,31 @@ +import groovy.transform.Memoized +import whelk.util.DocumentUtil + +whelk = getWhelk() + +selectBySqlWhere('deleted is false', silent: true) { doc -> + DocumentUtil.findKey(doc.graph, "@id") { value, path -> + if (is404(value)) { + incrementStats('404', value) + } + } +} + +@Memoized +boolean is404(iri) { + String systemId = whelk.storage.getSystemIdByIri(iri) + return !systemId +} + +def getWhelk() { + // A little hack to get a handle to whelk... + def whelk = null + selectByIds(['https://id.kb.se/marc']) { docItem -> + whelk = docItem.whelk + } + if (!whelk) { + throw new RuntimeException("Could not get Whelk") + } + return whelk +} + From 0a39b255240cce82cf8ec0a65cb6e59bcfa393e4 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 13 Oct 2021 14:04:59 +0200 Subject: [PATCH 02/47] Add QueryRunner class --- .../groovy/whelk/external/QueryRunner.groovy | 77 +++++++++ .../groovy/whelk/external/Wikidata.groovy | 158 ++++++++++++++++++ 2 files changed, 235 insertions(+) create mode 100644 whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy create mode 100644 whelk-core/src/main/groovy/whelk/external/Wikidata.groovy diff --git a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy new file mode 100644 index 0000000000..4cffcc39f3 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy @@ -0,0 +1,77 @@ +package whelk.external + +import org.apache.jena.query.ParameterizedSparqlString +import org.apache.jena.query.Query +import org.apache.jena.query.QueryExecution +import org.apache.jena.query.QueryExecutionFactory +import org.apache.jena.query.ResultSet +import org.apache.jena.query.ResultSetFactory +import org.apache.jena.rdf.model.Model +import org.apache.jena.rdf.model.RDFNode +import org.apache.jena.shared.PrefixMapping + +class QueryRunner { + static final Map nsPrefixes = + [ + "bd" : "http://www.bigdata.com/rdf#", + "kbv" : "https://id.kb.se/vocab/", + "rdfs" : "http://www.w3.org/2000/01/rdf-schema#", + "skos" : "http://www.w3.org/2004/02/skos/core#", + "wd" : "http://www.wikidata.org/entity/", + "wdt" : "http://www.wikidata.org/prop/direct/", + "wikibase": "http://wikiba.se/ontology#" + ] + + static PrefixMapping prefixMapping = PrefixMapping.Factory.create().setNsPrefixes(nsPrefixes) + + static ResultSet selectQuery(QueryExecution qe) { + ResultSet resultSet + + try { + ResultSet results = qe.execSelect() + resultSet = ResultSetFactory.copyResults(results) + } catch (Exception ex) { + println(ex.getMessage()) + } finally { + qe.close() + } + + return resultSet + } + + static Model constructQuery(QueryExecution qe) { + try { + return qe.execConstruct() + } catch (Exception ex) { + println(ex.getMessage()) + } finally { + qe.close() + } + } + + static boolean askQuery(QueryExecution qe) { + try { + return qe.execAsk() + } catch (Exception ex) { + println(ex.getMessage()) + } finally { + qe.close() + } + } + + static QueryExecution remoteQueryExec(Query query, String sparqlEndpoint) { + return QueryExecutionFactory.sparqlService(sparqlEndpoint, query) + } + + static QueryExecution localQueryExec(Query query, Model graph) { + return QueryExecutionFactory.create(query, graph) + } + + static Query prepareQuery(String command, Collection values = null) { + ParameterizedSparqlString paramString = new ParameterizedSparqlString(command, prefixMapping) + values?.eachWithIndex { v, i -> + paramString.setParam(i, v) + } + return paramString.asQuery() + } +} diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy new file mode 100644 index 0000000000..b5fb0d380e --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -0,0 +1,158 @@ +package whelk.external + +import groovy.transform.Memoized +import org.apache.jena.query.Query +import org.apache.jena.query.QueryExecution +import org.apache.jena.query.ResultSet +import org.apache.jena.rdf.model.Model +import org.apache.jena.rdf.model.ModelFactory +import org.apache.jena.rdf.model.RDFNode + +class Wikidata { + Optional getThing(String iri) { + if (!isWikidataEntity(iri)) { + return Optional.empty() + } + + WikidataEntity wdEntity = new WikidataEntity(iri) + + return Optional.ofNullable(wdEntity.convert()) + } + + boolean isWikidataEntity(String iri) { + iri.startsWith("https://www.wikidata.org") + } +} + +class WikidataEntity { + static final String wikidataEndpoint = "https://query.wikidata.org/sparql" + + enum Properties { + PREF_LABEL('skos:prefLabel'), + COUNTRY('wdt:P17'), + PART_OF('wdt:P131') // located in the administrative territorial entity + + String prefixedIri + + private Properties(String prefixedIri) { + this.prefixedIri = prefixedIri + } + } + + enum Type { + PLACE('Q618123'), // Geographical feature + PERSON('Q5'), // Human + OTHER('') + + String baseClass + + private Type(String baseClass) { + this.baseClass = baseClass + } + } + + Model graph = ModelFactory.createDefaultModel() + + String iri + String shortId + + WikidataEntity(String iri) { + this.iri = iri + this.shortId = getShortId(iri) + loadGraph() + } + + private void loadGraph() { + try { + graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.rdf?flavor=dump") + } catch (Exception ex) { + println("Unable to load graph for entity ${iri}") + } + } + + Map convert() { + switch (type()) { + case Type.PLACE: return convertPlace() + case Type.PERSON: return convertPerson() + default: return null + } + } + + Map convertPlace() { + Map place = + [ + '@id' : iri, // Måste vara entity irin! + '@type': "Place" + ] + + List langsOfInterest = ['sv', 'en'] + List prefLabel = getValuesOfProperty(Properties.PREF_LABEL.prefixedIri) + List prefLabelsOfInterest = prefLabel.findAll { it.getLanguage() in langsOfInterest } + if (!prefLabelsOfInterest.isEmpty()) + place['prefLabelByLang'] = prefLabelsOfInterest.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + + List country = getValuesOfProperty(Properties.COUNTRY.prefixedIri) + if (!country.isEmpty()) + place['country'] = country.collect { ['@id': it.toString()] } + + List partOf = getValuesOfProperty(Properties.PART_OF.prefixedIri) - country + if (!partOf.isEmpty()) + place['isPartOf'] = partOf.collect { ['@id': it.toString()] } + + return place + } + + Map convertPerson() { + Map person = + [ + '@id' : iri, + '@type': "Person" + ] + + List langsOfInterest = ['sv', 'en'] + List prefLabel = getValuesOfProperty(Properties.PREF_LABEL.prefixedIri) + List prefLabelsOfInterest = prefLabel.findAll { it.getLanguage() in langsOfInterest } + if (!prefLabelsOfInterest.isEmpty()) + person['prefLabelByLang'] = prefLabelsOfInterest.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + + return person + } + + Type type() { + String command = "SELECT ?type { wd:${shortId} wdt:P31 ?type }" + Query q = QueryRunner.prepareQuery(command) + QueryExecution qExec = QueryRunner.localQueryExec(q, graph) + ResultSet rs = QueryRunner.selectQuery(qExec) + Set wdTypes = rs.collect { it.get("type").toString() } as Set + + return Type.values().find { getSubclasses(it).intersect(wdTypes) } ?: Type.OTHER + } + + List getValuesOfProperty(String prop) { + String queryString = "SELECT ?o { wd:${shortId} ${prop} ?o }" + + Query q = QueryRunner.prepareQuery(queryString) + QueryExecution qExec = QueryRunner.localQueryExec(q, graph) + ResultSet rs = QueryRunner.selectQuery(qExec) + + return rs.collect { it.get("o") } + } + + @Memoized + static Set getSubclasses(Type type) { + if (type == Type.OTHER) { + return Collections.EMPTY_SET + } + + String queryString = "SELECT ?class { ?class wdt:P279* wd:${type.baseClass} }" + Query q = QueryRunner.prepareQuery(queryString) + QueryExecution qExec = QueryRunner.remoteQueryExec(q, wikidataEndpoint) + ResultSet res = QueryRunner.selectQuery(qExec) + return res.collect { it.get("class").toString() }.toSet() + } + + String getShortId(String iri) { + iri.replaceAll(/.*\//, '') + } +} + From b2750bda7ec88387016c4b43ae1440a4869fc2a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 8 Oct 2021 16:21:18 +0200 Subject: [PATCH 03/47] Add prototype for caching external entities --- .../main/groovy/whelk/rest/api/Crud.groovy | 6 + .../groovy/whelk/rest/api/SearchUtils.groovy | 106 +++++++++++------- .../whelk/rest/api/SearchUtilsSpec.groovy | 10 -- .../src/main/groovy/whelk/Document.groovy | 27 ++++- .../src/main/groovy/whelk/External.groovy | 67 +++++++++++ .../src/main/groovy/whelk/JsonLd.groovy | 6 + whelk-core/src/main/groovy/whelk/Whelk.groovy | 105 ++++++++++++++++- .../whelk/component/ElasticSearch.groovy | 8 ++ .../component/PostgreSQLComponent.groovy | 4 + 9 files changed, 281 insertions(+), 58 deletions(-) create mode 100644 whelk-core/src/main/groovy/whelk/External.groovy diff --git a/rest/src/main/groovy/whelk/rest/api/Crud.groovy b/rest/src/main/groovy/whelk/rest/api/Crud.groovy index 8107c3e27b..65b09bb284 100644 --- a/rest/src/main/groovy/whelk/rest/api/Crud.groovy +++ b/rest/src/main/groovy/whelk/rest/api/Crud.groovy @@ -679,7 +679,13 @@ class Crud extends HttpServlet { try { if (doc) { String activeSigel = request.getHeader(XL_ACTIVE_SIGEL_HEADER) + String collection = doc.getLegacyCollection(jsonld) + + if (doc.isCacheRecord()) { + throw new BadRequestException("Cannot POST/PUT cache record") + } + if (isUpdate) { // You are not allowed to change collection when updating a record diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index 499a7f9e1f..1ca92f3f5e 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -8,7 +8,6 @@ import groovy.util.logging.Log4j2 as Log import whelk.Document import whelk.JsonLd import whelk.Whelk -import whelk.component.DocumentNormalizer import whelk.exception.InvalidQueryException import whelk.exception.WhelkRuntimeException import whelk.search.ESQuery @@ -16,6 +15,10 @@ import whelk.search.ElasticFind import whelk.search.RangeParameterPrefix import whelk.util.DocumentUtil +import static whelk.JsonLd.GRAPH_KEY +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.TYPE_KEY + @Log class SearchUtils { @@ -132,6 +135,16 @@ class SearchUtils { queryParameters['_limit'] = [limit.toString()] Map esResult = esQuery.doQuery(queryParameters, suggest) + + + def e = searchExternal(queryParameters) + if (e) { + esResult['items'] = e + esResult['items'] + if(esResult['totalHits'] == 0) { + esResult['totalHits'] = e.size() + } + } + Lookup lookup = new Lookup() List mappings = [] @@ -159,9 +172,10 @@ class SearchUtils { item.identifiedBy?.with { List ids -> ids.removeAll { (Document.isIsni(it) || Document.isOrcid(it) ) && it.value?.size() == 16+3 } } // This object must be re-added because it might get filtered out in applyLens(). - item['reverseLinks'] = it['reverseLinks'] - if (item['reverseLinks'] != null) + if (it['reverseLinks']) { + item['reverseLinks'] = it['reverseLinks'] item['reverseLinks'][JsonLd.ID_KEY] = Document.getBASE_URI().resolve('find?o=' + URLEncoder.encode(it['@id'], 'UTF-8').toString()).toString() + } return item } } @@ -173,13 +187,14 @@ class SearchUtils { k = stripPrefix((String) k, ESQuery.OR_PREFIX) ((List) aggregations[k]?['buckets'])?.removeIf { it['key'] in v } } - + Map stats = null if (addStats == null || (addStats == 'true' || addStats == 'on')) { stats = buildStats(lookup, aggregations, makeFindUrl(SearchType.ELASTIC, stripNonStatsParams(pageParams)), (total > 0 && !predicates) ? reverseObject : null) } + if (!stats) { log.debug("No stats found for query: ${queryParameters}") } @@ -187,7 +202,7 @@ class SearchUtils { (query ? mappings.tail() : mappings).each { Map mapping -> Map params = removeMappingFromParams(pageParams, mapping) String upUrl = makeFindUrl(SearchType.ELASTIC, params, offset) - mapping['up'] = [ (JsonLd.ID_KEY): upUrl ] + mapping['up'] = [ (ID_KEY): upUrl ] } if (reverseObject) { @@ -195,7 +210,7 @@ class SearchUtils { mappings << [ 'variable' : 'o', 'object' : lookup.chip(reverseObject), // TODO: object/predicate/??? - 'up' : [(JsonLd.ID_KEY): upUrl], + 'up' : [(ID_KEY): upUrl], ] } @@ -205,7 +220,7 @@ class SearchUtils { 'variable' : 'p', 'object' : reverseObject, 'predicate': lookup.chip(predicates.first()), - 'up' : [(JsonLd.ID_KEY): upUrl], + 'up' : [(ID_KEY): upUrl], ] } @@ -222,12 +237,41 @@ class SearchUtils { result['_debug'] = esResult['_debug'] } - result['maxItems'] = esQuery.getMaxItems().toString() + result['maxItems'] = esQuery.getMaxItems() lookup.run() return result } + + List searchExternal(Map query) { + if (!query.q || !JsonLd.looksLikeIri(query.q.first())) { + return [] + } + + String iri = query.q.first().trim() + if (iri.contains('|')) { // TODO: cataloging client does "term | term*" in side panel search... + iri = iri.split('\\|').first().trim() + } + + def existing = whelk.getCards([iri]) + boolean existsInWhelk = !existing.isEmpty() && !(new Document(existing[iri]).isPlaceholder()) + if (existsInWhelk) { + return [] + } + + return whelk.external.getEphemeral(iri).map ({ doc -> + def extType = doc.getThingType() + def queryTypes = query[TYPE_KEY] + boolean isAnyTypeOk = !queryTypes || queryTypes.any { it == '*' } + if (isAnyTypeOk || queryTypes.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)}) { + whelk.embellish(doc) + [JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)] + } else { + [] + } + }).orElse([]) + } Map removeMappingFromParams(Map pageParams, Map mapping) { Map params = pageParams.clone() @@ -235,7 +279,7 @@ class SearchUtils { def param = params[variable] List values = param instanceof List ? param.clone() : param ? [param] : [] if ('object' in mapping) { - def value = mapping.object[JsonLd.ID_KEY] + def value = mapping.object[ID_KEY] values.remove(value) } else if ('value' in mapping) { def value = mapping.value @@ -275,8 +319,8 @@ class SearchUtils { private Map assembleSearchResults(SearchType st, List items, List mappings, Map pageParams, int limit, int offset, int total) { - Map result = [(JsonLd.TYPE_KEY): 'PartialCollectionView'] - result[(JsonLd.ID_KEY)] = makeFindUrl(st, pageParams, offset) + Map result = [(TYPE_KEY): 'PartialCollectionView'] + result[(ID_KEY)] = makeFindUrl(st, pageParams, offset) result['itemOffset'] = offset result['itemsPerPage'] = limit result['totalItems'] = total @@ -292,20 +336,6 @@ class SearchUtils { return result } - /** - * Create ES filter for specified siteBaseUri. - * - */ - Map makeSiteFilter(String siteBaseUri) { - return ['should': [ - ['prefix': [(JsonLd.ID_KEY): siteBaseUri]], - // ideally, we'd use ID_KEY here too, but that - // breaks the test case :/ - ['prefix': ['sameAs.@id': siteBaseUri]] - ], - 'minimum_should_match': 1] - } - /** * Build the term aggregation part of an ES query. * @@ -357,7 +387,7 @@ class SearchUtils { String searchPageUrl = "${baseUrlForKey}&${ESQuery.AND_PREFIX}${makeParam(key, itemId)}" Map observation = ['totalItems': bucket.getAt('doc_count'), - 'view': [(JsonLd.ID_KEY): searchPageUrl], + 'view': [(ID_KEY): searchPageUrl], 'object': lookup.chip(itemId)] observations << observation @@ -377,7 +407,7 @@ class SearchUtils { 'dimension' : JsonLd.REVERSE_KEY, 'observation': counts.collect { List relations, long count -> def viewUrl = baseUrl + '&' + - relations.collect{ makeParam('p', it + '.' + JsonLd.ID_KEY) }.join('&') + relations.collect{ makeParam('p', it + '.' + ID_KEY) }.join('&') [ 'totalItems': count, 'view' : ['@id': viewUrl], @@ -456,7 +486,7 @@ class SearchUtils { private int numberOfIncomingLinks(String iri) { try { - def doc = new ElasticFind(esQuery).find([(JsonLd.ID_KEY): [iri]]).first() + def doc = new ElasticFind(esQuery).find([(ID_KEY): [iri]]).first() return doc['reverseLinks']['totalItems'] } catch (Exception e) { @@ -480,8 +510,8 @@ class SearchUtils { if (termKey in ld.vocabIndex) { return ld.vocabIndex[termKey] } - - if (!itemId.startsWith('http') && itemId.contains('.')) { + + if (!JsonLd.looksLikeIri(itemId) && itemId.contains('.')) { String[] parts = itemId.split('\\.') List chain = parts .findAll { it != JsonLd.ID_KEY } @@ -528,7 +558,7 @@ class SearchUtils { } private Map dummyChip(String itemId) { - [(JsonLd.ID_KEY): itemId, 'label': itemId] + [(ID_KEY): itemId, 'label': itemId] } /* @@ -553,7 +583,7 @@ class SearchUtils { // FIXME move to Document or JsonLd private Map getEntry(Map jsonLd, String entryId) { // we rely on this convention for the time being. - return jsonLd[(JsonLd.GRAPH_KEY)].find { it[JsonLd.ID_KEY] == entryId } + return jsonLd[(GRAPH_KEY)].find { it[ID_KEY] == entryId } } /** @@ -623,20 +653,20 @@ class SearchUtils { Offsets offsets = new Offsets(total, limit, offset) - result['first'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams)] - result['last'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams, offsets.last)] + result['first'] = [(ID_KEY): makeFindUrl(st, pageParams)] + result['last'] = [(ID_KEY): makeFindUrl(st, pageParams, offsets.last)] if (offsets.prev != null) { if (offsets.prev == 0) { result['previous'] = result['first'] } else { - result['previous'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams, + result['previous'] = [(ID_KEY): makeFindUrl(st, pageParams, offsets.prev)] } } if (offsets.next) { - result['next'] = [(JsonLd.ID_KEY): makeFindUrl(st, pageParams, + result['next'] = [(ID_KEY): makeFindUrl(st, pageParams, offsets.next)] } @@ -723,11 +753,11 @@ class SearchUtils { String valueProp String termKey def value - if (param == JsonLd.TYPE_KEY || param == JsonLd.ID_KEY) { + if (param == TYPE_KEY || param == ID_KEY) { valueProp = 'object' termKey = param value = lookup.chip(val).with { it[JsonLd.ID_KEY] = val; return it } - } else if (param.endsWith(".${JsonLd.ID_KEY}")) { + } else if (param.endsWith(".${ID_KEY}")) { valueProp = 'object' termKey = param[0..-5] value = lookup.chip(val).with { it[JsonLd.ID_KEY] = val; return it } diff --git a/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy b/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy index 1383d7a429..2942566a5e 100644 --- a/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy +++ b/rest/src/test/groovy/whelk/rest/api/SearchUtilsSpec.groovy @@ -23,16 +23,6 @@ class SearchUtilsSpec extends Specification { assert !(urir =~ pattern) } - def "Should make site filter"() { - when: - String url = "http://example.com" - Map expected = ['should': [['prefix': ['@id': url]], - ['prefix': ['sameAs.@id': url]]], - 'minimum_should_match': 1] - then: - assert search.makeSiteFilter(url) == expected - } - def "Should build aggregation query"() { when: Map tree = ['@type': []] diff --git a/whelk-core/src/main/groovy/whelk/Document.groovy b/whelk-core/src/main/groovy/whelk/Document.groovy index 0580d6de04..a8218e5275 100644 --- a/whelk-core/src/main/groovy/whelk/Document.groovy +++ b/whelk-core/src/main/groovy/whelk/Document.groovy @@ -50,6 +50,7 @@ class Document { static final List thingCarrierTypesPath = ["@graph", 1, "carrierType"] static final List thingInSchemePath = ["@graph",1,"inScheme","@id"] static final List recordIdPath = ["@graph", 0, "@id"] + static final List recordTypePath = ["@graph", 0, "@type"] static final List workIdPath = ["@graph", 1, "instanceOf", "@id"] static final List thingMetaPath = ["@graph", 1, "meta", "@id"] static final List recordSameAsPath = ["@graph", 0, "sameAs"] @@ -171,12 +172,22 @@ class Document { String getThingType() { get(thingTypePath) } + String getRecordType() { get(recordTypePath) } + + String setRecordType(type) { set(recordTypePath, type) } + String getRecordStatus() { return get(statusPath) } void setRecordStatus(status) { set(statusPath, status) } void setThingMeta(meta) { set(thingMetaPath, meta) } + Map getThing() { (Map) get(thingPath) } + + void setThing(thing) { _removeLeafObject(thingPath, data); set(thingPath, thing) } + + void setRecordId(id) { set(recordIdPath, id) } + /** * Will have base URI prepended if not already there */ @@ -342,6 +353,14 @@ class Document { String getLegacyCollection(JsonLd jsonld) { LegacyIntegrationTools.determineLegacyCollection(this, jsonld) } + + boolean isPlaceholder() { + return getRecordType() == JsonLd.PLACEHOLDER_RECORD_TYPE + } + + boolean isCacheRecord() { + return getRecordType() == JsonLd.CACHE_RECORD_TYPE + } String getHeldBySigel() { String uri = get(sigelPath) @@ -714,22 +733,22 @@ class Document { return _get(path, data) } - static Object _get(List path, Object root) { + static Object _get(List path, Object root, Object defaultTo = null) { // Start at root data node Object node = root for (Object step : path) { if ((node instanceof Map) && !(step instanceof String)) { log.warn("Needed string as map key, but was given: " + step + ". (path was: " + path + ")") - return null + return defaultTo } else if ((node instanceof List) && !(step instanceof Integer)) { log.warn("Needed integer as list index, but was given: " + step + ". (path was: " + path + ")") - return null + return defaultTo } node = node[step] if (node == null) { - return null + return defaultTo } } diff --git a/whelk-core/src/main/groovy/whelk/External.groovy b/whelk-core/src/main/groovy/whelk/External.groovy new file mode 100644 index 0000000000..2633b51be9 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/External.groovy @@ -0,0 +1,67 @@ +package whelk + +class External { + Whelk whelk + + private static final Map IDS = [ + 'https://www.wikidata.org/wiki/Q54344' : ['@type': 'Place', 'prefLabel': 'Skellefteå', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q104877']]], + 'https://www.wikidata.org/wiki/Q1615045' : ['@type': 'Place', 'prefLabel': 'Robertsfors', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q104877']]], + 'https://www.wikidata.org/wiki/Q660050' : ['@type': 'Place', 'prefLabel': 'Varuträsk'], + 'https://www.wikidata.org/wiki/Q10524743': ['@type': 'Place', 'prefLabel': 'Hjoggböle'], + 'https://www.wikidata.org/wiki/Q26268' : ['@type': 'Place', 'prefLabel': 'Luleå', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q103686']]], + 'https://www.wikidata.org/wiki/Q103686' : ['@type': 'Place', 'prefLabel': 'Norrbotten'], + 'https://www.wikidata.org/wiki/Q104877' : ['@type': 'Place', 'prefLabel': 'Västerbotten'], + 'https://www.wikidata.org/wiki/Olov' : ['@type': 'Place', 'prefLabel': 'Mordor', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q1036456']]], + ] + + External(Whelk whelk) { + this.whelk = whelk + } + + private static Optional getThing(String iri) { + if (iri in IDS) { + return Optional.of( + IDS[iri].with { + it['@id'] = iri + new HashMap<>(it) + } + ) + } + return Optional.empty() + } + + Optional get(String iri) { + getThing(iri).map { document(it, JsonLd.CACHE_RECORD_TYPE) } + } + + Optional getEphemeral(String iri) { + getThing(iri).map { + def d = document(it, JsonLd.CACHE_RECORD_TYPE) + d.setRecordId("${d.getRecordIdentifiers().first()}#record".toString()) + d + } + } + + static Document getPlaceholder(String iri) { + def thing = [ + '@id' : iri, + '@type': JsonLd.PLACEHOLDER_ENTITY_TYPE + ] + + document(thing, JsonLd.PLACEHOLDER_RECORD_TYPE) + } + + private static Document document(Map thing, String recordType) { + new Document([ + '@graph': [ + [ + '@id' : Document.BASE_URI.toString() + IdGenerator.generate(), + '@type' : recordType, + 'mainEntity': ['@id': thing.'@id'], + 'inDataset' : ['@id': 'http://kblocalhost.kb.se:5000/v8h8lr6js3cmfvvd#it'] + ], + thing + ] + ]) + } +} diff --git a/whelk-core/src/main/groovy/whelk/JsonLd.groovy b/whelk-core/src/main/groovy/whelk/JsonLd.groovy index 038bb8234c..7610217316 100644 --- a/whelk-core/src/main/groovy/whelk/JsonLd.groovy +++ b/whelk-core/src/main/groovy/whelk/JsonLd.groovy @@ -42,6 +42,8 @@ class JsonLd { static final String RECORD_TYPE = 'Record' static final String CACHE_RECORD_TYPE = 'CacheRecord' + static final String PLACEHOLDER_RECORD_TYPE = 'PlaceholderRecord' + static final String PLACEHOLDER_ENTITY_TYPE = 'Resource' static final String SEARCH_KEY = "_str" @@ -548,6 +550,10 @@ class JsonLd { static List asList(o) { return (o instanceof List) ? (List) o : o != null ? [o] : [] } + + static boolean looksLikeIri(String s) { + s.startsWith('https://') || s.startsWith('http://') + } static boolean looksLikeIri(String s) { s && (s.startsWith('https://') || s.startsWith('http://')) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 51f08c9095..90dbea3c5d 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -2,6 +2,8 @@ package whelk import com.google.common.collect.Iterables import groovy.transform.CompileStatic +import groovy.transform.TypeChecked +import groovy.transform.TypeCheckingMode import groovy.util.logging.Log4j2 as Log import se.kb.libris.Normalizers import whelk.component.CachingPostgreSQLComponent @@ -20,10 +22,12 @@ import whelk.filter.NormalizerChain import whelk.meta.WhelkConstants import whelk.search.ESQuery import whelk.search.ElasticFind +import whelk.util.LegacyIntegrationTools import whelk.util.PropertyLoader import whelk.util.Romanizer import java.time.ZoneId +import java.util.function.Function /** * The Whelk is the root component of the XL system. @@ -57,6 +61,7 @@ class Whelk { RomanizationStep.LanguageResources languageResources ElasticFind elasticFind Relations relations + External external = new External(this) DocumentNormalizer normalizer Romanizer romanizer @@ -333,6 +338,7 @@ class Whelk { Set addedLinks = (postUpdateLinks - preUpdateLinks) Set removedLinks = (preUpdateLinks - postUpdateLinks) + //TODO: fails for placeholders... removedLinks.findResults { storage.getSystemIdByIri(it.iri) } .each{id -> elastic.decrementReverseLinks(id) } @@ -346,7 +352,7 @@ class Whelk { // we added a link to a document that includes us in its @reverse relations, reindex it elastic.index(doc, this) } - else { + else if (!doc.isPlaceholder()) { // just update link counter elastic.incrementReverseLinks(id) } @@ -428,6 +434,7 @@ class Whelk { throw new StorageCreateFailedException(document.getShortId(), "Document considered a duplicate of : " + collidingIDs) } + createPlaceholdersAndExternalDocs(document) boolean success = storage.createDocument(document, changedIn, changedBy, collection, deleted) if (success) { indexAsyncOrSync { @@ -457,6 +464,7 @@ class Whelk { preUpdateDoc = doc.clone() updateAgent.update(doc) normalize(doc) + createPlaceholdersAndExternalDocs(doc, preUpdateDoc) }) if (updated == null || preUpdateDoc == null) { @@ -472,6 +480,8 @@ class Whelk { void storeAtomicUpdate(Document doc, boolean minorUpdate, boolean writeIdenticalVersions, String changedIn, String changedBy, String oldChecksum) { normalize(doc) Document preUpdateDoc = storage.load(doc.shortId) + + createPlaceholdersAndExternalDocs(doc, preUpdateDoc) Document updated = storage.storeAtomicUpdate(doc, minorUpdate, writeIdenticalVersions, changedIn, changedBy, oldChecksum) if (updated == null) { @@ -481,7 +491,7 @@ class Whelk { reindexUpdated(updated, preUpdateDoc) sparqlUpdater?.pollNow() } - + /** * This is a variant of createDocument that does no or minimal denormalization or indexing. * It should NOT be used to create records in a production environment. Its intended purpose is @@ -508,10 +518,11 @@ class Whelk { updated.getThingIdentifiers()[0] && updated.getThingIdentifiers()[0] != preUpdateDoc.getThingIdentifiers()[0] } - + @TypeChecked(TypeCheckingMode.SKIP) void embellish(Document document, List levels = null) { - def docsByIris = { List iris -> bulkLoad(iris).values().collect{ it.data } } - Embellisher e = new Embellisher(jsonld, docsByIris, storage.&getCards, relations.&getByReverse) + def getDocs = andGetExternal({ List iris -> bulkLoad(iris).values().collect{ it.data } }) + def getCards = andGetExternal(storage.&getCards, true) + Embellisher e = new Embellisher(jsonld, getDocs, getCards, relations.&getByReverse) if (levels) { e.setEmbellishLevels(levels) @@ -523,7 +534,40 @@ class Whelk { e.embellish(document) } - + + //FIXME + @TypeChecked(TypeCheckingMode.SKIP) + private def andGetExternal(Function, Iterable> f, cards = false) { + def thingId = { graph -> (String) Document._get(Document.thingIdPath, graph) } + + return { Iterable iris -> + def result = f.apply(iris).collect { + def d = new Document(it) + if (d.isPlaceholder()) { + external.getEphemeral(d.getThingIdentifiers().first()).ifPresent({ ext -> + d.setThing(cards ? jsonld.toCard(ext.getThing(), false) : ext.getThing()) + }) + d.data + } else { + it + } + } + + // get external for IRIs that don't have placeholders + // TODO: only needed if we don't store placeholders for everything + def found = result.collect(thingId) + def missing = ((iris as Set) - (found as Set)) + def ext = missing + .collect{ external.getEphemeral(it) } + .findAll{ it.isPresent() } + .collect {cards ? jsonld.toCard(it.get().data) : it.get().data } + + result += ext + + return result + } + } + /** * Get cards * @param iris @@ -591,4 +635,53 @@ class Whelk { ZoneId getTimezone() { return timezone } + + private void createPlaceholdersAndExternalDocs(Document postUpdateDoc, Document preUpdateDoc = null) { + Set postUpdateLinks = postUpdateDoc.getExternalRefs() + Set preUpdateLinks = preUpdateDoc?.getExternalRefs() ?: new HashSet() //Collections.EMPTY_SET groovy compiler...? + + def iris = { Set s -> s.collect { it.iri } as Set } + Set addedIris = iris(postUpdateLinks) - iris(preUpdateLinks) + + createPlaceholdersAndExternalDocs(iris(postUpdateLinks), !postUpdateDoc.isCacheRecord()) + } + + private void createPlaceholdersAndExternalDocs(Set iris, boolean tryFetchExternal) { + Set brokenOrExternalIris = iris - storage.getSystemIdsByIris(iris).keySet() + + boolean minorUpdate = true + def changedIn = 'xl' + def changedBy = 'https://libris.kb.se/library/SEK' // FIXME... + def collection = LegacyIntegrationTools.NO_MARC_COLLECTION + def deleted = false + + brokenOrExternalIris.each { iri -> + def doc = tryFetchExternal + ? external.get(iri).orElse(External.getPlaceholder(iri)) + : External.getPlaceholder(iri) + + try { + createDocument(doc, changedIn, changedBy, collection, deleted) + } + catch (StorageCreateFailedException ignored) { + // Another transaction already created it -> OK + } + } + + // Check if old placeholder records can be replaced with cache records + bulkLoad(iris - brokenOrExternalIris).values() + .findAll{doc -> doc.isPlaceholder() } + .each { doc -> + try { + external.getEphemeral(doc.getThingIdentifiers().first()).ifPresent({ extDoc -> + def checkSum = doc.getChecksum(jsonld) + extDoc.setRecordId(doc.getRecordIdentifiers().first()) + storeAtomicUpdate(extDoc, minorUpdate, changedIn, changedBy, checkSum) + }) + } + catch (Exception e) { // TODO + log.warn("Failed to update ${doc.shortId}: $e", e) + } + } + } } diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 3a796e0030..ea46e06c32 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -180,6 +180,10 @@ class ElasticSearch { void bulkIndex(Collection docs, Whelk whelk) { if (docs) { String bulkString = docs.findResults{ doc -> + if (doc.isPlaceholder()) { + return null + } + try { String shapedData = getShapeForIndex(doc, whelk) String action = createActionRow(doc) @@ -222,6 +226,10 @@ class ElasticSearch { } void index(Document doc, Whelk whelk) { + if (doc.isPlaceholder()) { + return + } + // The justification for this uncomfortable catch-all, is that an index-failure must raise an alert (log entry) // _internally_ but be otherwise invisible to clients (If postgres writing was ok, the save is considered ok). try { diff --git a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy index cba3868c74..3a30156e0d 100644 --- a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy +++ b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy @@ -935,6 +935,10 @@ class PostgreSQLComponent { } } + if (preUpdateDoc.isCacheRecord() && !doc.isCacheRecord()) { + throw new RuntimeException("Cannot change cache record to not be cache record (${doc.getShortId()})") + } + if (doVerifyDocumentIdRetention) { verifyDocumentIdRetention(preUpdateDoc, doc, connection) } From a17ee557c99f8b30a70e6e8d0e5e99609d2003a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 14 Oct 2021 00:48:58 +0200 Subject: [PATCH 04/47] Add cache to External --- .../src/main/groovy/whelk/External.groovy | 80 ++++++++++--------- whelk-core/src/main/groovy/whelk/Whelk.groovy | 2 +- .../whelk/component/ElasticSearch.groovy | 2 +- .../groovy/whelk/external/Wikidata.groovy | 8 +- 4 files changed, 48 insertions(+), 44 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/External.groovy b/whelk-core/src/main/groovy/whelk/External.groovy index 2633b51be9..64fc71e3ac 100644 --- a/whelk-core/src/main/groovy/whelk/External.groovy +++ b/whelk-core/src/main/groovy/whelk/External.groovy @@ -1,47 +1,46 @@ package whelk -class External { - Whelk whelk +import com.google.common.cache.CacheBuilder +import com.google.common.cache.CacheLoader +import com.google.common.cache.LoadingCache +import whelk.external.Wikidata - private static final Map IDS = [ - 'https://www.wikidata.org/wiki/Q54344' : ['@type': 'Place', 'prefLabel': 'Skellefteå', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q104877']]], - 'https://www.wikidata.org/wiki/Q1615045' : ['@type': 'Place', 'prefLabel': 'Robertsfors', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q104877']]], - 'https://www.wikidata.org/wiki/Q660050' : ['@type': 'Place', 'prefLabel': 'Varuträsk'], - 'https://www.wikidata.org/wiki/Q10524743': ['@type': 'Place', 'prefLabel': 'Hjoggböle'], - 'https://www.wikidata.org/wiki/Q26268' : ['@type': 'Place', 'prefLabel': 'Luleå', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q103686']]], - 'https://www.wikidata.org/wiki/Q103686' : ['@type': 'Place', 'prefLabel': 'Norrbotten'], - 'https://www.wikidata.org/wiki/Q104877' : ['@type': 'Place', 'prefLabel': 'Västerbotten'], - 'https://www.wikidata.org/wiki/Olov' : ['@type': 'Place', 'prefLabel': 'Mordor', isPartOf: [['@id': 'https://www.wikidata.org/wiki/Q1036456']]], +class External { + private static final Map mappers = [ + 'http://kblocalhost.kb.se:5000/v8h8lr6js3cmfvvd#it' : new Wikidata(), ] - External(Whelk whelk) { - this.whelk = whelk - } - - private static Optional getThing(String iri) { - if (iri in IDS) { - return Optional.of( - IDS[iri].with { - it['@id'] = iri - new HashMap<>(it) - } - ) - } - return Optional.empty() - } + private static final int CACHE_SIZE = 50_000 + private LoadingCache> cache = CacheBuilder.newBuilder() + .maximumSize(CACHE_SIZE) + .recordStats() + .build(new CacheLoader>() { + @Override + Optional load(String iri) throws Exception { + return getInternal(iri) + } + }) + Optional get(String iri) { - getThing(iri).map { document(it, JsonLd.CACHE_RECORD_TYPE) } + cache.get(iri).map{ it.clone() } } Optional getEphemeral(String iri) { - getThing(iri).map { - def d = document(it, JsonLd.CACHE_RECORD_TYPE) - d.setRecordId("${d.getRecordIdentifiers().first()}#record".toString()) - d + get(iri).map {doc -> + doc.setRecordId("${doc.getThingIdentifiers().first()}#record".toString()) + doc } } + private static Optional getInternal(String iri) { + Document d = mappers.findResult { dataset, mapper -> + mapper.getThing(iri).map{ document(it, JsonLd.CACHE_RECORD_TYPE, dataset) }.orElse(null) + } + + return Optional.ofNullable (d) + } + static Document getPlaceholder(String iri) { def thing = [ '@id' : iri, @@ -51,15 +50,20 @@ class External { document(thing, JsonLd.PLACEHOLDER_RECORD_TYPE) } - private static Document document(Map thing, String recordType) { + private static Document document(Map thing, String recordType, String dataset = null) { + def record = [ + '@id' : Document.BASE_URI.toString() + IdGenerator.generate(), + '@type' : recordType, + 'mainEntity': ['@id': thing.'@id'] + ] + + if (dataset) { + record.inDataset = ['@id': dataset] + } + new Document([ '@graph': [ - [ - '@id' : Document.BASE_URI.toString() + IdGenerator.generate(), - '@type' : recordType, - 'mainEntity': ['@id': thing.'@id'], - 'inDataset' : ['@id': 'http://kblocalhost.kb.se:5000/v8h8lr6js3cmfvvd#it'] - ], + record, thing ] ]) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 90dbea3c5d..6e485a664d 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -61,7 +61,7 @@ class Whelk { RomanizationStep.LanguageResources languageResources ElasticFind elasticFind Relations relations - External external = new External(this) + External external = new External() DocumentNormalizer normalizer Romanizer romanizer diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index ea46e06c32..3083b5a5db 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -291,7 +291,7 @@ class ElasticSearch { } } } - + void remove(String identifier) { if (log.isDebugEnabled()) { log.debug("Deleting object with identifier ${toElasticId(identifier)}.") diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index b5fb0d380e..b5dc8ea92e 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -7,6 +7,7 @@ import org.apache.jena.query.ResultSet import org.apache.jena.rdf.model.Model import org.apache.jena.rdf.model.ModelFactory import org.apache.jena.rdf.model.RDFNode +import whelk.component.ElasticSearch class Wikidata { Optional getThing(String iri) { @@ -20,7 +21,7 @@ class Wikidata { } boolean isWikidataEntity(String iri) { - iri.startsWith("https://www.wikidata.org") + iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } } @@ -84,10 +85,9 @@ class WikidataEntity { '@id' : iri, // Måste vara entity irin! '@type': "Place" ] - - List langsOfInterest = ['sv', 'en'] + List prefLabel = getValuesOfProperty(Properties.PREF_LABEL.prefixedIri) - List prefLabelsOfInterest = prefLabel.findAll { it.getLanguage() in langsOfInterest } + List prefLabelsOfInterest = prefLabel.findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } if (!prefLabelsOfInterest.isEmpty()) place['prefLabelByLang'] = prefLabelsOfInterest.collectEntries { [it.getLanguage(), it.getLexicalForm()] } From 6338ce1bb632400c96646a757b60525c0297ef89 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 14 Oct 2021 10:51:30 +0200 Subject: [PATCH 05/47] Set correct entity iri plus some clean up --- .../groovy/whelk/external/Wikidata.groovy | 55 ++++++++----------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index b5dc8ea92e..780908b875 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -11,7 +11,7 @@ import whelk.component.ElasticSearch class Wikidata { Optional getThing(String iri) { - if (!isWikidataEntity(iri)) { + if (!isWikidata(iri)) { return Optional.empty() } @@ -20,25 +20,18 @@ class Wikidata { return Optional.ofNullable(wdEntity.convert()) } - boolean isWikidataEntity(String iri) { + boolean isWikidata(String iri) { iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } } class WikidataEntity { - static final String wikidataEndpoint = "https://query.wikidata.org/sparql" + static final String WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" + static final String WIKIDATA_ENTITY_NS = "http://www.wikidata.org/entity/" - enum Properties { - PREF_LABEL('skos:prefLabel'), - COUNTRY('wdt:P17'), - PART_OF('wdt:P131') // located in the administrative territorial entity - - String prefixedIri - - private Properties(String prefixedIri) { - this.prefixedIri = prefixedIri - } - } + static final String PROP_PREF_LABEL = "skos:prefLabel" + static final String PROP_COUNTRY = "wdt:P17" + static final String PROP_IS_PART_OF = "wdt:P131" // located in the administrative territorial entity enum Type { PLACE('Q618123'), // Geographical feature @@ -54,12 +47,12 @@ class WikidataEntity { Model graph = ModelFactory.createDefaultModel() - String iri + String entityIri String shortId WikidataEntity(String iri) { - this.iri = iri this.shortId = getShortId(iri) + this.entityIri = WIKIDATA_ENTITY_NS + shortId loadGraph() } @@ -67,7 +60,7 @@ class WikidataEntity { try { graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.rdf?flavor=dump") } catch (Exception ex) { - println("Unable to load graph for entity ${iri}") + println("Unable to load graph for entity ${entityIri}") } } @@ -82,20 +75,19 @@ class WikidataEntity { Map convertPlace() { Map place = [ - '@id' : iri, // Måste vara entity irin! + '@id' : entityIri, '@type': "Place" ] - - List prefLabel = getValuesOfProperty(Properties.PREF_LABEL.prefixedIri) - List prefLabelsOfInterest = prefLabel.findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } - if (!prefLabelsOfInterest.isEmpty()) - place['prefLabelByLang'] = prefLabelsOfInterest.collectEntries { [it.getLanguage(), it.getLexicalForm()] } - List country = getValuesOfProperty(Properties.COUNTRY.prefixedIri) + List prefLabel = getValuesOfProperty(PROP_PREF_LABEL).findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + if (!prefLabel.isEmpty()) + place['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + + List country = getValuesOfProperty(PROP_COUNTRY) if (!country.isEmpty()) place['country'] = country.collect { ['@id': it.toString()] } - List partOf = getValuesOfProperty(Properties.PART_OF.prefixedIri) - country + List partOf = getValuesOfProperty(PROP_IS_PART_OF) - country if (!partOf.isEmpty()) place['isPartOf'] = partOf.collect { ['@id': it.toString()] } @@ -105,15 +97,13 @@ class WikidataEntity { Map convertPerson() { Map person = [ - '@id' : iri, + '@id' : entityIri, '@type': "Person" ] - List langsOfInterest = ['sv', 'en'] - List prefLabel = getValuesOfProperty(Properties.PREF_LABEL.prefixedIri) - List prefLabelsOfInterest = prefLabel.findAll { it.getLanguage() in langsOfInterest } - if (!prefLabelsOfInterest.isEmpty()) - person['prefLabelByLang'] = prefLabelsOfInterest.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + List prefLabel = getValuesOfProperty(PROP_PREF_LABEL).findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + if (!prefLabel.isEmpty()) + person['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } return person } @@ -146,8 +136,9 @@ class WikidataEntity { String queryString = "SELECT ?class { ?class wdt:P279* wd:${type.baseClass} }" Query q = QueryRunner.prepareQuery(queryString) - QueryExecution qExec = QueryRunner.remoteQueryExec(q, wikidataEndpoint) + QueryExecution qExec = QueryRunner.remoteQueryExec(q, WIKIDATA_ENDPOINT) ResultSet res = QueryRunner.selectQuery(qExec) + return res.collect { it.get("class").toString() }.toSet() } From 4c813cdc086cefe65a3dcbfd0abed5c30bf04c9c Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 14 Oct 2021 11:09:54 +0200 Subject: [PATCH 06/47] Read RDF as Turtle --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 780908b875..1345521152 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -58,7 +58,7 @@ class WikidataEntity { private void loadGraph() { try { - graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.rdf?flavor=dump") + graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.ttl?flavor=dump", "Turtle") } catch (Exception ex) { println("Unable to load graph for entity ${entityIri}") } From 16d4719c2ea80075c831b02f3415a18f018d8b7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 14 Oct 2021 13:55:10 +0200 Subject: [PATCH 07/47] Wikidata dataset URI --- whelk-core/src/main/groovy/whelk/External.groovy | 8 ++++---- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/External.groovy b/whelk-core/src/main/groovy/whelk/External.groovy index 64fc71e3ac..68e1203aec 100644 --- a/whelk-core/src/main/groovy/whelk/External.groovy +++ b/whelk-core/src/main/groovy/whelk/External.groovy @@ -6,8 +6,8 @@ import com.google.common.cache.LoadingCache import whelk.external.Wikidata class External { - private static final Map mappers = [ - 'http://kblocalhost.kb.se:5000/v8h8lr6js3cmfvvd#it' : new Wikidata(), + private static final List mappers = [ + new Wikidata(), ] private static final int CACHE_SIZE = 50_000 @@ -34,8 +34,8 @@ class External { } private static Optional getInternal(String iri) { - Document d = mappers.findResult { dataset, mapper -> - mapper.getThing(iri).map{ document(it, JsonLd.CACHE_RECORD_TYPE, dataset) }.orElse(null) + Document d = mappers.findResult { mapper -> + mapper.getThing(iri).map{ document(it, JsonLd.CACHE_RECORD_TYPE, mapper.datasetId()) }.orElse(null) } return Optional.ofNullable (d) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 1345521152..eaaf09ce63 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -23,6 +23,10 @@ class Wikidata { boolean isWikidata(String iri) { iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } + + String datasetId() { + 'https://id.kb.se/datasets/wikidata' + } } class WikidataEntity { From 2b498a2da23fbe4bb7c50b5120a34d001377219b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 14 Oct 2021 17:09:20 +0200 Subject: [PATCH 08/47] Handle alias URIs in external search --- .../groovy/whelk/rest/api/SearchUtils.groovy | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index 1ca92f3f5e..d8be811957 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -134,14 +134,14 @@ class SearchUtils { // TODO Only manipulate `_limit` in one place queryParameters['_limit'] = [limit.toString()] - Map esResult = esQuery.doQuery(queryParameters, suggest) + def extItems = searchExternal(queryParameters) // might manipulate q + Map esResult = esQuery.doQuery(queryParameters, suggest) - def e = searchExternal(queryParameters) - if (e) { - esResult['items'] = e + esResult['items'] + if (extItems) { + esResult['items'] = extItems + (List) esResult['items'] if(esResult['totalHits'] == 0) { - esResult['totalHits'] = e.size() + esResult['totalHits'] = extItems.size() } } @@ -244,7 +244,7 @@ class SearchUtils { return result } - List searchExternal(Map query) { + List searchExternal(Map query) { if (!query.q || !JsonLd.looksLikeIri(query.q.first())) { return [] } @@ -254,13 +254,21 @@ class SearchUtils { iri = iri.split('\\|').first().trim() } - def existing = whelk.getCards([iri]) - boolean existsInWhelk = !existing.isEmpty() && !(new Document(existing[iri]).isPlaceholder()) - if (existsInWhelk) { + def existsInWhelk = { String i -> + def existing = whelk.getCards([i]) + return !existing.isEmpty() && !(new Document(existing[i]).isPlaceholder()) + } + + if (existsInWhelk(iri)) { return [] } return whelk.external.getEphemeral(iri).map ({ doc -> + if (existsInWhelk(doc.getThingIdentifiers().first())) { // iri was an alias/sameAs + query.q = [doc.getThingIdentifiers().first()] as String[] + return [] + } + def extType = doc.getThingType() def queryTypes = query[TYPE_KEY] boolean isAnyTypeOk = !queryTypes || queryTypes.any { it == '*' } From 090b46b1f700b3d913a51aa0fde903bc02a99a76 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 14 Oct 2021 16:56:27 +0200 Subject: [PATCH 09/47] Add more prefixes and simplify select querying --- .../groovy/whelk/external/QueryRunner.groovy | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy index 4cffcc39f3..ce756806a0 100644 --- a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy +++ b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy @@ -15,6 +15,9 @@ class QueryRunner { [ "bd" : "http://www.bigdata.com/rdf#", "kbv" : "https://id.kb.se/vocab/", + "p" : "http://www.wikidata.org/prop/", + "pq" : "http://www.wikidata.org/prop/qualifier/", + "ps" : "http://www.wikidata.org/prop/statement/", "rdfs" : "http://www.w3.org/2000/01/rdf-schema#", "skos" : "http://www.w3.org/2004/02/skos/core#", "wd" : "http://www.wikidata.org/entity/", @@ -24,6 +27,22 @@ class QueryRunner { static PrefixMapping prefixMapping = PrefixMapping.Factory.create().setNsPrefixes(nsPrefixes) + static ResultSet localSelectResult(String queryString, Model graph) { + Query q = prepareQuery(queryString) + QueryExecution qExec = localQueryExec(q, graph) + ResultSet rs = selectQuery(qExec) + + return rs + } + + static ResultSet remoteSelectResult(String queryString, String sparqlEndpoint) { + Query q = prepareQuery(queryString) + QueryExecution qExec = remoteQueryExec(q, sparqlEndpoint) + ResultSet rs = selectQuery(qExec) + + return rs + } + static ResultSet selectQuery(QueryExecution qe) { ResultSet resultSet From 0bc23a6211a4aae9f0740794c000ef55ed716a2e Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 14 Oct 2021 16:57:53 +0200 Subject: [PATCH 10/47] Exclude historical places from isPartOf and reorganize --- .../groovy/whelk/external/Wikidata.groovy | 73 ++++++++++++------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index eaaf09ce63..61d4da5da6 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -1,8 +1,6 @@ package whelk.external import groovy.transform.Memoized -import org.apache.jena.query.Query -import org.apache.jena.query.QueryExecution import org.apache.jena.query.ResultSet import org.apache.jena.rdf.model.Model import org.apache.jena.rdf.model.ModelFactory @@ -33,9 +31,12 @@ class WikidataEntity { static final String WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" static final String WIKIDATA_ENTITY_NS = "http://www.wikidata.org/entity/" - static final String PROP_PREF_LABEL = "skos:prefLabel" - static final String PROP_COUNTRY = "wdt:P17" - static final String PROP_IS_PART_OF = "wdt:P131" // located in the administrative territorial entity + // Wikidata property short ids + static final String COUNTRY = "P17" + static final String END_TIME = "P582" + static final String INSTANCE_OF = "P31" + static final String PART_OF_PLACE = "P131" // located in the administrative territorial entity + static final String SUBCLASS_OF = "P279" enum Type { PLACE('Q618123'), // Geographical feature @@ -83,15 +84,15 @@ class WikidataEntity { '@type': "Place" ] - List prefLabel = getValuesOfProperty(PROP_PREF_LABEL).findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + List prefLabel = getPrefLabel().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } if (!prefLabel.isEmpty()) place['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } - List country = getValuesOfProperty(PROP_COUNTRY) + List country = getCountry() if (!country.isEmpty()) place['country'] = country.collect { ['@id': it.toString()] } - List partOf = getValuesOfProperty(PROP_IS_PART_OF) - country + List partOf = getPartOfPlace() - country if (!partOf.isEmpty()) place['isPartOf'] = partOf.collect { ['@id': it.toString()] } @@ -105,31 +106,50 @@ class WikidataEntity { '@type': "Person" ] - List prefLabel = getValuesOfProperty(PROP_PREF_LABEL).findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + List prefLabel = getPrefLabel().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } if (!prefLabel.isEmpty()) person['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } return person } - Type type() { - String command = "SELECT ?type { wd:${shortId} wdt:P31 ?type }" - Query q = QueryRunner.prepareQuery(command) - QueryExecution qExec = QueryRunner.localQueryExec(q, graph) - ResultSet rs = QueryRunner.selectQuery(qExec) - Set wdTypes = rs.collect { it.get("type").toString() } as Set + List getPrefLabel() { + String queryString = "SELECT ?prefLabel { wd:${shortId} skos:prefLabel ?prefLabel }" - return Type.values().find { getSubclasses(it).intersect(wdTypes) } ?: Type.OTHER + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("prefLabel") } + } + + List getCountry() { + String queryString = "SELECT ?country { wd:${shortId} wdt:${COUNTRY} ?country }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("country") } } - List getValuesOfProperty(String prop) { - String queryString = "SELECT ?o { wd:${shortId} ${prop} ?o }" + List getPartOfPlace() { + String queryString = """ + SELECT ?place { + wd:${shortId} p:${PART_OF_PLACE} ?stmt . + ?stmt ps:${PART_OF_PLACE} ?place . + FILTER NOT EXISTS { ?place pq:${END_TIME} ?endTime } + } + """ - Query q = QueryRunner.prepareQuery(queryString) - QueryExecution qExec = QueryRunner.localQueryExec(q, graph) - ResultSet rs = QueryRunner.selectQuery(qExec) + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) - return rs.collect { it.get("o") } + return rs.collect { it.get("place") } + } + + Type type() { + String queryString = "SELECT ?type { wd:${shortId} wdt:${INSTANCE_OF} ?type }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + Set wdTypes = rs.collect { it.get("type").toString() } as Set + + return Type.values().find { getSubclasses(it).intersect(wdTypes) } ?: Type.OTHER } @Memoized @@ -138,12 +158,11 @@ class WikidataEntity { return Collections.EMPTY_SET } - String queryString = "SELECT ?class { ?class wdt:P279* wd:${type.baseClass} }" - Query q = QueryRunner.prepareQuery(queryString) - QueryExecution qExec = QueryRunner.remoteQueryExec(q, WIKIDATA_ENDPOINT) - ResultSet res = QueryRunner.selectQuery(qExec) + String queryString = "SELECT ?class { ?class wdt:${SUBCLASS_OF}* wd:${type.baseClass} }" + + ResultSet rs = QueryRunner.remoteSelectResult(queryString, WIKIDATA_ENDPOINT) - return res.collect { it.get("class").toString() }.toSet() + return rs.collect { it.get("class").toString() }.toSet() } String getShortId(String iri) { From 5fe0ada78aaced3b8820bf3a85a29337133c6cb5 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 14 Oct 2021 17:34:42 +0200 Subject: [PATCH 11/47] Correct variable mistake --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 61d4da5da6..eb3a0be2a2 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -134,7 +134,7 @@ class WikidataEntity { SELECT ?place { wd:${shortId} p:${PART_OF_PLACE} ?stmt . ?stmt ps:${PART_OF_PLACE} ?place . - FILTER NOT EXISTS { ?place pq:${END_TIME} ?endTime } + FILTER NOT EXISTS { ?stmt pq:${END_TIME} ?endTime } } """ From 638fe8e2b936d356bb5462e8052857345f17e3d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 14 Oct 2021 18:04:25 +0200 Subject: [PATCH 12/47] Naming --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 6e485a664d..3d8f8ec567 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -434,7 +434,7 @@ class Whelk { throw new StorageCreateFailedException(document.getShortId(), "Document considered a duplicate of : " + collidingIDs) } - createPlaceholdersAndExternalDocs(document) + createCacheRecordsAndPlaceholders(document) boolean success = storage.createDocument(document, changedIn, changedBy, collection, deleted) if (success) { indexAsyncOrSync { @@ -464,7 +464,7 @@ class Whelk { preUpdateDoc = doc.clone() updateAgent.update(doc) normalize(doc) - createPlaceholdersAndExternalDocs(doc, preUpdateDoc) + createCacheRecordsAndPlaceholders(doc, preUpdateDoc) }) if (updated == null || preUpdateDoc == null) { @@ -481,7 +481,7 @@ class Whelk { normalize(doc) Document preUpdateDoc = storage.load(doc.shortId) - createPlaceholdersAndExternalDocs(doc, preUpdateDoc) + createCacheRecordsAndPlaceholders(doc, preUpdateDoc) Document updated = storage.storeAtomicUpdate(doc, minorUpdate, writeIdenticalVersions, changedIn, changedBy, oldChecksum) if (updated == null) { @@ -636,7 +636,7 @@ class Whelk { return timezone } - private void createPlaceholdersAndExternalDocs(Document postUpdateDoc, Document preUpdateDoc = null) { + private void createCacheRecordsAndPlaceholders(Document postUpdateDoc, Document preUpdateDoc = null) { Set postUpdateLinks = postUpdateDoc.getExternalRefs() Set preUpdateLinks = preUpdateDoc?.getExternalRefs() ?: new HashSet() //Collections.EMPTY_SET groovy compiler...? From aebda9f8296513604277a5f609c9610a2edcbae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 16:00:36 +0200 Subject: [PATCH 13/47] Collect cache metrics for External --- whelk-core/src/main/groovy/whelk/External.groovy | 7 ++++++- .../main/groovy/whelk/component/DependencyCache.groovy | 10 ++++------ whelk-core/src/main/groovy/whelk/util/Metrics.groovy | 3 +++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/External.groovy b/whelk-core/src/main/groovy/whelk/External.groovy index 68e1203aec..f4e22bad69 100644 --- a/whelk-core/src/main/groovy/whelk/External.groovy +++ b/whelk-core/src/main/groovy/whelk/External.groovy @@ -4,13 +4,14 @@ import com.google.common.cache.CacheBuilder import com.google.common.cache.CacheLoader import com.google.common.cache.LoadingCache import whelk.external.Wikidata +import whelk.util.Metrics class External { private static final List mappers = [ new Wikidata(), ] - private static final int CACHE_SIZE = 50_000 + private static final int CACHE_SIZE = 10_000 private LoadingCache> cache = CacheBuilder.newBuilder() .maximumSize(CACHE_SIZE) @@ -22,6 +23,10 @@ class External { } }) + External() { + Metrics.cacheMetrics.addCache('external', cache) + } + Optional get(String iri) { cache.get(iri).map{ it.clone() } } diff --git a/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy b/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy index 9863eceb38..244047ffb6 100644 --- a/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy +++ b/whelk-core/src/main/groovy/whelk/component/DependencyCache.groovy @@ -7,10 +7,10 @@ import com.google.common.util.concurrent.ListenableFuture import com.google.common.util.concurrent.ListenableFutureTask import com.google.common.util.concurrent.ThreadFactoryBuilder import groovy.util.logging.Log4j2 as Log -import io.prometheus.client.guava.cache.CacheMetricsCollector import whelk.Document import whelk.Link import whelk.exception.MissingMainIriException +import whelk.util.Metrics import java.util.concurrent.Callable import java.util.concurrent.Executor @@ -25,9 +25,7 @@ import static whelk.component.PostgreSQLComponent.NotificationType.DEPENDENCY_CA class DependencyCache { private static final int CACHE_SIZE = 50_000 private static final int REFRESH_INTERVAL_MINUTES = 5 - - private static final CacheMetricsCollector cacheMetrics = new CacheMetricsCollector().register() - + PostgreSQLComponent storage private Executor cacheRefresher = Executors.newSingleThreadExecutor( @@ -48,8 +46,8 @@ class DependencyCache { DependencyCache(PostgreSQLComponent storage) { this.storage = storage - cacheMetrics.addCache('dependersCache', dependersCache) - cacheMetrics.addCache('dependencyCache', dependenciesCache) + Metrics.cacheMetrics.addCache('dependersCache', dependersCache) + Metrics.cacheMetrics.addCache('dependencyCache', dependenciesCache) } Set getDependenciesOfType(String iri, String typeOfRelation) { diff --git a/whelk-core/src/main/groovy/whelk/util/Metrics.groovy b/whelk-core/src/main/groovy/whelk/util/Metrics.groovy index 46a741dfb6..7256dacefa 100644 --- a/whelk-core/src/main/groovy/whelk/util/Metrics.groovy +++ b/whelk-core/src/main/groovy/whelk/util/Metrics.groovy @@ -2,8 +2,11 @@ package whelk.util import io.prometheus.client.Counter import io.prometheus.client.Summary +import io.prometheus.client.guava.cache.CacheMetricsCollector class Metrics { + static final CacheMetricsCollector cacheMetrics = new CacheMetricsCollector().register() + static final Summary clientTimer = Summary.build() .labelNames("target", "method") .quantile(0.5, 0.05) From f176d7106d317bef305bc27e33eaac9259fc4313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 16:03:51 +0200 Subject: [PATCH 14/47] Naming --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 9 +++++---- .../ExternalEntities.groovy} | 14 ++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) rename whelk-core/src/main/groovy/whelk/{External.groovy => external/ExternalEntities.groovy} (90%) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 3d8f8ec567..a72ae395f9 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -15,8 +15,9 @@ import whelk.component.SparqlUpdater import whelk.converter.marc.MarcFrameConverter import whelk.converter.marc.RomanizationStep import whelk.exception.StorageCreateFailedException -import whelk.filter.LanguageLinker import whelk.exception.WhelkException +import whelk.external.ExternalEntities +import whelk.filter.LanguageLinker import whelk.filter.LinkFinder import whelk.filter.NormalizerChain import whelk.meta.WhelkConstants @@ -61,7 +62,7 @@ class Whelk { RomanizationStep.LanguageResources languageResources ElasticFind elasticFind Relations relations - External external = new External() + ExternalEntities external = new ExternalEntities() DocumentNormalizer normalizer Romanizer romanizer @@ -657,8 +658,8 @@ class Whelk { brokenOrExternalIris.each { iri -> def doc = tryFetchExternal - ? external.get(iri).orElse(External.getPlaceholder(iri)) - : External.getPlaceholder(iri) + ? external.get(iri).orElse(ExternalEntities.getPlaceholder(iri)) + : ExternalEntities.getPlaceholder(iri) try { createDocument(doc, changedIn, changedBy, collection, deleted) diff --git a/whelk-core/src/main/groovy/whelk/External.groovy b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy similarity index 90% rename from whelk-core/src/main/groovy/whelk/External.groovy rename to whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy index f4e22bad69..34d3899637 100644 --- a/whelk-core/src/main/groovy/whelk/External.groovy +++ b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy @@ -1,12 +1,14 @@ -package whelk +package whelk.external import com.google.common.cache.CacheBuilder import com.google.common.cache.CacheLoader import com.google.common.cache.LoadingCache -import whelk.external.Wikidata +import whelk.Document +import whelk.IdGenerator +import whelk.JsonLd import whelk.util.Metrics -class External { +class ExternalEntities { private static final List mappers = [ new Wikidata(), ] @@ -22,9 +24,9 @@ class External { return getInternal(iri) } }) - - External() { - Metrics.cacheMetrics.addCache('external', cache) + + ExternalEntities() { + Metrics.cacheMetrics.addCache('external-entities', cache) } Optional get(String iri) { From 22092d04941e642f9b80f34bae12875dc9dc92f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 16:11:42 +0200 Subject: [PATCH 15/47] Only cache results for IRIs that might actually be mapped --- .../whelk/external/ExternalEntities.groovy | 9 +++++++-- .../main/groovy/whelk/external/Mapper.groovy | 7 +++++++ .../main/groovy/whelk/external/Wikidata.groovy | 17 ++++++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 whelk-core/src/main/groovy/whelk/external/Mapper.groovy diff --git a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy index 34d3899637..d368e69194 100644 --- a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy +++ b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy @@ -9,7 +9,7 @@ import whelk.JsonLd import whelk.util.Metrics class ExternalEntities { - private static final List mappers = [ + private static final List mappers = [ new Wikidata(), ] @@ -30,7 +30,12 @@ class ExternalEntities { } Optional get(String iri) { - cache.get(iri).map{ it.clone() } + if (mappers.any { it.mightHandle(iri) }) { + cache.get(iri).map{ it.clone() } + } + else { + Optional.empty() + } } Optional getEphemeral(String iri) { diff --git a/whelk-core/src/main/groovy/whelk/external/Mapper.groovy b/whelk-core/src/main/groovy/whelk/external/Mapper.groovy new file mode 100644 index 0000000000..65f4a23fbd --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/external/Mapper.groovy @@ -0,0 +1,7 @@ +package whelk.external + +interface Mapper { + boolean mightHandle(String iri) + Optional getThing(String iri) + String datasetId() +} \ No newline at end of file diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index eb3a0be2a2..a1b4aa1d2f 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -7,7 +7,8 @@ import org.apache.jena.rdf.model.ModelFactory import org.apache.jena.rdf.model.RDFNode import whelk.component.ElasticSearch -class Wikidata { +class Wikidata implements Mapper { + @Override Optional getThing(String iri) { if (!isWikidata(iri)) { return Optional.empty() @@ -17,14 +18,20 @@ class Wikidata { return Optional.ofNullable(wdEntity.convert()) } - - boolean isWikidata(String iri) { - iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") - } + @Override + boolean mightHandle(String iri) { + return isWikidata(iri) + } + + @Override String datasetId() { 'https://id.kb.se/datasets/wikidata' } + + static boolean isWikidata(String iri) { + iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") + } } class WikidataEntity { From 28a3a6c39bc43bedd00e7369479d5f5fe004930b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 16:20:48 +0200 Subject: [PATCH 16/47] Index placeholders --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 5 ++--- .../src/main/groovy/whelk/component/ElasticSearch.groovy | 8 -------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index a72ae395f9..327c5c61ad 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -338,8 +338,7 @@ class Whelk { private void reindexAffected(Document document, Set preUpdateLinks, Set postUpdateLinks) { Set addedLinks = (postUpdateLinks - preUpdateLinks) Set removedLinks = (preUpdateLinks - postUpdateLinks) - - //TODO: fails for placeholders... + removedLinks.findResults { storage.getSystemIdByIri(it.iri) } .each{id -> elastic.decrementReverseLinks(id) } @@ -353,7 +352,7 @@ class Whelk { // we added a link to a document that includes us in its @reverse relations, reindex it elastic.index(doc, this) } - else if (!doc.isPlaceholder()) { + else { // just update link counter elastic.incrementReverseLinks(id) } diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 3083b5a5db..4252773641 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -180,10 +180,6 @@ class ElasticSearch { void bulkIndex(Collection docs, Whelk whelk) { if (docs) { String bulkString = docs.findResults{ doc -> - if (doc.isPlaceholder()) { - return null - } - try { String shapedData = getShapeForIndex(doc, whelk) String action = createActionRow(doc) @@ -226,10 +222,6 @@ class ElasticSearch { } void index(Document doc, Whelk whelk) { - if (doc.isPlaceholder()) { - return - } - // The justification for this uncomfortable catch-all, is that an index-failure must raise an alert (log entry) // _internally_ but be otherwise invisible to clients (If postgres writing was ok, the save is considered ok). try { From de19129c89aec52104777031a7bd85dd3e0ca2ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 16:49:26 +0200 Subject: [PATCH 17/47] Exclude country link to self in countries --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index a1b4aa1d2f..f6f464eb3a 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -95,7 +95,7 @@ class WikidataEntity { if (!prefLabel.isEmpty()) place['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } - List country = getCountry() + List country = getCountry().findAll { it.toString() != entityIri } if (!country.isEmpty()) place['country'] = country.collect { ['@id': it.toString()] } From b61e3a9f743fd16de8284c82e6a8f7c4a8eb6f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 17:18:54 +0200 Subject: [PATCH 18/47] Add script for analysing production/manufacture places --- .../lxl-2483-publication-place.groovy | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 whelktool/scripts/analysis/lxl-2483-publication-place.groovy diff --git a/whelktool/scripts/analysis/lxl-2483-publication-place.groovy b/whelktool/scripts/analysis/lxl-2483-publication-place.groovy new file mode 100644 index 0000000000..dbeda0104f --- /dev/null +++ b/whelktool/scripts/analysis/lxl-2483-publication-place.groovy @@ -0,0 +1,34 @@ +import whelk.Document + +prod = ['publication', 'production', 'manufacture'] + +selectByCollection('bib') { doc -> + prod.each { p -> + getPathSafe(doc.graph, [1, p], []).each { + def place = asList(getPathSafe(it, ['place', 'label'])).flatten() + if (place) { + incrementStats(p, place) + } + } + } + +} + +Object getPathSafe(item, path, defaultTo = null) { + for (p in path) { + if (item[p] != null) { + item = item[p] + } else { + return defaultTo + } + } + return item +} + +private List asList(Object o) { + if (o == null) + return [] + if (o instanceof List) + return o + return [o] +} \ No newline at end of file From dcf62b89621b819f08f53907323e77c4141fb84c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 15 Oct 2021 18:15:02 +0200 Subject: [PATCH 19/47] Don't send cache records and placeholders to Virtuoso --- .../src/main/groovy/whelk/component/PostgreSQLComponent.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy index 3a30156e0d..2d4f5413fd 100644 --- a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy +++ b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy @@ -1082,7 +1082,7 @@ class PostgreSQLComponent { } } - if (sparqlQueueEnabled) { + if (sparqlQueueEnabled && !doc.isCacheRecord() && !doc.isPlaceholder()) { sparqlQueueAdd(doc.getShortId(), connection) } } From bf02e9d962ca3d32b694c2467e7aab0d29e79a40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 20 Oct 2021 22:16:50 +0200 Subject: [PATCH 20/47] Display and index placeholders as actual things if possible --- rest/src/main/groovy/whelk/rest/api/Crud.groovy | 7 +++++++ .../src/main/groovy/whelk/component/ElasticSearch.groovy | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/rest/src/main/groovy/whelk/rest/api/Crud.groovy b/rest/src/main/groovy/whelk/rest/api/Crud.groovy index 65b09bb284..e38f9defca 100644 --- a/rest/src/main/groovy/whelk/rest/api/Crud.groovy +++ b/rest/src/main/groovy/whelk/rest/api/Crud.groovy @@ -191,6 +191,13 @@ class Crud extends HttpServlet { sendGetResponse(response, body, eTag, request.getPath(), request.getContentType(), request.getId()) } else { ETag eTag + + if (doc.isPlaceholder()) { + whelk.external.getEphemeral(doc.getThingIdentifiers().first()).ifPresent({ ext -> + doc.setThing(ext.getThing()) + }) + } + if (request.shouldEmbellish()) { String plainChecksum = doc.getChecksum(jsonld) whelk.embellish(doc) diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 4252773641..953e629099 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -308,6 +308,12 @@ class ElasticSearch { } String getShapeForIndex(Document document, Whelk whelk) { + if (document.isPlaceholder()) { + whelk.external.getEphemeral(document.getThingIdentifiers().first()).ifPresent({ ext -> + document.setThing(ext.getThing()) + }) + } + Document copy = document.clone() whelk.embellish(copy, ['search-chips']) From 51e70dd99c8f37e7c8d1500ba454ebab09568f14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 21 Oct 2021 11:16:24 +0200 Subject: [PATCH 21/47] Make "ephemeral" things have record type placeholder --- .../src/main/groovy/whelk/external/ExternalEntities.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy index d368e69194..49bbd6ab69 100644 --- a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy +++ b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy @@ -41,6 +41,7 @@ class ExternalEntities { Optional getEphemeral(String iri) { get(iri).map {doc -> doc.setRecordId("${doc.getThingIdentifiers().first()}#record".toString()) + doc.setRecordType(JsonLd.PLACEHOLDER_RECORD_TYPE) doc } } From 45b61c04a1b13c91fce286b5f692cbd323b1184e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 22 Oct 2021 09:36:08 +0200 Subject: [PATCH 22/47] Naming --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 327c5c61ad..caa1271c67 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -643,10 +643,10 @@ class Whelk { def iris = { Set s -> s.collect { it.iri } as Set } Set addedIris = iris(postUpdateLinks) - iris(preUpdateLinks) - createPlaceholdersAndExternalDocs(iris(postUpdateLinks), !postUpdateDoc.isCacheRecord()) + createCacheRecordsAndPlaceholders(iris(postUpdateLinks), !postUpdateDoc.isCacheRecord()) } - private void createPlaceholdersAndExternalDocs(Set iris, boolean tryFetchExternal) { + private void createCacheRecordsAndPlaceholders(Set iris, boolean tryFetchExternal) { Set brokenOrExternalIris = iris - storage.getSystemIdsByIris(iris).keySet() boolean minorUpdate = true From 8d02247f48f842688eceb8e37ebb7f037a03d095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 22 Oct 2021 09:37:19 +0200 Subject: [PATCH 23/47] Only create placeholders for added links --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index caa1271c67..203b97e869 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -639,11 +639,11 @@ class Whelk { private void createCacheRecordsAndPlaceholders(Document postUpdateDoc, Document preUpdateDoc = null) { Set postUpdateLinks = postUpdateDoc.getExternalRefs() Set preUpdateLinks = preUpdateDoc?.getExternalRefs() ?: new HashSet() //Collections.EMPTY_SET groovy compiler...? - + def iris = { Set s -> s.collect { it.iri } as Set } Set addedIris = iris(postUpdateLinks) - iris(preUpdateLinks) - createCacheRecordsAndPlaceholders(iris(postUpdateLinks), !postUpdateDoc.isCacheRecord()) + createCacheRecordsAndPlaceholders(addedIris, !postUpdateDoc.isCacheRecord()) } private void createCacheRecordsAndPlaceholders(Set iris, boolean tryFetchExternal) { From cd625c7eebd01f070637d47c48bae22d240cfc84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Fri, 22 Oct 2021 23:33:59 +0200 Subject: [PATCH 24/47] Handle redirected external URIs when saving --- .../src/main/groovy/whelk/Document.groovy | 12 ++++++++-- whelk-core/src/main/groovy/whelk/Whelk.groovy | 24 +++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/Document.groovy b/whelk-core/src/main/groovy/whelk/Document.groovy index a8218e5275..9bee8c7661 100644 --- a/whelk-core/src/main/groovy/whelk/Document.groovy +++ b/whelk-core/src/main/groovy/whelk/Document.groovy @@ -913,8 +913,16 @@ class Document { private static boolean isSet(String key, JsonLd jsonLd) { jsonLd && key && jsonLd.isSetContainer(key) } - - public String toVerboseString() { + + String toVerboseString() { return "{completeId=" + getCompleteId() + ", baseUri=" + baseUri.toString() + ", base identifiers:" + getRecordIdentifiers().join(','); } + + void replaceLinks(Map oldToNew) { + DocumentUtil.findKey(data, JsonLd.ID_KEY) { value, path -> + if (oldToNew.containsKey(value)) { + new DocumentUtil.Replace(oldToNew[(String) value]) + } + } + } } diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 203b97e869..4bd305b656 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -28,6 +28,7 @@ import whelk.util.PropertyLoader import whelk.util.Romanizer import java.time.ZoneId +import java.util.function.Consumer import java.util.function.Function /** @@ -644,9 +645,15 @@ class Whelk { Set addedIris = iris(postUpdateLinks) - iris(preUpdateLinks) createCacheRecordsAndPlaceholders(addedIris, !postUpdateDoc.isCacheRecord()) + + def redirects = createCacheRecordsAndPlaceholders(addedIris, !postUpdateDoc.isCacheRecord()) + + if (redirects) { + postUpdateDoc.replaceLinks(redirects) + } } - private void createCacheRecordsAndPlaceholders(Set iris, boolean tryFetchExternal) { + private Map createCacheRecordsAndPlaceholders(Set iris, boolean tryFetchExternal) { Set brokenOrExternalIris = iris - storage.getSystemIdsByIris(iris).keySet() boolean minorUpdate = true @@ -655,11 +662,17 @@ class Whelk { def collection = LegacyIntegrationTools.NO_MARC_COLLECTION def deleted = false + Map redirectedIris = [:] + brokenOrExternalIris.each { iri -> def doc = tryFetchExternal ? external.get(iri).orElse(ExternalEntities.getPlaceholder(iri)) : ExternalEntities.getPlaceholder(iri) + if (doc.getThingIdentifiers().first() != iri) { + redirectedIris[iri] = doc.getThingIdentifiers().first() + } + try { createDocument(doc, changedIn, changedBy, collection, deleted) } @@ -673,9 +686,14 @@ class Whelk { .findAll{doc -> doc.isPlaceholder() } .each { doc -> try { - external.getEphemeral(doc.getThingIdentifiers().first()).ifPresent({ extDoc -> + String iri = doc.getThingIdentifiers().first() + external.getEphemeral(iri).ifPresent( (Consumer) { Document extDoc -> def checkSum = doc.getChecksum(jsonld) extDoc.setRecordId(doc.getRecordIdentifiers().first()) + if (extDoc.getThingIdentifiers().first() != iri) { + redirectedIris[iri] = extDoc.getThingIdentifiers().first() + extDoc.addThingIdentifier(iri) + } storeAtomicUpdate(extDoc, minorUpdate, changedIn, changedBy, checkSum) }) } @@ -683,5 +701,7 @@ class Whelk { log.warn("Failed to update ${doc.shortId}: $e", e) } } + + return redirectedIris } } From 8fd0d7dfd384c637ace4faf44a4aaab504020826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Sat, 23 Oct 2021 01:00:41 +0200 Subject: [PATCH 25/47] Map wikidata description --- .../main/groovy/whelk/external/QueryRunner.groovy | 1 + .../src/main/groovy/whelk/external/Wikidata.groovy | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy index ce756806a0..76a5f229ad 100644 --- a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy +++ b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy @@ -19,6 +19,7 @@ class QueryRunner { "pq" : "http://www.wikidata.org/prop/qualifier/", "ps" : "http://www.wikidata.org/prop/statement/", "rdfs" : "http://www.w3.org/2000/01/rdf-schema#", + "sdo" : "http://schema.org/", "skos" : "http://www.w3.org/2004/02/skos/core#", "wd" : "http://www.wikidata.org/entity/", "wdt" : "http://www.wikidata.org/prop/direct/", diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index f6f464eb3a..a58e9fc1bc 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -95,6 +95,10 @@ class WikidataEntity { if (!prefLabel.isEmpty()) place['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + List description = getDescription().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } + if (!prefLabel.isEmpty()) + place['descriptionByLang'] = description.collectEntries { [it.getLanguage(), it.getLexicalForm()] } + List country = getCountry().findAll { it.toString() != entityIri } if (!country.isEmpty()) place['country'] = country.collect { ['@id': it.toString()] } @@ -128,6 +132,14 @@ class WikidataEntity { return rs.collect { it.get("prefLabel") } } + List getDescription() { + String queryString = "SELECT ?description { wd:${shortId} sdo:description ?description }" + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("description") } + } + List getCountry() { String queryString = "SELECT ?country { wd:${shortId} wdt:${COUNTRY} ?country }" From 757b2e84d4bef533b1d084fb1d6f813b047bd06d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Sat, 23 Oct 2021 01:12:46 +0200 Subject: [PATCH 26/47] Fix publication place script --- .../analysis/lxl-2483-publication-place.groovy | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/whelktool/scripts/analysis/lxl-2483-publication-place.groovy b/whelktool/scripts/analysis/lxl-2483-publication-place.groovy index dbeda0104f..0a235479b6 100644 --- a/whelktool/scripts/analysis/lxl-2483-publication-place.groovy +++ b/whelktool/scripts/analysis/lxl-2483-publication-place.groovy @@ -1,17 +1,29 @@ import whelk.Document +errors = getReportWriter("errors.txt") + prod = ['publication', 'production', 'manufacture'] selectByCollection('bib') { doc -> + try { + process(doc) + } + catch (Exception e) { + def m = "${doc.doc.shortId} $e" + println(m) + errors.println(m) + } +} + +void process(doc) { prod.each { p -> getPathSafe(doc.graph, [1, p], []).each { - def place = asList(getPathSafe(it, ['place', 'label'])).flatten() + def place = asList(getPathSafe(it, ['place', 'label'])).flatten().join(' | ') if (place) { incrementStats(p, place) } } } - } Object getPathSafe(item, path, defaultTo = null) { From e2ef6caf603ae612c69ff0f410008266c0747fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Mon, 25 Oct 2021 08:25:17 +0200 Subject: [PATCH 27/47] Collect metrics for wikidata .ttl?flavor=dump --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index a58e9fc1bc..b0fdef5e86 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -6,6 +6,7 @@ import org.apache.jena.rdf.model.Model import org.apache.jena.rdf.model.ModelFactory import org.apache.jena.rdf.model.RDFNode import whelk.component.ElasticSearch +import whelk.util.Metrics class Wikidata implements Mapper { @Override @@ -70,7 +71,9 @@ class WikidataEntity { private void loadGraph() { try { - graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.ttl?flavor=dump", "Turtle") + Metrics.clientTimer.labels(Wikidata.class.getSimpleName(), 'ttl-dump').time { + graph.read("https://www.wikidata.org/wiki/Special:EntityData/${shortId}.ttl?flavor=dump", "Turtle") + } } catch (Exception ex) { println("Unable to load graph for entity ${entityIri}") } From 372a17429142c8b27088dc992b52c77622e0a2df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 3 Nov 2021 13:30:49 +0100 Subject: [PATCH 28/47] Add Wikidata search prototype --- .../groovy/whelk/rest/api/SearchUtils.groovy | 73 +++++++++++++++---- .../groovy/whelk/external/Wikidata.groovy | 55 +++++++++++++- 2 files changed, 111 insertions(+), 17 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index d8be811957..fdb58f56c7 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -10,11 +10,14 @@ import whelk.JsonLd import whelk.Whelk import whelk.exception.InvalidQueryException import whelk.exception.WhelkRuntimeException +import whelk.external.Wikidata import whelk.search.ESQuery import whelk.search.ElasticFind import whelk.search.RangeParameterPrefix import whelk.util.DocumentUtil +import java.util.function.Predicate + import static whelk.JsonLd.GRAPH_KEY import static whelk.JsonLd.ID_KEY import static whelk.JsonLd.TYPE_KEY @@ -133,18 +136,31 @@ class SearchUtils { // // TODO Only manipulate `_limit` in one place queryParameters['_limit'] = [limit.toString()] - - def extItems = searchExternal(queryParameters) // might manipulate q - - Map esResult = esQuery.doQuery(queryParameters, suggest) - if (extItems) { - esResult['items'] = extItems + (List) esResult['items'] - if(esResult['totalHits'] == 0) { - esResult['totalHits'] = extItems.size() + // TODO external switch + Map esResult + if (queryParameters.q && queryParameters.q.first().trim().startsWith('wiki ')) { // TODO: only for testing + def result = searchWikidata(queryParameters) + esResult = [ + 'items' : result, + 'totalHits': result.size(), + 'aggregations': [:] + ] + } + else { + def extItems = selectExternalByUri(queryParameters) // might manipulate q + + esResult = esQuery.doQuery(queryParameters, suggest) + + if (extItems) { + esResult['items'] = extItems + (List) esResult['items'] + if(esResult['totalHits'] == 0) { + esResult['totalHits'] = extItems.size() + } } } + Lookup lookup = new Lookup() List mappings = [] @@ -244,7 +260,41 @@ class SearchUtils { return result } - List searchExternal(Map query) { + List searchWikidata(Map query) { + if (!query.q) { + return [] + } + + String q = query.q.first().trim() + if (q.startsWith('wiki ')) { + q = q.substring('wiki '.length()) + } + if (q.contains('|')) { // TODO: cataloging client does "term | term*" in side panel search... + q = q.split('\\|').first().trim() + } + + def typeFilter = extTypeFilter(query) + + Wikidata.query(q) + .collect {whelk.external.getEphemeral(it) } // TODO? could get e.g 15 URIs from wikidata and then collect results until we get e.g. 5 + .findResults { it.orElse(null) } + .findAll {typeFilter.test(it) } + .collect {doc -> + whelk.embellish(doc) + JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) + } + } + + private Predicate extTypeFilter(Map query) { + def queryTypes = query[TYPE_KEY] + boolean isAnyTypeOk = !queryTypes || queryTypes.any { it == '*' } + return { Document doc -> + def extType = doc.getThingType() + isAnyTypeOk || queryTypes.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)} + } + } + + List selectExternalByUri(Map query) { if (!query.q || !JsonLd.looksLikeIri(query.q.first())) { return [] } @@ -269,10 +319,7 @@ class SearchUtils { return [] } - def extType = doc.getThingType() - def queryTypes = query[TYPE_KEY] - boolean isAnyTypeOk = !queryTypes || queryTypes.any { it == '*' } - if (isAnyTypeOk || queryTypes.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)}) { + if (extTypeFilter(query).test(doc)) { whelk.embellish(doc) [JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)] } else { diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index b0fdef5e86..9ebe33c960 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -6,8 +6,17 @@ import org.apache.jena.rdf.model.Model import org.apache.jena.rdf.model.ModelFactory import org.apache.jena.rdf.model.RDFNode import whelk.component.ElasticSearch +import whelk.exception.WhelkRuntimeException import whelk.util.Metrics +import java.net.http.HttpClient +import java.net.http.HttpRequest +import java.net.http.HttpResponse +import java.nio.charset.StandardCharsets +import java.time.Duration + +import static whelk.util.Jackson.mapper + class Wikidata implements Mapper { @Override Optional getThing(String iri) { @@ -33,6 +42,38 @@ class Wikidata implements Mapper { static boolean isWikidata(String iri) { iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } + + @Override + static List query(String query) { + try { + performQuery(query) + } + catch (Exception e) { + throw new WhelkRuntimeException("Error querying wikidata: $e", e) + } + } + + private static List performQuery(String query) { + HttpClient client = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.NORMAL).build() + def base = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json' + def lang = 'sv' + def limit = 5 + def q = URLEncoder.encode(query, StandardCharsets.UTF_8) + String uri = "$base&limit=$limit&language=$lang&uselang=$lang&search=$q" + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(uri)) + .timeout(Duration.ofSeconds(30)) + .GET() + .build() + + def httpResponse = client.send(request, HttpResponse.BodyHandlers.ofString()) + def result = mapper.readValue(httpResponse.body(), Map.class) + .get('search') + .collect { (String) it['concepturi'] } + + return result + } } class WikidataEntity { @@ -58,15 +99,21 @@ class WikidataEntity { } } - Model graph = ModelFactory.createDefaultModel() + Model graph String entityIri String shortId WikidataEntity(String iri) { - this.shortId = getShortId(iri) - this.entityIri = WIKIDATA_ENTITY_NS + shortId - loadGraph() + try { + graph = ModelFactory.createDefaultModel() + this.shortId = getShortId(iri) + this.entityIri = WIKIDATA_ENTITY_NS + shortId + loadGraph() + } + catch (ExceptionInInitializerError e) { + e.printStackTrace() + } } private void loadGraph() { From ce61a14eb23099f925d891352778b4696fdd09f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 3 Nov 2021 13:31:21 +0100 Subject: [PATCH 29/47] Bump apache-jena-libs --- whelk-core/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/build.gradle b/whelk-core/build.gradle index 9a91be1367..2526384a86 100644 --- a/whelk-core/build.gradle +++ b/whelk-core/build.gradle @@ -95,7 +95,7 @@ dependencies { api 'commons-io:commons-io:2.11.0' implementation "org.apache.httpcomponents:httpclient:${httpComponentsClientVersion}" implementation "org.apache.httpcomponents:httpcore:${httpComponentsCoreVersion}" - api 'org.apache.jena:apache-jena-libs:3.0.1' + implementation 'org.apache.jena:apache-jena-libs:3.17.0' api "org.codehaus.groovy:groovy-json:${groovyVersion}" api "org.codehaus.groovy:groovy-xml:${groovyVersion}" api "org.codehaus.groovy:groovy-yaml:${groovyVersion}" From 731a0a655070f6dd2402ee20592020f11c7c5c72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 3 Nov 2021 13:53:53 +0100 Subject: [PATCH 30/47] Inject docs that exist in whelk in wikidata search results --- .../groovy/whelk/rest/api/SearchUtils.groovy | 18 +++++++++++++----- .../main/groovy/whelk/external/Wikidata.groovy | 3 +-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index fdb58f56c7..aa69acf0a0 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -275,16 +275,24 @@ class SearchUtils { def typeFilter = extTypeFilter(query) - Wikidata.query(q) - .collect {whelk.external.getEphemeral(it) } // TODO? could get e.g 15 URIs from wikidata and then collect results until we get e.g. 5 - .findResults { it.orElse(null) } + def uris = Wikidata.query(q) + def existingInWhelk = whelk.getCards(uris) + + uris + .collect { uri -> + existingInWhelk[uri] + ? new Document(existingInWhelk[uri]) + : whelk.external.getEphemeral(uri).orElse(null) + // TODO? could get e.g 15 URIs from wikidata and then collect results until we get e.g. 5 + } + .grep() .findAll {typeFilter.test(it) } - .collect {doc -> + .collect { doc -> whelk.embellish(doc) JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) } } - + private Predicate extTypeFilter(Map query) { def queryTypes = query[TYPE_KEY] boolean isAnyTypeOk = !queryTypes || queryTypes.any { it == '*' } diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 9ebe33c960..1521b66a56 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -42,8 +42,7 @@ class Wikidata implements Mapper { static boolean isWikidata(String iri) { iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } - - @Override + static List query(String query) { try { performQuery(query) From beb39b77742b71737a27a0fdd8647f1b83eecbd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 3 Nov 2021 15:27:14 +0100 Subject: [PATCH 31/47] Set changedBy for cache records to same as request that caused creation --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 4bd305b656..402a3e8941 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -435,7 +435,7 @@ class Whelk { throw new StorageCreateFailedException(document.getShortId(), "Document considered a duplicate of : " + collidingIDs) } - createCacheRecordsAndPlaceholders(document) + createCacheRecordsAndPlaceholders(changedBy, document) boolean success = storage.createDocument(document, changedIn, changedBy, collection, deleted) if (success) { indexAsyncOrSync { @@ -465,7 +465,7 @@ class Whelk { preUpdateDoc = doc.clone() updateAgent.update(doc) normalize(doc) - createCacheRecordsAndPlaceholders(doc, preUpdateDoc) + createCacheRecordsAndPlaceholders(changedBy, doc, preUpdateDoc) }) if (updated == null || preUpdateDoc == null) { @@ -482,7 +482,7 @@ class Whelk { normalize(doc) Document preUpdateDoc = storage.load(doc.shortId) - createCacheRecordsAndPlaceholders(doc, preUpdateDoc) + createCacheRecordsAndPlaceholders(changedBy, doc, preUpdateDoc) Document updated = storage.storeAtomicUpdate(doc, minorUpdate, writeIdenticalVersions, changedIn, changedBy, oldChecksum) if (updated == null) { @@ -637,28 +637,24 @@ class Whelk { return timezone } - private void createCacheRecordsAndPlaceholders(Document postUpdateDoc, Document preUpdateDoc = null) { + private void createCacheRecordsAndPlaceholders(String changedBy, Document postUpdateDoc, Document preUpdateDoc = null) { Set postUpdateLinks = postUpdateDoc.getExternalRefs() Set preUpdateLinks = preUpdateDoc?.getExternalRefs() ?: new HashSet() //Collections.EMPTY_SET groovy compiler...? def iris = { Set s -> s.collect { it.iri } as Set } Set addedIris = iris(postUpdateLinks) - iris(preUpdateLinks) - - createCacheRecordsAndPlaceholders(addedIris, !postUpdateDoc.isCacheRecord()) - - def redirects = createCacheRecordsAndPlaceholders(addedIris, !postUpdateDoc.isCacheRecord()) + def redirects = createCacheRecordsAndPlaceholders(changedBy, addedIris, !postUpdateDoc.isCacheRecord()) if (redirects) { postUpdateDoc.replaceLinks(redirects) } } - private Map createCacheRecordsAndPlaceholders(Set iris, boolean tryFetchExternal) { + private Map createCacheRecordsAndPlaceholders(String changedBy, Set iris, boolean tryFetchExternal) { Set brokenOrExternalIris = iris - storage.getSystemIdsByIris(iris).keySet() boolean minorUpdate = true - def changedIn = 'xl' - def changedBy = 'https://libris.kb.se/library/SEK' // FIXME... + def changedIn = 'xl' // FIXME def collection = LegacyIntegrationTools.NO_MARC_COLLECTION def deleted = false From 07107f3757238c9e7db487e1dc0cfa9a158b61a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 11 Nov 2021 15:46:33 +0100 Subject: [PATCH 32/47] Move external entity search to separate servlet --- .../rest/api/ExternalEntitiesSearchAPI.groovy | 131 ++++++++++++++++++ .../groovy/whelk/rest/api/SearchUtils.groovy | 103 +------------- rest/src/main/webapp/WEB-INF/web.xml | 9 ++ .../whelk/component/ElasticSearch.groovy | 14 ++ .../whelk/external/ExternalEntities.groovy | 1 - 5 files changed, 155 insertions(+), 103 deletions(-) create mode 100644 rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy new file mode 100644 index 0000000000..43da2c12f2 --- /dev/null +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -0,0 +1,131 @@ +package whelk.rest.api + +import whelk.Document +import whelk.JsonLd +import whelk.Whelk +import whelk.external.Wikidata +import whelk.util.WhelkFactory + +import javax.servlet.ServletException +import javax.servlet.http.HttpServlet +import javax.servlet.http.HttpServletRequest +import javax.servlet.http.HttpServletResponse +import java.util.function.Predicate + +import static whelk.JsonLd.TYPE_KEY + +class ExternalEntitiesSearchAPI extends HttpServlet { + Whelk whelk + + @Override + void init() { + whelk = WhelkFactory.getSingletonWhelk() + } + + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + String q = request.getParameter('q')?.trim() + def types = request.getParameterMap().get(TYPE_KEY) as List ?: [] + + def items = JsonLd.looksLikeIri(q) + ? selectExternal(q, types) + : searchExternal(q, types) + + def mapping = [:] + if (q) { + mapping << ['variable' : 'q', + 'predicate': whelk.jsonld.toTermKey('textQuery'), + 'value' : q] + } + + def result = [ + (TYPE_KEY): 'PartialCollectionView', + 'itemOffset' : 0, + 'totalItems' : items.size(), + 'search': [ + 'mapping' : mapping + ], + 'items' : items + // TODO: other, @type etc + ] + + HttpTools.sendResponse(response, result, MimeTypes.JSONLD) + } + + List searchExternal(String q, Collection types) { + def typeFilter = typeFilter(types) + + def uris = Wikidata.query(q) + def inWhelk = whelk.getCards(uris) + + // TODO? could get e.g 15 URIs from wikidata and then collect results until we get e.g. 5 + uris + .collect { uri -> + if (inWhelk[uri]) { + def doc = new Document(inWhelk[uri]) + insertReverseLinkCount(doc) + doc + } + else { + whelk.external.getEphemeral(uri).orElse(null) + } + } + .grep() + .findAll {typeFilter.test(it) } + .collect { doc -> + whelk.embellish(doc) + JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) + } + } + + private Predicate typeFilter(Collection types) { + boolean isAnyTypeOk = !types || types.any { it == '*' } + return { Document doc -> + def extType = doc.getThingType() + isAnyTypeOk || types.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)} + } + } + + List selectExternal(String iri, Collection types) { + def typeFilter = typeFilter(types) + + Closure whelkResult = { Map data -> + Document doc = new Document(data) + if (!typeFilter.test(doc)) { + return [] + } + insertReverseLinkCount(doc) + whelk.embellish(doc) + def framed = JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) + return [framed] + } + + def inWhelk = whelk.getCards([iri]) + if (inWhelk[iri]) { + return whelkResult(inWhelk[iri]) + } + + return whelk.external.getEphemeral(iri).map ({ doc -> + def extId = doc.getThingIdentifiers().first() + inWhelk = whelk.getCards([extId]) + if (inWhelk[extId]) { // iri was an alias/sameAs + return whelkResult(inWhelk[extId]) + } + + if (typeFilter(types).test(doc)) { + whelk.embellish(doc) + [JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)] + } else { + [] + } + }).orElse([]) + } + + void insertReverseLinkCount(Document doc) { + whelk.elastic.retrieveIndexedDocument(doc.getShortId())?.with { + if (it.reverseLinks) { + doc.data[JsonLd.GRAPH_KEY][1]['reverseLinks'] = it.reverseLinks + } + } + } +} diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index aa69acf0a0..2e8e546cbb 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -10,14 +10,11 @@ import whelk.JsonLd import whelk.Whelk import whelk.exception.InvalidQueryException import whelk.exception.WhelkRuntimeException -import whelk.external.Wikidata import whelk.search.ESQuery import whelk.search.ElasticFind import whelk.search.RangeParameterPrefix import whelk.util.DocumentUtil -import java.util.function.Predicate - import static whelk.JsonLd.GRAPH_KEY import static whelk.JsonLd.ID_KEY import static whelk.JsonLd.TYPE_KEY @@ -137,29 +134,7 @@ class SearchUtils { // TODO Only manipulate `_limit` in one place queryParameters['_limit'] = [limit.toString()] - // TODO external switch - Map esResult - if (queryParameters.q && queryParameters.q.first().trim().startsWith('wiki ')) { // TODO: only for testing - def result = searchWikidata(queryParameters) - esResult = [ - 'items' : result, - 'totalHits': result.size(), - 'aggregations': [:] - ] - } - else { - def extItems = selectExternalByUri(queryParameters) // might manipulate q - - esResult = esQuery.doQuery(queryParameters, suggest) - - if (extItems) { - esResult['items'] = extItems + (List) esResult['items'] - if(esResult['totalHits'] == 0) { - esResult['totalHits'] = extItems.size() - } - } - } - + Map esResult = esQuery.doQuery(queryParameters, suggest) Lookup lookup = new Lookup() @@ -260,82 +235,6 @@ class SearchUtils { return result } - List searchWikidata(Map query) { - if (!query.q) { - return [] - } - - String q = query.q.first().trim() - if (q.startsWith('wiki ')) { - q = q.substring('wiki '.length()) - } - if (q.contains('|')) { // TODO: cataloging client does "term | term*" in side panel search... - q = q.split('\\|').first().trim() - } - - def typeFilter = extTypeFilter(query) - - def uris = Wikidata.query(q) - def existingInWhelk = whelk.getCards(uris) - - uris - .collect { uri -> - existingInWhelk[uri] - ? new Document(existingInWhelk[uri]) - : whelk.external.getEphemeral(uri).orElse(null) - // TODO? could get e.g 15 URIs from wikidata and then collect results until we get e.g. 5 - } - .grep() - .findAll {typeFilter.test(it) } - .collect { doc -> - whelk.embellish(doc) - JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) - } - } - - private Predicate extTypeFilter(Map query) { - def queryTypes = query[TYPE_KEY] - boolean isAnyTypeOk = !queryTypes || queryTypes.any { it == '*' } - return { Document doc -> - def extType = doc.getThingType() - isAnyTypeOk || queryTypes.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)} - } - } - - List selectExternalByUri(Map query) { - if (!query.q || !JsonLd.looksLikeIri(query.q.first())) { - return [] - } - - String iri = query.q.first().trim() - if (iri.contains('|')) { // TODO: cataloging client does "term | term*" in side panel search... - iri = iri.split('\\|').first().trim() - } - - def existsInWhelk = { String i -> - def existing = whelk.getCards([i]) - return !existing.isEmpty() && !(new Document(existing[i]).isPlaceholder()) - } - - if (existsInWhelk(iri)) { - return [] - } - - return whelk.external.getEphemeral(iri).map ({ doc -> - if (existsInWhelk(doc.getThingIdentifiers().first())) { // iri was an alias/sameAs - query.q = [doc.getThingIdentifiers().first()] as String[] - return [] - } - - if (extTypeFilter(query).test(doc)) { - whelk.embellish(doc) - [JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)] - } else { - [] - } - }).orElse([]) - } - Map removeMappingFromParams(Map pageParams, Map mapping) { Map params = pageParams.clone() String variable = mapping['variable'] diff --git a/rest/src/main/webapp/WEB-INF/web.xml b/rest/src/main/webapp/WEB-INF/web.xml index 345a555382..b3665c0cd4 100644 --- a/rest/src/main/webapp/WEB-INF/web.xml +++ b/rest/src/main/webapp/WEB-INF/web.xml @@ -62,6 +62,10 @@ RemoteSearch whelk.rest.api.RemoteSearchAPI + + ExternalEntitiesSearchAPI + whelk.rest.api.ExternalEntitiesSearchAPI + MarcConverter whelk.rest.api.ConverterAPI @@ -116,6 +120,11 @@ /_remotesearch + + ExternalEntitiesSearchAPI + /_externalentities + + MarcConverter /_convert diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 953e629099..4d911e8497 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -306,6 +306,20 @@ class ElasticSearch { log.warn("Record with id $identifier was not deleted from the Elasticsearch index: $e") } } + + Map retrieveIndexedDocument(String systemId) { + try { + mapper.readValue(client.performRequest('GET', + "/${indexName}/_doc/$systemId/_source", ''), Map) + } catch (UnexpectedHttpStatusException e) { + if (isMissingDocument(e)) { + return null + } + else { + throw e + } + } + } String getShapeForIndex(Document document, Whelk whelk) { if (document.isPlaceholder()) { diff --git a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy index 49bbd6ab69..c46b26fcaa 100644 --- a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy +++ b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy @@ -28,7 +28,6 @@ class ExternalEntities { ExternalEntities() { Metrics.cacheMetrics.addCache('external-entities', cache) } - Optional get(String iri) { if (mappers.any { it.mightHandle(iri) }) { cache.get(iri).map{ it.clone() } From ae8a6b01c82af425c40fe7ff4bad450e8af24886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 11 Nov 2021 16:09:32 +0100 Subject: [PATCH 33/47] Refactor SearchUtils.Lookup. Use it in ExternalEntitiesSearch --- .../rest/api/ExternalEntitiesSearchAPI.groovy | 24 ++++-- .../groovy/whelk/rest/api/SearchUtils.groovy | 75 +++++++++++-------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy index 43da2c12f2..cc1dfdf5e5 100644 --- a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -12,6 +12,7 @@ import javax.servlet.http.HttpServletRequest import javax.servlet.http.HttpServletResponse import java.util.function.Predicate +import static whelk.JsonLd.CONTEXT_KEY import static whelk.JsonLd.TYPE_KEY class ExternalEntitiesSearchAPI extends HttpServlet { @@ -30,25 +31,32 @@ class ExternalEntitiesSearchAPI extends HttpServlet { def items = JsonLd.looksLikeIri(q) ? selectExternal(q, types) : searchExternal(q, types) + + SearchUtils.Lookup lookup = new SearchUtils.Lookup(whelk) - def mapping = [:] + // TODO: proper mapping + def mappings = [] if (q) { - mapping << ['variable' : 'q', - 'predicate': whelk.jsonld.toTermKey('textQuery'), + mappings << ['variable' : 'q', + 'predicate': lookup.chip('textQuery'), 'value' : q] } + def (paramMappings, _) = SearchUtils.mapParams(lookup, request.getParameterMap()) + mappings.addAll(paramMappings) def result = [ - (TYPE_KEY): 'PartialCollectionView', + (CONTEXT_KEY): Crud.CONTEXT_PATH, + (TYPE_KEY) : 'PartialCollectionView', 'itemOffset' : 0, 'totalItems' : items.size(), - 'search': [ - 'mapping' : mapping + 'search' : [ + 'mapping': mappings ], - 'items' : items - // TODO: other, @type etc + 'items' : items ] + lookup.run() + HttpTools.sendResponse(response, result, MimeTypes.JSONLD) } diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index 2e8e546cbb..5466a5d6b3 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -38,7 +38,7 @@ class SearchUtils { Whelk whelk JsonLd ld ESQuery esQuery - URI vocabUri + SearchUtils(Whelk whelk) { this(whelk.jsonld) @@ -48,9 +48,6 @@ class SearchUtils { SearchUtils(JsonLd jsonld) { this.ld = jsonld - if (ld.vocabId) { - vocabUri = new URI(ld.vocabId) - } } Map doSearch(Map queryParameters) { @@ -136,7 +133,7 @@ class SearchUtils { Map esResult = esQuery.doQuery(queryParameters, suggest) - Lookup lookup = new Lookup() + Lookup lookup = new Lookup(whelk) List mappings = [] if (query) { @@ -457,9 +454,21 @@ class SearchUtils { } } - private class Lookup { + static class Lookup { private Multimap iriPos = ArrayListMultimap.create() + + private Whelk whelk + private JsonLd ld + private URI vocabUri + Lookup(Whelk whelk) { + this.whelk = whelk + this.ld = whelk.jsonld + if (ld.vocabId) { + vocabUri = new URI(ld.vocabId) + } + } + Map chip(String itemRepr) { boolean matchesTerm = false def itemId = itemRepr @@ -517,37 +526,37 @@ class SearchUtils { it.value.putAll(chip) } } - } - - private Map dummyChip(String itemId) { - [(ID_KEY): itemId, 'label': itemId] - } - /* - * Read vocab term data from storage. - * - * Returns null if not found. - * - */ - private String getFullUri(String id) { - try { - if (vocabUri) { - return vocabUri.resolve(id).toString() + private Map dummyChip(String itemId) { + [(ID_KEY): itemId, 'label': itemId] + } + + /* + * Read vocab term data from storage. + * + * Returns null if not found. + * + */ + private String getFullUri(String id) { + try { + if (vocabUri) { + return vocabUri.resolve(id).toString() + } + } + catch (IllegalArgumentException e) { + // Couldn't resolve, which means id isn't a valid IRI. + // No need to check the db. + return null } } - catch (IllegalArgumentException e) { - // Couldn't resolve, which means id isn't a valid IRI. - // No need to check the db. - return null + + // FIXME move to Document or JsonLd + private Map getEntry(Map jsonLd, String entryId) { + // we rely on this convention for the time being. + return jsonLd[(GRAPH_KEY)].find { it[ID_KEY] == entryId } } } - // FIXME move to Document or JsonLd - private Map getEntry(Map jsonLd, String entryId) { - // we rely on this convention for the time being. - return jsonLd[(GRAPH_KEY)].find { it[ID_KEY] == entryId } - } - /** * Create a URL for '/find' with the specified query parameters. * @@ -702,7 +711,7 @@ class SearchUtils { * filtered out. * */ - private Tuple2 mapParams(Lookup lookup, Map params) { + static Tuple2 mapParams(Lookup lookup, Map params) { List result = [] Map pageParams = [:] List reservedParams = getReservedParameters() @@ -753,7 +762,7 @@ class SearchUtils { /* * Return a list of reserved query params */ - private List getReservedParameters() { + private static List getReservedParameters() { return ['q', 'p', 'o', 'value', '_limit', '_offset', '_suggest'] } From 44e5e68a1b68e23dd7e5399354378ebbbe09c658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 11 Nov 2021 16:10:48 +0100 Subject: [PATCH 34/47] Refactor SearchUtils.Lookup. Use it in ExternalEntitiesSearch --- rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy index 5466a5d6b3..61f3e12fac 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils.groovy @@ -711,7 +711,7 @@ class SearchUtils { * filtered out. * */ - static Tuple2 mapParams(Lookup lookup, Map params) { + static Tuple2 mapParams(Lookup lookup, Map params) { List result = [] Map pageParams = [:] List reservedParams = getReservedParameters() From 93e4c8a17701aa1690217a45f1f868c3a2bf1d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 11 Nov 2021 16:41:47 +0100 Subject: [PATCH 35/47] Clean up --- .../rest/api/ExternalEntitiesSearchAPI.groovy | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy index cc1dfdf5e5..ac67b0ca69 100644 --- a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -97,27 +97,16 @@ class ExternalEntitiesSearchAPI extends HttpServlet { List selectExternal(String iri, Collection types) { def typeFilter = typeFilter(types) - Closure whelkResult = { Map data -> - Document doc = new Document(data) - if (!typeFilter.test(doc)) { - return [] - } - insertReverseLinkCount(doc) - whelk.embellish(doc) - def framed = JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) - return [framed] - } - def inWhelk = whelk.getCards([iri]) if (inWhelk[iri]) { - return whelkResult(inWhelk[iri]) + return whelkResult(inWhelk[iri], typeFilter) } return whelk.external.getEphemeral(iri).map ({ doc -> def extId = doc.getThingIdentifiers().first() inWhelk = whelk.getCards([extId]) if (inWhelk[extId]) { // iri was an alias/sameAs - return whelkResult(inWhelk[extId]) + return whelkResult(inWhelk[extId], typeFilter) } if (typeFilter(types).test(doc)) { @@ -128,6 +117,17 @@ class ExternalEntitiesSearchAPI extends HttpServlet { } }).orElse([]) } + + List whelkResult(Map data, typeFilter) { + Document doc = new Document(data) + if (!typeFilter.test(doc)) { + return [] + } + insertReverseLinkCount(doc) + whelk.embellish(doc) + def framed = JsonLd.frame(doc.getThingIdentifiers().first(), doc.data) + return [framed] + } void insertReverseLinkCount(Document doc) { whelk.elastic.retrieveIndexedDocument(doc.getShortId())?.with { From 038eb1601c3fb2ae2e0a66a4641c485f42558a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 18 Nov 2021 15:56:13 +0100 Subject: [PATCH 36/47] Language parameter in external search --- .../rest/api/ExternalEntitiesSearchAPI.groovy | 13 +++++------ .../src/main/groovy/whelk/JsonLd.groovy | 2 +- .../groovy/whelk/external/Wikidata.groovy | 22 ++++++++++++++----- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy index ac67b0ca69..9b8a712303 100644 --- a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -25,16 +25,16 @@ class ExternalEntitiesSearchAPI extends HttpServlet { @Override protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { - String q = request.getParameter('q')?.trim() + String q = request.getParameter('q')?.trim() ?: '' def types = request.getParameterMap().get(TYPE_KEY) as List ?: [] + def language = request.getParameter('_lang') ?: 'sv' def items = JsonLd.looksLikeIri(q) ? selectExternal(q, types) - : searchExternal(q, types) + : searchExternal(q, types, language) SearchUtils.Lookup lookup = new SearchUtils.Lookup(whelk) - // TODO: proper mapping def mappings = [] if (q) { mappings << ['variable' : 'q', @@ -60,13 +60,12 @@ class ExternalEntitiesSearchAPI extends HttpServlet { HttpTools.sendResponse(response, result, MimeTypes.JSONLD) } - List searchExternal(String q, Collection types) { + List searchExternal(String q, Collection types, languageTag) { def typeFilter = typeFilter(types) - def uris = Wikidata.query(q) + def uris = Wikidata.query(q, languageTag, 5) def inWhelk = whelk.getCards(uris) - - // TODO? could get e.g 15 URIs from wikidata and then collect results until we get e.g. 5 + uris .collect { uri -> if (inWhelk[uri]) { diff --git a/whelk-core/src/main/groovy/whelk/JsonLd.groovy b/whelk-core/src/main/groovy/whelk/JsonLd.groovy index 7610217316..dfec57e287 100644 --- a/whelk-core/src/main/groovy/whelk/JsonLd.groovy +++ b/whelk-core/src/main/groovy/whelk/JsonLd.groovy @@ -552,7 +552,7 @@ class JsonLd { } static boolean looksLikeIri(String s) { - s.startsWith('https://') || s.startsWith('http://') + s && (s.startsWith('https://') || s.startsWith('http://')) } static boolean looksLikeIri(String s) { diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 1521b66a56..4224417b4d 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -43,22 +43,32 @@ class Wikidata implements Mapper { iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } - static List query(String query) { + static List query(String query, String langTag, int limit) { try { - performQuery(query) + performQuery(query, langTag, limit) } catch (Exception e) { throw new WhelkRuntimeException("Error querying wikidata: $e", e) } } - private static List performQuery(String query) { + /** + * Search Wikidata using the wbsearchentities API + * Documented here: https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities + * + * Language parameter: "Search in this language. This only affects how entities are selected, not + * the language in which the results are returned: this is controlled by the "uselang" parameter." + * + * @param query the query string + * @param langTag language code for language to search in + * @param limit max number of hits + * @return a list of entity URIs + */ + private static List performQuery(String query, String langTag, int limit) { HttpClient client = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.NORMAL).build() def base = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json' - def lang = 'sv' - def limit = 5 def q = URLEncoder.encode(query, StandardCharsets.UTF_8) - String uri = "$base&limit=$limit&language=$lang&uselang=$lang&search=$q" + String uri = "$base&limit=$limit&language=$langTag&uselang=$langTag&search=$q" HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(uri)) From 949919b1056e457a19fadfe9ed88c511a93a309c Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 22 Nov 2021 14:24:17 +0100 Subject: [PATCH 37/47] Add wdtn prefix --- whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy index 76a5f229ad..2573abfec1 100644 --- a/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy +++ b/whelk-core/src/main/groovy/whelk/external/QueryRunner.groovy @@ -23,6 +23,7 @@ class QueryRunner { "skos" : "http://www.w3.org/2004/02/skos/core#", "wd" : "http://www.wikidata.org/entity/", "wdt" : "http://www.wikidata.org/prop/direct/", + "wdtn" : "http://www.wikidata.org/prop/direct-normalized/", "wikibase": "http://wikiba.se/ontology#" ] From 260906e6d1fdcba3146d8050df73d3473a33b0f2 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 22 Nov 2021 14:25:24 +0100 Subject: [PATCH 38/47] Map various identifiers --- .../groovy/whelk/external/Wikidata.groovy | 160 +++++++++++++++--- 1 file changed, 133 insertions(+), 27 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 4224417b4d..e8cbbe3730 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -1,6 +1,7 @@ package whelk.external import groovy.transform.Memoized +import org.apache.jena.query.QuerySolution import org.apache.jena.query.ResultSet import org.apache.jena.rdf.model.Model import org.apache.jena.rdf.model.ModelFactory @@ -28,7 +29,7 @@ class Wikidata implements Mapper { return Optional.ofNullable(wdEntity.convert()) } - + @Override boolean mightHandle(String iri) { return isWikidata(iri) @@ -42,7 +43,7 @@ class Wikidata implements Mapper { static boolean isWikidata(String iri) { iri.startsWith("https://www.wikidata.org") || iri.startsWith("http://www.wikidata.org") } - + static List query(String query, String langTag, int limit) { try { performQuery(query, langTag, limit) @@ -55,10 +56,10 @@ class Wikidata implements Mapper { /** * Search Wikidata using the wbsearchentities API * Documented here: https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities - * + * * Language parameter: "Search in this language. This only affects how entities are selected, not * the language in which the results are returned: this is controlled by the "uselang" parameter." - * + * * @param query the query string * @param langTag language code for language to search in * @param limit max number of hits @@ -69,7 +70,7 @@ class Wikidata implements Mapper { def base = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json' def q = URLEncoder.encode(query, StandardCharsets.UTF_8) String uri = "$base&limit=$limit&language=$langTag&uselang=$langTag&search=$q" - + HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(uri)) .timeout(Duration.ofSeconds(30)) @@ -80,7 +81,7 @@ class Wikidata implements Mapper { def result = mapper.readValue(httpResponse.body(), Map.class) .get('search') .collect { (String) it['concepturi'] } - + return result } } @@ -91,20 +92,34 @@ class WikidataEntity { // Wikidata property short ids static final String COUNTRY = "P17" + static final String DDC = "P1036" + static final String EDITION = "P747" static final String END_TIME = "P582" + static final String FAST = "P2163" + static final String FREEBASE = "P646" + static final String GEONAMES = "P1566" static final String INSTANCE_OF = "P31" - static final String PART_OF_PLACE = "P131" // located in the administrative territorial entity + static final String LC_AUTH = "P244" + static final String LOCATED_IN = "P131" // located in the administrative territorial entity static final String SUBCLASS_OF = "P279" - - enum Type { - PLACE('Q618123'), // Geographical feature - PERSON('Q5'), // Human + static final String TORA = "P4820" + static final String YSO = "P2347" + + // Wikidata class short ids + static final String GEO_FEATURE = "Q618123" + static final String HUMAN = "Q5" + static final String SWEDISH_MUNI = "Q127448" + static final String SWEDISH_COUNTY = "Q200547" + + enum KbvType { + PLACE(GEO_FEATURE), + PERSON(HUMAN), OTHER('') - String baseClass + String wikidataType - private Type(String baseClass) { - this.baseClass = baseClass + private KbvType(String wikidataType) { + this.wikidataType = wikidataType } } @@ -137,8 +152,8 @@ class WikidataEntity { Map convert() { switch (type()) { - case Type.PLACE: return convertPlace() - case Type.PERSON: return convertPerson() + case KbvType.PLACE: return convertPlace() + case KbvType.PERSON: return convertPerson() default: return null } } @@ -155,17 +170,47 @@ class WikidataEntity { place['prefLabelByLang'] = prefLabel.collectEntries { [it.getLanguage(), it.getLexicalForm()] } List description = getDescription().findAll { it.getLanguage() in ElasticSearch.LANGUAGES_TO_INDEX } - if (!prefLabel.isEmpty()) + if (!description.isEmpty()) place['descriptionByLang'] = description.collectEntries { [it.getLanguage(), it.getLexicalForm()] } List country = getCountry().findAll { it.toString() != entityIri } if (!country.isEmpty()) place['country'] = country.collect { ['@id': it.toString()] } - List partOf = getPartOfPlace() - country + List partOf = getLocatedIn() - country if (!partOf.isEmpty()) place['isPartOf'] = partOf.collect { ['@id': it.toString()] } + List ddc = getDdc().collect { code, edition -> + Map bNode = + [ + '@type': "ClassificationDdc", + 'code' : code.toString() + ] + if (edition) + bNode['edition'] = ['@id': edition.toString()] + + return bNode + } + + List lcsh = getLcsh().collect { + ['@id': it.toString()] + } + + List fast = getFast().collect { + ['@id': it.toString()] + } + + List closeMatches = ddc + lcsh + fast + + if (closeMatches) { + place['closeMatch'] = closeMatches + } + + List identifiers = getPlaceIdentifiers() + if (!identifiers.isEmpty()) + place['exactMatch'] = identifiers.collect { ['@id': it.toString()] } + return place } @@ -207,11 +252,11 @@ class WikidataEntity { return rs.collect { it.get("country") } } - List getPartOfPlace() { + List getLocatedIn() { String queryString = """ - SELECT ?place { - wd:${shortId} p:${PART_OF_PLACE} ?stmt . - ?stmt ps:${PART_OF_PLACE} ?place . + SELECT DISTINCT ?place { + wd:${shortId} p:${LOCATED_IN} ?stmt . + ?stmt ps:${LOCATED_IN} ?place . FILTER NOT EXISTS { ?stmt pq:${END_TIME} ?endTime } } """ @@ -221,22 +266,83 @@ class WikidataEntity { return rs.collect { it.get("place") } } - Type type() { + List> getDdc() { + String queryString = """ + SELECT ?code ?edition { + wd:${shortId} wdt:${DDC} ?code ; + wdt:${DDC} ?stmt . + OPTIONAL { ?stmt pq:${EDITION} ?edition } + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { [it.get("code"), it.get("edition")] } + } + + List getLcsh() { + String queryString = """ + SELECT ?fullId { + wd:${shortId} wdtn:${LC_AUTH} ?fullId ; + wdt:${LC_AUTH} ?shortId . + FILTER(strstarts(?shortId, "sh")) + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("fullId") } + } + + List getFast() { + String queryString = """ + SELECT ?fastId { + wd:${shortId} wdtn:${FAST} ?fastId ; + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("fastId") } + } + + List getPlaceIdentifiers() { + String queryString = """ + SELECT ?freebaseId ?geonamesId ?toraId { + VALUES ?place { wd:${shortId} } + + OPTIONAL { ?place wdtn:${FREEBASE} ?freebaseId } + OPTIONAL { ?place wdtn:${GEONAMES} ?geonamesId } + OPTIONAL { ?place wdt:${TORA} ?toraShortId } + OPTIONAL { ?place wdtn:${YSO} ?ysoId } + + bind(iri(concat("https://data.riksarkivet.se/tora/", ?toraShortId)) as ?toraId) + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + QuerySolution singleRowResult = rs.next() + + return rs.getResultVars().findResults { singleRowResult?.get(it) } + } + + KbvType type() { String queryString = "SELECT ?type { wd:${shortId} wdt:${INSTANCE_OF} ?type }" ResultSet rs = QueryRunner.localSelectResult(queryString, graph) Set wdTypes = rs.collect { it.get("type").toString() } as Set - return Type.values().find { getSubclasses(it).intersect(wdTypes) } ?: Type.OTHER + return KbvType.values().find { getSubclasses(it).intersect(wdTypes) } ?: KbvType.OTHER } @Memoized - static Set getSubclasses(Type type) { - if (type == Type.OTHER) { + static Set getSubclasses(KbvType type) { + if (type == KbvType.OTHER) { return Collections.EMPTY_SET } - String queryString = "SELECT ?class { ?class wdt:${SUBCLASS_OF}* wd:${type.baseClass} }" + String queryString = "SELECT ?class { ?class wdt:${SUBCLASS_OF}* wd:${type.wikidataType} }" ResultSet rs = QueryRunner.remoteSelectResult(queryString, WIKIDATA_ENDPOINT) From 72e80d013a594ee464e90efd43923e12f5a1d7d9 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 23 Nov 2021 07:57:45 +0100 Subject: [PATCH 39/47] Select YSO --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index e8cbbe3730..199ff3de29 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -308,7 +308,7 @@ class WikidataEntity { List getPlaceIdentifiers() { String queryString = """ - SELECT ?freebaseId ?geonamesId ?toraId { + SELECT ?freebaseId ?geonamesId ?toraId ?ysoId { VALUES ?place { wd:${shortId} } OPTIONAL { ?place wdtn:${FREEBASE} ?freebaseId } From 01cbfa35f4033d8df8f1d095b46e242ff07ee559 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 23 Nov 2021 11:31:02 +0100 Subject: [PATCH 40/47] Get correct namespace for lcsh URI --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 199ff3de29..6932b1e2f0 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -282,16 +282,16 @@ class WikidataEntity { List getLcsh() { String queryString = """ - SELECT ?fullId { - wd:${shortId} wdtn:${LC_AUTH} ?fullId ; - wdt:${LC_AUTH} ?shortId . + SELECT ?id { + wd:${shortId} wdt:${LC_AUTH} ?shortId . + bind(iri(concat("http://id.loc.gov/authorities/subjects/", ?shortId)) as ?id) FILTER(strstarts(?shortId, "sh")) } """ ResultSet rs = QueryRunner.localSelectResult(queryString, graph) - return rs.collect { it.get("fullId") } + return rs.collect { it.get("id") } } List getFast() { From 9b4cdcb417f44bf31168a8333984a30af5b73e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 25 Nov 2021 12:52:51 +0100 Subject: [PATCH 41/47] Use locatedIn instead of isPartOf for Places --- whelk-core/src/main/groovy/whelk/external/Wikidata.groovy | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index 6932b1e2f0..ff3d21d068 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -177,9 +177,9 @@ class WikidataEntity { if (!country.isEmpty()) place['country'] = country.collect { ['@id': it.toString()] } - List partOf = getLocatedIn() - country - if (!partOf.isEmpty()) - place['isPartOf'] = partOf.collect { ['@id': it.toString()] } + List locatedIn = getLocatedIn() - country + if (!locatedIn.isEmpty()) + place['locatedIn'] = locatedIn.collect { ['@id': it.toString()] } List ddc = getDdc().collect { code, edition -> Map bNode = From 577990d45be13b1c9b21477fa33606555db3687d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Thu, 25 Nov 2021 14:29:40 +0100 Subject: [PATCH 42/47] Replace Wikidata countries with id.kb.se countries --- .../rest/api/ExternalEntitiesSearchAPI.groovy | 2 + whelk-core/src/main/groovy/whelk/Whelk.groovy | 3 +- .../whelk/external/ExternalEntities.groovy | 44 ++++++++++++++++--- .../groovy/whelk/external/Wikidata.groovy | 24 ++++++++-- 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy index 9b8a712303..d8b8c3b088 100644 --- a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -64,6 +64,8 @@ class ExternalEntitiesSearchAPI extends HttpServlet { def typeFilter = typeFilter(types) def uris = Wikidata.query(q, languageTag, 5) + uris.removeAll(whelk.external.getBannedImports()) + def inWhelk = whelk.getCards(uris) uris diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 402a3e8941..04bc53f853 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -63,7 +63,7 @@ class Whelk { RomanizationStep.LanguageResources languageResources ElasticFind elasticFind Relations relations - ExternalEntities external = new ExternalEntities() + ExternalEntities external DocumentNormalizer normalizer Romanizer romanizer @@ -222,6 +222,7 @@ class Whelk { elasticFind = new ElasticFind(new ESQuery(this)) initDocumentNormalizers(elasticFind) } + external = new ExternalEntities(this) } // FIXME: de-KBV/Libris-ify: some of these are KBV specific, is that a problem? diff --git a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy index c46b26fcaa..8bdea561e4 100644 --- a/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy +++ b/whelk-core/src/main/groovy/whelk/external/ExternalEntities.groovy @@ -6,15 +6,15 @@ import com.google.common.cache.LoadingCache import whelk.Document import whelk.IdGenerator import whelk.JsonLd +import whelk.Whelk import whelk.util.Metrics class ExternalEntities { - private static final List mappers = [ - new Wikidata(), - ] + private final List mappers private static final int CACHE_SIZE = 10_000 - + private final Set bannedImports + private LoadingCache> cache = CacheBuilder.newBuilder() .maximumSize(CACHE_SIZE) .recordStats() @@ -25,9 +25,16 @@ class ExternalEntities { } }) - ExternalEntities() { + ExternalEntities(Whelk whelk) { + Map countryMappings = loadCountryMappings(whelk) + mappers = [ + new Wikidata(countryMappings), + ] + bannedImports = Collections.unmodifiableSet(countryMappings.keySet()) + Metrics.cacheMetrics.addCache('external-entities', cache) } + Optional get(String iri) { if (mappers.any { it.mightHandle(iri) }) { cache.get(iri).map{ it.clone() } @@ -44,8 +51,12 @@ class ExternalEntities { doc } } + + Set getBannedImports() { + return bannedImports + } - private static Optional getInternal(String iri) { + private Optional getInternal(String iri) { Document d = mappers.findResult { mapper -> mapper.getThing(iri).map{ document(it, JsonLd.CACHE_RECORD_TYPE, mapper.datasetId()) }.orElse(null) } @@ -80,4 +91,25 @@ class ExternalEntities { ] ]) } + + private static Map loadCountryMappings(Whelk whelk) { + if (!whelk.elasticFind) { + return [:] + } + + def query = [ + (JsonLd.TYPE_KEY): ['Country'], + "q" : ["*"], + '_sort' : [JsonLd.ID_KEY] + ] + + Map result = [:] + def recordIds = whelk.elasticFind.findIds(query).collect{ whelk.baseUri.toString() + it } + whelk.bulkLoad(recordIds).collect { id, doc -> + JsonLd.asList(doc.getThing()['exactMatch']).each { match -> + result[(String) match[JsonLd.ID_KEY]] = doc.getThingIdentifiers().first() + } + } + return result + } } diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index ff3d21d068..bd3d49a4fd 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -9,6 +9,7 @@ import org.apache.jena.rdf.model.RDFNode import whelk.component.ElasticSearch import whelk.exception.WhelkRuntimeException import whelk.util.Metrics +import groovy.util.logging.Log4j2 as Log import java.net.http.HttpClient import java.net.http.HttpRequest @@ -18,14 +19,22 @@ import java.time.Duration import static whelk.util.Jackson.mapper +@Log class Wikidata implements Mapper { + Map countryMap + + Wikidata(Map countryMap) { + this.countryMap = countryMap + log.info("Initialized with ${countryMap.size()} country mappings") + } + @Override Optional getThing(String iri) { if (!isWikidata(iri)) { return Optional.empty() } - WikidataEntity wdEntity = new WikidataEntity(iri) + WikidataEntity wdEntity = new WikidataEntity(iri, countryMap) return Optional.ofNullable(wdEntity.convert()) } @@ -128,7 +137,9 @@ class WikidataEntity { String entityIri String shortId - WikidataEntity(String iri) { + Map countryMap + + WikidataEntity(String iri, Map countryMap) { try { graph = ModelFactory.createDefaultModel() this.shortId = getShortId(iri) @@ -138,6 +149,7 @@ class WikidataEntity { catch (ExceptionInInitializerError e) { e.printStackTrace() } + this.countryMap = countryMap } private void loadGraph() { @@ -175,11 +187,11 @@ class WikidataEntity { List country = getCountry().findAll { it.toString() != entityIri } if (!country.isEmpty()) - place['country'] = country.collect { ['@id': it.toString()] } + place['country'] = country.collect { ['@id': replaceIfCountry(it.toString())] } List locatedIn = getLocatedIn() - country if (!locatedIn.isEmpty()) - place['locatedIn'] = locatedIn.collect { ['@id': it.toString()] } + place['locatedIn'] = locatedIn.collect { ['@id': replaceIfCountry(it.toString())] } List ddc = getDdc().collect { code, edition -> Map bNode = @@ -213,6 +225,10 @@ class WikidataEntity { return place } + + String replaceIfCountry(String id) { + return countryMap.get(id, id) + } Map convertPerson() { Map person = From c611ac080e2201516703c5b1e93ee373d9bb92ef Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 30 Nov 2021 13:55:41 +0100 Subject: [PATCH 43/47] Add getty closeMatch --- .../groovy/whelk/external/Wikidata.groovy | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy index bd3d49a4fd..b3efd4cbc1 100644 --- a/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy +++ b/whelk-core/src/main/groovy/whelk/external/Wikidata.groovy @@ -107,6 +107,7 @@ class WikidataEntity { static final String FAST = "P2163" static final String FREEBASE = "P646" static final String GEONAMES = "P1566" + static final String GETTY = "P1667" static final String INSTANCE_OF = "P31" static final String LC_AUTH = "P244" static final String LOCATED_IN = "P131" // located in the administrative territorial entity @@ -213,7 +214,11 @@ class WikidataEntity { ['@id': it.toString()] } - List closeMatches = ddc + lcsh + fast + List getty = getGetty().collect { + ['@id': it.toString()] + } + + List closeMatches = ddc + lcsh + fast + getty if (closeMatches) { place['closeMatch'] = closeMatches @@ -322,6 +327,18 @@ class WikidataEntity { return rs.collect { it.get("fastId") } } + List getGetty() { + String queryString = """ + SELECT ?fastId { + wd:${shortId} wdtn:${GETTY} ?gettyId ; + } + """ + + ResultSet rs = QueryRunner.localSelectResult(queryString, graph) + + return rs.collect { it.get("gettyId") } + } + List getPlaceIdentifiers() { String queryString = """ SELECT ?freebaseId ?geonamesId ?toraId ?ysoId { From 7b73d1d5ee51284c0ba08899c64854d07f699697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Tue, 30 Nov 2021 17:10:25 +0100 Subject: [PATCH 44/47] Fix select external by id --- .../whelk/rest/api/ExternalEntitiesSearchAPI.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy index d8b8c3b088..faa1771149 100644 --- a/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy +++ b/rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy @@ -96,21 +96,21 @@ class ExternalEntitiesSearchAPI extends HttpServlet { } List selectExternal(String iri, Collection types) { - def typeFilter = typeFilter(types) + def theTypeFilter = typeFilter(types) def inWhelk = whelk.getCards([iri]) if (inWhelk[iri]) { - return whelkResult(inWhelk[iri], typeFilter) + return whelkResult(inWhelk[iri], theTypeFilter) } return whelk.external.getEphemeral(iri).map ({ doc -> def extId = doc.getThingIdentifiers().first() inWhelk = whelk.getCards([extId]) if (inWhelk[extId]) { // iri was an alias/sameAs - return whelkResult(inWhelk[extId], typeFilter) + return whelkResult(inWhelk[extId], theTypeFilter) } - if (typeFilter(types).test(doc)) { + if (theTypeFilter.test(doc)) { whelk.embellish(doc) [JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)] } else { From 87ce7754ce64f81f6ab0a369ee08d5adcde25a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 22 Mar 2023 13:55:37 +0100 Subject: [PATCH 45/47] Fix merge --- whelk-core/src/main/groovy/whelk/JsonLd.groovy | 4 ---- 1 file changed, 4 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/JsonLd.groovy b/whelk-core/src/main/groovy/whelk/JsonLd.groovy index dfec57e287..ac9ab7671d 100644 --- a/whelk-core/src/main/groovy/whelk/JsonLd.groovy +++ b/whelk-core/src/main/groovy/whelk/JsonLd.groovy @@ -555,10 +555,6 @@ class JsonLd { s && (s.startsWith('https://') || s.startsWith('http://')) } - static boolean looksLikeIri(String s) { - s && (s.startsWith('https://') || s.startsWith('http://')) - } - static List> findPaths(Map obj, String key, String value) { return findPaths(obj, key, [value].toSet()) } From b153f94aee4f74e3a035e0a84efc733bcaaf473d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 22 Mar 2023 15:34:46 +0100 Subject: [PATCH 46/47] Fix merge --- whelk-core/src/main/groovy/whelk/Whelk.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 04bc53f853..442522326e 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -691,7 +691,7 @@ class Whelk { redirectedIris[iri] = extDoc.getThingIdentifiers().first() extDoc.addThingIdentifier(iri) } - storeAtomicUpdate(extDoc, minorUpdate, changedIn, changedBy, checkSum) + storeAtomicUpdate(extDoc, minorUpdate, false, changedIn, changedBy, checkSum) }) } catch (Exception e) { // TODO From 301bb688c10e9302b8edf854e0248e0e6b0703f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 22 Mar 2023 16:14:25 +0100 Subject: [PATCH 47/47] Fix merge --- whelk-core/build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/whelk-core/build.gradle b/whelk-core/build.gradle index 2526384a86..1c7f2169b8 100644 --- a/whelk-core/build.gradle +++ b/whelk-core/build.gradle @@ -95,6 +95,7 @@ dependencies { api 'commons-io:commons-io:2.11.0' implementation "org.apache.httpcomponents:httpclient:${httpComponentsClientVersion}" implementation "org.apache.httpcomponents:httpcore:${httpComponentsCoreVersion}" + api 'org.apache.jena:apache-jena-libs:3.17.0' implementation 'org.apache.jena:apache-jena-libs:3.17.0' api "org.codehaus.groovy:groovy-json:${groovyVersion}" api "org.codehaus.groovy:groovy-xml:${groovyVersion}"