Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic mechanisms for using/caching external entities #1009

Draft
wants to merge 47 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
0a3eb2a
Add script for analyzing broken/external links
olovy Oct 7, 2021
0a39b25
Add QueryRunner class
kwahlin Oct 13, 2021
b2750bd
Add prototype for caching external entities
olovy Oct 8, 2021
a17ee55
Add cache to External
olovy Oct 13, 2021
6338ce1
Set correct entity iri plus some clean up
kwahlin Oct 14, 2021
4c813cd
Read RDF as Turtle
kwahlin Oct 14, 2021
16d4719
Wikidata dataset URI
olovy Oct 14, 2021
2b498a2
Handle alias URIs in external search
olovy Oct 14, 2021
090b46b
Add more prefixes and simplify select querying
kwahlin Oct 14, 2021
0bc23a6
Exclude historical places from isPartOf and reorganize
kwahlin Oct 14, 2021
5fe0ada
Correct variable mistake
kwahlin Oct 14, 2021
638fe8e
Naming
olovy Oct 14, 2021
aebda9f
Collect cache metrics for External
olovy Oct 15, 2021
f176d71
Naming
olovy Oct 15, 2021
22092d0
Only cache results for IRIs that might actually be mapped
olovy Oct 15, 2021
28a3a6c
Index placeholders
olovy Oct 15, 2021
de19129
Exclude country link to self in countries
olovy Oct 15, 2021
b61e3a9
Add script for analysing production/manufacture places
olovy Oct 15, 2021
dcf62b8
Don't send cache records and placeholders to Virtuoso
olovy Oct 15, 2021
bf02e9d
Display and index placeholders as actual things if possible
olovy Oct 20, 2021
51e70dd
Make "ephemeral" things have record type placeholder
olovy Oct 21, 2021
45b61c0
Naming
olovy Oct 22, 2021
8d02247
Only create placeholders for added links
olovy Oct 22, 2021
cd625c7
Handle redirected external URIs when saving
olovy Oct 22, 2021
8fd0d7d
Map wikidata description
olovy Oct 22, 2021
757b2e8
Fix publication place script
olovy Oct 22, 2021
e2ef6ca
Collect metrics for wikidata .ttl?flavor=dump
olovy Oct 25, 2021
372a174
Add Wikidata search prototype
olovy Nov 3, 2021
ce61a14
Bump apache-jena-libs
olovy Nov 3, 2021
731a0a6
Inject docs that exist in whelk in wikidata search results
olovy Nov 3, 2021
beb39b7
Set changedBy for cache records to same as request that caused creation
olovy Nov 3, 2021
07107f3
Move external entity search to separate servlet
olovy Nov 11, 2021
ae8a6b0
Refactor SearchUtils.Lookup. Use it in ExternalEntitiesSearch
olovy Nov 11, 2021
44e5e68
Refactor SearchUtils.Lookup. Use it in ExternalEntitiesSearch
olovy Nov 11, 2021
93e4c8a
Clean up
olovy Nov 11, 2021
038eb16
Language parameter in external search
olovy Nov 18, 2021
949919b
Add wdtn prefix
kwahlin Nov 22, 2021
260906e
Map various identifiers
kwahlin Nov 22, 2021
72e80d0
Select YSO
kwahlin Nov 23, 2021
01cbfa3
Get correct namespace for lcsh URI
kwahlin Nov 23, 2021
9b4cdcb
Use locatedIn instead of isPartOf for Places
olovy Nov 25, 2021
577990d
Replace Wikidata countries with id.kb.se countries
olovy Nov 25, 2021
c611ac0
Add getty closeMatch
kwahlin Nov 30, 2021
7b73d1d
Fix select external by id
olovy Nov 30, 2021
87ce775
Fix merge
olovy Mar 22, 2023
b153f94
Fix merge
olovy Mar 22, 2023
301bb68
Fix merge
olovy Mar 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions rest/src/main/groovy/whelk/rest/api/Crud.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,13 @@ class Crud extends HttpServlet {
sendGetResponse(response, body, eTag, request.getPath(), request.getContentType(), request.getId())
} else {
ETag eTag

if (doc.isPlaceholder()) {
whelk.external.getEphemeral(doc.getThingIdentifiers().first()).ifPresent({ ext ->
doc.setThing(ext.getThing())
})
}

if (request.shouldEmbellish()) {
String plainChecksum = doc.getChecksum(jsonld)
whelk.embellish(doc)
Expand Down Expand Up @@ -679,7 +686,13 @@ class Crud extends HttpServlet {
try {
if (doc) {
String activeSigel = request.getHeader(XL_ACTIVE_SIGEL_HEADER)

String collection = doc.getLegacyCollection(jsonld)

if (doc.isCacheRecord()) {
throw new BadRequestException("Cannot POST/PUT cache record")
}

if (isUpdate) {

// You are not allowed to change collection when updating a record
Expand Down
140 changes: 140 additions & 0 deletions rest/src/main/groovy/whelk/rest/api/ExternalEntitiesSearchAPI.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package whelk.rest.api

import whelk.Document
import whelk.JsonLd
import whelk.Whelk
import whelk.external.Wikidata
import whelk.util.WhelkFactory

import javax.servlet.ServletException
import javax.servlet.http.HttpServlet
import javax.servlet.http.HttpServletRequest
import javax.servlet.http.HttpServletResponse
import java.util.function.Predicate

import static whelk.JsonLd.CONTEXT_KEY
import static whelk.JsonLd.TYPE_KEY

class ExternalEntitiesSearchAPI extends HttpServlet {
Whelk whelk

@Override
void init() {
whelk = WhelkFactory.getSingletonWhelk()
}

@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
String q = request.getParameter('q')?.trim() ?: ''
def types = request.getParameterMap().get(TYPE_KEY) as List ?: []
def language = request.getParameter('_lang') ?: 'sv'

def items = JsonLd.looksLikeIri(q)
? selectExternal(q, types)
: searchExternal(q, types, language)

SearchUtils.Lookup lookup = new SearchUtils.Lookup(whelk)

def mappings = []
if (q) {
mappings << ['variable' : 'q',
'predicate': lookup.chip('textQuery'),
'value' : q]
}
def (paramMappings, _) = SearchUtils.mapParams(lookup, request.getParameterMap())
mappings.addAll(paramMappings)

def result = [
(CONTEXT_KEY): Crud.CONTEXT_PATH,
(TYPE_KEY) : 'PartialCollectionView',
'itemOffset' : 0,
'totalItems' : items.size(),
'search' : [
'mapping': mappings
],
'items' : items
]

lookup.run()

HttpTools.sendResponse(response, result, MimeTypes.JSONLD)
}

List searchExternal(String q, Collection<String> types, languageTag) {
def typeFilter = typeFilter(types)

def uris = Wikidata.query(q, languageTag, 5)
uris.removeAll(whelk.external.getBannedImports())

def inWhelk = whelk.getCards(uris)

uris
.collect { uri ->
if (inWhelk[uri]) {
def doc = new Document(inWhelk[uri])
insertReverseLinkCount(doc)
doc
}
else {
whelk.external.getEphemeral(uri).orElse(null)
}
}
.grep()
.findAll {typeFilter.test(it) }
.collect { doc ->
whelk.embellish(doc)
JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)
}
}

private Predicate<Document> typeFilter(Collection<String> types) {
boolean isAnyTypeOk = !types || types.any { it == '*' }
return { Document doc ->
def extType = doc.getThingType()
isAnyTypeOk || types.any { it == extType || whelk.jsonld.isSubClassOf(extType, (String) it)}
}
}

List selectExternal(String iri, Collection<String> types) {
def theTypeFilter = typeFilter(types)

def inWhelk = whelk.getCards([iri])
if (inWhelk[iri]) {
return whelkResult(inWhelk[iri], theTypeFilter)
}

return whelk.external.getEphemeral(iri).map ({ doc ->
def extId = doc.getThingIdentifiers().first()
inWhelk = whelk.getCards([extId])
if (inWhelk[extId]) { // iri was an alias/sameAs
return whelkResult(inWhelk[extId], theTypeFilter)
}

if (theTypeFilter.test(doc)) {
whelk.embellish(doc)
[JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)]
} else {
[]
}
}).orElse([])
}

List whelkResult(Map data, typeFilter) {
Document doc = new Document(data)
if (!typeFilter.test(doc)) {
return []
}
insertReverseLinkCount(doc)
whelk.embellish(doc)
def framed = JsonLd.frame(doc.getThingIdentifiers().first(), doc.data)
return [framed]
}

void insertReverseLinkCount(Document doc) {
whelk.elastic.retrieveIndexedDocument(doc.getShortId())?.with {
if (it.reverseLinks) {
doc.data[JsonLd.GRAPH_KEY][1]['reverseLinks'] = it.reverseLinks
}
}
}
}
Loading