Skip to content

Commit

Permalink
Bugfix/lxl 4600 unverified ids (#1534)
Browse files Browse the repository at this point in the history
* Filter out surely invalid ids when collecting short ids

* Load only verified xl ids in collectFormBNodeIdToResourceIds

* Don't add prefix to already prefixed types

* Add missing type declaration
  • Loading branch information
kwahlin authored and olovy committed Dec 10, 2024
1 parent 4f11676 commit 954cffa
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 22 deletions.
31 changes: 15 additions & 16 deletions whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import static whelk.JsonLd.RECORD_TYPE
import static whelk.JsonLd.THING_KEY
import static whelk.JsonLd.TYPE_KEY
import static whelk.JsonLd.asList
import static whelk.JsonLd.looksLikeIri
import static whelk.component.SparqlQueryClient.GRAPH_VAR
import static whelk.converter.JsonLDTurtleConverter.toTurtleNoPrelude
import static whelk.util.DocumentUtil.getAtPath
Expand Down Expand Up @@ -80,7 +81,7 @@ class MatchForm {
return path.findAll { it instanceof String } as List<String>
}

private getSubtypes() {
private Set<String> getSubtypes() {
return getSubtypes(form)
}

Expand Down Expand Up @@ -220,8 +221,9 @@ class MatchForm {

private String insertTypeMappings(String sparqlPattern) {
if (shouldMatchSubtypes() && getSubtypes()) {
def baseType = form[TYPE_KEY]
String valuesClause = "VALUES ?$baseType { ${([baseType] + getSubtypes()).collect { ":$it" }.join(" ")} }\n"
String baseType = form[TYPE_KEY]
String values = ([baseType] + getSubtypes()).collect { it.contains(":") ? it : ":$it" }.join(" ")
String valuesClause = "VALUES ?$baseType { $values }\n"
return valuesClause + sparqlPattern
}
return sparqlPattern
Expand Down Expand Up @@ -265,34 +267,31 @@ class MatchForm {
if (!anyOf) {
return
}
def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set<String>
def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as List<String>
if (ids) {
String nodeId = node[BNODE_ID]

def (iris, shortIds) = ids.split(JsonLd::looksLikeIri)
if (shortIds.isEmpty()) {
nodeIdMappings[nodeId] = iris
return
}

if (!idLoader) {
nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT }
nodeIdMappings[nodeId] = ids.findResults {
IdLoader.isXlShortId(it)
? Document.BASE_URI.toString() + it + Document.HASH_IT
: (looksLikeIri(it) ? it : null)
} as Set<String>
return
}

def nodeType = node[TYPE_KEY]
def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null
def xlShortIds = idLoader.collectXlShortIds(shortIds as List<String>, marcCollection)
def xlShortIds = idLoader.collectXlShortIds(ids, marcCollection)
def parentProp = dropIndexes(path).reverse()[1]
def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) }
// TODO: Fix hardcoding
def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata")
def isRecord = whelk.jsonld.isInstanceOf((Map) node, "AdminMetadata")
|| isInRange(RECORD_TYPE)
|| isInRange("AdminMetadata")

nodeIdMappings[nodeId] = iris + xlShortIds.collect {
Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT)
}
nodeIdMappings[nodeId] = idLoader.loadAllIds(xlShortIds)
.collect { isRecord ? it.recordIri() : it.thingIri() } as Set<String>

return new DocumentUtil.Nop()
}
Expand Down
24 changes: 18 additions & 6 deletions whelktool/src/main/java/whelk/datatool/util/IdLoader.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package whelk.datatool.util;

import whelk.JsonLd;
import whelk.component.PostgreSQLComponent;

import java.io.BufferedReader;
Expand All @@ -18,6 +19,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;

import static whelk.datatool.WhelkTool.DEFAULT_FETCH_SIZE;
Expand Down Expand Up @@ -65,20 +67,30 @@ public List<Id> loadAllIds(Collection<String> shortIds) {
}
}

public List<String> collectXlShortIds(Collection<String> xlIds) {
Map<String, String> iriToShortId = findShortIdsForIris(xlIds.stream().filter(id -> id.contains(":")).toList());
return xlIds.stream().map(id -> iriToShortId.getOrDefault(id, id)).toList();
public List<String> collectXlShortIds(Collection<String> ids) {
Map<String, String> iriToShortId = findShortIdsForIris(ids.stream().filter(JsonLd::looksLikeIri).toList());
return ids.stream()
.map(id -> iriToShortId.getOrDefault(id, isXlShortId(id) ? id : null))
.filter(Objects::nonNull)
.toList();
}

public List<String> collectXlShortIds(Collection<String> ids, String marcCollection) {
if (!Arrays.asList("bib", "auth", "hold").contains(marcCollection)) {
return collectXlShortIds(ids);
}
Map<String, String> iriToShortId = findShortIdsForIris(ids.stream().filter(id -> id.contains(":")).toList());
Map<String, String> voyagerIdToXlShortID = findXlShortIdsForVoyagerIds(
Map<String, String> iriToShortId = findShortIdsForIris(ids.stream().filter(JsonLd::looksLikeIri).toList());
Map<String, String> voyagerIdToXlShortId = findXlShortIdsForVoyagerIds(
ids.stream().filter(IdLoader::isVoyagerId).toList(),
marcCollection);
return ids.stream().map(id -> iriToShortId.getOrDefault(id, voyagerIdToXlShortID.getOrDefault(id, id))).toList();
return ids.stream()
.map(id -> iriToShortId.getOrDefault(id, voyagerIdToXlShortId.getOrDefault(id, isXlShortId(id) ? id : null)))
.filter(Objects::nonNull)
.toList();
}

public static boolean isXlShortId(String id) {
return id.matches("[0-9a-z]{15,17}");
}

private Map<String, String> findXlShortIdsForVoyagerIds(Collection<String> voyagerIds, String marcCollection) {
Expand Down

0 comments on commit 954cffa

Please sign in to comment.