From e32f0ba57d2e159fed8546e6dd173a157a4d6e8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Tue, 7 Jun 2022 16:42:33 +0200 Subject: [PATCH 1/3] Use bulkLoad in BlankNodeLinker.loadDefinitions --- .../src/main/groovy/whelk/filter/BlankNodeLinker.groovy | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/filter/BlankNodeLinker.groovy b/whelk-core/src/main/groovy/whelk/filter/BlankNodeLinker.groovy index 21bb14e1b4..69793be56e 100644 --- a/whelk-core/src/main/groovy/whelk/filter/BlankNodeLinker.groovy +++ b/whelk-core/src/main/groovy/whelk/filter/BlankNodeLinker.groovy @@ -1,5 +1,6 @@ package whelk.filter +import com.google.common.collect.Iterables import whelk.Whelk import whelk.search.ESQuery import whelk.search.ElasticFind @@ -63,9 +64,8 @@ class BlankNodeLinker implements DocumentUtil.Linker { '_sort' : [ID_KEY] ] - finder.findIds(q).each { id -> - def doc = whelk.getDocument(id) - if (doc) { + Iterables.partition(finder.findIds(q), 100).each { List i -> + whelk.bulkLoad(i).each { id, doc -> addDefinition(doc.data[GRAPH_KEY][1]) } } From b5e8477626d95e7a5c5728080b656644dd309385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Tue, 7 Jun 2022 17:24:59 +0200 Subject: [PATCH 2/3] Load all documents in one (1) query in whelk.bulkLoad --- .../src/main/groovy/whelk/JsonLd.groovy | 4 +++ whelk-core/src/main/groovy/whelk/Whelk.groovy | 34 ++++++++++++------- .../component/PostgreSQLComponent.groovy | 28 +++++++++++++++ 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/JsonLd.groovy b/whelk-core/src/main/groovy/whelk/JsonLd.groovy index 43149a8350..9b81a2c6f3 100644 --- a/whelk-core/src/main/groovy/whelk/JsonLd.groovy +++ b/whelk-core/src/main/groovy/whelk/JsonLd.groovy @@ -511,6 +511,10 @@ class JsonLd { return (o instanceof List) ? (List) o : o != null ? [o] : [] } + static boolean looksLikeIri(String s) { + s && (s.startsWith('https://') || s.startsWith('http://')) + } + static List> findPaths(Map obj, String key, String value) { return findPaths(obj, key, [value].toSet()) } diff --git a/whelk-core/src/main/groovy/whelk/Whelk.groovy b/whelk-core/src/main/groovy/whelk/Whelk.groovy index 9f5793b66f..0d59f543a2 100644 --- a/whelk-core/src/main/groovy/whelk/Whelk.groovy +++ b/whelk-core/src/main/groovy/whelk/Whelk.groovy @@ -164,23 +164,33 @@ class Whelk { } Map bulkLoad(Collection ids) { - Map result = [:] + def idMap = [:] + def otherIris = [] + List systemIds = [] ids.each { id -> - Document doc - - // Fetch from DB if (id.startsWith(Document.BASE_URI.toString())) { - id = Document.BASE_URI.resolve(id).getPath().substring(1) + def systemId = Document.BASE_URI.resolve(id).getPath().substring(1) + idMap[systemId] = id + systemIds << systemId } - doc = storage.load(id) - if (doc == null) - doc = storage.getDocumentByIri(id) - - if (doc && !doc.deleted) { - result[id] = doc + else if (JsonLd.looksLikeIri(id)) { + otherIris << id + } + else { + systemIds << id } } - return result + if (otherIris) { + Map idToIri = storage.getSystemIdsByIris(otherIris) + .collectEntries { k, v -> [(v): k] } + + systemIds.addAll(idToIri.keySet()) + idMap.putAll(idToIri) + } + + return storage.bulkLoad(systemIds) + .findAll { id, doc -> !doc.deleted } + .collectEntries { id, doc -> [(idMap.getOrDefault(id, id)) : doc]} } private void reindex(Document updated, Document preUpdateDoc) { diff --git a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy index 21ed3b1e16..9bb165c9ad 100644 --- a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy +++ b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy @@ -7,6 +7,7 @@ import com.zaxxer.hikari.metrics.prometheus.PrometheusHistogramMetricsTrackerFac import groovy.json.StringEscapeUtils import groovy.transform.CompileStatic import groovy.util.logging.Log4j2 as Log +import org.apache.jena.atlas.iterator.Iter import org.postgresql.PGStatement import org.postgresql.util.PGobject import org.postgresql.util.PSQLException @@ -118,6 +119,12 @@ class PostgreSQLComponent { private static final String GET_DOCUMENT_VERSION = "SELECT id, data FROM lddb__versions WHERE id = ? AND checksum = ?" + private static final String BULK_LOAD_DOCUMENTS = """ + SELECT id, data, created, modified, deleted + FROM unnest(?) AS in_id, lddb l + WHERE in_id = l.id + """.stripIndent() + private static final String GET_EMBELLISHED_DOCUMENT = "SELECT data from lddb__embellished where id = ?" @@ -1827,6 +1834,27 @@ class PostgreSQLComponent { } return doc } + + Map bulkLoad(Iterable systemIds) { + return withDbConnection { + Connection connection = getMyConnection() + PreparedStatement preparedStatement = null + ResultSet rs = null + try { + preparedStatement = connection.prepareStatement(BULK_LOAD_DOCUMENTS) + preparedStatement.setArray(1, connection.createArrayOf("TEXT", systemIds as String[])) + + rs = preparedStatement.executeQuery() + SortedMap result = new TreeMap<>() + while(rs.next()) { + result[rs.getString("id")] = assembleDocument(rs) + } + return result + } finally { + close(rs, preparedStatement) + } + } + } String getSystemIdByIri(String iri) { return withDbConnection { From e40548acbd4226a8727e26a8da0d3dd1f0d4f3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= Date: Wed, 8 Jun 2022 10:38:38 +0200 Subject: [PATCH 3/3] Remove unused import --- .../src/main/groovy/whelk/component/PostgreSQLComponent.groovy | 1 - 1 file changed, 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy index 9bb165c9ad..2026a4cfa2 100644 --- a/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy +++ b/whelk-core/src/main/groovy/whelk/component/PostgreSQLComponent.groovy @@ -7,7 +7,6 @@ import com.zaxxer.hikari.metrics.prometheus.PrometheusHistogramMetricsTrackerFac import groovy.json.StringEscapeUtils import groovy.transform.CompileStatic import groovy.util.logging.Log4j2 as Log -import org.apache.jena.atlas.iterator.Iter import org.postgresql.PGStatement import org.postgresql.util.PGobject import org.postgresql.util.PSQLException