Skip to content

Commit

Permalink
Merge pull request #1124 from libris/feature/bulk-load
Browse files Browse the repository at this point in the history
Actually bulk load documents in one query in bulkLoad
  • Loading branch information
olovy authored Jun 8, 2022
2 parents 663b8f2 + e40548a commit ecaae76
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 15 deletions.
4 changes: 4 additions & 0 deletions whelk-core/src/main/groovy/whelk/JsonLd.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,10 @@ class JsonLd {
return (o instanceof List) ? (List) o : o != null ? [o] : []
}

static boolean looksLikeIri(String s) {
s && (s.startsWith('https://') || s.startsWith('http://'))
}

static List<List<String>> findPaths(Map obj, String key, String value) {
return findPaths(obj, key, [value].toSet())
}
Expand Down
34 changes: 22 additions & 12 deletions whelk-core/src/main/groovy/whelk/Whelk.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -164,23 +164,33 @@ class Whelk {
}

Map<String, Document> bulkLoad(Collection<String> ids) {
Map<String, Document> result = [:]
def idMap = [:]
def otherIris = []
List<String> systemIds = []
ids.each { id ->
Document doc

// Fetch from DB
if (id.startsWith(Document.BASE_URI.toString())) {
id = Document.BASE_URI.resolve(id).getPath().substring(1)
def systemId = Document.BASE_URI.resolve(id).getPath().substring(1)
idMap[systemId] = id
systemIds << systemId
}
doc = storage.load(id)
if (doc == null)
doc = storage.getDocumentByIri(id)

if (doc && !doc.deleted) {
result[id] = doc
else if (JsonLd.looksLikeIri(id)) {
otherIris << id
}
else {
systemIds << id
}
}
return result
if (otherIris) {
Map<String, String> idToIri = storage.getSystemIdsByIris(otherIris)
.collectEntries { k, v -> [(v): k] }

systemIds.addAll(idToIri.keySet())
idMap.putAll(idToIri)
}

return storage.bulkLoad(systemIds)
.findAll { id, doc -> !doc.deleted }
.collectEntries { id, doc -> [(idMap.getOrDefault(id, id)) : doc]}
}

private void reindex(Document updated, Document preUpdateDoc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ class PostgreSQLComponent {
private static final String GET_DOCUMENT_VERSION =
"SELECT id, data FROM lddb__versions WHERE id = ? AND checksum = ?"

private static final String BULK_LOAD_DOCUMENTS = """
SELECT id, data, created, modified, deleted
FROM unnest(?) AS in_id, lddb l
WHERE in_id = l.id
""".stripIndent()

private static final String GET_EMBELLISHED_DOCUMENT =
"SELECT data from lddb__embellished where id = ?"

Expand Down Expand Up @@ -1827,6 +1833,27 @@ class PostgreSQLComponent {
}
return doc
}

Map<String, Document> bulkLoad(Iterable<String> systemIds) {
return withDbConnection {
Connection connection = getMyConnection()
PreparedStatement preparedStatement = null
ResultSet rs = null
try {
preparedStatement = connection.prepareStatement(BULK_LOAD_DOCUMENTS)
preparedStatement.setArray(1, connection.createArrayOf("TEXT", systemIds as String[]))

rs = preparedStatement.executeQuery()
SortedMap<String, Document> result = new TreeMap<>()
while(rs.next()) {
result[rs.getString("id")] = assembleDocument(rs)
}
return result
} finally {
close(rs, preparedStatement)
}
}
}

String getSystemIdByIri(String iri) {
return withDbConnection {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package whelk.filter

import com.google.common.collect.Iterables
import whelk.Whelk
import whelk.search.ESQuery
import whelk.search.ElasticFind
Expand Down Expand Up @@ -63,9 +64,8 @@ class BlankNodeLinker implements DocumentUtil.Linker {
'_sort' : [ID_KEY]
]

finder.findIds(q).each { id ->
def doc = whelk.getDocument(id)
if (doc) {
Iterables.partition(finder.findIds(q), 100).each { List<String> i ->
whelk.bulkLoad(i).each { id, doc ->
addDefinition(doc.data[GRAPH_KEY][1])
}
}
Expand Down

0 comments on commit ecaae76

Please sign in to comment.