Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Actually bulk load documents in one query in bulkLoad #1124

Merged
merged 3 commits into from
Jun 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions whelk-core/src/main/groovy/whelk/JsonLd.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,10 @@ class JsonLd {
return (o instanceof List) ? (List) o : o != null ? [o] : []
}

static boolean looksLikeIri(String s) {
Copy link
Contributor Author

@olovy olovy Jun 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From #1009

s && (s.startsWith('https://') || s.startsWith('http://'))
}

static List<List<String>> findPaths(Map obj, String key, String value) {
return findPaths(obj, key, [value].toSet())
}
Expand Down
34 changes: 22 additions & 12 deletions whelk-core/src/main/groovy/whelk/Whelk.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -164,23 +164,33 @@ class Whelk {
}

Map<String, Document> bulkLoad(Collection<String> ids) {
Map<String, Document> result = [:]
def idMap = [:]
def otherIris = []
List<String> systemIds = []
ids.each { id ->
Document doc

// Fetch from DB
if (id.startsWith(Document.BASE_URI.toString())) {
id = Document.BASE_URI.resolve(id).getPath().substring(1)
def systemId = Document.BASE_URI.resolve(id).getPath().substring(1)
idMap[systemId] = id
systemIds << systemId
}
doc = storage.load(id)
if (doc == null)
doc = storage.getDocumentByIri(id)

if (doc && !doc.deleted) {
olovy marked this conversation as resolved.
Show resolved Hide resolved
result[id] = doc
else if (JsonLd.looksLikeIri(id)) {
otherIris << id
}
else {
systemIds << id
}
}
return result
if (otherIris) {
Map<String, String> idToIri = storage.getSystemIdsByIris(otherIris)
.collectEntries { k, v -> [(v): k] }

systemIds.addAll(idToIri.keySet())
idMap.putAll(idToIri)
}

return storage.bulkLoad(systemIds)
.findAll { id, doc -> !doc.deleted }
.collectEntries { id, doc -> [(idMap.getOrDefault(id, id)) : doc]}
}

private void reindex(Document updated, Document preUpdateDoc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ class PostgreSQLComponent {
private static final String GET_DOCUMENT_VERSION =
"SELECT id, data FROM lddb__versions WHERE id = ? AND checksum = ?"

private static final String BULK_LOAD_DOCUMENTS = """
SELECT id, data, created, modified, deleted
FROM unnest(?) AS in_id, lddb l
WHERE in_id = l.id
""".stripIndent()

private static final String GET_EMBELLISHED_DOCUMENT =
"SELECT data from lddb__embellished where id = ?"

Expand Down Expand Up @@ -1827,6 +1833,27 @@ class PostgreSQLComponent {
}
return doc
}

Map<String, Document> bulkLoad(Iterable<String> systemIds) {
return withDbConnection {
Connection connection = getMyConnection()
PreparedStatement preparedStatement = null
ResultSet rs = null
try {
preparedStatement = connection.prepareStatement(BULK_LOAD_DOCUMENTS)
preparedStatement.setArray(1, connection.createArrayOf("TEXT", systemIds as String[]))

rs = preparedStatement.executeQuery()
SortedMap<String, Document> result = new TreeMap<>()
while(rs.next()) {
result[rs.getString("id")] = assembleDocument(rs)
}
return result
} finally {
close(rs, preparedStatement)
}
}
}

String getSystemIdByIri(String iri) {
return withDbConnection {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package whelk.filter

import com.google.common.collect.Iterables
import whelk.Whelk
import whelk.search.ESQuery
import whelk.search.ElasticFind
Expand Down Expand Up @@ -63,9 +64,8 @@ class BlankNodeLinker implements DocumentUtil.Linker {
'_sort' : [ID_KEY]
]

finder.findIds(q).each { id ->
def doc = whelk.getDocument(id)
if (doc) {
Iterables.partition(finder.findIds(q), 100).each { List<String> i ->
whelk.bulkLoad(i).each { id, doc ->
addDefinition(doc.data[GRAPH_KEY][1])
}
}
Expand Down