Skip to content

Commit

Permalink
feat(emm): Use AS terms in full dump (#1529)
Browse files Browse the repository at this point in the history
  • Loading branch information
olovy authored Nov 25, 2024
1 parent e4cca7b commit be8af49
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Session.vim
/*.properties
/*.properties-*
!gradle.properties
emm/exampleclient/libris-cache.sqlite3

# Java/Gradle Artifacts
/.gradle
Expand Down
22 changes: 16 additions & 6 deletions emm/exampleclient/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# embedded within another entity, or somewhere else)?
# If so, we must update that entity wherever we keep it.
#
# This is all we need to do to keap our cache up date in perpetuity.
# This is all we need to do to keep our cache up date in perpetuity.
#
# For this example client we are going to keep knowledge graphs in an SQLITE
# table called 'entities'. These knowledge graphs will always consist of an
Expand Down Expand Up @@ -121,7 +121,7 @@ def download_entity(url):
req = Request(url)
req.add_header('accept', 'application/json+ld')
try:
return json.load(urlopen(req))["@graph"][1]
return get_main_entity(json.load(urlopen(req)))
except:
return None

Expand Down Expand Up @@ -218,17 +218,22 @@ def load_dump(connection):
dumpCreationTime = None
while next_url:
with urllib.request.urlopen(next_url) as response:
print(f"Getting {next_url}")
data = json.load(response)
dumpCreationTimeOnPage = data["creationTime"]
dumpCreationTimeOnPage = data["startTime"]
if (dumpCreationTime and dumpCreationTime != dumpCreationTimeOnPage):
print(" DUMP INVALIDATED WHILE DOWNLOADING, TODO: DEAL WITH THIS ")
dumpCreationTime = dumpCreationTimeOnPage
if "next" in data:
next_url = data["next"]
else:
next_url = None
if "entities" in data:
for entity in data["entities"]:
if "items" in data:
for item in data["items"]:
if "@graph" not in item: # skip @context
continue

entity = get_main_entity(item)
embellish(entity, connection)
ingest_entity(entity, connection)
cursor = connection.cursor()
Expand All @@ -244,6 +249,11 @@ def load_dump(connection):
connection.commit()


def get_main_entity(named_graph):
# FIXME? relying on XL convention @graph[0] = Record, @graph[1] = Main entity
named_graph["@graph"][1]


#
# Given a root entity 'r', and a replacement/update of some embedded entity 'u',
# update the data of 'u' wherever it is copied/embedded into 'r'.
Expand Down Expand Up @@ -344,7 +354,7 @@ def update(connection):
result = cursor.execute("SELECT julianday(changes_consumed_until) - julianday(?) FROM state", (item["published"],))
diff = result.fetchone()[0]
if (float(diff) >= 0.0):
print(f"{item["published"]} is before our last taken update, stop here.")
print(f"{item['published']} is before our last taken update, stop here.")
next_url = None
break
handle_activity(connection, item)
Expand Down
59 changes: 46 additions & 13 deletions emm/src/main/java/whelk/Dump.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

Expand Down Expand Up @@ -170,26 +171,39 @@ private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String

BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class);
Instant dumpCreationTime = attributes.creationTime().toInstant();
sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines + EmmChangeSet.TARGET_HITS_PER_PAGE, totalEntityCount, dumpCreationTime);
sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime);
}

private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList<String> recordIdsOnPage, HttpServletResponse res, long nextLineOffset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{
private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList<String> recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{
var responseObject = new LinkedHashMap<>();

responseObject.put("creationTime", ZonedDateTime.ofInstant(dumpCreationTime, ZoneOffset.UTC).toString());
responseObject.put(JsonLd.CONTEXT_KEY, "https://www.w3.org/ns/activitystreams");
responseObject.put(JsonLd.ID_KEY, apiBaseUrl+"?dump="+dump+"&offset="+offset);
responseObject.put("type", "CollectionPage");
responseObject.put("startTime", ZonedDateTime.ofInstant(dumpCreationTime, ZoneOffset.UTC).toString());
if (totalEntityCount == null)
responseObject.put("status", "generating");
responseObject.put("_status", "generating");
else {
responseObject.put("status", "done");
responseObject.put("totalEntityCount", totalEntityCount);
responseObject.put("_status", "done");
responseObject.put("totalItems", totalEntityCount);
}

if (totalEntityCount == null || nextLineOffset < totalEntityCount) {
responseObject.put("next", apiBaseUrl+"?dump="+dump+"&offset="+nextLineOffset);
long nextOffset = offset + EmmChangeSet.TARGET_HITS_PER_PAGE;
if (totalEntityCount == null || nextOffset < totalEntityCount) {
responseObject.put("next", apiBaseUrl+"?dump="+dump+"&offset="+nextOffset);
}

var items = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE);
responseObject.put("items", items);

var contextDoc = contextDoc(whelk);
if (offset == 0) {
items.add(Map.of(
JsonLd.ID_KEY, contextDoc.getRecordIdentifiers().getFirst(),
JsonLd.CONTEXT_KEY, contextDoc.data.get(JsonLd.CONTEXT_KEY)
));
}

var entitiesList = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE);
responseObject.put("entities", entitiesList);
Map<String, Document> idsAndRecords = whelk.bulkLoad(recordIdsOnPage);
for (Document doc : idsAndRecords.values()) {

Expand All @@ -207,22 +221,41 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String
logger.warn("Bad instance? " + itemOf);
continue;
}
// TODO just put instance as its own graph in items?
var itemOfPath = new ArrayList<>();
itemOfPath.add("@graph"); itemOfPath.add(1); itemOfPath.add("itemOf"); // unggh..
doc._set(itemOfPath, instance.getThing(), doc.data);
entitiesList.add(doc.getThing());
}

items.add(wrapDoc(doc, contextDoc));
}
// For normal categories
else {
entitiesList.add(doc.getThing());
items.add(wrapDoc(doc, contextDoc));
}

}

HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE);
}

private static Object wrapDoc(Document doc, Document contextDoc) {
var context = new ArrayList<>();
context.add(null);
context.add(contextDoc.getRecordIdentifiers().getFirst());
return Map.of(
JsonLd.ID_KEY, doc.getRecordIdentifiers().getFirst(),
JsonLd.CONTEXT_KEY, context,
JsonLd.GRAPH_KEY, doc.data.get(JsonLd.GRAPH_KEY)
);
}

private static Document contextDoc(Whelk whelk) {
// FIXME whelk load by IRI
var docs = whelk.bulkLoad(List.of(whelk.getSystemContextUri()));
assert docs.size() == 1;
return docs.entrySet().stream().findFirst().get().getValue();
}

private static void invalidateIfOld(Path dumpFilePath) {
try {
if (!Files.exists(dumpFilePath))
Expand Down

0 comments on commit be8af49

Please sign in to comment.