Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dra 400 check filter for preservica7 #58

Merged
merged 10 commits into from
Apr 15, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.annotation.Target;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import dk.kb.datahandler.oai.OaiResponseFilterPreservicaFive;
import dk.kb.datahandler.oai.OaiResponseFilterPreserviceSeven;
import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrServerException;
import org.slf4j.Logger;
Expand All @@ -27,7 +28,6 @@
import dk.kb.datahandler.oai.OaiRecord;
import dk.kb.datahandler.oai.OaiResponse;
import dk.kb.datahandler.oai.OaiResponseFilter;
import dk.kb.datahandler.oai.OaiResponseFilterPreservica;
import dk.kb.datahandler.oai.OaiTargetJob;
import dk.kb.datahandler.util.HarvestTimeUtil;
import dk.kb.datahandler.util.SolrUtils;
Expand Down Expand Up @@ -278,7 +278,10 @@ private static Integer oaiIngestPerform(OaiTargetJob job, String from, String un
oaiFilter = new OaiResponseFilter(origin, dsAPI);
break;
case PRESERVICA:
oaiFilter = new OaiResponseFilterPreservica(origin, dsAPI);
oaiFilter = new OaiResponseFilterPreserviceSeven(origin, dsAPI);
break;
case PRESERVICA5:
oaiFilter = new OaiResponseFilterPreservicaFive(origin, dsAPI);
break;
default: throw new UnsupportedOperationException(
"Unknown filter '" + oaiTargetDto.getFilter() + "' for target '" + targetName + "'");
Expand Down
36 changes: 24 additions & 12 deletions src/main/java/dk/kb/datahandler/oai/OaiHarvestClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
import java.util.ArrayList;
import java.util.Base64;

import javax.servlet.ServletRequest;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

Expand Down Expand Up @@ -41,7 +44,7 @@ public class OaiHarvestClient {
private String from;
private String until;

public OaiHarvestClient (OaiTargetJob oaiTargetJob,String from, String until){
public OaiHarvestClient(OaiTargetJob oaiTargetJob, String from, String until){
this.oaiTargetJob=oaiTargetJob;
this.oaiTarget=oaiTargetJob.getDto();
this.from=from;
Expand All @@ -65,13 +68,14 @@ public OaiResponse next() throws Exception{
String metadataPrefix= oaiTarget.getMetadataprefix();

uri=addQueryParamsToUri(uri, set, resumptionToken,metadataPrefix,from, until);
log.info("calling uri:"+uri);
log.info("calling uri:"+uri);
//log.info("resumption token at:"+resumptionToken);
String xmlResponse=getHttpResponse(uri,oaiTarget.getUsername(),oaiTarget.getPassword());
String xmlResponse = getHttpResponse(uri, oaiTarget.getUsername(), oaiTarget.getPassword());
log.info("Does the test get here");
tokee marked this conversation as resolved.
Show resolved Hide resolved

Document document =sanitizeXml(xmlResponse,uri);
Document document = sanitizeXml(xmlResponse,uri);

String errorMessage=getErrorMessage(document);
String errorMessage = getErrorMessage(document);
if (errorMessage != null && errorMessage.trim().length() >1) {
log.info("Error message from OAI server when harvesting set:"+set +" message:"+errorMessage);
oaiTargetJob.setCompletedTime(System.currentTimeMillis());
Expand All @@ -84,9 +88,9 @@ public OaiResponse next() throws Exception{
String resumptionToken= getResumptionToken(document);
oaiResponse.setTotalRecords(getResumptionTotalSize(document));

if (resumptionToken != null && !resumptionToken.equals("")) {
if (resumptionToken != null && !resumptionToken.isEmpty()) {
this.resumptionToken = resumptionToken;
log.info("next resumption token:"+resumptionToken);
log.info("next resumption token:"+resumptionToken);
tokee marked this conversation as resolved.
Show resolved Hide resolved
oaiResponse.setResumptionToken(resumptionToken);
}
else {
Expand Down Expand Up @@ -123,7 +127,6 @@ private String addQueryParamsToUri(String uri,String set, String resumptionToken
uri += "&until="+until;
}


if (metadataPrefix != null && resumptionToken == null) {
uri +="&metadataPrefix="+metadataPrefix;
}
Expand Down Expand Up @@ -217,7 +220,6 @@ public static Document sanitizeXml(String xmlResponse, String uri) throws Except
* The solution is to do both.
*
*/

protected static String getHttpResponse(String uri, String user, String password) throws Exception {
HttpClient client = HttpClient.newBuilder()
.authenticator(new Authenticator() {
Expand All @@ -231,11 +233,21 @@ protected PasswordAuthentication getPasswordAuthentication() {
}).build();


/* The Preservica 7 OAI-PMH endpoint is acting up. Per their documentation they say that they follow the standard
and makes Basic authorixation possible. However, the response does not contain a WWW-Authenticate header, when queried.

If using their acces endpoint to get an access-token then the service works. I got an access token by calling the following:
curl -X 'POST' 'DEVELSERVER' -H 'accept: application/json' -H 'Content-Type: application/x-www-form-urlencoded' -d 'username=USERNAME&password=PASSWORD&cookie=false&includeUserDetails=false'
Changing DEVELSERVER, USERNAME and PASSWORD

I've written the integration test dk.kb.datahandler.oai.OaiHarvestClientIntegrationTest.testPreservicaSevenAuth
which makes it seem like this might be a problem in our end as the test parses.
*/
tokee marked this conversation as resolved.
Show resolved Hide resolved
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(uri))
.header("User-Agent", "Java 11 HttpClient Bot")
.header("Authorization", getBasicAuthenticationHeader(user, password))
.uri(URI.create(uri))
//.header("Preservica-Access-Token", "b8844076-d96d-4fe1-b8fe-e0dd6bee5bbc")
tokee marked this conversation as resolved.
Show resolved Hide resolved
.header("User-Agent", "Java 11 HttpClient Bot")
.header("Authorization", getBasicAuthenticationHeader(user, password))
.build();

HttpResponse<String> response = client.send(request, BodyHandlers.ofString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@
import java.util.regex.Pattern;

/**
* Filtering and delivery of Preservica OAI records. Generates {@code datasource} prefixed IDs,
* Filtering and delivery of Preservica OAI records from Preservica 5. Generates {@code datasource} prefixed IDs,
* derives type from {@link DsRecordDto#getId()} and resolves {@code parent} from content.
*/
public class OaiResponseFilterPreservica extends OaiResponseFilter {
private static final Logger log = LoggerFactory.getLogger(OaiResponseFilterPreservica.class);
public class OaiResponseFilterPreservicaFive extends OaiResponseFilter {
private static final Logger log = LoggerFactory.getLogger(OaiResponseFilterPreservicaFive.class);

private static final Pattern PARENT_PATTERN = Pattern.compile(
"<DeliverableUnitRef>([^<]+)</DeliverableUnitRef>");
Expand All @@ -55,15 +55,15 @@ public class OaiResponseFilterPreservica extends OaiResponseFilter {
"<ManifestationRelRef>1</ManifestationRelRef>");

private static final Pattern METADATA_PATTERN = Pattern.compile(
"<Metadata schemaURI=\"http://www.pbcore.org/PBCore/PBCoreNamespace.html\">");
"<Metadata\\s+schemaUri=\"http://www\\.pbcore\\.org/PBCore/PBCoreNamespace\\.html\">");

protected int emptyMetadataRecords = 0;

/**
* @param datasource source for records. Currently used for {@code origin}.
* @param storage destination for records.
*/
public OaiResponseFilterPreservica(String datasource, DsStorageClient storage) {
public OaiResponseFilterPreservicaFive(String datasource, DsStorageClient storage) {
super(datasource, storage);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
package dk.kb.datahandler.oai;

import dk.kb.storage.invoker.v1.ApiException;
import dk.kb.storage.model.v1.DsRecordDto;
import dk.kb.storage.model.v1.RecordTypeDto;
import dk.kb.storage.util.DsStorageClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Filtering and delivery of Preservica OAI records from Preservica 7. Generates {@code datasource} prefixed IDs,
* derives type from {@link DsRecordDto#getId()} and resolves {@code parent} from content.
*/
public class OaiResponseFilterPreserviceSeven extends OaiResponseFilter{
tokee marked this conversation as resolved.
Show resolved Hide resolved
private static final Logger log = LoggerFactory.getLogger(OaiResponseFilterPreserviceSeven.class);

private static final Pattern PARENT_PATTERN = Pattern.compile(
"<DeliverableUnitRef>([^<]+)</DeliverableUnitRef>");

/**
* Pattern for determining if a InformationObject from Preservica 7
* contains metadata about a radio resource.
*/
private static final Pattern RADIO_PATTERN = Pattern.compile(
"<formatMediaType>Sound</formatMediaType>|<ComponentType>Audio</ComponentType>");

/**
* Pattern for determining if a InformationObject from Preservica 7
* contains metadata about a television resource.
*/
private static final Pattern TV_PATTERN = Pattern.compile(
"<formatMediaType>Moving\\sImage</formatMediaType>|<ComponentType>Video</ComponentType>");

/**
* TODO: We need data from preservica 7 with assets before we can see if the filtering of preservation assets is even needed.
* Pattern to match preservation manifestations from Preservica5 by type.
*/
private static final Pattern PRESERVATION_MANIFESTATION_PATTERN = Pattern.compile(
"<ManifestationRelRef>1</ManifestationRelRef>");

/**
* Pattern used to check that records does in fact contain PBCore metadata.
*/
private static final Pattern METADATA_PATTERN = Pattern.compile(
"<Metadata\\s+schemaUri=\"http://www\\.pbcore\\.org/PBCore/PBCoreNamespace\\.html\">");

protected int emptyMetadataRecords = 0;

/**
* @param datasource source for records. Default implementation uses this for {@code origin}.
* @param storage destination for records.
*/
public OaiResponseFilterPreserviceSeven(String datasource, DsStorageClient storage) {
super(datasource, storage);
}


/**
* Add records from Preservica OAI-PMH harvest to ds-storage. Records goes through a filtering where StructuralObjects
* from Preservica are filtered away and not added to ds-storage. Furthermore, types are resolved based on IDs.
* @param response OAI-PMH response containing preservica records.
*/
@Override
public void addToStorage(OaiResponse response) throws ApiException {
for (OaiRecord oaiRecord: response.getRecords()) {
String xml = oaiRecord.getMetadata();
String recordId = oaiRecord.getId();
// Preservica StructuralObjects are ignored as they are only used as folders in the GUI.
if (recordId.contains("oai:so")){
log.debug("Skipped Structural object with id: '{}'", recordId);
continue;
}
// DeliverableUnits from preservica 5 and InformationObjects from preservcia 6/7 need to have the PBCore metadata tag.
// Manifestations from preservica 5 does not seem to have it.
// Therefore, we are checking the ID as well.
Matcher metadataMatcher = METADATA_PATTERN.matcher(xml);
if ((recordId.contains("oai:io")) && !metadataMatcher.find()) {
processed++;
emptyMetadataRecords ++;
log.warn("OAI-PMH record '{}' does not contain PBCore metadata and is therefore not added to storage. " +
"'{}' empty records have been found and '{}' records have been processed in total.",
recordId, emptyMetadataRecords, processed);
continue;
}
// TODO: This lookup might not be needed for preservica 7. We need a record with a presentation asset from
// preservica 7 before this can be determined. Hopefully this comes with their stage env
// Manifestations can not be of type 1 as type 1 = preservation manifestations.
Matcher preservationMatcher = PRESERVATION_MANIFESTATION_PATTERN.matcher(xml);
if (recordId.contains("oai:man") && preservationMatcher.find()) {
log.debug("OAI-PMH record '{}' is a preservation manifestation and is not added to DS-storage.",
recordId);
continue;
}
addToStorage(oaiRecord);
processed++;
}
}

@Override
public String getOrigin(OaiRecord oaiRecord, String datasource) {
String xml = oaiRecord.getMetadata();

Matcher radioDeliverableunitMatcher = RADIO_PATTERN.matcher(xml);
Matcher tvDeliverableUnitMatcher = TV_PATTERN.matcher(xml);

if (radioDeliverableunitMatcher.find()){
return "ds.radio";
} else if (tvDeliverableUnitMatcher.find()){
return "ds.tv";
} else {
log.warn("No specific origin has been extracted for preservica record '{}'.",
oaiRecord.getId());
// Not quite sure what we should do in the case where nothing gets matched.
return datasource;
}
}

@Override
public String getParentID(OaiRecord oaiRecord, String origin) {
tokee marked this conversation as resolved.
Show resolved Hide resolved
String xml = oaiRecord.getMetadata();
if (!xml.contains("<xip:Manifestation")) {
return null;
}

Matcher m = PARENT_PATTERN.matcher(xml);
if (!m.find()) {
log.debug("Unable to resolve parent ID for record '{}'", oaiRecord.getId());
return null;
}
String parentID = m.group(1);
if (parentID.length() < 30 || parentID.length() > 40) {
log.warn("ParentID '{}' does not seem to have correct format for record '{}'",
parentID, oaiRecord.getId());
}
return origin + ":oai:du:" + parentID;
}

/**
* Determine the type of record in hand. For preservica 7 all records injected are most likely InformationObjects
* which is a metadata record, mapping to our DELIVERABLEUNIT record type.
* @param dsRecord to specify recordType for.
* @param storageId used to define the type of record.
*/
@Override
public RecordTypeDto getRecordType(DsRecordDto dsRecord, String storageId) {
if (storageId.contains("oai:io")){
return RecordTypeDto.DELIVERABLEUNIT;
}

log.debug("Unable to derive record type for id '{}' from datasource '{}'. Falling back to '{}'", storageId, datasource, RecordTypeDto.DELIVERABLEUNIT);
return RecordTypeDto.DELIVERABLEUNIT;
}
}
2 changes: 1 addition & 1 deletion src/main/openapi/ds-datahandler-openapi_v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ components:
example: 'H.C. Andersen'
filter:
type: string
enum: ['direct', 'preservica']
enum: ['direct', 'preservica', 'preservica5']
tokee marked this conversation as resolved.
Show resolved Hide resolved
description: |
Special handling of record. Possible value are
* `direct`: No special handling
Expand Down
Loading