From bf413e7b937bbb7e3e52a1d4800576bdbdfc2119 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 6 Jun 2022 17:02:29 -0400 Subject: [PATCH] Moved all the handling of custom dataverse_json metadata out of OaiHandler and GetRecord parser. (#8372) --- .../harvest/client/FastGetRecord.java | 104 +--------- .../harvest/client/HarvesterServiceBean.java | 178 ++++++++---------- 2 files changed, 90 insertions(+), 192 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java index 60abc97bccd..5b3e4df331d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java @@ -72,7 +72,6 @@ public class FastGetRecord { - private static final String DATAVERSE_EXTENDED_METADATA = "dataverse_json"; private static final String XML_METADATA_TAG = "metadata"; private static final String XML_METADATA_TAG_OPEN = "<"+XML_METADATA_TAG+">"; private static final String XML_METADATA_TAG_CLOSE = ""; @@ -222,13 +221,7 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref //metadataOut.println(""); /* ? */ metadataFlag = true; - } else if (line.matches(".*<"+XML_METADATA_TAG+" [^>]*>.*")) { - if (metadataPrefix.equals(DATAVERSE_EXTENDED_METADATA)) { - oaiResponseHeader = oaiResponseHeader.concat(line); - metadataWritten = true; - metadataFlag = true; - } - } + } } //System.out.println(line); @@ -380,19 +373,12 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref try { StringReader reader = new StringReader(oaiResponseHeader); xmlr = xmlInputFactory.createXMLStreamReader(reader); - processOAIheader(xmlr, metadataPrefix.equals(DATAVERSE_EXTENDED_METADATA)); + processOAIheader(xmlr); } catch (XMLStreamException ex) { - //Logger.getLogger("global").log(Level.SEVERE, null, ex); if (this.errorMessage == null) { this.errorMessage = "Malformed GetRecord response; baseURL=" + baseURL + ", identifier=" + identifier + ", metadataPrefix=" + metadataPrefix; } - - // delete the temp metadata file; we won't need it: - if (savedMetadataFile != null) { - //savedMetadataFile.delete(); - } - } try { @@ -414,14 +400,8 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref if (!(metadataWritten) && !(this.isDeleted())) { this.errorMessage = "Failed to parse GetRecord response; baseURL=" + baseURL + ", identifier=" + identifier + ", metadataPrefix=" + metadataPrefix; - //savedMetadataFile.delete(); - } - - if (this.isDeleted()) { - //savedMetadataFile.delete(); } - } else { this.errorMessage = "GetRecord request failed. HTTP error code "+responseCode; } @@ -445,16 +425,16 @@ private static String getRequestURL(String baseURL, return requestURL.toString(); } - private void processOAIheader (XMLStreamReader xmlr, boolean extensionMode) throws XMLStreamException, IOException { + private void processOAIheader (XMLStreamReader xmlr) throws XMLStreamException, IOException { // is this really a GetRecord response? xmlr.nextTag(); xmlr.require(XMLStreamConstants.START_ELEMENT, null, "OAI-PMH"); - processOAIPMH(xmlr, extensionMode); + processOAIPMH(xmlr); } - private void processOAIPMH (XMLStreamReader xmlr, boolean extensionMode) throws XMLStreamException, IOException { + private void processOAIPMH (XMLStreamReader xmlr) throws XMLStreamException, IOException { for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { if (event == XMLStreamConstants.START_ELEMENT) { @@ -477,7 +457,7 @@ else if (xmlr.getLocalName().equals("error")) { } else if (xmlr.getLocalName().equals("GetRecord")) { - processGetRecordSection(xmlr, extensionMode); + processGetRecordSection(xmlr); } } else if (event == XMLStreamConstants.END_ELEMENT) { if (xmlr.getLocalName().equals("OAI-PMH")) return; @@ -485,11 +465,11 @@ else if (xmlr.getLocalName().equals("GetRecord")) { } } - private void processGetRecordSection (XMLStreamReader xmlr, boolean extensionMode) throws XMLStreamException, IOException { + private void processGetRecordSection (XMLStreamReader xmlr) throws XMLStreamException, IOException { for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { if (event == XMLStreamConstants.START_ELEMENT) { if (xmlr.getLocalName().equals("record")) { - processRecord(xmlr, extensionMode); + processRecord(xmlr); } } else if (event == XMLStreamConstants.END_ELEMENT) { if (xmlr.getLocalName().equals("GetRecord")) return; @@ -498,7 +478,7 @@ private void processGetRecordSection (XMLStreamReader xmlr, boolean extensionMod } - private void processRecord (XMLStreamReader xmlr, boolean extensionMode) throws XMLStreamException, IOException { + private void processRecord (XMLStreamReader xmlr) throws XMLStreamException, IOException { for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { if (event == XMLStreamConstants.START_ELEMENT) { if (xmlr.getLocalName().equals("header")) { @@ -506,11 +486,6 @@ private void processRecord (XMLStreamReader xmlr, boolean extensionMode) throws this.recordDeleted = true; } processHeader(xmlr); - } else if (xmlr.getLocalName().equals("metadata")) { - if (extensionMode) { - String extendedMetadataApiUrl = xmlr.getAttributeValue(null, "directApiCall"); - processMetadataExtended(extendedMetadataApiUrl); - } } } else if (event == XMLStreamConstants.END_ELEMENT) { if (xmlr.getLocalName().equals("record")) return; @@ -532,67 +507,6 @@ else if (xmlr.getLocalName().equals("setSpec")) {/*do nothing*/} } } - private void processMetadataExtended (String extendedApiUrl) throws IOException { - InputStream in = null; - int responseCode = 0; - HttpURLConnection con = null; - - - - try { - URL url = new URL(extendedApiUrl.replaceAll("&", "&")); // is this necessary? - - con = (HttpURLConnection) url.openConnection(); - con.setRequestProperty("User-Agent", "DataverseHarvester/3.0"); - responseCode = con.getResponseCode(); - } catch (MalformedURLException mue) { - throw new IOException ("Bad API URL: "+extendedApiUrl); - } catch (FileNotFoundException e) { - responseCode = HttpURLConnection.HTTP_UNAVAILABLE; - } - - - - - if (responseCode == 200) { - in = con.getInputStream(); - // TODO: - /* we should probably still support gzip/compress encoding here - ? - String contentEncoding = con.getHeaderField("Content-Encoding"); - - // support for the standard compress/gzip/deflate compression - // schemes: - - if ("compress".equals(contentEncoding)) { - ZipInputStream zis = new ZipInputStream(con.getInputStream()); - zis.getNextEntry(); - in = zis; - } else if ("gzip".equals(contentEncoding)) { - in = new GZIPInputStream(con.getInputStream()); - } else if ("deflate".equals(contentEncoding)) { - in = new InflaterInputStream(con.getInputStream()); - } ... - */ - FileOutputStream tempOut = new FileOutputStream(savedMetadataFile); - - int bufsize; - byte[] buffer = new byte[4 * 8192]; - - while ((bufsize = in.read(buffer)) != -1) { - tempOut.write(buffer, 0, bufsize); - tempOut.flush(); - } - - in.close(); - tempOut.close(); - return; - } - - throw new IOException("Failed to download extended metadata."); - - } - - // (from Gustavo's ddiServiceBean -- L.A.) // /* We had to add this method because the ref getElementText has a bug where it diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java index 397a90b0c99..86430a25ef0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java @@ -32,7 +32,6 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import org.apache.commons.lang3.mutable.MutableBoolean; -import org.apache.commons.lang3.mutable.MutableLong; import org.xml.sax.SAXException; import io.gdcc.xoai.model.oaipmh.Header; @@ -42,8 +41,13 @@ import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler; import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException; import edu.harvard.iq.dataverse.search.IndexServiceBean; +import java.io.FileOutputStream; import java.io.FileWriter; +import java.io.InputStream; import java.io.PrintWriter; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; import javax.persistence.EntityManager; import javax.persistence.PersistenceContext; @@ -75,13 +79,12 @@ public class HarvesterServiceBean { IndexServiceBean indexService; private static final Logger logger = Logger.getLogger("edu.harvard.iq.dataverse.harvest.client.HarvesterServiceBean"); - private static final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private static final SimpleDateFormat logFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss"); public static final String HARVEST_RESULT_SUCCESS="success"; public static final String HARVEST_RESULT_FAILED="failed"; - private static final Long INDEXING_CONTENT_BATCH_SIZE = 10000000L; - + public static final String DATAVERSE_PROPRIETARY_METADATA_FORMAT="dataverse_json"; + public static final String DATAVERSE_PROPRIETARY_METADATA_API="/api/datasets/export?exporter="+DATAVERSE_PROPRIETARY_METADATA_FORMAT; public HarvesterServiceBean() { @@ -183,24 +186,7 @@ public void doHarvest(DataverseRequest dataverseRequest, Long harvestingClientId hdLogger.log(Level.INFO, "COMPLETED HARVEST, server=" + harvestingClientConfig.getArchiveUrl() + ", metadataPrefix=" + harvestingClientConfig.getMetadataPrefix()); hdLogger.log(Level.INFO, "Datasets created/updated: " + harvestedDatasetIds.size() + ", datasets deleted: " + deletedIdentifiers.size() + ", datasets failed: " + failedIdentifiers.size()); - // now index all the datasets we have harvested - created, modified or deleted: - /* (TODO: may not be needed at all. In Dataverse4, we may be able to get away with the normal - reindexing after every import. See the rest of the comments about batch indexing throughout - this service bean) - if (this.processedSizeThisBatch > 0) { - hdLogger.log(Level.INFO, "POST HARVEST, reindexing the remaining studies."); - if (this.harvestedDatasetIdsThisBatch != null) { - hdLogger.log(Level.INFO, this.harvestedDatasetIdsThisBatch.size()+" studies in the batch"); - } - hdLogger.log(Level.INFO, this.processedSizeThisBatch + " bytes of content"); - indexService.updateIndexList(this.harvestedDatasetIdsThisBatch); - hdLogger.log(Level.INFO, "POST HARVEST, calls to index finished."); - } else { - hdLogger.log(Level.INFO, "(All harvested content already reindexed)"); - } - */ } - //mailService.sendHarvestNotification(...getSystemEmail(), harvestingDataverse.getName(), logFileName, logTimestamp, harvestErrorOccurred.booleanValue(), harvestedDatasetIds.size(), failedIdentifiers); } catch (Throwable e) { harvestErrorOccurred.setValue(true); String message = "Exception processing harvest, server= " + harvestingClientConfig.getHarvestingUrl() + ",format=" + harvestingClientConfig.getMetadataPrefix() + " " + e.getClass().getName() + " " + e.getMessage(); @@ -235,7 +221,6 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien logBeginOaiHarvest(hdLogger, harvestingClient); List harvestedDatasetIds = new ArrayList(); - MutableLong processedSizeThisBatch = new MutableLong(0L); OaiHandler oaiHandler; try { @@ -261,9 +246,8 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien MutableBoolean getRecordErrorOccurred = new MutableBoolean(false); // Retrieve and process this record with a separate GetRecord call: - Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, processedSizeThisBatch, deletedIdentifiers, dateStamp); + Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp); - hdLogger.info("Total content processed in this batch so far: "+processedSizeThisBatch); if (datasetId != null) { harvestedDatasetIds.add(datasetId); @@ -280,20 +264,6 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien //temporary: //throw new IOException("Exception occured, stopping harvest"); } - - // reindexing in batches? - this is from DVN 3; - // we may not need it anymore. - if ( processedSizeThisBatch.longValue() > INDEXING_CONTENT_BATCH_SIZE ) { - - hdLogger.log(Level.INFO, "REACHED CONTENT BATCH SIZE LIMIT; calling index ("+ harvestedDatasetIdsThisBatch.size()+" datasets in the batch)."); - //indexService.updateIndexList(this.harvestedDatasetIdsThisBatch); - hdLogger.log(Level.INFO, "REINDEX DONE."); - - - processedSizeThisBatch.setValue(0L); - harvestedDatasetIdsThisBatch = null; - } - } } catch (OaiHandlerException e) { throw new IOException("Failed to run ListIdentifiers: " + e.getMessage()); @@ -303,23 +273,34 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien return harvestedDatasetIds; - } - + } - - private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, MutableLong processedSizeThisBatch, List deletedIdentifiers, Date dateStamp) { + private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, List deletedIdentifiers, Date dateStamp) { String errMessage = null; Dataset harvestedDataset = null; logGetRecord(hdLogger, oaiHandler, identifier); File tempFile = null; - try { - FastGetRecord record = oaiHandler.runGetRecord(identifier); - errMessage = record.getErrorMessage(); + try { + boolean deleted = false; + + if (DATAVERSE_PROPRIETARY_METADATA_FORMAT.equals(oaiHandler.getMetadataPrefix())) { + // Make direct call to obtain the proprietary Dataverse metadata + // in JSON from the remote Dataverse server: + String extendedApiUrl = getProprietaryDataverseMetadataURL(oaiHandler.getBaseOaiUrl(), identifier); + tempFile = retrieveProprietaryDataverseMetadata(extendedApiUrl); + + } else { + FastGetRecord record = oaiHandler.runGetRecord(identifier); + errMessage = record.getErrorMessage(); + deleted = record.isDeleted(); + tempFile = record.getMetadataFile(); + } if (errMessage != null) { hdLogger.log(Level.SEVERE, "Error calling GetRecord - " + errMessage); - } else if (record.isDeleted()) { + + } else if (deleted) { hdLogger.info("Deleting harvesting dataset for "+identifier+", per the OAI server's instructions."); Dataset dataset = datasetService.getDatasetByHarvestInfo(oaiHandler.getHarvestingClient().getDataverse(), identifier); @@ -336,24 +317,21 @@ private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, P } else { hdLogger.info("Successfully retrieved GetRecord response."); - tempFile = record.getMetadataFile(); PrintWriter cleanupLog; harvestedDataset = importService.doImportHarvestedDataset(dataverseRequest, oaiHandler.getHarvestingClient(), identifier, oaiHandler.getMetadataPrefix(), - record.getMetadataFile(), + tempFile, dateStamp, importCleanupLog); hdLogger.fine("Harvest Successful for identifier " + identifier); - hdLogger.fine("Size of this record: " + record.getMetadataFile().length()); - processedSizeThisBatch.add(record.getMetadataFile().length()); + hdLogger.fine("Size of this record: " + tempFile.length()); } } catch (Throwable e) { logGetRecordException(hdLogger, oaiHandler, identifier, e); errMessage = "Caught exception while executing GetRecord on "+identifier; - //logException(e, hdLogger); } finally { if (tempFile != null) { @@ -364,14 +342,12 @@ private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, P } } - // TODO: the message below is taken from DVN3; - figure out what it means... - // // If we got an Error from the OAI server or an exception happened during import, then // set recordErrorOccurred to true (if recordErrorOccurred is being used) // otherwise throw an exception (if recordErrorOccurred is not used, i.e null) if (errMessage != null) { - if (recordErrorOccurred != null) { + if (recordErrorOccurred != null) { recordErrorOccurred.setValue(true); } else { throw new EJBException(errMessage); @@ -380,6 +356,56 @@ private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, P return harvestedDataset != null ? harvestedDataset.getId() : null; } + + File retrieveProprietaryDataverseMetadata (String remoteApiUrl) throws IOException { + InputStream in; + int responseCode = 0; + HttpURLConnection con = null; + + File tempMetadataFile = File.createTempFile("meta", ".tmp"); + + try { + URL url = new URL(remoteApiUrl); + + con = (HttpURLConnection) url.openConnection(); + con.setRequestProperty("User-Agent", "DataverseHarvester/3.0"); + responseCode = con.getResponseCode(); + } catch (MalformedURLException mue) { + throw new IOException ("Bad API URL: "+remoteApiUrl); + } + + if (responseCode == 200) { + in = con.getInputStream(); + + FileOutputStream tempOut = new FileOutputStream(tempMetadataFile); + + int bufsize; + byte[] buffer = new byte[4 * 8192]; + + while ((bufsize = in.read(buffer)) != -1) { + tempOut.write(buffer, 0, bufsize); + tempOut.flush(); + } + + in.close(); + tempOut.close(); + return tempMetadataFile; + } + + throw new IOException("Failed to download extended metadata."); + + } + + private static String getProprietaryDataverseMetadataURL(String baseURL, String identifier) { + + baseURL = baseURL.replaceAll("/oai", ""); + + StringBuilder requestURL = new StringBuilder(baseURL); + requestURL.append(DATAVERSE_PROPRIETARY_METADATA_API); + requestURL.append("&persistentId=").append(identifier); + + return requestURL.toString(); + } private void logBeginOaiHarvest(Logger hdLogger, HarvestingClient harvestingClient) { hdLogger.log(Level.INFO, "BEGIN HARVEST, oaiUrl=" @@ -448,47 +474,5 @@ private void logException(Throwable e, Logger logger) { } while ((e = e.getCause()) != null); logger.severe(fullMessage); } - - /* - some dead code below: - this functionality has been moved into OaiHandler. - TODO: test that harvesting is still working and remove. - - private ServiceProvider getServiceProvider(String baseOaiUrl, Granularity oaiGranularity) { - Context context = new Context(); - - context.withBaseUrl(baseOaiUrl); - context.withGranularity(oaiGranularity); - context.withOAIClient(new HttpOAIClient(baseOaiUrl)); - - ServiceProvider serviceProvider = new ServiceProvider(context); - return serviceProvider; - } - */ - - /** - * Creates an XOAI parameters object for the ListIdentifiers call - * - * @param metadataPrefix - * @param set - * @param from - * @return ListIdentifiersParameters - */ - /* - private ListIdentifiersParameters buildParams(String metadataPrefix, String set, Date from) { - ListIdentifiersParameters mip = ListIdentifiersParameters.request(); - mip.withMetadataPrefix(metadataPrefix); - - if (from != null) { - mip.withFrom(from); - } - - if (set != null) { - mip.withSetSpec(set); - } - return mip; - } - */ - - + }