Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

10982 Request identifier support for oai dc harvesting #11010

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
import edu.harvard.iq.dataverse.ForeignMetadataFieldMapping;
import edu.harvard.iq.dataverse.ForeignMetadataFormatMapping;
import edu.harvard.iq.dataverse.MetadataBlockServiceBean;
import edu.harvard.iq.dataverse.api.dto.*;
import edu.harvard.iq.dataverse.api.dto.DatasetVersionDTO;
import edu.harvard.iq.dataverse.api.dto.DatasetDTO;
import edu.harvard.iq.dataverse.api.dto.FieldDTO;
import edu.harvard.iq.dataverse.api.dto.MetadataBlockDTO;
import edu.harvard.iq.dataverse.dataset.DatasetTypeServiceBean;
Expand All @@ -29,7 +30,6 @@
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import jakarta.ejb.EJB;
Expand Down Expand Up @@ -155,7 +155,14 @@ public DatasetDTO processXML( XMLStreamReader xmlr, ForeignMetadataFormatMapping
// It is distributed as required content, in reference_data.sql.
// Note that arbitrary formatting tags are supported for the outer xml
// wrapper. -- L.A. 4.5
public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException {
/**
*
* @param DcXmlToParse Metadata contained in the <metadata> tag
* @param harvestIdentifier Header harvesting id
* @return datasetDTO with metadata filled in
* @throws XMLStreamException
*/
public DatasetDTO processOAIDCxml(String DcXmlToParse, String harvestIdentifier) throws XMLStreamException {
// look up DC metadata mapping:

ForeignMetadataFormatMapping dublinCoreMapping = findFormatMappingByName(DCTERMS);
Expand Down Expand Up @@ -189,7 +196,7 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException
// as an "other id". In the context of OAI harvesting, we expect
// the identifier to be a global id, so we need to rearrange that:

String identifier = getOtherIdFromDTO(datasetDTO.getDatasetVersion());
String identifier = getIdentifierHarvestableByDataverse(datasetDTO.getDatasetVersion(), harvestIdentifier);
logger.fine("Imported identifier: "+identifier);

String globalIdentifier = reassignIdentifierAsGlobalId(identifier, datasetDTO);
Expand Down Expand Up @@ -335,7 +342,13 @@ private FieldDTO makeDTO(DatasetFieldType dataverseFieldType, FieldDTO value, St
return value;
}

private String getOtherIdFromDTO(DatasetVersionDTO datasetVersionDTO) {
/**
*
* @param datasetVersionDTO
* @param harvestIdentifier Header harvesting id
* @return
*/
public String getIdentifierHarvestableByDataverse(DatasetVersionDTO datasetVersionDTO, String harvestIdentifier) {
List<String> otherIds = new ArrayList<>();
for (Map.Entry<String, MetadataBlockDTO> entry : datasetVersionDTO.getMetadataBlocks().entrySet()) {
String key = entry.getKey();
Expand All @@ -354,6 +367,12 @@ private String getOtherIdFromDTO(DatasetVersionDTO datasetVersionDTO) {
}
}
}

// The identifier is possibly declared only in the header, so we add it to the list
if (harvestIdentifier != null) {
otherIds.add(harvestIdentifier);
}

if (!otherIds.isEmpty()) {
// We prefer doi or hdl identifiers like "doi:10.7910/DVN/1HE30F"
for (String otherId : otherIds) {
Expand Down Expand Up @@ -384,6 +403,11 @@ private String getOtherIdFromDTO(DatasetVersionDTO datasetVersionDTO) {
//ToDo - sync with GlobalId.parsePersistentId(String) ? - that currently doesn't do URL forms, but could
public String reassignIdentifierAsGlobalId(String identifierString, DatasetDTO datasetDTO) {

if (identifierString == null) {
logger.warning("Error parsing identifier: is null");
return null;
}

int index1 = identifierString.indexOf(':');
int index2 = identifierString.indexOf('/');
if (index1==-1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
logger.fine("importing DC "+metadataFile.getAbsolutePath());
try {
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
dsDTO = importGenericService.processOAIDCxml(xmlToParse);
dsDTO = importGenericService.processOAIDCxml(xmlToParse, harvestIdentifier);
} catch (IOException | XMLStreamException e) {
throw new ImportException("Failed to process Dublin Core XML record: "+ e.getClass() + " (" + e.getMessage() + ")");
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,70 @@
package edu.harvard.iq.dataverse.api.imports;

import edu.harvard.iq.dataverse.api.dto.DatasetDTO;
import edu.harvard.iq.dataverse.api.dto.DatasetVersionDTO;

import org.apache.commons.io.FileUtils;
import com.google.gson.Gson;
import java.io.File;
import java.io.IOException;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.InjectMocks;
import org.mockito.junit.jupiter.MockitoExtension;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.nio.charset.StandardCharsets;

@ExtendWith(MockitoExtension.class)
public class ImportGenericServiceBeanTest {

@InjectMocks
private ImportGenericServiceBean importGenericService;

@Test
public void testReassignIdentifierAsGlobalId() {
void testIdentifierHarvestableWithOtherID() throws IOException {
// "otherIdValue" containing the value : doi:10.7910/DVN/TJCLKP
File file = new File("src/test/resources/json/importGenericWithOtherId.json");
String text = FileUtils.readFileToString(file, StandardCharsets.UTF_8);
DatasetVersionDTO dto = new Gson().fromJson(text, DatasetVersionDTO.class);

assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "https://doi.org/10.7910/DVN/TJCLKP"));
// junk or null
assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "junk"));
assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, null));
assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "http://www.example.com"));
assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "https://dataverse.org"));
}

@Test
void testIdentifierHarvestableWithoutOtherID() throws IOException {
// Does not contain data of type "otherIdValue"
File file = new File("src/test/resources/json/importGenericWithoutOtherId.json");
String text = FileUtils.readFileToString(file, StandardCharsets.UTF_8);
DatasetVersionDTO dto = new Gson().fromJson(text, DatasetVersionDTO.class);

// non-URL
assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "doi:10.7910/DVN/TJCLKP"));
assertEquals("hdl:10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "hdl:10.7910/DVN/TJCLKP"));
// HTTPS
assertEquals("https://doi.org/10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "https://doi.org/10.7910/DVN/TJCLKP"));
assertEquals("https://dx.doi.org/10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "https://dx.doi.org/10.7910/DVN/TJCLKP"));
assertEquals("https://hdl.handle.net/10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "https://hdl.handle.net/10.7910/DVN/TJCLKP"));
// HTTP (no S)
assertEquals("http://doi.org/10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "http://doi.org/10.7910/DVN/TJCLKP"));
assertEquals("http://dx.doi.org/10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "http://dx.doi.org/10.7910/DVN/TJCLKP"));
assertEquals("http://hdl.handle.net/10.7910/DVN/TJCLKP", importGenericService.getIdentifierHarvestableByDataverse(dto, "http://hdl.handle.net/10.7910/DVN/TJCLKP"));
// junk or null
assertNull(importGenericService.getIdentifierHarvestableByDataverse(dto, "junk"));
assertNull(importGenericService.getIdentifierHarvestableByDataverse(dto, null));
assertNull(importGenericService.getIdentifierHarvestableByDataverse(dto, "http://www.example.com"));
assertNull(importGenericService.getIdentifierHarvestableByDataverse(dto, "https://dataverse.org"));
}

@Test
void testReassignIdentifierAsGlobalId() {
// non-URL
assertEquals("doi:10.7910/DVN/TJCLKP", importGenericService.reassignIdentifierAsGlobalId("doi:10.7910/DVN/TJCLKP", new DatasetDTO()));
assertEquals("hdl:10.7910/DVN/TJCLKP", importGenericService.reassignIdentifierAsGlobalId("hdl:10.7910/DVN/TJCLKP", new DatasetDTO()));
Expand All @@ -29,6 +78,8 @@ public void testReassignIdentifierAsGlobalId() {
assertEquals("hdl:10.7910/DVN/TJCLKP", importGenericService.reassignIdentifierAsGlobalId("http://hdl.handle.net/10.7910/DVN/TJCLKP", new DatasetDTO()));
// junk
assertNull(importGenericService.reassignIdentifierAsGlobalId("junk", new DatasetDTO()));
assertNull(importGenericService.reassignIdentifierAsGlobalId("http://www.example.com", new DatasetDTO()));
assertNull(importGenericService.reassignIdentifierAsGlobalId("https://dataverse.org", new DatasetDTO()));
}

}
Loading
Loading