Skip to content

Commit

Permalink
Merge pull request ORCID#6769 from ORCID/8563-fix-fundref-ror-matchin…
Browse files Browse the repository at this point in the history
…g-eg-duplicate-organization-deutsches-museum

ROR uploader changes to take in account the external Identifiers updates
  • Loading branch information
amontenegro authored Apr 13, 2023
2 parents 9a8b373 + f2be0ae commit 001f7d4
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,10 @@ private OrgDisambiguated convertEntity(OrgDisambiguatedEntity orgDisambiguatedEn
String type = extIdEntity.getIdentifierType();
String identifier = extIdEntity.getIdentifier();
Boolean preferred = extIdEntity.getPreferred();

if(preferred == null) {
preferred = Boolean.FALSE;
}

OrgDisambiguatedExternalIdentifiers extId = null;

if (externalIdsMap.containsKey(type)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -58,7 +59,7 @@ public class RinggoldOrgLoadSource implements OrgLoadSource {

@Resource
private OrgDisambiguatedDao orgDisambiguatedDao;

@Resource
private OrgDisambiguatedManager orgDisambiguatedManager;

Expand All @@ -83,6 +84,8 @@ public class RinggoldOrgLoadSource implements OrgLoadSource {

private static final List<String> ALLOWED_EXTERNAL_IDENTIFIERS = Arrays.asList("ISNI", "IPED", "NCES", "OFR");

private Set<Long> UPDATED_RINGGOLDS;

@Override
public String getSourceName() {
return "RINGGOLD";
Expand All @@ -104,10 +107,11 @@ public boolean downloadOrgData() {
}

private boolean importData() {
Map<Integer, List<JsonNode>> altNamesMap = new HashMap<>();
Map<Integer, List<JsonNode>> identifiersMap = new HashMap<>();
Map<Integer, JsonNode> dnNameMap = new HashMap<>();
Map<Integer, Integer> deletedElementsMap = new HashMap<>();
Map<Integer, List<JsonNode>> altNamesMap = new HashMap<Integer, List<JsonNode>>();
Map<Integer, List<JsonNode>> identifiersMap = new HashMap<Integer, List<JsonNode>>();
Map<Integer, JsonNode> dnNameMap = new HashMap<Integer, JsonNode>();
Map<Integer, Integer> deletedElementsMap = new HashMap<Integer, Integer>();
UPDATED_RINGGOLDS = new HashSet<Long>();

LOGGER.info("Starting the importData process");
try (ZipFile zip = new ZipFile(ftpsFileDownloader.getLocalFilePath())) {
Expand All @@ -116,6 +120,7 @@ private boolean importData() {
processDeletedElementsFile(zip, deletedElementsMap);
processInstitutions(zip, altNamesMap, identifiersMap, dnNameMap);
processDeletedElements(deletedElementsMap);
groupRinggoldsWithUpdatedExternalModifiers();
return true;
} catch (Exception e) {
LOGGER.error("Error importing RINGGOLD data", e);
Expand All @@ -124,13 +129,13 @@ private boolean importData() {
LOGGER.warn("Ringgold import completed");
}
}

private JsonNode getJsonNode(ZipFile zip, ZipEntry entry) throws IOException, UnsupportedEncodingException {
LOGGER.info("Generating json node for: " + entry.getName());
try (Reader reader = new InputStreamReader(zip.getInputStream(entry), RINGGOLD_CHARACTER_ENCODING)){
try (Reader reader = new InputStreamReader(zip.getInputStream(entry), RINGGOLD_CHARACTER_ENCODING)) {
JsonNode node = JsonUtils.read(reader);
return node;
}
}
}

private void processAltNamesFile(ZipFile mainFile, Map<Integer, List<JsonNode>> altNamesMap, Map<Integer, JsonNode> dnNameMap)
Expand Down Expand Up @@ -217,8 +222,8 @@ private void processInstitutions(JsonNode institutions, Map<Integer, List<JsonNo
LOGGER.info("Processing institutions");
institutions.forEach(institution -> {
Integer ringgoldId = institution.get("ringgold_id").asInt();
OrgDisambiguatedEntity entity = processInstitution(institution, dnNameMap);
generateExternalIdentifiers(entity, identifiersMap.get(ringgoldId));
OrgDisambiguatedEntity entity = processInstitution(institution, dnNameMap, identifiersMap);


// Create orgs based on the alt names information
if (altNamesMap.containsKey(ringgoldId)) {
Expand All @@ -241,7 +246,7 @@ private void processDeletedElements(Map<Integer, Integer> deletedElementsMap) {
// Check if the status is up to date, if not, update it
if (!status.name().equals(existingEntity.getStatus())) {
existingEntity.setStatus(status.name());
existingEntity.setIndexingStatus(IndexingStatus.REINDEX);
existingEntity.setIndexingStatus(IndexingStatus.PENDING);
if (newId != null) {
existingEntity.setSourceParentId(String.valueOf(newId));
}
Expand All @@ -259,8 +264,9 @@ private void processDeletedElements(Map<Integer, Integer> deletedElementsMap) {
});
}

private OrgDisambiguatedEntity processInstitution(JsonNode institution, Map<Integer, JsonNode> dnNameMap) {
private OrgDisambiguatedEntity processInstitution(JsonNode institution, Map<Integer, JsonNode> dnNameMap, Map<Integer, List<JsonNode>> identifiersMap) {
Integer recId = institution.get("rec_id").asInt();

Integer ringgoldId = institution.get("ringgold_id").asInt();
LOGGER.info("Processing ringgold_id: {} rec_id: {}", ringgoldId, recId);
Integer parentId = institution.get("parent_ringgold_id").asInt() == 0 ? null : institution.get("parent_ringgold_id").asInt();
Expand All @@ -281,6 +287,7 @@ private OrgDisambiguatedEntity processInstitution(JsonNode institution, Map<Inte

OrgDisambiguatedEntity entity = orgDisambiguatedDao.findBySourceIdAndSourceType(String.valueOf(ringgoldId), OrgDisambiguatedSourceType.RINGGOLD.name());
Date now = new Date();

if (entity == null) {
entity = new OrgDisambiguatedEntity();
entity.setLastIndexedDate(now);
Expand All @@ -295,34 +302,34 @@ private OrgDisambiguatedEntity processInstitution(JsonNode institution, Map<Inte
}
entity.setSourceType(OrgDisambiguatedSourceType.RINGGOLD.name());
entity.setIndexingStatus(IndexingStatus.PENDING);
OrgDisambiguatedEntity newEntity= orgDisambiguatedManager.createOrgDisambiguated(entity);

OrgDisambiguatedEntity newEntity = orgDisambiguatedManager.createOrgDisambiguated(entity);
try {
//mark group for indexing
// mark group for indexing
new OrgGrouping(newEntity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);

}
catch (Exception ex) {
} catch (Exception ex) {
LOGGER.error("Error when grouping by ROR and marking group orgs for reindexing, eating the exception", ex);
}
} else {
// If the element have changed
if (changed(entity, parentId, name, country, city, state, type)) {
entity.setCity(city);
entity.setCountry(country.name());
entity.setName(name);
entity.setOrgType(type);
entity.setRegion(state);
entity.setIndexingStatus(IndexingStatus.REINDEX);
entity = orgDisambiguatedManager.updateOrgDisambiguated(entity);
try {
// mark group for indexing
new OrgGrouping(entity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
if(entity != null) {
new OrgGrouping(entity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
}

} catch (Exception ex) {
LOGGER.error("Error when grouping by ROR and marking group orgs for reindexing, eating the exception", ex);
}
orgDisambiguatedManager.updateOrgDisambiguated(entity);


}
}

Expand All @@ -331,6 +338,9 @@ private OrgDisambiguatedEntity processInstitution(JsonNode institution, Map<Inte
if (originalName != null) {
generateOrganizationFromInstitutionNode(entity, originalName, country, city, state);
}
if(entity != null && identifiersMap.get(ringgoldId) != null) {
generateExternalIdentifiers(entity, identifiersMap.get(ringgoldId));
}

return entity;
}
Expand Down Expand Up @@ -365,25 +375,27 @@ private void generateExternalIdentifiers(OrgDisambiguatedEntity disambiguatedEnt
} else {
existingExternalIdentifiersMap.remove(id);
}
UPDATED_RINGGOLDS.add(disambiguatedEntity.getId());
});
}

// Then, remove all existing external identifiers that are not present
// in the ringgold data anymore
for (OrgDisambiguatedExternalIdentifierEntity extIdToBeRemoved : existingExternalIdentifiersMap.values()) {
orgDisambiguatedExternalIdentifierDao.remove(extIdToBeRemoved.getId());
UPDATED_RINGGOLDS.add(disambiguatedEntity.getId());
}
}

private void generateOrganizations(OrgDisambiguatedEntity disambiguatedEntity, List<JsonNode> altNames) {
altNames.forEach(altName -> {
String name = altName.get("name").asText();
LOGGER.info("Processing organization {} for {}", name, disambiguatedEntity.getId());
String city = altName.get("city").asText();
Iso3166Country country = Iso3166Country.fromValue(altName.get("country").asText());
OrgEntity existingOrg = orgDao.findByNameCityRegionCountryAndType(name, city, "", country.name(), "RINGGOLD");
if (existingOrg != null) {
if (existingOrg.getOrgDisambiguated() == null) {
LOGGER.info("Processing organization {} for {}", name, disambiguatedEntity.getId());
existingOrg.setOrgDisambiguated(disambiguatedEntity);
orgDao.merge(existingOrg);
}
Expand Down Expand Up @@ -416,6 +428,24 @@ private void generateOrganizationFromInstitutionNode(OrgDisambiguatedEntity disa
}
}

private void groupRinggoldsWithUpdatedExternalModifiers() {
for (Long id : UPDATED_RINGGOLDS) {
OrgDisambiguatedEntity entity = orgDisambiguatedDao.find(id);
if (entity != null) {
entity.setIndexingStatus(IndexingStatus.PENDING);
try {
// mark group for indexing
new OrgGrouping(entity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);

} catch (Exception ex) {
LOGGER.error("Error when grouping by ROR and marking group orgs for reindexing, eating the exception", ex);
}
entity = orgDisambiguatedManager.updateOrgDisambiguated(entity);

}
}
}

private boolean changed(OrgDisambiguatedEntity entity, Integer parentId, String name, Iso3166Country country, String city, String state, String type) {
if (!name.equals(entity.getName()) || !country.name().equals(entity.getCountry()) || !city.equals(entity.getCity()) || !type.equals(entity.getOrgType())) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.StringJoiner;
import java.util.zip.ZipEntry;
Expand Down Expand Up @@ -79,6 +81,8 @@ public class RorOrgLoadSource implements OrgLoadSource {

@Resource
private FileRotator fileRotator;

private Set<Long> UPDATED_RORS;

@Override
public String getSourceName() {
Expand Down Expand Up @@ -154,6 +158,7 @@ private boolean loadData() {

//ror returns the JSON as Array of institutes
JsonNode rootNode = JsonUtils.read(fileToLoad);
UPDATED_RORS = new HashSet<Long>();

rootNode.forEach(institute -> {
String sourceId = institute.get("id").isNull() ? null : institute.get("id").asText();
Expand Down Expand Up @@ -203,6 +208,9 @@ private boolean loadData() {
LOGGER.error("Illegal status '" + status + "' for institute " + sourceId);
}
});

// Check if any RORs with external identifiers updated and group them
groupRORsWithUpdatedExternalModifiers();

LOGGER.info("Time taken to process the data: {}", Duration.between(start, Instant.now()).toString());
return true;
Expand Down Expand Up @@ -260,11 +268,13 @@ private void processExternalIdentifiers(OrgDisambiguatedEntity org, JsonNode ins
if(StringUtils.equalsIgnoreCase(OrgDisambiguatedSourceType.GRID.name(), identifierTypeName)) {
JsonNode extId = (JsonNode) entry.getValue().get("all");
setExternalId(org, identifierTypeName, preferredId, extId);
UPDATED_RORS.add(org.getId());
}
else {
ArrayNode elements = (ArrayNode) entry.getValue().get("all");
for (JsonNode extId : elements) {
setExternalId(org, identifierTypeName, preferredId, extId);
UPDATED_RORS.add(org.getId());
}
}
}
Expand All @@ -275,6 +285,7 @@ private void processExternalIdentifiers(OrgDisambiguatedEntity org, JsonNode ins
// If the external identifier doesn't exists yet
if (orgDisambiguatedExternalIdentifierDao.findByDetails(org.getId(), url, WIKIPEDIA_URL.toUpperCase()) == null) {
createExternalIdentifier(org, url, WIKIPEDIA_URL.toUpperCase(), true);
UPDATED_RORS.add(org.getId());
} else {
LOGGER.info("Wikipedia URL for {} already exists", org.getId());
}
Expand Down Expand Up @@ -451,4 +462,23 @@ public boolean isEnabled() {
return enabled;
}

}

private void groupRORsWithUpdatedExternalModifiers() {
for (Long id : UPDATED_RORS) {
OrgDisambiguatedEntity entity = orgDisambiguatedDao.find(id);
if (entity != null) {
entity.setIndexingStatus(IndexingStatus.PENDING);
try {
// mark group for indexing
new OrgGrouping(entity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);

} catch (Exception ex) {
LOGGER.error("Error when grouping by ROR and marking group orgs for reindexing, eating the exception", ex);
}
entity = orgDisambiguatedManager.updateOrgDisambiguated(entity);

}
}
}

}
Loading

0 comments on commit 001f7d4

Please sign in to comment.