Skip to content

Commit

Permalink
Merge pull request #1760 from alliance-genome/SCRUM-4690
Browse files Browse the repository at this point in the history
SCRUM-4690 Reimplement Expression Atlas xref load
  • Loading branch information
markquintontulloch authored Dec 12, 2024
2 parents 3308686 + 767e014 commit 2f17dcc
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 157 deletions.
Original file line number Diff line number Diff line change
@@ -1,40 +1,32 @@
package org.alliancegenome.curation_api.jobs.executors;

import static org.alliancegenome.curation_api.services.DataProviderService.RESOURCE_DESCRIPTOR_PREFIX;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

import org.alliancegenome.curation_api.model.entities.CrossReference;
import org.alliancegenome.curation_api.model.entities.DataProvider;
import org.alliancegenome.curation_api.model.entities.Organization;
import org.alliancegenome.curation_api.model.entities.ResourceDescriptorPage;
import org.alliancegenome.curation_api.enums.BackendBulkDataProvider;
import org.alliancegenome.curation_api.exceptions.KnownIssueValidationException;
import org.alliancegenome.curation_api.exceptions.ObjectUpdateException;
import org.alliancegenome.curation_api.exceptions.ObjectUpdateException.ObjectUpdateExceptionData;
import org.alliancegenome.curation_api.model.entities.bulkloads.BulkLoadFileHistory;
import org.alliancegenome.curation_api.model.entities.bulkloads.BulkURLLoad;
import org.alliancegenome.curation_api.services.DataProviderService;
import org.alliancegenome.curation_api.services.OrganizationService;
import org.alliancegenome.curation_api.services.ResourceDescriptorPageService;
import org.alliancegenome.curation_api.services.GeneService;
import org.alliancegenome.curation_api.util.ProcessDisplayHelper;
import org.jetbrains.annotations.NotNull;
import org.apache.commons.collections.CollectionUtils;

import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;

import io.quarkus.logging.Log;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;

@ApplicationScoped
public class ExpressionAtlasExecutor extends LoadFileExecutor {

@Inject
DataProviderService service;
@Inject
ResourceDescriptorPageService resourceDescriptorPageService;
@Inject
OrganizationService organizationService;
GeneService geneService;

public void execLoad(BulkLoadFileHistory bulkLoadFileHistory) throws IOException {

Expand All @@ -50,53 +42,61 @@ public void execLoad(BulkLoadFileHistory bulkLoadFileHistory) throws IOException

String name = bulkLoadFileHistory.getBulkLoad().getName();
String dataProviderName = name.substring(0, name.indexOf(" "));

BackendBulkDataProvider dataProvider = BackendBulkDataProvider.valueOf(dataProviderName);

Organization organization = organizationService.getByAbbr(dataProviderName).getEntity();
ResourceDescriptorPage ensemblGenePage = resourceDescriptorPageService.getPageForResourceDescriptor("ENSEMBL", "expression_atlas");

List<Long> dataProviderIdsBefore =
new ArrayList<>(service.getDataProviderMap(organization, ensemblGenePage).values().stream().map(DataProvider::getId).toList());
dataProviderIdsBefore.removeIf(Objects::isNull);

List<Long> dataProviderIdsLoaded = new ArrayList<>();
ProcessDisplayHelper ph = new ProcessDisplayHelper();
ph.addDisplayHandler(loadProcessDisplayService);
ph.startProcess(name, accessions.size());
accessions.forEach(accession -> {
CrossReference reference = getCrossReference(ensemblGenePage, accession, organization);
DataProvider provider = new DataProvider();
provider.setSourceOrganization(organization);
provider.setCrossReference(reference);
DataProvider entity = service.insertExpressionAtlasDataProvider(provider).getEntity();
if (entity != null) {
dataProviderIdsLoaded.add(entity.getId());
bulkLoadFileHistory.incrementCompleted();
} else {
bulkLoadFileHistory.incrementSkipped();
}
ph.progressProcess();
});
bulkLoadFileHistory.setTotalCount(accessions.size());
runCleanup(service, bulkLoadFileHistory, dataProviderName, dataProviderIdsBefore, dataProviderIdsLoaded, "Atlas Load Type");
ph.finishProcess();
updateHistory(bulkLoadFileHistory);
runLoad(bulkLoadFileHistory, dataProvider, accessions);

bulkLoadFileHistory.finishLoad();
updateHistory(bulkLoadFileHistory);
updateExceptions(bulkLoadFileHistory);
}

@NotNull
private static CrossReference getCrossReference(ResourceDescriptorPage ensemblGenePage, String accession, Organization organization) {
CrossReference reference = new CrossReference();
if (List.of("FB", "SGD").contains(organization.getAbbreviation())) {
reference.setReferencedCurie(accession);
} else {
reference.setReferencedCurie(RESOURCE_DESCRIPTOR_PREFIX + ":" + accession);

private void runLoad(BulkLoadFileHistory history, BackendBulkDataProvider dataProvider, List<String> identifiers) {
ProcessDisplayHelper ph = new ProcessDisplayHelper();
ph.addDisplayHandler(loadProcessDisplayService);
if (CollectionUtils.isNotEmpty(identifiers)) {
String loadMessage = "Expression Atlas cross-reference update";
if (dataProvider != null) {
loadMessage = loadMessage + " for " + dataProvider.name();
}
ph.startProcess(loadMessage, identifiers.size());

history.setCount(identifiers.size());
updateHistory(history);

for (String identifier : identifiers) {
try {
geneService.addExpressionAtlasXref(identifier, dataProvider);
history.incrementCompleted();
} catch (ObjectUpdateException e) {
history.incrementFailed();
addException(history, e.getData());
} catch (KnownIssueValidationException e) {
Log.debug(e.getMessage());
history.incrementSkipped();
} catch (Exception e) {
e.printStackTrace();
history.incrementFailed();
addException(history, new ObjectUpdateExceptionData(identifier, e.getMessage(), e.getStackTrace()));
}
if (history.getErrorRate() > 0.25) {
Log.error("Failure Rate > 25% aborting load");
updateHistory(history);
updateExceptions(history);
failLoadAboveErrorRateCutoff(history);
return;
}
ph.progressProcess();
if (Thread.currentThread().isInterrupted()) {
Log.info("Thread Interrupted:");
break;
}
}
updateHistory(history);
updateExceptions(history);
ph.finishProcess();
}
reference.setDisplayName(accession);
reference.setResourceDescriptorPage(ensemblGenePage);
return reference;
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,20 @@ public List<CrossReference> getUpdatedXrefList(List<CrossReference> incomingXref
List<CrossReference> finalXrefs = new ArrayList<>();
List<String> addedXrefUniqueIds = new ArrayList<>();

List<CrossReference> combinedXrefs = new ArrayList<>();
combinedXrefs.addAll(incomingXrefs);
if (keepAllExisting && CollectionUtils.isNotEmpty(existingXrefs)) {
incomingXrefs.addAll(existingXrefs);
combinedXrefs.addAll(existingXrefs);
}

if (CollectionUtils.isNotEmpty(incomingXrefs)) {
for (CrossReference incomingXref : incomingXrefs) {
String incomingXrefUniqueId = getCrossReferenceUniqueId(incomingXref);
if (CollectionUtils.isNotEmpty(combinedXrefs)) {
for (CrossReference xref : combinedXrefs) {
String incomingXrefUniqueId = getCrossReferenceUniqueId(xref);
if (!addedXrefUniqueIds.contains(incomingXrefUniqueId)) {
if (existingXrefMap.containsKey(incomingXrefUniqueId)) {
finalXrefs.add(updateCrossReference(existingXrefMap.get(incomingXrefUniqueId), incomingXref));
finalXrefs.add(updateCrossReference(existingXrefMap.get(incomingXrefUniqueId), xref));
} else {
finalXrefs.add(crossReferenceDAO.persist(incomingXref));
finalXrefs.add(crossReferenceDAO.persist(xref));
}
addedXrefUniqueIds.add(incomingXrefUniqueId);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,26 @@
package org.alliancegenome.curation_api.services;

import jakarta.annotation.PostConstruct;
import jakarta.enterprise.context.RequestScoped;
import jakarta.inject.Inject;
import jakarta.transaction.Transactional;
import org.alliancegenome.curation_api.auth.AuthenticatedUser;
import org.alliancegenome.curation_api.dao.*;
import org.alliancegenome.curation_api.model.entities.*;
import org.alliancegenome.curation_api.dao.CrossReferenceDAO;
import org.alliancegenome.curation_api.dao.DataProviderDAO;
import org.alliancegenome.curation_api.dao.GeneDAO;
import org.alliancegenome.curation_api.dao.OrganizationDAO;
import org.alliancegenome.curation_api.dao.SpeciesDAO;
import org.alliancegenome.curation_api.model.entities.AllianceMember;
import org.alliancegenome.curation_api.model.entities.DataProvider;
import org.alliancegenome.curation_api.model.entities.Person;
import org.alliancegenome.curation_api.response.ObjectResponse;
import org.alliancegenome.curation_api.services.base.BaseEntityCrudService;
import org.alliancegenome.curation_api.services.validation.DataProviderValidator;
import org.jetbrains.annotations.NotNull;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import jakarta.annotation.PostConstruct;
import jakarta.enterprise.context.RequestScoped;
import jakarta.inject.Inject;
import jakarta.transaction.Transactional;

@RequestScoped
public class DataProviderService extends BaseEntityCrudService<DataProvider, DataProviderDAO> {

public static final String RESOURCE_DESCRIPTOR_PREFIX = "ENSEMBL";
public static final String RESOURCE_DESCRIPTOR_PAGE_NAME = "default";
// <crossReference.referencedCurie, DataProvider>
Map<String, Long> accessionGeneMap = new HashMap<>();
HashMap<String, DataProvider> dataProviderMap = new HashMap<>();


@Inject
@AuthenticatedUser
protected Person authenticatedPerson;
Expand Down Expand Up @@ -70,77 +64,6 @@ public DataProvider getDefaultDataProvider(String sourceOrganizationAbbreviation
return dataProviderDAO.getOrCreateDataProvider(organizationDAO.getOrCreateOrganization(sourceOrganizationAbbreviation));
}

@Transactional
public ObjectResponse<DataProvider> insertExpressionAtlasDataProvider(DataProvider entity) {
String referencedCurie = entity.getCrossReference().getReferencedCurie();
// find associated gene
Long geneID = getAssociatedGeneId(referencedCurie, entity.getSourceOrganization());
// if no gene found skip (= don't import) the accession
if (geneID == null) {
return new ObjectResponse<>();
}

DataProvider dbEntity = getDataProvider(entity.getSourceOrganization(), referencedCurie, entity.getCrossReference().getResourceDescriptorPage());
// we only create new records, no updates
if (dbEntity == null) {
dataProviderDAO.persist(entity);
if (!List.of("FB", "SGD").contains(entity.getSourceOrganization().getAbbreviation())) {
crossReferenceDAO.persistAccessionGeneAssociated(entity.getCrossReference().getId(), geneID);
}
return new ObjectResponse<>(entity);
}
return new ObjectResponse<>(dbEntity);
}

@NotNull
public static String getFullReferencedCurie(String localReferencedCurie) {
return RESOURCE_DESCRIPTOR_PREFIX + ":" + localReferencedCurie;
}

private Long getAssociatedGeneId(String fullReferencedCurie, Organization sourceOrganization) {
String orgAbbreviation = sourceOrganization.getAbbreviation();
if (orgAbbreviation.equals("FB")) {
fullReferencedCurie = orgAbbreviation + ":" + fullReferencedCurie;
}
if (accessionGeneMap.size() == 0) {
if (List.of("FB", "SGD").contains(orgAbbreviation)) {
Map<String, Object> map = new HashMap<>();
map.put("displayName", orgAbbreviation);
Species species = speciesDAO.findByParams(map).getSingleResult();
accessionGeneMap = geneDAO.getAllGeneIdsPerSpecies(species);
} else {
ResourceDescriptorPage page = resourceDescriptorPageService.getPageForResourceDescriptor(RESOURCE_DESCRIPTOR_PREFIX, RESOURCE_DESCRIPTOR_PAGE_NAME);
accessionGeneMap = crossReferenceDAO.getGenesWithCrossRefs(page);
}
}
return accessionGeneMap.get(fullReferencedCurie);
}

private DataProvider getDataProvider(Organization sourceOrganization, String crossReferenceCurie, ResourceDescriptorPage page) {
if (dataProviderMap.size() > 0) {
return dataProviderMap.get(crossReferenceCurie);
}
populateDataProviderMap(sourceOrganization, page);
return dataProviderMap.get(crossReferenceCurie);
}

private void populateDataProviderMap(Organization sourceOrganization, ResourceDescriptorPage page) {
List<DataProvider> allOrgProvider = dataProviderDAO.getAllDataProvider(sourceOrganization, page);
allOrgProvider.stream()
.filter(dataProvider -> dataProvider.getCrossReference() != null && Objects.equals(dataProvider.getCrossReference().getResourceDescriptorPage().getId(), page.getId()))
.forEach(dataProvider -> {
dataProviderMap.put(dataProvider.getCrossReference().getReferencedCurie(), dataProvider);
});
}

public HashMap<String, DataProvider> getDataProviderMap(Organization sourceOrganization, ResourceDescriptorPage page) {
if (dataProviderMap.size() > 0) {
return dataProviderMap;
}
populateDataProviderMap(sourceOrganization, page);
return dataProviderMap;
}

@Transactional
public ObjectResponse<DataProvider> upsert(DataProvider uiEntity) {
ObjectResponse<DataProvider> response = dataProviderValidator.validateDataProvider(uiEntity, null, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,75 @@ public void addGeoXref(String entrezId, BackendBulkDataProvider dataProvider) th
}

if (allianceGene == null) {
throw new KnownIssueValidationException("crossReferences - referencedCurie: " + ValidationConstants.INVALID_MESSAGE + " (" + "NCBI_Gene:" + entrezId + ")");
throw new KnownIssueValidationException("crossReferences - referencedCurie: " + ValidationConstants.INVALID_MESSAGE + " (NCBI_Gene:" + entrezId + ")");
}

geneXrefHelper.addGeoCrossReference(allianceGene, "NCBI_Gene:" + entrezId);
allianceGene = geneXrefHelper.addGeoCrossReference(allianceGene, "NCBI_Gene:" + entrezId);

if (allianceGene == null) {
throw new ObjectValidationException(entrezId, "resourceDescriptorPage: " + ValidationConstants.INVALID_MESSAGE + " (NCBI_Gene:gene/other_expression)");
}
}

@Transactional
public void addExpressionAtlasXref(String identifier, BackendBulkDataProvider dataProvider) throws ValidationException {
NCBITaxonTerm taxon = ncbiTaxonTermService.getByCurie(dataProvider.canonicalTaxonCurie).getEntity();
if (taxon == null) {
throw new ObjectValidationException(identifier, "dataProvider - canonical taxon: " + ValidationConstants.INVALID_MESSAGE + " (" + dataProvider.canonicalTaxonCurie + " not found)");
}


String searchField;
String searchValue;
String referencedCurie;
String resourceDescriptorPrefix;
switch (dataProvider) {
case FB -> {
searchField = "modEntityId";
searchValue = "FB:" + identifier;
referencedCurie = searchValue;
resourceDescriptorPrefix = "FB";
}
case SGD -> {
searchField = "geneSymbol.displayText";
searchValue = identifier;
referencedCurie = "SGD:" + identifier;
resourceDescriptorPrefix = "SGD";
}
default -> {
searchField = "crossReferences.referencedCurie";
searchValue = "ENSEMBL:" + identifier;
referencedCurie = searchValue;
resourceDescriptorPrefix = "ENSEMBL";
}
}

Gene allianceGene = null;
SearchResponse<Gene> searchResponse = findByField(searchField, searchValue);
if (searchResponse != null) {
// Need to check that returned gene belongs to MOD corresponding to taxon
for (Gene searchResult : searchResponse.getResults()) {
String resultDataProviderCoreGenus = BackendBulkDataProvider.getCoreGenus(searchResult.getDataProvider().getSourceOrganization().getAbbreviation());
if (taxon.getName().startsWith(resultDataProviderCoreGenus + " ")) {
allianceGene = searchResult;
break;
}
if (StringUtils.equals(taxon.getCurie(), "NCBITaxon:9606") && StringUtils.equals(searchResult.getDataProvider().getSourceOrganization().getAbbreviation(), "RGD")) {
allianceGene = searchResult;
break;
}
}
}

if (allianceGene == null) {
throw new KnownIssueValidationException(searchField + ": " + ValidationConstants.INVALID_MESSAGE + " (" + searchValue + ")");
}

allianceGene = geneXrefHelper.addExpressionAtlasXref(allianceGene, resourceDescriptorPrefix, referencedCurie);

if (allianceGene == null) {
throw new ObjectValidationException(identifier, "resourceDescriptorPage: " + ValidationConstants.INVALID_MESSAGE + " (" + resourceDescriptorPrefix + ":expression_atlas)");
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public Gene addGenePhenotypeCrossReference(BackendBulkDataProvider dataProvider,
xref.setReferencedCurie(gene.getIdentifier());
xref.setResourceDescriptorPage(rdp);

List<CrossReference> updatedXrefs = xrefService.getUpdatedXrefList(List.of(xref), gene.getCrossReferences());
List<CrossReference> updatedXrefs = xrefService.getUpdatedXrefList(List.of(xref), gene.getCrossReferences(), true);

if (gene.getCrossReferences() != null) {
gene.getCrossReferences().clear();
Expand Down
Loading

0 comments on commit 2f17dcc

Please sign in to comment.