Skip to content

Commit

Permalink
Merge pull request #30 from kermitt2/pii_support
Browse files Browse the repository at this point in the history
adding PII as lookup ID #29
  • Loading branch information
kermitt2 authored May 22, 2019
2 parents 3af4a2b + 207118a commit aed698b
Show file tree
Hide file tree
Showing 11 changed files with 160 additions and 12 deletions.
13 changes: 13 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ __Important Note__: this Docker is a way to test and play with the biblio-glutto
- match record by ISTEX ID
- `GET host:port/service/lookup?istexid=ISTEXID`
- `GET host:port/service/lookup/istexid/{ISTEXID}`

- match record by Elsevier ID
- `GET host:port/service/lookup?pii=PII`
- `GET host:port/service/lookup/pii/{PII}`

- match record by article title and first author lastname
- `GET host:port/service/lookup?atitle=ARTICLE_TITLE&firstAuthor=FIRST_AUTHOR_SURNAME[?postValidate=true]`
Expand Down Expand Up @@ -141,6 +145,7 @@ In case you are only interested by the Open Access URL for a bibliographical obj
- `GET host:port/service/oa?doi=DOI` return the best Open Accss PDF url for a given DOI
- `GET host:port/service/oa?pmid=PMID` return the best Open Accss PDF url for a given PMID
- `GET host:port/service/oa?pmc=PMC` return the best Open Accss PDF url for a given PMC ID
- `GET host:port/service/oa?pii=PII` return the best Open Accss PDF url for a given Elsevier ID

### cURL examples

Expand Down Expand Up @@ -178,6 +183,12 @@ Bibliographical metadata lookup by PMC ID (note that the `PMC` prefix in the ide
curl http://localhost:8080/service/lookup?pmc=PMC1017419
```

Bibliographical metadata lookup by Elsevier ID:

```sh
curl http://localhost:8080/service/lookup?pii=
```

Bibliographical metadata lookup by ISTEX ID:

```sh
Expand All @@ -190,6 +201,8 @@ Open Access resolver by DOI:
curl "http://localhost:8080/service/oa?doi=10.1038/nature12373"
```



## Building the bibliographical data look-up and matching databases

### Architecture
Expand Down
7 changes: 6 additions & 1 deletion lookup/notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ Here a brief description of the API:
- match IDs (istexID, ark, etc...) by DOI:
- `GET host:port/service/lookup/istex/id?istexid=ISTEXID` for a given ISTEX ID returns the mapping IDs: ark, pmid, etc..
- `GET host:port/service/lookup/istex/id/{ISTEXID}` for a given ISTEX ID returns the mapping IDs: ark, pmid, etc..


- Open Access API returns the OA pdf link by doi:
- `GET host:port/service/lookup/oa/url?doi=doi` return the best Open Accss PDF url for a given DOI
Expand Down Expand Up @@ -150,6 +150,10 @@ For simplification, the API only does look-up of full metadata records (crossref
- match record by PMC ID
- `GET host:port/service/lookup?pmc=PMC`
- `GET host:port/service/lookup/pmc/{PMC}`
- match record by Elsevier ID
- `GET host:port/service/lookup?pii=PII`
- `GET host:port/service/lookup/pii/{PII}`

- match record by ISTEX ID
- `GET host:port/service/lookup?istexid=ISTEXID`
Expand All @@ -175,6 +179,7 @@ Open Access API returns the OA pdf link (url) by identifier:
- `GET host:port/service/oa?doi=DOI` return the best Open Accss PDF url for a given DOI
- `GET host:port/service/oa?pmid=PMID` return the best Open Accss PDF url for a given PMID
- `GET host:port/service/oa?pmc=PMC` return the best Open Accss PDF url for a given PMC ID
- `GET host:port/service/oa?pii=PII` return the best Open Accss PDF url for a given Elsevier ID



Expand Down
13 changes: 10 additions & 3 deletions lookup/src/main/java/com/scienceminer/lookup/data/IstexData.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
package com.scienceminer.lookup.data;

import com.fasterxml.jackson.annotation.JsonIgnoreProperties;

import java.io.Serializable;
import java.util.List;

@JsonIgnoreProperties({"pii"})
public class IstexData implements Serializable {
private String corpusName;

Expand All @@ -21,6 +18,8 @@ public class IstexData implements Serializable {

private List<String> mesh;

private List<String> pii;

public String getIstexId() {
return istexId;
}
Expand Down Expand Up @@ -76,4 +75,12 @@ public List<String> getMesh() {
public void setMesh(List<String> mesh) {
this.mesh = mesh;
}

public List<String> getPii() {
return pii;
}

public void setPii(List<String> pii) {
this.pii = pii;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,12 @@ public List<Pair<String, String>> retrieveCrossrefRecords(Integer total) {
return metadataLookup.retrieveList(total);
}

public List<Pair<String, IstexData>> retrieveIstexRecords_piiToIds(Integer total) {
return istexLookup.retrieveList_piiToIds(total);
}

//Setters

protected void setOaDoiLookup(OALookup oaDoiLookup) {
this.oaDoiLookup = oaDoiLookup;
}
Expand All @@ -84,5 +89,4 @@ protected void setMetadataLookup(MetadataLookup metadataLookup) {
protected void setPmidLookup(PMIdsLookup pmidLookup) {
this.pmidLookup = pmidLookup;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,24 @@ public String retrieveByIstexid(String istexid, Boolean postValidate, String fir
throw new NotFoundException("Cannot find bibliographical record with ISTEX ID " + istexid);
}

public String retrieveByPii(String pii, Boolean postValidate, String firstAuthor, String atitle) {
final IstexData istexData = istexLookup.retrieveByPii(pii);

if (istexData != null && CollectionUtils.isNotEmpty(istexData.getDoi()) && isNotBlank(istexData.getDoi().get(0))) {
final String doi = istexData.getDoi().get(0);
MatchingDocument outputData = metadataLookup.retrieveByMetadata(doi);

outputData = validateJsonBody(postValidate, firstAuthor, atitle, outputData);
//return injectIdsByIstexData(outputData.getJsonObject(), doi, istexData);

final String oaLink = oaDoiLookup.retrieveOALinkByDoi(doi);
return injectIdsByIstexData(outputData.getJsonObject(), doi, istexData, oaLink);
}

throw new NotFoundException("Cannot find bibliographical record by PII " + pii);
}


// Intermediate lookups

public PmidData retrievePMidsByDoi(String doi) {
Expand Down Expand Up @@ -265,6 +283,17 @@ public String retrieveOAUrlByPmc(String pmc) {
throw new NotFoundException("Open Access URL was not found for PM ID " + pmc);
}

public String retrieveOAUrlByPii(String pii) {
final IstexData istexData = istexLookup.retrieveByPii(pii);

if (istexData != null && CollectionUtils.isNotEmpty(istexData.getDoi())) {
return oaDoiLookup.retrieveOALinkByDoi(istexData.getDoi().get(0));
}

throw new NotFoundException("Open Access URL was not found for pii " + pii);
}


public String retrieveByBiblio(String biblio) {
final MatchingDocument outputData = metadataMatching.retrieveByBiblio(biblio);
return injectIdsByDoi(outputData.getJsonObject(), outputData.getDOI());
Expand Down Expand Up @@ -464,6 +493,15 @@ protected String injectIdsByIstexData(String jsonobj, String doi, IstexData iste
sb.append("\"mesh\":\"" + istexData.getMesh().get(0) + "\"");
foundIstexData = true;
}
if (CollectionUtils.isNotEmpty(istexData.getPii())) {
if (!first) {
sb.append(", ");
} else {
first = false;
}
sb.append("\"pii\":\"" + istexData.getPii().get(0) + "\"");
foundIstexData = true;
}
}

if (!pmid || !pmc) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.scienceminer.lookup.storage.lookup;

import com.codahale.metrics.Meter;
import com.google.inject.servlet.ServletScopes;
import com.scienceminer.lookup.data.IstexData;
import com.scienceminer.lookup.exception.ServiceOverloadedException;
import com.scienceminer.lookup.reader.IstexIdsReader;
Expand All @@ -26,9 +27,10 @@
import static org.apache.commons.lang3.StringUtils.lowerCase;

/**
* Lookup:
* Lookups:
* - doi -> istex ID, pmid, ark, etc...
* - istexID -> doi, pmid, ark, etc...
* - pii -> doi, istex ID, pmid, ark, etc...
*/
public class IstexIdsLookup {

Expand All @@ -37,11 +39,13 @@ public class IstexIdsLookup {
protected Env<ByteBuffer> environment;
protected Dbi<ByteBuffer> dbDoiToIds;
protected Dbi<ByteBuffer> dbIstexToIds;
protected Dbi<ByteBuffer> dbPiiToIds;

public static final String ENV_NAME = "istex";

public static final String NAME_DOI2IDS = ENV_NAME + "_doi2ids";
public static final String NAME_ISTEX2IDS = ENV_NAME + "_istex2ids";
public static final String NAME_PII2IDS = ENV_NAME + "_pii2ids";

private final int batchSize;

Expand All @@ -51,6 +55,7 @@ public IstexIdsLookup(StorageEnvFactory storageEnvFactory) {

dbDoiToIds = this.environment.openDbi(NAME_DOI2IDS, DbiFlags.MDB_CREATE);
dbIstexToIds = this.environment.openDbi(NAME_ISTEX2IDS, DbiFlags.MDB_CREATE);
dbPiiToIds = this.environment.openDbi(NAME_PII2IDS, DbiFlags.MDB_CREATE);
}

public void loadFromFile(InputStream is, IstexIdsReader reader, Meter metric) {
Expand All @@ -72,11 +77,19 @@ public void loadFromFile(InputStream is, IstexIdsReader reader, Meter metric) {
}
}

// unwrapping list of pii pii -> ids
for (String pii : istexData.getPii()) {
if (isNotBlank(pii)) {
store(dbPiiToIds, lowerCase(pii), istexData, transactionWrapper.tx);
}
}

// istex id -> ids (no need to unwrap)
if (isNotBlank(istexData.getIstexId())) {
store(dbIstexToIds, istexData.getIstexId(), istexData, transactionWrapper.tx);

}

metric.mark();
counter.incrementAndGet();
}
Expand Down Expand Up @@ -128,6 +141,7 @@ public Map<String, Long> getSize() {
try (final Txn<ByteBuffer> txn = this.environment.txnRead()) {
size.put(NAME_DOI2IDS, dbDoiToIds.stat(txn).entries);
size.put(NAME_ISTEX2IDS, dbIstexToIds.stat(txn).entries);
size.put(NAME_PII2IDS, dbPiiToIds.stat(txn).entries);
} catch (Env.ReadersFullException e) {
throw new ServiceOverloadedException("Not enough readers for LMDB access, increase them or reduce the parallel request rate. ", e);
}
Expand Down Expand Up @@ -186,14 +200,36 @@ record = (IstexData) BinarySerialiser.deserialize(cachedData);
}

return record;
}

public IstexData retrieveByPii(String pii) {
final ByteBuffer keyBuffer = allocateDirect(environment.getMaxKeySize());
ByteBuffer cachedData = null;
IstexData record = null;
try (Txn<ByteBuffer> tx = environment.txnRead()) {
keyBuffer.put(BinarySerialiser.serialize(lowerCase(pii))).flip();
cachedData = dbPiiToIds.get(tx, keyBuffer);
if (cachedData != null) {
record = (IstexData) BinarySerialiser.deserialize(cachedData);
}
} catch (Env.ReadersFullException e) {
throw new ServiceOverloadedException("Not enough readers for LMDB access, increase them or reduce the parallel request rate. ", e);
} catch (Exception e) {
LOGGER.error("Cannot retrieve ISTEX identifiers by pii: " + pii, e);
}

return record;
}

public List<Pair<String, IstexData>> retrieveList_doiToIds(Integer total) {
return retrieveList(total, dbDoiToIds);

}

public List<Pair<String, IstexData>> retrieveList_piiToIds(Integer total) {
return retrieveList(total, dbPiiToIds);
}

public List<Pair<String, IstexData>> retrieveList_istexToIds(Integer total) {
return retrieveList(total, dbIstexToIds);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ public List<Pair<String, IstexData>> getIstexData_doiToIds(@QueryParam("total")
public List<Pair<String, IstexData>> getIstexData_istexIdToIds(@QueryParam("total") Integer total) {
return storage.retrieveIstexRecords_istexToIds(total);
}

@GET
@Produces(MediaType.APPLICATION_JSON)
@Path("/istex/pii")
public List<Pair<String, IstexData>> getIstexData_istexpiiToIds(@QueryParam("total") Integer total) {
return storage.retrieveIstexRecords_piiToIds(total);
}

@GET
@Produces(MediaType.APPLICATION_JSON)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ public void getByQueryAsync(
@QueryParam("doi") String doi,
@QueryParam("pmid") String pmid,
@QueryParam("pmc") String pmc,
@QueryParam("pii") String pii,
@QueryParam("istexid") String istexid,
@QueryParam("firstAuthor") String firstAuthor,
@QueryParam("atitle") String atitle,
Expand Down Expand Up @@ -95,14 +96,15 @@ public void getByQueryAsync(
if (postValidate == null) postValidate = Boolean.TRUE;
if (parseReference == null) parseReference = Boolean.TRUE;

getByQuery(doi, pmid, pmc, istexid, firstAuthor, atitle,
getByQuery(doi, pmid, pmc, pii, istexid, firstAuthor, atitle,
postValidate, jtitle, volume, firstPage, biblio, parseReference, asyncResponse);
}

protected void getByQuery(
String doi,
String pmid,
String pmc,
String pii,
String istexid,
String firstAuthor,
String atitle,
Expand Down Expand Up @@ -163,6 +165,20 @@ protected void getByQuery(
}
}

if (isNotBlank(pii)) {
areParametersEnoughToLookup = true;
try {
final String response = lookupEngine.retrieveByPii(pii, postValidate, firstAuthor, atitle);
if (isNotBlank(response)) {
asyncResponse.resume(response);
return;
}

} catch (NotFoundException e) {
LOGGER.warn("PII ID did not matched, move to additional metadata");
}
}

if (isNotBlank(istexid)) {
areParametersEnoughToLookup = true;
try {
Expand Down Expand Up @@ -364,6 +380,13 @@ public String getByPmid(@PathParam("pmid") String pmid) {
return lookupEngine.retrieveByPmid(pmid, false, null, null);
}

@GET
@Produces(MediaType.APPLICATION_JSON)
@Path("/pii/{pii}")
public String getByPii(@PathParam("pii") String pii) {
return lookupEngine.retrieveByPii(pii, false, null, null);
}

@GET
@Produces(MediaType.APPLICATION_JSON)
@Path("/pmc/{pmc}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ public OAController(LookupConfiguration configuration, StorageEnvFactory storage
public OAResource getDoiByMetadataDoi(
@QueryParam("doi") String doi,
@QueryParam("pmid") String pmid,
@QueryParam("pmc") String pmc
@QueryParam("pmc") String pmc,
@QueryParam("pii") String pii

) {
if (isNotBlank(doi)) {
Expand All @@ -51,6 +52,10 @@ public OAResource getDoiByMetadataDoi(
return new OAResource(storage.retrieveOAUrlByPmc(pmc));
}

if (isNotBlank(pii)) {
return new OAResource(storage.retrieveOAUrlByPii(pii));
}

throw new ServiceException(400, "The supplied parameters were not sufficient to select the query");
}

Expand All @@ -75,4 +80,11 @@ public OAResource getDoiByMetadataPmid(@PathParam("pmid") String pmid) {
public OAResource getDoiByMetadataPmc(@PathParam("pmc") String pmc) {
return new OAResource(storage.retrieveOAUrlByPmc(pmc));
}

@GET
@Produces(MediaType.APPLICATION_JSON)
@Path("/pii/{pii}")
public OAResource getDoiByMetadataPii(@PathParam("pii") String pii) {
return new OAResource(storage.retrieveOAUrlByPii(pii));
}
}
Loading

0 comments on commit aed698b

Please sign in to comment.