-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added Fetcher for ISIDORE #10518
Added Fetcher for ISIDORE #10518
Changes from 32 commits
fb2222e
7d65239
0f87aed
6d02f06
25e4985
060b388
3037d9a
ea6988e
403bf06
6d286fe
e50b370
39496eb
664a933
bcd405c
c08beb0
38b9d33
48def73
b246a0d
86e4ba4
ca69df7
81c2172
8f45b03
e15a3aa
0ca60b9
4dacce8
065c4cf
db4a9ad
620f719
0553443
80e1088
4702d14
1e3f3a3
ae49b38
fdf40fd
b3e7d59
5c09c36
cd6d630
8e3eb37
50e4205
d00eeb1
e96cfef
b8c8f68
92cf060
87f17bc
5b04053
d4d08e0
b71ef06
baa4038
42bac2b
a0dfb2d
247ec3e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,222 @@ | ||||||||||||||||
package org.jabref.logic.importer.fetcher; | ||||||||||||||||
|
||||||||||||||||
import java.io.IOException; | ||||||||||||||||
import java.net.MalformedURLException; | ||||||||||||||||
import java.net.URISyntaxException; | ||||||||||||||||
import java.net.URL; | ||||||||||||||||
import java.util.Collections; | ||||||||||||||||
import java.util.Optional; | ||||||||||||||||
import java.util.StringJoiner; | ||||||||||||||||
|
||||||||||||||||
import javax.xml.parsers.DocumentBuilder; | ||||||||||||||||
import javax.xml.parsers.DocumentBuilderFactory; | ||||||||||||||||
import javax.xml.parsers.ParserConfigurationException; | ||||||||||||||||
|
||||||||||||||||
import org.jabref.logic.help.HelpFile; | ||||||||||||||||
import org.jabref.logic.importer.FetcherException; | ||||||||||||||||
import org.jabref.logic.importer.IdBasedParserFetcher; | ||||||||||||||||
import org.jabref.logic.importer.Parser; | ||||||||||||||||
import org.jabref.logic.net.URLDownload; | ||||||||||||||||
import org.jabref.model.entry.BibEntry; | ||||||||||||||||
import org.jabref.model.entry.field.StandardField; | ||||||||||||||||
import org.jabref.model.entry.types.EntryType; | ||||||||||||||||
import org.jabref.model.entry.types.StandardEntryType; | ||||||||||||||||
|
||||||||||||||||
import org.jooq.lambda.Unchecked; | ||||||||||||||||
import org.w3c.dom.Document; | ||||||||||||||||
import org.w3c.dom.Element; | ||||||||||||||||
import org.w3c.dom.Node; | ||||||||||||||||
import org.w3c.dom.NodeList; | ||||||||||||||||
import org.xml.sax.SAXException; | ||||||||||||||||
|
||||||||||||||||
/** | ||||||||||||||||
* Fetcher for <a href="https://isidore.science">ISIDORE</a>``` | ||||||||||||||||
* Will take in the link to the website or the last six digits that identify the reference | ||||||||||||||||
* Uses <a href="https://isidore.science/api">ISIDORE's API</a>. */ | ||||||||||||||||
public class ISIDOREFetcher implements IdBasedParserFetcher { | ||||||||||||||||
private static final int LINKLENGTH = 47; | ||||||||||||||||
|
||||||||||||||||
private String URL; | ||||||||||||||||
private Parser parser; | ||||||||||||||||
|
||||||||||||||||
private DocumentBuilderFactory factory; | ||||||||||||||||
|
||||||||||||||||
public ISIDOREFetcher() { | ||||||||||||||||
this.factory = DocumentBuilderFactory.newInstance(); | ||||||||||||||||
this.parser = getParser(); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public URL getUrlForIdentifier(String identifier) throws URISyntaxException, MalformedURLException, FetcherException { | ||||||||||||||||
identifier = identifier.trim(); | ||||||||||||||||
|
||||||||||||||||
if (identifier.length() == 6) { | ||||||||||||||||
// this allows the user to input only the six-digit code at the end. | ||||||||||||||||
identifier = "https://isidore.science/document/10670/1." + identifier; | ||||||||||||||||
} else if (identifier.length() == 8) { | ||||||||||||||||
// allows the user to put in the eight digits including the "1." | ||||||||||||||||
identifier = "https://isidore.science/document/10670/" + identifier; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
if (identifier.startsWith("https://isidore.science/document/10670/1.") && (identifier.length() == LINKLENGTH)) { | ||||||||||||||||
this.URL = identifier; | ||||||||||||||||
// change the link to be the correct link for the api. | ||||||||||||||||
identifier = identifier.replace("/document/", "/resource/content?uri="); | ||||||||||||||||
identifier = identifier.replace("https://isidore.science/", "https://api.isidore.science/"); | ||||||||||||||||
return new URL(identifier); | ||||||||||||||||
} else { | ||||||||||||||||
// Throw an error if the link does not start with the link above | ||||||||||||||||
throw new FetcherException("Could not construct url for ISIDORE"); | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public Parser getParser() { | ||||||||||||||||
return xmlData -> { | ||||||||||||||||
|
||||||||||||||||
try { | ||||||||||||||||
DocumentBuilder builder = this.factory.newDocumentBuilder(); | ||||||||||||||||
Document document = builder.parse(xmlData); | ||||||||||||||||
|
||||||||||||||||
// Assuming the root element represents an entry | ||||||||||||||||
Element entryElement = document.getDocumentElement(); | ||||||||||||||||
|
||||||||||||||||
if (entryElement == null) { | ||||||||||||||||
return Collections.emptyList(); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
return Collections.singletonList(xmlItemToBibEntry(document.getDocumentElement())); | ||||||||||||||||
} catch ( | ||||||||||||||||
ParserConfigurationException | | ||||||||||||||||
IOException | | ||||||||||||||||
SAXException e) { | ||||||||||||||||
Unchecked.throwChecked(new FetcherException("Issue with parsing link")); | ||||||||||||||||
} | ||||||||||||||||
return null; | ||||||||||||||||
}; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
private BibEntry xmlItemToBibEntry(Element itemElement) { | ||||||||||||||||
return new BibEntry(getType(itemElement.getElementsByTagName("types").item(0).getChildNodes())) | ||||||||||||||||
.withField(StandardField.TITLE, itemElement.getElementsByTagName("title").item(0).getTextContent().replaceAll("\"", "")) | ||||||||||||||||
.withField(StandardField.AUTHOR, getAuthor(itemElement.getElementsByTagName("enrichedCreators").item(0))) | ||||||||||||||||
.withField(StandardField.YEAR, itemElement.getElementsByTagName("date").item(0).getChildNodes().item(1).getTextContent().substring(0, 4)) | ||||||||||||||||
.withField(StandardField.JOURNAL, getJournal(itemElement.getElementsByTagName("dc:source"))) | ||||||||||||||||
.withField(StandardField.PUBLISHER, getPublishers(itemElement.getElementsByTagName("publishers").item(0))) | ||||||||||||||||
.withField(StandardField.DOI, getDOI(itemElement.getElementsByTagName("ore").item(0).getChildNodes())) | ||||||||||||||||
.withField(StandardField.URL, this.URL); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
private String getDOI(NodeList list) { | ||||||||||||||||
for (int i = 0; i < list.getLength(); i++) { | ||||||||||||||||
String content = list.item(i).getTextContent(); | ||||||||||||||||
if (content.contains("DOI:")) { | ||||||||||||||||
return content.replace("DOI: ", ""); | ||||||||||||||||
} | ||||||||||||||||
if (list.item(i).getTextContent().contains("doi:")) { | ||||||||||||||||
return content.replace("info:doi:", ""); | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
return ""; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Get the type of the document, ISIDORE only seems to have select types, also their types are different to | ||||||||||||||||
// those used by JabRef. | ||||||||||||||||
private EntryType getType(NodeList list) { | ||||||||||||||||
for (int i = 0; i < list.getLength(); i++) { | ||||||||||||||||
String type = list.item(i).getTextContent(); | ||||||||||||||||
if (type.contains("article") || type.contains("Article")) { | ||||||||||||||||
return StandardEntryType.Article; | ||||||||||||||||
} | ||||||||||||||||
if (type.contains("thesis") || type.contains("Thesis")) { | ||||||||||||||||
return StandardEntryType.Thesis; | ||||||||||||||||
} | ||||||||||||||||
if (type.contains("book") || type.contains("Book")) { | ||||||||||||||||
return StandardEntryType.Book; | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
return StandardEntryType.Misc; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Gets all the authors, separated with the word "and" | ||||||||||||||||
// For some reason the author field sometimes has extra numbers and letters. | ||||||||||||||||
private String getAuthor(Node itemElement) { | ||||||||||||||||
StringJoiner stringJoiner = new StringJoiner(" and "); | ||||||||||||||||
for (int i = 1; i < itemElement.getChildNodes().getLength(); i += 2) { | ||||||||||||||||
String next = removeNumbers(itemElement.getChildNodes().item(i).getTextContent()).replaceAll("\\s+", " "); | ||||||||||||||||
next = next.replace("\n", ""); | ||||||||||||||||
if (next.isBlank()) { | ||||||||||||||||
continue; | ||||||||||||||||
} | ||||||||||||||||
stringJoiner.add(next); | ||||||||||||||||
} | ||||||||||||||||
return (stringJoiner.toString().substring(0, stringJoiner.length())).trim().replaceAll("\\s+", " "); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Remove numbers from a string and everything after the number, (helps with the author field). | ||||||||||||||||
private String removeNumbers(String string) { | ||||||||||||||||
for (int i = 0; i < string.length(); i++) { | ||||||||||||||||
if (Character.isDigit(string.charAt(i))) { | ||||||||||||||||
return string.substring(0, i); | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
return string; | ||||||||||||||||
Comment on lines
+200
to
+205
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this suggestion OK for you or do you see any issues? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately some of the Author nodes are a bit weird. Sometimes they contain a string of numbers and a dash after the name and then repeat the name again for no apparent reason e.g. (Patrick Bonnel becomes Patrick Bonnel 0766-05442 Patrick). So to solve this I simply removed everything after the first number. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is OK. You can even put that as JavaDoc comment and as test case. What I meant: Your lines 155 to 160 can be dine with a one-line RegEx. Using Here's the private String removeNumbers(String string) {
return string.replaceFirst("\\d.*", "");
} In the regex:
The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a second ChatGPT suggestion, but I don't know about performance gains. I tend to keep the above suggestion and optimize if there are performance issues import java.util.regex.*;
private String removeNumbers(String string) {
Matcher m = Pattern.compile("^[^\\d]*").matcher(string);
if (m.find()) {
return m.group(0);
}
return string;
} Given your context, the method you've provided returns the portion of the string before the first number. Using a regular expression, we can accomplish the same task more concisely. Here's a refactored version of the import java.util.regex.*;
private String removeNumbers(String string) {
Matcher m = Pattern.compile("^[^\\d]*").matcher(string);
if (m.find()) {
return m.group(0);
}
return string;
} The regular expression
The method works by matching as many non-digit characters as possible from the beginning of the string until it encounters a digit (or the end of the string). If a match is found, it returns that match; otherwise, it simply returns the original string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The second option is good, if |
||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// In the XML file the publishers node often lists multiple publisher e.g. | ||||||||||||||||
// <publisher origin="HAL CCSD">HAL CCSD</publisher> | ||||||||||||||||
// <publisher origin="Elsevier">Elsevier</publisher> | ||||||||||||||||
// Therefore this function simply gets all of them. | ||||||||||||||||
private String getPublishers(Node itemElement) { | ||||||||||||||||
u7492883 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||
if (itemElement == null) { | ||||||||||||||||
return ""; | ||||||||||||||||
} | ||||||||||||||||
StringJoiner stringJoiner = new StringJoiner(", "); | ||||||||||||||||
for (int i = 0; i < itemElement.getChildNodes().getLength(); i++) { | ||||||||||||||||
if (itemElement.getChildNodes().item(i).getTextContent().isBlank()) { | ||||||||||||||||
continue; | ||||||||||||||||
} | ||||||||||||||||
stringJoiner.add(itemElement.getChildNodes().item(i).getTextContent().trim()); | ||||||||||||||||
} | ||||||||||||||||
return stringJoiner.toString(); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
private String getJournal(NodeList list) { | ||||||||||||||||
// If there is no journal, return an empty string. | ||||||||||||||||
if (list.getLength() == 0) { | ||||||||||||||||
return ""; | ||||||||||||||||
} | ||||||||||||||||
String reference = list.item(list.getLength() - 1).getTextContent(); | ||||||||||||||||
for (int i = 0; i < reference.length(); i++) { | ||||||||||||||||
if (reference.charAt(i) == ',') { | ||||||||||||||||
return reference.substring(0, i); | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
return ""; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public void doPostCleanup(BibEntry entry) { | ||||||||||||||||
IdBasedParserFetcher.super.doPostCleanup(entry); | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the interface has a default method, so if you don't to anything extra, you can remove the. method |
||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException { | ||||||||||||||||
return IdBasedParserFetcher.super.performSearchById(identifier); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public String getName() { | ||||||||||||||||
return "ISIDORE"; | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public Optional<HelpFile> getHelpPage() { | ||||||||||||||||
return IdBasedParserFetcher.super.getHelpPage(); | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
@Override | ||||||||||||||||
public URLDownload getUrlDownload(URL url) { | ||||||||||||||||
return IdBasedParserFetcher.super.getUrlDownload(url); | ||||||||||||||||
} | ||||||||||||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.util.Optional; | ||
|
||
import org.jabref.logic.importer.FetcherException; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
import org.jabref.testutils.category.FetcherTest; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
@FetcherTest | ||
public class ISIDOREFetcherTest { | ||
|
||
private ISIDOREFetcher fetcher; | ||
|
||
@BeforeEach | ||
public void setup() { | ||
this.fetcher = new ISIDOREFetcher(); | ||
} | ||
|
||
@Test | ||
public void checkArticle() throws FetcherException { | ||
BibEntry expected = new BibEntry(StandardEntryType.Article) | ||
.withField(StandardField.TITLE, "Investigating day-to-day variability of transit usage on a multimonth scale with smart card data. A case study in Lyon") | ||
.withField(StandardField.AUTHOR, "Oscar Egu and Patrick Bonnel") | ||
.withField(StandardField.YEAR, "2020") | ||
.withField(StandardField.JOURNAL, "Travel Behaviour and Society") | ||
.withField(StandardField.PUBLISHER, "HAL CCSD, Elsevier") | ||
.withField(StandardField.DOI, "10.1016/j.tbs.2019.12.003") | ||
.withField(StandardField.URL, "https://isidore.science/document/10670/1.hrzlqd"); | ||
|
||
Optional<BibEntry> actual = fetcher.performSearchById("https://isidore.science/document/10670/1.hrzlqd"); | ||
|
||
assertEquals(Optional.of(expected), actual); | ||
} | ||
|
||
@Test | ||
public void checkArticle2() throws FetcherException { | ||
BibEntry expected = new BibEntry(StandardEntryType.Article) | ||
.withField(StandardField.TITLE, " Anthony B. Atkinson, Inequality – What Can Be Done ? Cambridge (Mass.) Harvard University Press, 2015, XI-384 p. ") | ||
.withField(StandardField.AUTHOR, "Benoît Rapoport") | ||
.withField(StandardField.YEAR, "2016") | ||
.withField(StandardField.JOURNAL, "Population (édition française)") | ||
.withField(StandardField.PUBLISHER, "HAL CCSD, INED - Institut national d’études démographiques") | ||
.withField(StandardField.DOI, "10.3917/popu.1601.0153") | ||
.withField(StandardField.URL, "https://isidore.science/document/10670/1.d2vlam"); | ||
|
||
Optional<BibEntry> actual = fetcher.performSearchById("d2vlam"); | ||
|
||
assertEquals(Optional.of(expected), actual); | ||
} | ||
|
||
@Test | ||
public void checkThesis() throws FetcherException { | ||
BibEntry expected = new BibEntry(StandardEntryType.Thesis) | ||
.withField(StandardField.TITLE, "Mapping English L2 errors : an integrated system and textual approach") | ||
.withField(StandardField.AUTHOR, "Clive Hamilton") | ||
.withField(StandardField.YEAR, "2015") | ||
.withField(StandardField.URL, "https://isidore.science/document/10670/1.m05oth"); | ||
|
||
Optional<BibEntry> actual = fetcher.performSearchById("1.m05oth"); | ||
|
||
assertEquals(Optional.of(expected), actual); | ||
} | ||
|
||
@Test | ||
public void checkArticle3() throws FetcherException { | ||
BibEntry expected = new BibEntry(StandardEntryType.Article) | ||
.withField(StandardField.TITLE, "Salvage Lymph Node Dissection for Nodal Recurrent Prostate Cancer: A Systematic Review.") | ||
.withField(StandardField.AUTHOR, "G. Ploussard and G. Gandaglia and H. Borgmann and P. de Visschere and I. Heidegger and A. Kretschmer and R. Mathieu and C. Surcel and D. Tilki and I. Tsaur and M. Valerio and R. van den Bergh and P. Ost and A. Briganti") | ||
.withField(StandardField.YEAR, "2019") | ||
.withField(StandardField.JOURNAL, "European urology") | ||
.withField(StandardField.DOI, "10.1016/j.eururo.2018.10.041") | ||
.withField(StandardField.URL, "https://isidore.science/document/10670/1.zm7q2x"); | ||
|
||
Optional<BibEntry> actual = fetcher.performSearchById("https://isidore.science/document/10670/1.zm7q2x"); | ||
|
||
assertEquals(Optional.of(expected), actual); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should also implement
PagedSearchBasedParserFetcher
to allow searching for a term. That makes more sense as the vast majority of users won't have an ISIDORE reference id.