Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve arXiv fetcher #6113

Merged
merged 2 commits into from
Mar 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Changed

- We improved the arXiv fetcher. Now it should find entries even more reliably and does no longer include the version (e.g `v1`) in the `eprint` field. [forum#1941](https://discourse.jabref.org/t/remove-version-in-arxiv-import/1941)
- We moved the group search bar and the button "New group" from bottom to top position to make it more prominent. [#6112](https://github.com/JabRef/jabref/pull/6112)


### Fixed

- We fixed an issue where opening a library from the recent libraries menu was not possible. [#5939](https://github.com/JabRef/jabref/issues/5939)
Expand Down
25 changes: 4 additions & 21 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.xml.parsers.DocumentBuilder;
Expand Down Expand Up @@ -59,8 +57,6 @@ public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetche
private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class);

private static final String API_URL = "https://export.arxiv.org/api/query";
private static final String ARXIV_URL_PREFIX_FOR_ID = "(https?://arxiv.org/abs/)";
private static final Pattern URL_PATTERN = Pattern.compile(ARXIV_URL_PREFIX_FOR_ID);

private final ImportFormatPreferences importFormatPreferences;

Expand Down Expand Up @@ -106,7 +102,7 @@ private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherEx

private Optional<ArXivEntry> searchForEntryById(String id) throws FetcherException {
Optional<ArXivIdentifier> identifier = ArXivIdentifier.parse(id);
if (!identifier.isPresent()) {
if (identifier.isEmpty()) {
return Optional.empty();
}

Expand Down Expand Up @@ -263,10 +259,8 @@ public List<BibEntry> performSearch(String query) throws FetcherException {

@Override
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
String cleanedIdentifier = identifier.replaceAll(" ", "");
cleanedIdentifier = ArXivEntry.createIdString(cleanedIdentifier);

return searchForEntryById(cleanedIdentifier).map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
return searchForEntryById(identifier)
.map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
}

@Override
Expand Down Expand Up @@ -372,18 +366,7 @@ public Optional<URL> getPdfUrl() {
* Returns the arXiv identifier
*/
public Optional<String> getIdString() {
return urlAbstractPage.map(ArXivEntry::createIdString);
}

public static String createIdString(String id) {
Matcher matcher = URL_PATTERN.matcher(id);
if (matcher.find()) {
// Remove leading http(s)://arxiv.org/abs/ from abstract url to get arXiv ID
return id.substring(matcher.group(1).length());
} else {
return id;
}

return urlAbstractPage.flatMap(ArXivIdentifier::parse).map(ArXivIdentifier::getNormalizedWithoutVersion);
}

public Optional<ArXivIdentifier> getId() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,49 +9,59 @@

import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.strings.StringUtil;

/**
* Identifier for the arXiv. See https://arxiv.org/help/arxiv_identifier
*/
public class ArXivIdentifier implements Identifier {

private static final String ARXIV_PREFIX = "http(s)?://arxiv.org/(abs|pdf)/|arxiv|arXiv";
private final String identifier;
private final String classification;
private final String version;

ArXivIdentifier(String identifier) {
this(identifier, "");
this(identifier, "", "");
}

ArXivIdentifier(String identifier, String classification) {
this(identifier, "", classification);
}

ArXivIdentifier(String identifier, String version, String classification) {
this.identifier = identifier.trim();
this.version = version.trim();
this.classification = classification.trim();
}

public static Optional<ArXivIdentifier> parse(String value) {
Pattern identifierPattern = Pattern.compile("(arxiv|arXiv)?\\s?:?\\s?(?<id>\\d{4}.\\d{4,5}(v\\d+)?)\\s?(\\[(?<classification>\\S+)\\])?");
Matcher identifierMatcher = identifierPattern.matcher(value);
String identifier = value.replaceAll(" ", "");
Pattern identifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>\\d{4}.\\d{4,5})(v(?<version>\\d+))?\\s?(\\[(?<classification>\\S+)\\])?");
Matcher identifierMatcher = identifierPattern.matcher(identifier);
if (identifierMatcher.matches()) {
String id = identifierMatcher.group("id");
String classification = identifierMatcher.group("classification");
if (classification == null) {
classification = "";
}
return Optional.of(new ArXivIdentifier(id, classification));
String version = identifierMatcher.group("version");
if (version == null) {
version = "";
}
return Optional.of(new ArXivIdentifier(id, version, classification));
}

Pattern oldIdentifierPattern = Pattern.compile("(arxiv|arXiv)?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})");
Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(value);
Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?<version>\\d+))?");
Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier);
if (oldIdentifierMatcher.matches()) {
String id = oldIdentifierMatcher.group("id");
String classification = oldIdentifierMatcher.group("classification");
return Optional.of(new ArXivIdentifier(id, classification));
}

Pattern urlPattern = Pattern.compile("(http://arxiv.org/abs/)(?<id>\\S+)");
Matcher urlMatcher = urlPattern.matcher(value);
if (urlMatcher.matches()) {
String id = urlMatcher.group("id");
return Optional.of(new ArXivIdentifier(id));
String version = oldIdentifierMatcher.group("version");
if (version == null) {
version = "";
}
return Optional.of(new ArXivIdentifier(id, version, classification));
}

return Optional.empty();
Expand Down Expand Up @@ -99,6 +109,14 @@ public Field getDefaultField() {

@Override
public String getNormalized() {
if (StringUtil.isNotBlank(version)) {
return identifier + "v" + version;
} else {
return identifier;
}
}

public String getNormalizedWithoutVersion() {
return identifier;
}

Expand Down
60 changes: 30 additions & 30 deletions src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@
import static org.mockito.Mockito.when;

@FetcherTest
public class ArXivTest {
class ArXivTest {

private ArXiv finder;
private BibEntry entry;
private BibEntry sliceTheoremPaper;

@BeforeEach
public void setUp() {
void setUp() {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class);
when(importFormatPreferences.getKeywordSeparator()).thenReturn(',');
finder = new ArXiv(importFormatPreferences);
Expand All @@ -41,107 +41,107 @@ public void setUp() {
sliceTheoremPaper.setField(StandardField.TITLE, "Slice theorem for Fréchet group actions and covariant symplectic field theory");
sliceTheoremPaper.setField(StandardField.DATE, "2014-05-09");
sliceTheoremPaper.setField(StandardField.ABSTRACT, "A general slice theorem for the action of a Fr\\'echet Lie group on a Fr\\'echet manifolds is established. The Nash-Moser theorem provides the fundamental tool to generalize the result of Palais to this infinite-dimensional setting. The presented slice theorem is illustrated by its application to gauge theories: the action of the gauge transformation group admits smooth slices at every point and thus the gauge orbit space is stratified by Fr\\'echet manifolds. Furthermore, a covariant and symplectic formulation of classical field theory is proposed and extensively discussed. At the root of this novel framework is the incorporation of field degrees of freedom F and spacetime M into the product manifold F * M. The induced bigrading of differential forms is used in order to carry over the usual symplectic theory to this new setting. The examples of the Klein-Gordon field and general Yang-Mills theory illustrate that the presented approach conveniently handles the occurring symmetries.");
sliceTheoremPaper.setField(StandardField.EPRINT, "1405.2249v1");
sliceTheoremPaper.setField(StandardField.EPRINT, "1405.2249");
sliceTheoremPaper.setField(StandardField.FILE, ":http\\://arxiv.org/pdf/1405.2249v1:PDF");
sliceTheoremPaper.setField(StandardField.EPRINTTYPE, "arXiv");
sliceTheoremPaper.setField(StandardField.EPRINTCLASS, "math-ph");
sliceTheoremPaper.setField(StandardField.KEYWORDS, "math-ph, math.DG, math.MP, math.SG, 58B99, 58Z05, 58B25, 22E65, 58D19, 53D20, 53D42");
}

@Test
public void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException {
void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void findFullTextRejectsNullParameter() {
void findFullTextRejectsNullParameter() {
assertThrows(NullPointerException.class, () -> finder.findFullText(null));
}

@Test
public void findFullTextByDOI() throws IOException {
void findFullTextByDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1529/biophysj.104.047340");
entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByEprint() throws IOException {
void findFullTextByEprint() throws IOException {
entry.setField(StandardField.EPRINT, "1603.06570");
assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByEprintWithPrefix() throws IOException {
void findFullTextByEprintWithPrefix() throws IOException {
entry.setField(StandardField.EPRINT, "arXiv:1603.06570");
assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByEprintWithUnknownDOI() throws IOException {
void findFullTextByEprintWithUnknownDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1529/unknown");
entry.setField(StandardField.EPRINT, "1603.06570");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByTitle() throws IOException {
void findFullTextByTitle() throws IOException {
entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByTitleAndPartOfAuthor() throws IOException {
void findFullTextByTitleAndPartOfAuthor() throws IOException {
entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");
entry.setField(StandardField.AUTHOR, "Weeks and Lucks");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void notFindFullTextByUnknownDOI() throws IOException {
void notFindFullTextByUnknownDOI() throws IOException {
entry.setField(StandardField.DOI, "10.1529/unknown");
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void notFindFullTextByUnknownId() throws IOException {
void notFindFullTextByUnknownId() throws IOException {
entry.setField(StandardField.EPRINT, "1234.12345");
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void findFullTextByDOINotAvailableInCatalog() throws IOException {
void findFullTextByDOINotAvailableInCatalog() throws IOException {
entry.setField(StandardField.DOI, "10.1016/0370-2693(77)90015-6");
entry.setField(StandardField.TITLE, "Superspace formulation of supergravity");

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void searchEntryByPartOfTitle() throws Exception {
void searchEntryByPartOfTitle() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
finder.performSearch("ti:\"slice theorem for Frechet\""));
}

@Test
public void searchEntryByPartOfTitleWithAcuteAccent() throws Exception {
void searchEntryByPartOfTitleWithAcuteAccent() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
finder.performSearch("ti:\"slice theorem for Fréchet\""));
}

@Test
public void searchEntryByOldId() throws Exception {
void searchEntryByOldId() throws Exception {
BibEntry expected = new BibEntry();
expected.setType(StandardEntryType.Article);
expected.setField(StandardField.AUTHOR, "H1 Collaboration");
expected.setField(StandardField.TITLE, "Multi-Electron Production at High Transverse Momenta in ep Collisions at HERA");
expected.setField(StandardField.DATE, "2003-07-07");
expected.setField(StandardField.ABSTRACT, "Multi-electron production is studied at high electron transverse momentum in positron- and electron-proton collisions using the H1 detector at HERA. The data correspond to an integrated luminosity of 115 pb-1. Di-electron and tri-electron event yields are measured. Cross sections are derived in a restricted phase space region dominated by photon-photon collisions. In general good agreement is found with the Standard Model predictions. However, for electron pair invariant masses above 100 GeV, three di-electron events and three tri-electron events are observed, compared to Standard Model expectations of 0.30 \\pm 0.04 and 0.23 \\pm 0.04, respectively.");
expected.setField(StandardField.EPRINT, "hep-ex/0307015v1");
expected.setField(StandardField.EPRINT, "hep-ex/0307015");
expected.setField(StandardField.FILE, ":http\\://arxiv.org/pdf/hep-ex/0307015v1:PDF");
expected.setField(StandardField.EPRINTTYPE, "arXiv");
expected.setField(StandardField.EPRINTCLASS, "hep-ex");
Expand All @@ -153,61 +153,61 @@ public void searchEntryByOldId() throws Exception {
}

@Test
public void searchEntryByIdWith4DigitsAndVersion() throws Exception {
void searchEntryByIdWith4DigitsAndVersion() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("1405.2249v1"));
}

@Test
public void searchEntryByIdWith4Digits() throws Exception {
void searchEntryByIdWith4Digits() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("1405.2249"));
}

@Test
public void searchEntryByIdWith4DigitsAndPrefix() throws Exception {
void searchEntryByIdWith4DigitsAndPrefix() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("arXiv:1405.2249"));
}

@Test
public void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception {
void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("arXiv : 1405. 2249"));
}

@Test
public void searchEntryByIdWith5Digits() throws Exception {
void searchEntryByIdWith5Digits() throws Exception {
assertEquals(Optional.of(
"An Optimal Convergence Theorem for Mean Curvature Flow of Arbitrary Codimension in Hyperbolic Spaces"),
finder.performSearchById("1503.06747").flatMap(entry -> entry.getField(StandardField.TITLE)));
}

@Test
public void searchWithMalformedIdThrowsException() throws Exception {
void searchWithMalformedIdThrowsException() throws Exception {
assertThrows(FetcherException.class, () -> finder.performSearchById("123412345"));
}

@Test
public void searchIdentifierForSlicePaper() throws Exception {
void searchIdentifierForSlicePaper() throws Exception {
sliceTheoremPaper.clearField(StandardField.EPRINT);

assertEquals(ArXivIdentifier.parse("1405.2249v1"), finder.findIdentifier(sliceTheoremPaper));
assertEquals(ArXivIdentifier.parse("1405.2249"), finder.findIdentifier(sliceTheoremPaper));
}

@Test
public void searchEmptyId() throws Exception {
void searchEmptyId() throws Exception {
assertEquals(Optional.empty(), finder.performSearchById(""));
}

@Test
public void searchWithHttpUrl() throws Exception {
void searchWithHttpUrl() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("http://arxiv.org/abs/1405.2249"));
}

@Test
public void searchWithHttpsUrl() throws Exception {
void searchWithHttpsUrl() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("https://arxiv.org/abs/1405.2249"));
}

@Test
public void searchWithHttpsUrlNotTrimmed() throws Exception {
void searchWithHttpsUrlNotTrimmed() throws Exception {
assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("https : // arxiv . org / abs / 1405 . 2249 "));
}
}
Loading