-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add option to parse new references from plain text using GROBID… (#5614)
GROBID integration using new fetcher Add the possibility to extract references from plain text using the GROBID service. GROBID is called over a custom server. See also pull request #5614 Co-Authored-By: joeyzgraggen <[email protected]> Co-Authored-By: guenesaydin <[email protected]> Co-Authored-By: obsluk00 <[email protected]> Co-Authored-By: nikodemkch <[email protected]>
- Loading branch information
1 parent
e0e837e
commit 5fa1dcf
Showing
14 changed files
with
357 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.stream.Collectors; | ||
|
||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.logic.importer.SearchBasedFetcher; | ||
import org.jabref.logic.importer.fileformat.BibtexParser; | ||
import org.jabref.logic.importer.util.GrobidService; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.util.DummyFileUpdateMonitor; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class GrobidCitationFetcher implements SearchBasedFetcher { | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(GrobidCitationFetcher.class); | ||
private static final String GROBID_URL = "http://grobid.cm.in.tum.de:8070"; | ||
private ImportFormatPreferences importFormatPreferences; | ||
private GrobidService grobidService; | ||
|
||
public GrobidCitationFetcher(ImportFormatPreferences importFormatPreferences) { | ||
this.importFormatPreferences = importFormatPreferences; | ||
this.grobidService = new GrobidService(GROBID_URL); | ||
} | ||
|
||
/** | ||
* Passes request to grobid server, using consolidateCitations option to improve result. | ||
* Takes a while, since the server has to look up the entry. | ||
* @return A BibTeX-String if extraction is successful and an empty String otherwise. | ||
*/ | ||
private String parseUsingGrobid(String plainText) { | ||
try { | ||
return grobidService.processCitation(plainText, GrobidService.ConsolidateCitations.WITH_METADATA); | ||
} catch (IOException e) { | ||
LOGGER.debug("Could not process citation", e); | ||
return ""; | ||
} | ||
} | ||
|
||
private Optional<BibEntry> parseBibToBibEntry(String bibtexString) { | ||
try { | ||
return BibtexParser.singleFromString(bibtexString, | ||
importFormatPreferences, new DummyFileUpdateMonitor()); | ||
} catch (ParseException e) { | ||
return Optional.empty(); | ||
} | ||
} | ||
|
||
@Override | ||
public List<BibEntry> performSearch(String query) { | ||
List<String> plainReferences = Arrays.stream( query.split( "\\r\\r+|\\n\\n+|\\r\\n(\\r\\n)+" ) ) | ||
.map(String::trim) | ||
.filter(str -> !str.isBlank()) | ||
.collect(Collectors.toCollection(ArrayList::new)); | ||
if (plainReferences.isEmpty()) { | ||
return Collections.emptyList(); | ||
} else { | ||
return plainReferences.stream() | ||
.map(reference -> parseBibToBibEntry(parseUsingGrobid(reference))) | ||
.flatMap(Optional::stream) | ||
.collect(Collectors.toList()); | ||
} | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "GROBID"; | ||
} | ||
} |
57 changes: 57 additions & 0 deletions
57
src/main/java/org/jabref/logic/importer/util/GrobidService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package org.jabref.logic.importer.util; | ||
|
||
import java.io.IOException; | ||
import java.net.URLEncoder; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
import org.jabref.logic.net.URLDownload; | ||
|
||
/** | ||
* Implements an API to a GROBID server, as described at | ||
* https://grobid.readthedocs.io/en/latest/Grobid-service/#grobid-web-services | ||
* <p> | ||
* Note: Currently a custom GROBID server is used... | ||
* https://github.com/NikodemKch/grobid | ||
* <p> | ||
* The methods are structured to match the GROBID server api. | ||
* Each method corresponds to a GROBID service request. Only the ones already used are already implemented. | ||
*/ | ||
public class GrobidService { | ||
|
||
public enum ConsolidateCitations { | ||
NO(0), WITH_METADATA(1), WITH_DOI_ONLY(2); | ||
private int code; | ||
|
||
ConsolidateCitations(int code) { | ||
this.code = code; | ||
} | ||
|
||
public int getCode() { | ||
return this.code; | ||
} | ||
} | ||
|
||
String grobidServerURL; | ||
|
||
public GrobidService(String grobidServerURL) { | ||
this.grobidServerURL = grobidServerURL; | ||
} | ||
|
||
/** | ||
* @return A BibTeX-String if extraction is successful and an IOException otherwise. | ||
*/ | ||
public String processCitation(String rawCitation, ConsolidateCitations consolidateCitations) throws IOException { | ||
rawCitation = URLEncoder.encode(rawCitation, StandardCharsets.UTF_8); | ||
URLDownload urlDownload = new URLDownload(grobidServerURL | ||
+ "/api/processCitation"); | ||
//urlDownload.addHeader("Accept", "application/x-bibtex"); //TODO: Uncomment as soon as the default GROBID server is used. | ||
urlDownload.setPostData("citations=" + rawCitation + "&consolidateCitations=" + consolidateCitations); | ||
String httpResponse = urlDownload.asString(); | ||
|
||
if (httpResponse == null || httpResponse.equals("@misc{-1,\n\n}\n")) { //This filters empty BibTeX entries | ||
throw new IOException("The GROBID server response does not contain anything."); | ||
} | ||
|
||
return httpResponse; | ||
} | ||
} |
Oops, something went wrong.