-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5206 from JabRef/bibtexextractor
Bibtexextractor
- Loading branch information
Showing
10 changed files
with
305 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -210,3 +210,4 @@ Yang Zongze | |
Yara Grassi Gouffon | ||
Yifan Peng | ||
Zhang Liang | ||
Nikita Borovikov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
170 changes: 170 additions & 0 deletions
170
src/main/java/org/jabref/gui/bibtexextractor/BibtexExtractor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
package org.jabref.gui.bibtexextractor; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Calendar; | ||
import java.util.List; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.types.EntryType; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
|
||
public class BibtexExtractor { | ||
|
||
private static final String AUTHOR_TAG = "[author_tag]"; | ||
private static final String URL_TAG = "[url_tag]"; | ||
private static final String YEAR_TAG = "[year_tag]"; | ||
private static final String PAGES_TAG = "[pages_tag]"; | ||
|
||
private static final String INITIALS_GROUP = "INITIALS"; | ||
private static final String LASTNAME_GROUP = "LASTNAME"; | ||
|
||
private static final Pattern URL_PATTERN = Pattern.compile( | ||
"(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)" + | ||
"(([\\w\\-]+\\.)+?([\\w\\-.~]+\\/?)*" + | ||
"[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)", | ||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); | ||
|
||
private static final Pattern YEAR_PATTERN = Pattern.compile( | ||
"\\d{4}", | ||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); | ||
|
||
private static final Pattern AUTHOR_PATTERN = Pattern.compile( | ||
"(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+),?\\s(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})" + | ||
"\\s*(and|,|\\.)*", | ||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); | ||
|
||
private static final Pattern AUTHOR_PATTERN_2 = Pattern.compile( | ||
"(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+)" + | ||
"\\s*(and|,|\\.)*", | ||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); | ||
|
||
private static final Pattern PAGES_PATTERN = Pattern.compile( | ||
"(p.)?\\s?\\d+(-\\d+)?", | ||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); | ||
|
||
private final List<String> urls = new ArrayList<>(); | ||
private final List<String> authors = new ArrayList<>(); | ||
private String year = ""; | ||
private String pages = ""; | ||
private String title = ""; | ||
private boolean isArticle = true; | ||
private String journalOrPublisher = ""; | ||
|
||
public BibEntry extract(String input) { | ||
String inputWithoutUrls = findUrls(input); | ||
String inputWithoutAuthors = findAuthors(inputWithoutUrls); | ||
String inputWithoutYear = findYear(inputWithoutAuthors); | ||
String inputWithoutPages = findPages(inputWithoutYear); | ||
String nonParsed = findParts(inputWithoutPages); | ||
return generateEntity(nonParsed); | ||
} | ||
|
||
private BibEntry generateEntity(String input) { | ||
EntryType type = isArticle ? StandardEntryType.Article : StandardEntryType.Book; | ||
BibEntry extractedEntity = new BibEntry(type); | ||
extractedEntity.setField(StandardField.AUTHOR, String.join(" and ", authors)); | ||
extractedEntity.setField(StandardField.URL, String.join(", ", urls)); | ||
extractedEntity.setField(StandardField.YEAR, year); | ||
extractedEntity.setField(StandardField.PAGES, pages); | ||
extractedEntity.setField(StandardField.TITLE, title); | ||
if (isArticle) { | ||
extractedEntity.setField(StandardField.JOURNAL, journalOrPublisher); | ||
} else { | ||
extractedEntity.setField(StandardField.PUBLISHER, journalOrPublisher); | ||
} | ||
extractedEntity.setField(StandardField.COMMENT, input); | ||
return extractedEntity; | ||
} | ||
|
||
private String findUrls(String input) { | ||
Matcher matcher = URL_PATTERN.matcher(input); | ||
while (matcher.find()) { | ||
urls.add(input.substring(matcher.start(1), matcher.end())); | ||
} | ||
return fixSpaces(matcher.replaceAll(URL_TAG)); | ||
} | ||
|
||
private String findYear(String input) { | ||
Matcher matcher = YEAR_PATTERN.matcher(input); | ||
while (matcher.find()) { | ||
String yearCandidate = input.substring(matcher.start(), matcher.end()); | ||
int intYearCandidate = Integer.parseInt(yearCandidate); | ||
if ((intYearCandidate > 1700) && (intYearCandidate <= Calendar.getInstance().get(Calendar.YEAR))) { | ||
year = yearCandidate; | ||
return fixSpaces(input.replace(year, YEAR_TAG)); | ||
} | ||
} | ||
return input; | ||
} | ||
|
||
private String findAuthors(String input) { | ||
String currentInput = findAuthorsByPattern(input, AUTHOR_PATTERN); | ||
return findAuthorsByPattern(currentInput, AUTHOR_PATTERN_2); | ||
} | ||
|
||
private String findAuthorsByPattern(String input, Pattern pattern) { | ||
Matcher matcher = pattern.matcher(input); | ||
while (matcher.find()) { | ||
authors.add(GenerateAuthor(matcher.group(LASTNAME_GROUP), matcher.group(INITIALS_GROUP))); | ||
} | ||
return fixSpaces(matcher.replaceAll(AUTHOR_TAG)); | ||
} | ||
|
||
private String GenerateAuthor(String lastName, String initials) { | ||
return lastName + ", " + initials; | ||
} | ||
|
||
private String findPages(String input) { | ||
Matcher matcher = PAGES_PATTERN.matcher(input); | ||
if (matcher.find()) { | ||
pages = input.substring(matcher.start(), matcher.end()); | ||
} | ||
return fixSpaces(matcher.replaceFirst(PAGES_TAG)); | ||
} | ||
|
||
private String fixSpaces(String input) { | ||
return input.replaceAll("[,.!?;:]", "$0 ") | ||
.replaceAll("\\p{Lt}", " $0") | ||
.replaceAll("\\s+", " ").trim(); | ||
} | ||
|
||
private String findParts(String input) { | ||
ArrayList<String> lastParts = new ArrayList<>(); | ||
int afterAuthorsIndex = input.lastIndexOf(AUTHOR_TAG); | ||
if (afterAuthorsIndex == -1) { | ||
return input; | ||
} else { | ||
afterAuthorsIndex += AUTHOR_TAG.length(); | ||
} | ||
int delimiterIndex = input.lastIndexOf("//"); | ||
if (delimiterIndex != -1) { | ||
lastParts.add(input.substring(afterAuthorsIndex, delimiterIndex) | ||
.replace(YEAR_TAG, "") | ||
.replace(PAGES_TAG, "")); | ||
lastParts.addAll(Arrays.asList(input.substring(delimiterIndex + 2).split(",|\\."))); | ||
} else { | ||
lastParts.addAll(Arrays.asList(input.substring(afterAuthorsIndex).split(",|\\."))); | ||
} | ||
int nonDigitParts = 0; | ||
for (String part : lastParts) { | ||
if (part.matches(".*\\d.*")) { | ||
break; | ||
} | ||
nonDigitParts++; | ||
} | ||
if (nonDigitParts > 0) { | ||
title = lastParts.get(0); | ||
} | ||
if (nonDigitParts > 1) { | ||
journalOrPublisher = lastParts.get(1); | ||
} | ||
if (nonDigitParts > 2) { | ||
isArticle = false; | ||
} | ||
return fixSpaces(input); | ||
} | ||
} |
42 changes: 42 additions & 0 deletions
42
src/main/java/org/jabref/gui/bibtexextractor/BibtexExtractorViewModel.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package org.jabref.gui.bibtexextractor; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
import javafx.beans.property.SimpleStringProperty; | ||
import javafx.beans.property.StringProperty; | ||
|
||
import org.jabref.Globals; | ||
import org.jabref.model.database.BibDatabaseContext; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.types.EntryType; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
|
||
public class BibtexExtractorViewModel { | ||
|
||
private final StringProperty inputTextProperty = new SimpleStringProperty(""); | ||
private final BibDatabaseContext bibdatabaseContext; | ||
|
||
public BibtexExtractorViewModel(BibDatabaseContext bibdatabaseContext) { | ||
this.bibdatabaseContext = bibdatabaseContext; | ||
} | ||
|
||
public StringProperty inputTextProperty() { | ||
return this.inputTextProperty; | ||
} | ||
|
||
public void startExtraction() { | ||
|
||
BibtexExtractor extractor = new BibtexExtractor(); | ||
BibEntry entity = extractor.extract(inputTextProperty.getValue()); | ||
this.bibdatabaseContext.getDatabase().insertEntry(entity); | ||
trackNewEntry(StandardEntryType.Article); | ||
} | ||
|
||
private void trackNewEntry(EntryType type) { | ||
Map<String, String> properties = new HashMap<>(); | ||
properties.put("EntryType", type.getName()); | ||
|
||
Globals.getTelemetryClient().ifPresent(client -> client.trackEvent("NewEntry", properties, new HashMap<>())); | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
src/main/java/org/jabref/gui/bibtexextractor/ExtractBibtexAction.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package org.jabref.gui.bibtexextractor; | ||
|
||
import org.jabref.gui.StateManager; | ||
import org.jabref.gui.actions.SimpleCommand; | ||
|
||
import static org.jabref.gui.actions.ActionHelper.needsDatabase; | ||
|
||
public class ExtractBibtexAction extends SimpleCommand { | ||
|
||
public ExtractBibtexAction(StateManager stateManager) { | ||
this.executable.bind(needsDatabase(stateManager)); | ||
} | ||
|
||
@Override | ||
public void execute() { | ||
ExtractBibtexDialog dlg = new ExtractBibtexDialog(); | ||
dlg.showAndWait(); | ||
} | ||
} |
14 changes: 14 additions & 0 deletions
14
src/main/java/org/jabref/gui/bibtexextractor/ExtractBibtexDialog.fxml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<?import javafx.scene.control.ButtonType?> | ||
<?import javafx.scene.control.DialogPane?> | ||
<?import javafx.scene.control.TextArea?> | ||
|
||
<DialogPane prefHeight="430.0" prefWidth="586.0" xmlns="http://javafx.com/javafx/8.0.171" | ||
xmlns:fx="http://javafx.com/fxml/1" fx:controller="org.jabref.gui.bibtexextractor.ExtractBibtexDialog"> | ||
<content> | ||
<TextArea fx:id="input" minHeight="-Infinity" prefHeight="350.0" prefWidth="586.0"/> | ||
</content> | ||
<ButtonType fx:id="extractButtonType" buttonData="OK_DONE" text="%Extract"/> | ||
<ButtonType fx:constant="CANCEL"/> | ||
</DialogPane> |
50 changes: 50 additions & 0 deletions
50
src/main/java/org/jabref/gui/bibtexextractor/ExtractBibtexDialog.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package org.jabref.gui.bibtexextractor; | ||
|
||
import javax.inject.Inject; | ||
|
||
import javafx.fxml.FXML; | ||
import javafx.scene.control.Button; | ||
import javafx.scene.control.ButtonType; | ||
import javafx.scene.control.TextArea; | ||
import javafx.scene.control.Tooltip; | ||
|
||
import org.jabref.gui.StateManager; | ||
import org.jabref.gui.util.BaseDialog; | ||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.model.database.BibDatabaseContext; | ||
|
||
import com.airhacks.afterburner.views.ViewLoader; | ||
|
||
/** | ||
* GUI Dialog for the feature "Extract BibTeX from plain text". | ||
*/ | ||
public class ExtractBibtexDialog extends BaseDialog<Void> { | ||
|
||
private final Button buttonExtract; | ||
@FXML private TextArea input; | ||
@FXML private ButtonType extractButtonType; | ||
private BibtexExtractorViewModel viewModel; | ||
|
||
@Inject private StateManager stateManager; | ||
|
||
public ExtractBibtexDialog() { | ||
|
||
ViewLoader.view(this) | ||
.load() | ||
.setAsDialogPane(this); | ||
|
||
this.setTitle(Localization.lang("Input text to parse")); | ||
buttonExtract = (Button) getDialogPane().lookupButton(extractButtonType); | ||
buttonExtract.setTooltip(new Tooltip((Localization.lang("Starts the extraction of the BibTeX entry")))); | ||
buttonExtract.setOnAction(e -> viewModel.startExtraction()); | ||
buttonExtract.disableProperty().bind(viewModel.inputTextProperty().isEmpty()); | ||
} | ||
|
||
@FXML | ||
private void initialize() { | ||
BibDatabaseContext database = stateManager.getActiveDatabase().orElseThrow(() -> new NullPointerException("Database null")); | ||
this.viewModel = new BibtexExtractorViewModel(database); | ||
|
||
input.textProperty().bindBidirectional(viewModel.inputTextProperty()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters