Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bibtexextractor #5206

Merged
merged 18 commits into from
Aug 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,4 @@ Yang Zongze
Yara Grassi Gouffon
Yifan Peng
Zhang Liang
Nikita Borovikov
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- The Medline fetcher now normalizes the author names according to the BibTeX-Standard [#4345](https://github.com/JabRef/jabref/issues/4345)
- We added an option on the Linked File Viewer to rename the attached file of an entry directly on the JabRef. [#4844](https://github.com/JabRef/jabref/issues/4844)
- We added an option in the preference dialog box that allows user to enable helpful tooltips.[#3599](https://github.com/JabRef/jabref/issues/3599)
- We added a tool for extracting BibTeX entries from plain text. [#5206](https://github.com/JabRef/jabref/pull/5206)
- We moved the dropdown menu for selecting the push-application from the toolbar into the external application preferences. [#674](https://github.com/JabRef/jabref/issues/674)
- We removed the alphabetical ordering of the custom tabs and updated the error message when trying to create a general field with a name containing an illegal character. [#5019](https://github.com/JabRef/jabref/issues/5019)
- We added a context menu to the bib(la)tex-source-editor to copy'n'paste. [#5007](https://github.com/JabRef/jabref/pull/5007)
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/gui/JabRefFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.jabref.gui.actions.SimpleCommand;
import org.jabref.gui.actions.StandardActions;
import org.jabref.gui.auximport.NewSubLibraryAction;
import org.jabref.gui.bibtexextractor.ExtractBibtexAction;
import org.jabref.gui.bibtexkeypattern.BibtexKeyPatternAction;
import org.jabref.gui.contentselector.ManageContentSelectorAction;
import org.jabref.gui.copyfiles.CopyFilesAction;
Expand Down Expand Up @@ -772,6 +773,7 @@ private MenuBar createMenu() {
factory.createMenuItem(StandardActions.FIND_UNLINKED_FILES, new FindUnlinkedFilesAction(this, stateManager)),
factory.createMenuItem(StandardActions.WRITE_XMP, new OldDatabaseCommandWrapper(Actions.WRITE_XMP, this, stateManager)),
factory.createMenuItem(StandardActions.COPY_LINKED_FILES, new CopyFilesAction(stateManager, this.getDialogService())),
factory.createMenuItem(StandardActions.EXTRACT_BIBTEX, new ExtractBibtexAction(stateManager)),

new SeparatorMenuItem(),

Expand Down
1 change: 1 addition & 0 deletions src/main/java/org/jabref/gui/actions/StandardActions.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ public enum StandardActions implements Action {
DOWNLOAD_FULL_TEXT(Localization.lang("Search full text documents online"), IconTheme.JabRefIcons.FILE_SEARCH, KeyBinding.DOWNLOAD_FULL_TEXT),
CLEANUP_ENTRIES(Localization.lang("Cleanup entries"), IconTheme.JabRefIcons.CLEANUP_ENTRIES, KeyBinding.CLEANUP),
SET_FILE_LINKS(Localization.lang("Automatically set file links"), KeyBinding.AUTOMATICALLY_LINK_FILES),
EXTRACT_BIBTEX(Localization.lang("Extract BibTeX from plain text")),

HELP(Localization.lang("Online help"), IconTheme.JabRefIcons.HELP, KeyBinding.HELP),
HELP_KEY_PATTERNS(Localization.lang("Help on key patterns"), IconTheme.JabRefIcons.HELP, KeyBinding.HELP),
Expand Down
170 changes: 170 additions & 0 deletions src/main/java/org/jabref/gui/bibtexextractor/BibtexExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package org.jabref.gui.bibtexextractor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

public class BibtexExtractor {

private static final String AUTHOR_TAG = "[author_tag]";
private static final String URL_TAG = "[url_tag]";
private static final String YEAR_TAG = "[year_tag]";
private static final String PAGES_TAG = "[pages_tag]";

private static final String INITIALS_GROUP = "INITIALS";
private static final String LASTNAME_GROUP = "LASTNAME";

private static final Pattern URL_PATTERN = Pattern.compile(
"(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)" +
"(([\\w\\-]+\\.)+?([\\w\\-.~]+\\/?)*" +
"[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern YEAR_PATTERN = Pattern.compile(
"\\d{4}",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern AUTHOR_PATTERN = Pattern.compile(
"(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+),?\\s(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})" +
"\\s*(and|,|\\.)*",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern AUTHOR_PATTERN_2 = Pattern.compile(
"(?<" + INITIALS_GROUP + ">(\\p{Lu}\\.\\s){1,2})(?<" + LASTNAME_GROUP + ">\\p{Lu}\\w+)" +
"\\s*(and|,|\\.)*",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private static final Pattern PAGES_PATTERN = Pattern.compile(
"(p.)?\\s?\\d+(-\\d+)?",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);

private final List<String> urls = new ArrayList<>();
private final List<String> authors = new ArrayList<>();
private String year = "";
private String pages = "";
private String title = "";
private boolean isArticle = true;
private String journalOrPublisher = "";

public BibEntry extract(String input) {
String inputWithoutUrls = findUrls(input);
String inputWithoutAuthors = findAuthors(inputWithoutUrls);
String inputWithoutYear = findYear(inputWithoutAuthors);
String inputWithoutPages = findPages(inputWithoutYear);
String nonParsed = findParts(inputWithoutPages);
return generateEntity(nonParsed);
}

private BibEntry generateEntity(String input) {
EntryType type = isArticle ? StandardEntryType.Article : StandardEntryType.Book;
BibEntry extractedEntity = new BibEntry(type);
extractedEntity.setField(StandardField.AUTHOR, String.join(" and ", authors));
extractedEntity.setField(StandardField.URL, String.join(", ", urls));
extractedEntity.setField(StandardField.YEAR, year);
extractedEntity.setField(StandardField.PAGES, pages);
extractedEntity.setField(StandardField.TITLE, title);
if (isArticle) {
extractedEntity.setField(StandardField.JOURNAL, journalOrPublisher);
} else {
extractedEntity.setField(StandardField.PUBLISHER, journalOrPublisher);
}
extractedEntity.setField(StandardField.COMMENT, input);
return extractedEntity;
}

private String findUrls(String input) {
Matcher matcher = URL_PATTERN.matcher(input);
while (matcher.find()) {
urls.add(input.substring(matcher.start(1), matcher.end()));
}
return fixSpaces(matcher.replaceAll(URL_TAG));
}

private String findYear(String input) {
Matcher matcher = YEAR_PATTERN.matcher(input);
while (matcher.find()) {
String yearCandidate = input.substring(matcher.start(), matcher.end());
int intYearCandidate = Integer.parseInt(yearCandidate);
if ((intYearCandidate > 1700) && (intYearCandidate <= Calendar.getInstance().get(Calendar.YEAR))) {
year = yearCandidate;
return fixSpaces(input.replace(year, YEAR_TAG));
}
}
return input;
}

private String findAuthors(String input) {
String currentInput = findAuthorsByPattern(input, AUTHOR_PATTERN);
return findAuthorsByPattern(currentInput, AUTHOR_PATTERN_2);
}

private String findAuthorsByPattern(String input, Pattern pattern) {
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
authors.add(GenerateAuthor(matcher.group(LASTNAME_GROUP), matcher.group(INITIALS_GROUP)));
}
return fixSpaces(matcher.replaceAll(AUTHOR_TAG));
}

private String GenerateAuthor(String lastName, String initials) {
return lastName + ", " + initials;
}

private String findPages(String input) {
Matcher matcher = PAGES_PATTERN.matcher(input);
if (matcher.find()) {
pages = input.substring(matcher.start(), matcher.end());
}
return fixSpaces(matcher.replaceFirst(PAGES_TAG));
}

private String fixSpaces(String input) {
return input.replaceAll("[,.!?;:]", "$0 ")
.replaceAll("\\p{Lt}", " $0")
.replaceAll("\\s+", " ").trim();
}

private String findParts(String input) {
ArrayList<String> lastParts = new ArrayList<>();
int afterAuthorsIndex = input.lastIndexOf(AUTHOR_TAG);
if (afterAuthorsIndex == -1) {
return input;
} else {
afterAuthorsIndex += AUTHOR_TAG.length();
}
int delimiterIndex = input.lastIndexOf("//");
if (delimiterIndex != -1) {
lastParts.add(input.substring(afterAuthorsIndex, delimiterIndex)
.replace(YEAR_TAG, "")
.replace(PAGES_TAG, ""));
lastParts.addAll(Arrays.asList(input.substring(delimiterIndex + 2).split(",|\\.")));
} else {
lastParts.addAll(Arrays.asList(input.substring(afterAuthorsIndex).split(",|\\.")));
}
int nonDigitParts = 0;
for (String part : lastParts) {
if (part.matches(".*\\d.*")) {
break;
}
nonDigitParts++;
}
if (nonDigitParts > 0) {
title = lastParts.get(0);
}
if (nonDigitParts > 1) {
journalOrPublisher = lastParts.get(1);
}
if (nonDigitParts > 2) {
isArticle = false;
}
return fixSpaces(input);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.jabref.gui.bibtexextractor;

import java.util.HashMap;
import java.util.Map;

import javafx.beans.property.SimpleStringProperty;
import javafx.beans.property.StringProperty;

import org.jabref.Globals;
import org.jabref.model.database.BibDatabaseContext;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

public class BibtexExtractorViewModel {

private final StringProperty inputTextProperty = new SimpleStringProperty("");
private final BibDatabaseContext bibdatabaseContext;

public BibtexExtractorViewModel(BibDatabaseContext bibdatabaseContext) {
this.bibdatabaseContext = bibdatabaseContext;
}

public StringProperty inputTextProperty() {
return this.inputTextProperty;
}

public void startExtraction() {

BibtexExtractor extractor = new BibtexExtractor();
BibEntry entity = extractor.extract(inputTextProperty.getValue());
this.bibdatabaseContext.getDatabase().insertEntry(entity);
trackNewEntry(StandardEntryType.Article);
}

private void trackNewEntry(EntryType type) {
Map<String, String> properties = new HashMap<>();
properties.put("EntryType", type.getName());

Globals.getTelemetryClient().ifPresent(client -> client.trackEvent("NewEntry", properties, new HashMap<>()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.jabref.gui.bibtexextractor;

import org.jabref.gui.StateManager;
import org.jabref.gui.actions.SimpleCommand;

import static org.jabref.gui.actions.ActionHelper.needsDatabase;

public class ExtractBibtexAction extends SimpleCommand {

public ExtractBibtexAction(StateManager stateManager) {
this.executable.bind(needsDatabase(stateManager));
}

@Override
public void execute() {
ExtractBibtexDialog dlg = new ExtractBibtexDialog();
dlg.showAndWait();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>

<?import javafx.scene.control.ButtonType?>
<?import javafx.scene.control.DialogPane?>
<?import javafx.scene.control.TextArea?>

<DialogPane prefHeight="430.0" prefWidth="586.0" xmlns="http://javafx.com/javafx/8.0.171"
xmlns:fx="http://javafx.com/fxml/1" fx:controller="org.jabref.gui.bibtexextractor.ExtractBibtexDialog">
<content>
<TextArea fx:id="input" minHeight="-Infinity" prefHeight="350.0" prefWidth="586.0"/>
</content>
<ButtonType fx:id="extractButtonType" buttonData="OK_DONE" text="%Extract"/>
<ButtonType fx:constant="CANCEL"/>
</DialogPane>
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.jabref.gui.bibtexextractor;

import javax.inject.Inject;

import javafx.fxml.FXML;
import javafx.scene.control.Button;
import javafx.scene.control.ButtonType;
import javafx.scene.control.TextArea;
import javafx.scene.control.Tooltip;

import org.jabref.gui.StateManager;
import org.jabref.gui.util.BaseDialog;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabaseContext;

import com.airhacks.afterburner.views.ViewLoader;

/**
* GUI Dialog for the feature "Extract BibTeX from plain text".
*/
public class ExtractBibtexDialog extends BaseDialog<Void> {

private final Button buttonExtract;
@FXML private TextArea input;
@FXML private ButtonType extractButtonType;
private BibtexExtractorViewModel viewModel;

@Inject private StateManager stateManager;

public ExtractBibtexDialog() {

ViewLoader.view(this)
.load()
.setAsDialogPane(this);

this.setTitle(Localization.lang("Input text to parse"));
buttonExtract = (Button) getDialogPane().lookupButton(extractButtonType);
buttonExtract.setTooltip(new Tooltip((Localization.lang("Starts the extraction of the BibTeX entry"))));
buttonExtract.setOnAction(e -> viewModel.startExtraction());
buttonExtract.disableProperty().bind(viewModel.inputTextProperty().isEmpty());
}

@FXML
private void initialize() {
BibDatabaseContext database = stateManager.getActiveDatabase().orElseThrow(() -> new NullPointerException("Database null"));
this.viewModel = new BibtexExtractorViewModel(database);

input.textProperty().bindBidirectional(viewModel.inputTextProperty());
}
}
5 changes: 5 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,11 @@ Accept\ changes=Accept changes
Dismiss\ changes=Dismiss changes
The\ library\ has\ been\ modified\ by\ another\ program.=The library has been modified by another program.

Extract=Extract
Extract\ BibTeX\ from\ plain\ text= Extract BibTeX from plain text
Input\ text\ to\ parse=Input text to parse
Starts\ the\ extraction\ of\ the\ BibTeX\ entry=Starts the extraction of the BibTeX entry

Execute\ command=Execute command
Open\ File\ Browser=Open File Browser
Use\ default\ file\ browser=Use default file browser
Expand Down