From 6ceb4096037d1e68afb3f26281c019b9e7afe95d Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Thu, 18 Jul 2024 14:42:00 +0200 Subject: [PATCH 01/18] Move switch for online/offline into ExtractReferencesAction --- .../maintable/ExtractReferencesAction.java | 30 ++++++++----------- .../jabref/gui/maintable/RightClickMenu.java | 6 ++-- .../fileformat/PdfContentImporter.java | 2 ++ 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java b/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java index 392fc4dd0ad..2f84e7f816e 100644 --- a/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java +++ b/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java @@ -15,7 +15,6 @@ import org.jabref.gui.actions.SimpleCommand; import org.jabref.gui.importer.ImportEntriesDialog; import org.jabref.gui.util.BackgroundTask; -import org.jabref.gui.util.TaskExecutor; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter; import org.jabref.logic.importer.util.GrobidService; @@ -33,30 +32,30 @@ /** * SIDE EFFECT: Sets the "cites" field of the entry having the linked files * - * Mode choice A: online or offline - * Mode choice B: complete entry or single file (the latter is not implemented) + * + * + * The mode is selected by the preferences whether to use Grobid or not. * * The different modes should be implemented as sub classes. However, this was too complicated, thus we use variables at the constructor to parameterize this class. */ public class ExtractReferencesAction extends SimpleCommand { private final int FILES_LIMIT = 10; - private final boolean online; private final DialogService dialogService; private final StateManager stateManager; private final PreferencesService preferencesService; private final BibEntry entry; private final LinkedFile linkedFile; - private final TaskExecutor taskExecutor; private final BibliographyFromPdfImporter bibliographyFromPdfImporter; - public ExtractReferencesAction(boolean online, - DialogService dialogService, + public ExtractReferencesAction(DialogService dialogService, StateManager stateManager, - PreferencesService preferencesService, - TaskExecutor taskExecutor) { - this(online, dialogService, stateManager, preferencesService, null, null, taskExecutor); + PreferencesService preferencesService) { + this(dialogService, stateManager, preferencesService, null, null); } /** @@ -65,20 +64,16 @@ public ExtractReferencesAction(boolean online, * @param entry the entry to handle (can be null) * @param linkedFile the linked file (can be null) */ - private ExtractReferencesAction(boolean online, - @NonNull DialogService dialogService, + private ExtractReferencesAction(@NonNull DialogService dialogService, @NonNull StateManager stateManager, @NonNull PreferencesService preferencesService, @Nullable BibEntry entry, - @Nullable LinkedFile linkedFile, - @NonNull TaskExecutor taskExecutor) { - this.online = online; + @Nullable LinkedFile linkedFile) { this.dialogService = dialogService; this.stateManager = stateManager; this.preferencesService = preferencesService; this.entry = entry; this.linkedFile = linkedFile; - this.taskExecutor = taskExecutor; bibliographyFromPdfImporter = new BibliographyFromPdfImporter(preferencesService.getCitationKeyPatternPreferences()); if (this.linkedFile == null) { @@ -98,8 +93,6 @@ public void execute() { private void extractReferences() { stateManager.getActiveDatabase().ifPresent(databaseContext -> { - assert online == this.preferencesService.getGrobidPreferences().isGrobidEnabled(); - List selectedEntries; if (entry == null) { selectedEntries = stateManager.getSelectedEntries(); @@ -107,6 +100,7 @@ private void extractReferences() { selectedEntries = List.of(entry); } + boolean online = this.preferencesService.getGrobidPreferences().isGrobidEnabled(); Callable parserResultCallable; if (online) { Optional> parserResultCallableOnline = getParserResultCallableOnline(databaseContext, selectedEntries); diff --git a/src/main/java/org/jabref/gui/maintable/RightClickMenu.java b/src/main/java/org/jabref/gui/maintable/RightClickMenu.java index fdd2defec32..6e0f5d9a1e1 100644 --- a/src/main/java/org/jabref/gui/maintable/RightClickMenu.java +++ b/src/main/java/org/jabref/gui/maintable/RightClickMenu.java @@ -53,8 +53,10 @@ public static ContextMenu create(BibEntryTableViewModel entry, ActionFactory factory = new ActionFactory(); ContextMenu contextMenu = new ContextMenu(); - MenuItem extractFileReferencesOnline = factory.createMenuItem(StandardActions.EXTRACT_FILE_REFERENCES_ONLINE, new ExtractReferencesAction(true, dialogService, stateManager, preferencesService, taskExecutor)); - MenuItem extractFileReferencesOffline = factory.createMenuItem(StandardActions.EXTRACT_FILE_REFERENCES_OFFLINE, new ExtractReferencesAction(false, dialogService, stateManager, preferencesService, taskExecutor)); + ExtractReferencesAction extractReferencesAction = new ExtractReferencesAction(dialogService, stateManager, preferencesService); + // Two menu items required, because of menu item display. Action checks preference internal what to do + MenuItem extractFileReferencesOnline = factory.createMenuItem(StandardActions.EXTRACT_FILE_REFERENCES_ONLINE, extractReferencesAction); + MenuItem extractFileReferencesOffline = factory.createMenuItem(StandardActions.EXTRACT_FILE_REFERENCES_OFFLINE, extractReferencesAction); contextMenu.getItems().addAll( factory.createMenuItem(StandardActions.COPY, new EditAction(StandardActions.COPY, () -> libraryTab, stateManager, undoManager)), diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 78759549575..b48283e06b4 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -35,6 +35,8 @@ *

* Currently, Springer, and IEEE formats are supported. *

+ * In case one wants to have a list of {@link BibEntry} matching the bibliography of a PDF, + * please see {@link BibliographyFromPdfImporter}. */ public class PdfContentImporter extends Importer { From 4d574b6e985719182a3aeb02be1a819fafe38215 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Thu, 18 Jul 2024 14:43:12 +0200 Subject: [PATCH 02/18] Add support to parse more than the last page of a PDF --- CHANGELOG.md | 1 + .../BibliographyFromPdfImporter.java | 23 +++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4d193ddab3..9a2f9eb6e81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ### Added - We added support for selecting and using CSL Styles in JabRef's OpenOffice/LibreOffice integration for inserting bibliographic and in-text citations into a document. [#2146](https://github.com/JabRef/jabref/issues/2146), [#8893](https://github.com/JabRef/jabref/issues/8893) +- When converting the references section of a paper (PDF file), mor than the last page is treated. - Added minimal support for [biblatex data annotation](https://mirrors.ctan.org/macros/latex/contrib/biblatex/doc/biblatex.pdf#subsection.3.7) fields in .layout files. [#11505](https://github.com/JabRef/jabref/issues/11505) - Added saving of selected options in the [Lookup -> Search for unlinked local files dialog](https://docs.jabref.org/collect/findunlinkedfiles#link-the-pdfs-to-your-bib-library). [#11439](https://github.com/JabRef/jabref/issues/11439) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java index f2605594e13..04d9d94200b 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java @@ -39,6 +39,7 @@ *

* Currently, IEEE two column format is supported. *

+ * To extract a {@link BibEntry} matching the PDF, see {@link PdfContentImporter}. */ public class BibliographyFromPdfImporter extends Importer { @@ -93,7 +94,7 @@ public ParserResult importDatabase(Path filePath) { List result; try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) { - String contents = getLastPageContents(document); + String contents = getReferencesPagesText(document); result = getEntriesFromPDFContent(contents); } catch (EncryptedPdfsNotSupportedException e) { return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); @@ -130,15 +131,27 @@ private List getEntriesFromPDFContent(String contents) { .toList(); } - private String getLastPageContents(PDDocument document) throws IOException { - PDFTextStripper stripper = new PDFTextStripper(); + /** + * Extracts the text from all pages containing references. It simply goes from the last page backwards until there is probably no reference anymore. + */ + private String getReferencesPagesText(PDDocument document) throws IOException { + return prependToResult("", document, new PDFTextStripper(), document.getNumberOfPages()); + } + + private String prependToResult(String currentText, PDDocument document, PDFTextStripper stripper, int pageNumber) throws IOException { + String pageContents = getPageContents(document, stripper, pageNumber); + String result = pageContents + currentText; + if (!pageContents.contains("References") && !pageContents.contains("REFERENCES") && (pageNumber > 0)) { + return prependToResult(result, document, stripper, pageNumber - 1); + } + return result; + } - int lastPage = document.getNumberOfPages(); + private static String getPageContents(PDDocument document, PDFTextStripper stripper, int lastPage) throws IOException { stripper.setStartPage(lastPage); stripper.setEndPage(lastPage); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); - return writer.toString(); } From 267884eab508c8cfb8150c16e39734538774a1a3 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Fri, 19 Jul 2024 14:46:44 +0200 Subject: [PATCH 03/18] First implementation of NEwLibraryFromPdfAction --- CHANGELOG.md | 1 + .../jabref/gui/actions/StandardActions.java | 1 + .../java/org/jabref/gui/frame/MainMenu.java | 2 + .../maintable/ExtractReferencesAction.java | 2 - .../maintable/NewLibraryFromPdfAction.java | 98 +++++++++++++++++++ .../org/jabref/gui/util/UiTaskExecutor.java | 3 - .../jabref/logic/util/DelayTaskThrottler.java | 8 +- src/main/resources/l10n/JabRef_en.properties | 17 ++-- 8 files changed, 115 insertions(+), 17 deletions(-) create mode 100644 src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a2f9eb6e81..809cb15f2db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ### Added - We added support for selecting and using CSL Styles in JabRef's OpenOffice/LibreOffice integration for inserting bibliographic and in-text citations into a document. [#2146](https://github.com/JabRef/jabref/issues/2146), [#8893](https://github.com/JabRef/jabref/issues/8893) +- We added Tools > New library based on references in PDF file... to create a new library based on the references section in a PDF file. - When converting the references section of a paper (PDF file), mor than the last page is treated. - Added minimal support for [biblatex data annotation](https://mirrors.ctan.org/macros/latex/contrib/biblatex/doc/biblatex.pdf#subsection.3.7) fields in .layout files. [#11505](https://github.com/JabRef/jabref/issues/11505) - Added saving of selected options in the [Lookup -> Search for unlinked local files dialog](https://docs.jabref.org/collect/findunlinkedfiles#link-the-pdfs-to-your-bib-library). [#11439](https://github.com/JabRef/jabref/issues/11439) diff --git a/src/main/java/org/jabref/gui/actions/StandardActions.java b/src/main/java/org/jabref/gui/actions/StandardActions.java index 931e049d163..4bb5338f1ce 100644 --- a/src/main/java/org/jabref/gui/actions/StandardActions.java +++ b/src/main/java/org/jabref/gui/actions/StandardActions.java @@ -93,6 +93,7 @@ public enum StandardActions implements Action { PARSE_LATEX(Localization.lang("Search for citations in LaTeX files..."), IconTheme.JabRefIcons.LATEX_CITATIONS), NEW_SUB_LIBRARY_FROM_AUX(Localization.lang("New sublibrary based on AUX file") + "...", Localization.lang("New BibTeX sublibrary") + Localization.lang("This feature generates a new library based on which entries are needed in an existing LaTeX document."), IconTheme.JabRefIcons.NEW), + NEW_LIBRARY_FROM_PDF(Localization.lang("New library based on references in PDF file..."), Localization.lang("This feature generates a new library based on the list of references in a PDF file."), IconTheme.JabRefIcons.NEW), WRITE_METADATA_TO_PDF(Localization.lang("Write metadata to PDF files"), Localization.lang("Will write metadata to the PDFs linked from selected entries."), KeyBinding.WRITE_METADATA_TO_PDF), START_NEW_STUDY(Localization.lang("Start new systematic literature review")), diff --git a/src/main/java/org/jabref/gui/frame/MainMenu.java b/src/main/java/org/jabref/gui/frame/MainMenu.java index 3e4351759ab..c0414a642e7 100644 --- a/src/main/java/org/jabref/gui/frame/MainMenu.java +++ b/src/main/java/org/jabref/gui/frame/MainMenu.java @@ -51,6 +51,7 @@ import org.jabref.gui.journals.AbbreviateAction; import org.jabref.gui.libraryproperties.LibraryPropertiesAction; import org.jabref.gui.linkedfile.RedownloadMissingFilesAction; +import org.jabref.gui.maintable.NewLibraryFromPdfAction; import org.jabref.gui.mergeentries.MergeEntriesAction; import org.jabref.gui.preferences.ShowPreferencesAction; import org.jabref.gui.preview.CopyCitationAction; @@ -276,6 +277,7 @@ private void createMenu() { tools.getItems().addAll( factory.createMenuItem(StandardActions.PARSE_LATEX, new ParseLatexAction(stateManager)), factory.createMenuItem(StandardActions.NEW_SUB_LIBRARY_FROM_AUX, new NewSubLibraryAction(frame, stateManager, dialogService)), + factory.createMenuItem(StandardActions.NEW_LIBRARY_FROM_PDF, new NewLibraryFromPdfAction(frame, stateManager, dialogService, preferencesService, taskExecutor)), new SeparatorMenuItem(), diff --git a/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java b/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java index 2f84e7f816e..3d243ee0e06 100644 --- a/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java +++ b/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java @@ -38,8 +38,6 @@ * * * The mode is selected by the preferences whether to use Grobid or not. - * - * The different modes should be implemented as sub classes. However, this was too complicated, thus we use variables at the constructor to parameterize this class. */ public class ExtractReferencesAction extends SimpleCommand { private final int FILES_LIMIT = 10; diff --git a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java new file mode 100644 index 00000000000..58db1795f5e --- /dev/null +++ b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java @@ -0,0 +1,98 @@ +package org.jabref.gui.maintable; + +import java.nio.file.Path; +import java.util.concurrent.Callable; + +import javafx.application.Platform; + +import org.jabref.gui.DialogService; +import org.jabref.gui.LibraryTabContainer; +import org.jabref.gui.StateManager; +import org.jabref.gui.actions.SimpleCommand; +import org.jabref.gui.util.BackgroundTask; +import org.jabref.gui.util.FileDialogConfiguration; +import org.jabref.gui.util.TaskExecutor; +import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter; +import org.jabref.logic.importer.util.GrobidService; +import org.jabref.logic.l10n.Localization; +import org.jabref.logic.util.StandardFileType; +import org.jabref.preferences.PreferencesService; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Similar to {@link ExtractReferencesAction}. This action creates a new library, the other action "just" appends to the current library + * + *

    + *
  • Mode choice A: online or offline
  • + *
  • Mode choice B: complete entry or single file (the latter is not implemented)
  • + *
+ *

+ * The mode is selected by the preferences whether to use Grobid or not. + *

+ * The different modes should be implemented as sub classes. Moreover, there are synergies with {@link ExtractReferencesAction}. However, this was too complicated. + */ +public class NewLibraryFromPdfAction extends SimpleCommand { + private static final Logger LOGGER = LoggerFactory.getLogger(NewLibraryFromPdfAction .class); + + private final LibraryTabContainer libraryTabContainer; + private final StateManager stateManager; + private final DialogService dialogService; + private final PreferencesService preferencesService; + private final BibliographyFromPdfImporter bibliographyFromPdfImporter; + private final TaskExecutor taskExecutor; + + public NewLibraryFromPdfAction( + LibraryTabContainer libraryTabContainer, + StateManager stateManager, + DialogService dialogService, + PreferencesService preferencesService, + TaskExecutor taskExecutor) { + this.libraryTabContainer = libraryTabContainer; + this.stateManager = stateManager; + this.dialogService = dialogService; + this.preferencesService = preferencesService; + this.bibliographyFromPdfImporter = new BibliographyFromPdfImporter(preferencesService.getCitationKeyPatternPreferences()); + this.taskExecutor = taskExecutor; + } + + @Override + public void execute() { + final FileDialogConfiguration.Builder builder = new FileDialogConfiguration.Builder(); + builder.withDefaultExtension(StandardFileType.PDF); + // Sensible default for the directory to start browsing is the directory of the currently opened library. The pdf storage dir seems not to be feasible, because extracting references from a PDF itself can be done by the context menu of the respective entry. + stateManager.getActiveDatabase() + .flatMap(db -> db.getDatabasePath()) + .ifPresent(path -> builder.withInitialDirectory(path.getParent())); + FileDialogConfiguration fileDialogConfiguration = builder.build(); + + LOGGER.trace("Opening file dialog with configuration: {}", fileDialogConfiguration); + + dialogService.showFileOpenDialog(fileDialogConfiguration).ifPresent(path -> { + LOGGER.trace("Selected file: {}", path); + Callable parserResultCallable = getParserResultCallable(path); + BackgroundTask.wrap(parserResultCallable) + .withInitialMessage(Localization.lang("Processing PDF(s)")) + .onFailure(failure -> Platform.runLater(() -> dialogService.showErrorDialogAndWait(failure))) + .onSuccess(result -> { + LOGGER.trace("Finished processing PDF(s): {}", result); + libraryTabContainer.addTab(result.getDatabaseContext(), true); + }) + .executeWith(taskExecutor); + }); + } + + private Callable getParserResultCallable(Path path) { + Callable parserResultCallable; + boolean online = this.preferencesService.getGrobidPreferences().isGrobidEnabled(); + if (online) { + parserResultCallable = () -> new ParserResult( + new GrobidService(this.preferencesService.getGrobidPreferences()).processReferences(path, preferencesService.getImportFormatPreferences())); + } else { + parserResultCallable = () -> bibliographyFromPdfImporter.importDatabase(path); + } + return parserResultCallable; + } +} diff --git a/src/main/java/org/jabref/gui/util/UiTaskExecutor.java b/src/main/java/org/jabref/gui/util/UiTaskExecutor.java index 02fa8210fa8..98ba40f2440 100644 --- a/src/main/java/org/jabref/gui/util/UiTaskExecutor.java +++ b/src/main/java/org/jabref/gui/util/UiTaskExecutor.java @@ -38,9 +38,6 @@ public class UiTaskExecutor implements TaskExecutor { private final ScheduledExecutorService scheduledExecutor = Executors.newScheduledThreadPool(2); private final WeakHashMap throttlers = new WeakHashMap<>(); - /** - * - */ public static V runInJavaFXThread(Callable callable) { if (Platform.isFxApplicationThread()) { try { diff --git a/src/main/java/org/jabref/logic/util/DelayTaskThrottler.java b/src/main/java/org/jabref/logic/util/DelayTaskThrottler.java index 5ecb5ed0a45..ef2c74f7aec 100644 --- a/src/main/java/org/jabref/logic/util/DelayTaskThrottler.java +++ b/src/main/java/org/jabref/logic/util/DelayTaskThrottler.java @@ -61,13 +61,17 @@ public ScheduledFuture scheduleTask(Callable command) { return scheduledTask; } - // Execute scheduled Runnable early + /** + * Execute scheduled Runnable early + */ public void execute(Runnable command) { delay = 0; schedule(command); } - // Cancel scheduled Runnable gracefully + /** + * Cancel scheduled Runnable gracefully + */ public void cancel() { scheduledTask.cancel(false); } diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index 9679586dc62..75f4b905355 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -111,10 +111,7 @@ Automatically\ create\ groups=Automatically create groups Automatically\ remove\ exact\ duplicates=Automatically remove exact duplicates -AUX\ file\ import=AUX file import - Available\ export\ formats=Available export formats - Available\ import\ formats=Available import formats %0\ source=%0 source @@ -407,8 +404,6 @@ Format\ string=Format string Formatter\ name=Formatter name -found\ in\ AUX\ file=found in AUX file - Fulltext\ for\ %0=Fulltext for %0 Fulltext\ for\ a\ new\ entry=Fulltext for a new entry @@ -528,7 +523,6 @@ Content\:\ %0=Content: %0 Language=Language Last\ modified=Last modified -LaTeX\ AUX\ file\:=LaTeX AUX file\: Link=Link Memory\ stick\ mode\ -\ Store\ preferences\ in\ 'jabref.xml'\ in\ the\ app\ folder.=Memory stick mode - Store preferences in 'jabref.xml' in the app folder. @@ -571,10 +565,6 @@ Name\ formatter=Name formatter Natbib\ style=Natbib style -nested\ AUX\ files=nested AUX files - -New\ BibTeX\ sublibrary=New BibTeX sublibrary - New\ group=New group Next\ entry=Next entry @@ -897,7 +887,14 @@ Statically\ group\ entries\ by\ manual\ assignment=Statically group entries by m Status=Status +AUX\ file\ import=AUX file import +LaTeX\ AUX\ file\:=LaTeX AUX file\: +found\ in\ AUX\ file=found in AUX file +nested\ AUX\ files=nested AUX files +New\ library\ based\ on\ references\ in\ PDF\ file...=New library based on references in PDF file... +This\ feature\ generates\ a\ new\ library\ based\ on\ the\ list\ of\ references\ in\ a\ PDF\ file.=This feature generates a new library based on the list of references in a PDF file. Sublibrary\ from\ AUX\ to\ BibTeX=Sublibrary from AUX to BibTeX +New\ BibTeX\ sublibrary=New BibTeX sublibrary Switches\ between\ full\ and\ abbreviated\ journal\ name\ if\ the\ journal\ name\ is\ known.=Switches between full and abbreviated journal name if the journal name is known. From a827f878fd96de394e2ea08273c7609005c6d3fe Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Fri, 19 Jul 2024 14:47:09 +0200 Subject: [PATCH 04/18] Fix typo in CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 809cb15f2db..b9feb1a92e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv - We added support for selecting and using CSL Styles in JabRef's OpenOffice/LibreOffice integration for inserting bibliographic and in-text citations into a document. [#2146](https://github.com/JabRef/jabref/issues/2146), [#8893](https://github.com/JabRef/jabref/issues/8893) - We added Tools > New library based on references in PDF file... to create a new library based on the references section in a PDF file. -- When converting the references section of a paper (PDF file), mor than the last page is treated. +- When converting the references section of a paper (PDF file), more than the last page is treated. - Added minimal support for [biblatex data annotation](https://mirrors.ctan.org/macros/latex/contrib/biblatex/doc/biblatex.pdf#subsection.3.7) fields in .layout files. [#11505](https://github.com/JabRef/jabref/issues/11505) - Added saving of selected options in the [Lookup -> Search for unlinked local files dialog](https://docs.jabref.org/collect/findunlinkedfiles#link-the-pdfs-to-your-bib-library). [#11439](https://github.com/JabRef/jabref/issues/11439) From 2f05e1675676083823455f3ef7de86dc71dd6811 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Fri, 19 Jul 2024 14:47:22 +0200 Subject: [PATCH 05/18] Extract method --- .../gui/maintable/ExtractReferencesAction.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java b/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java index 3d243ee0e06..99f1ebedaaa 100644 --- a/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java +++ b/src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java @@ -158,9 +158,21 @@ private void extractReferences(Iterator fileListIterator, ParserResult res result.getDatabase().insertEntries(bibliographyFromPdfImporter.importDatabase(fileListIterator.next()).getDatabase().getEntries()); } + String cites = getCites(result.getDatabase().getEntries(), currentEntry); + currentEntry.setField(StandardField.CITES, cites); + } + + /** + * Creates the field content for the "cites" field. The field contains the citation keys of the imported entries. + * + * TODO: Move this part to logic somehow + * + * @param currentEntry used to create citation keys if the importer did not provide one from the imported entry + */ + private static String getCites(List entries, BibEntry currentEntry) { StringJoiner cites = new StringJoiner(","); int count = 0; - for (BibEntry importedEntry : result.getDatabase().getEntries()) { + for (BibEntry importedEntry : entries) { count++; Optional citationKey = importedEntry.getCitationKey(); String citationKeyToAdd; @@ -187,7 +199,7 @@ private void extractReferences(Iterator fileListIterator, ParserResult res } cites.add(citationKeyToAdd); } - currentEntry.setField(StandardField.CITES, cites.toString()); + return cites.toString(); } private Optional> getParserResultCallableOnline(BibDatabaseContext databaseContext, List selectedEntries) { From a0e2c11d332ffe6b6de095e2cc11a7ddf955baff Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Fri, 19 Jul 2024 15:19:42 +0200 Subject: [PATCH 06/18] Collect IEEE formatted pdf in folder pdfs/IEEE --- .../fileformat/BibliographyFromPdfImporterTest.java | 2 +- .../fileformat => pdfs/IEEE}/tua3i2refpage.pdf | Bin 2 files changed, 1 insertion(+), 1 deletion(-) rename src/test/resources/{org/jabref/logic/importer/fileformat => pdfs/IEEE}/tua3i2refpage.pdf (100%) diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java index e441d84e4cd..40130eb09a3 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java @@ -104,7 +104,7 @@ void setup() { @Test void tua3i2refpage() throws Exception { - Path file = Path.of(BibliographyFromPdfImporterTest.class.getResource("tua3i2refpage.pdf").toURI()); + Path file = Path.of(BibliographyFromPdfImporterTest.class.getResource("/pdfs/IEEE/tua3i2refpage.pdf").toURI()); ParserResult parserResult = bibliographyFromPdfImporter.importDatabase(file); BibEntry entry02 = new BibEntry(StandardEntryType.Article) .withCitationKey("Kondo2020") diff --git a/src/test/resources/org/jabref/logic/importer/fileformat/tua3i2refpage.pdf b/src/test/resources/pdfs/IEEE/tua3i2refpage.pdf similarity index 100% rename from src/test/resources/org/jabref/logic/importer/fileformat/tua3i2refpage.pdf rename to src/test/resources/pdfs/IEEE/tua3i2refpage.pdf From 989c8103291550da21e55bd92afc31807ec0c195 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 20 Jul 2024 14:04:45 +0200 Subject: [PATCH 07/18] Improve parsing --- .../AllowedToUseApacheCommonsLang3.java | 11 ++ .../BibliographyFromPdfImporter.java | 103 ++++++++++++------ .../BibliographyFromPdfImporterTest.java | 57 ++++++++++ 3 files changed, 136 insertions(+), 35 deletions(-) create mode 100644 src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java diff --git a/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java b/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java new file mode 100644 index 00000000000..95aa6c0949b --- /dev/null +++ b/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java @@ -0,0 +1,11 @@ +package org.jabref.architecture; + +/** + * Annotation to indicate that usage of ApacheCommonsLang3 is explicitly allowed. + * The intention is to fully switch to Google Guava and only use Apache Commons Lang3 if there is no other possibility + */ +public @interface ApacheCommonsLang3Allowed { + + // The rationale + String value(); +} diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java index 04d9d94200b..0b8b0de3952 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java @@ -5,7 +5,6 @@ import java.io.StringWriter; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -29,6 +28,7 @@ import org.jabref.model.entry.types.StandardEntryType; import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.lang3.StringUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.slf4j.Logger; @@ -45,6 +45,7 @@ public class BibliographyFromPdfImporter extends Importer { private static final Logger LOGGER = LoggerFactory.getLogger(BibliographyFromPdfImporter.class); + private static final Pattern REFERENCES = Pattern.compile("References", Pattern.CASE_INSENSITIVE); private static final Pattern REFERENCE_PATTERN = Pattern.compile("\\[(\\d+)\\](.*?)(?=\\[|$)", Pattern.DOTALL); private static final Pattern YEAR_AT_END = Pattern.compile(", (\\d{4})\\.$"); private static final Pattern PAGES = Pattern.compile(", pp\\. (\\d+--?\\d+)\\.?(.*)"); @@ -53,7 +54,7 @@ public class BibliographyFromPdfImporter extends Importer { private static final Pattern MONTH_AND_YEAR = Pattern.compile(", ([A-Z][a-z]{2,7}\\.? \\d+),? ?(.*)"); private static final Pattern VOLUME = Pattern.compile(", vol\\. (\\d+)(.*)"); private static final Pattern NO = Pattern.compile(", no\\. (\\d+)(.*)"); - private static final Pattern AUTHORS_AND_TITLE_AT_BEGINNING = Pattern.compile("^([^“]+), “(.*?)”, "); + private static final Pattern AUTHORS_AND_TITLE_AT_BEGINNING = Pattern.compile("^([^“]+), “(.*?)(”,|,”) "); private static final Pattern TITLE = Pattern.compile("“(.*?)”, (.*)"); private final CitationKeyPatternPreferences citationKeyPatternPreferences; @@ -135,13 +136,34 @@ private List getEntriesFromPDFContent(String contents) { * Extracts the text from all pages containing references. It simply goes from the last page backwards until there is probably no reference anymore. */ private String getReferencesPagesText(PDDocument document) throws IOException { - return prependToResult("", document, new PDFTextStripper(), document.getNumberOfPages()); + int lastPage = document.getNumberOfPages(); + String lastPageContents = getPageContents(document, new PDFTextStripper(), lastPage); + String result = lastPageContents; + + if (!containsWordReferences(lastPageContents)) { + result = prependToResult(result, document, new PDFTextStripper(), lastPage); + } + + Matcher matcher = REFERENCES.matcher(result); + if (!matcher.hasMatch()) { + // Ensure that not too much is returned + LOGGER.warn("Could not found 'References'. Returning last page only."); + return lastPageContents; + } + + int start = matcher.start(); + return result.substring(start); + } + + private static boolean containsWordReferences(String result) { + Matcher matcher = REFERENCES.matcher(result); + return matcher.find(); } private String prependToResult(String currentText, PDDocument document, PDFTextStripper stripper, int pageNumber) throws IOException { String pageContents = getPageContents(document, stripper, pageNumber); String result = pageContents + currentText; - if (!pageContents.contains("References") && !pageContents.contains("REFERENCES") && (pageNumber > 0)) { + if (!containsWordReferences(pageContents) && (pageNumber > 0)) { return prependToResult(result, document, stripper, pageNumber - 1); } return result; @@ -185,11 +207,11 @@ BibEntry parseReference(String number, String reference) { reference = reference.substring(0, matcher.start()).trim(); } - reference = updateEntryAndReferenceIfMatches(reference, PAGES, result, StandardField.PAGES); + reference = updateEntryAndReferenceIfMatches(reference, PAGES, result, StandardField.PAGES).newReference; // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016 // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019 - reference = updateEntryAndReferenceIfMatches(reference, PAGE, result, StandardField.PAGES); + reference = updateEntryAndReferenceIfMatches(reference, PAGE, result, StandardField.PAGES).newReference; matcher = MONTH_RANGE_AND_YEAR.matcher(reference); if (matcher.find()) { @@ -220,9 +242,13 @@ BibEntry parseReference(String number, String reference) { // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57 // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia - reference = updateEntryAndReferenceIfMatches(reference, VOLUME, result, StandardField.VOLUME); + EntryUpdateResult entryUpdateResult = updateEntryAndReferenceIfMatches(reference, VOLUME, result, StandardField.VOLUME); + boolean volumeFound = entryUpdateResult.modified; + reference = entryUpdateResult.newReference; - reference = updateEntryAndReferenceIfMatches(reference, NO, result, StandardField.NUMBER); + entryUpdateResult = updateEntryAndReferenceIfMatches(reference, NO, result, StandardField.NUMBER); + boolean numberFound = entryUpdateResult.modified; + reference = entryUpdateResult.newReference; // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia @@ -239,17 +265,15 @@ BibEntry parseReference(String number, String reference) { } else { // No authors present // Example: “AF4.1.1 SRF Linac Engineering Design Report”, Internal note. - reference = updateEntryAndReferenceIfMatches(reference, TITLE, result, StandardField.TITLE); + reference = updateEntryAndReferenceIfMatches(reference, TITLE, result, StandardField.TITLE).newReference; } // Nucl. Fusion // in Proc. IPAC’19, Mel- bourne, Australia // presented at th 8th DITANET Topical Workshop on Beam Position Monitors, CERN, Geneva, Switzreland List stringsToRemove = List.of("presented at", "to be presented at"); - // need to use "iterator()" instead of "stream().foreach", because "reference" is modified inside the loop - Iterator iterator = stringsToRemove.iterator(); - while (iterator.hasNext()) { - String check = iterator.next(); + // need to use "for" loop instead of "stream().foreach", because "reference" is modified inside the loop + for (String check : stringsToRemove) { if (reference.startsWith(check)) { reference = reference.substring(check.length()).trim(); result.setType(StandardEntryType.InProceedings); @@ -257,12 +281,23 @@ BibEntry parseReference(String number, String reference) { } boolean startsWithInProc = reference.startsWith("in Proc."); - boolean conainsWorkshop = reference.contains("Workshop"); - if (startsWithInProc || conainsWorkshop) { + boolean containsWorkshop = reference.contains("Workshop"); + if (startsWithInProc || containsWorkshop || (!volumeFound && !numberFound)) { int beginIndex = startsWithInProc ? 3 : 0; - result.setField(StandardField.BOOKTITLE, reference.substring(beginIndex).replace("- ", "").trim()); + String bookTitle = reference.substring(beginIndex).replace("- ", "").trim(); + int lastDot = bookTitle.lastIndexOf('.'); + if (lastDot > 0) { + String textAfterDot = reference.substring(lastDot + 1).trim(); + // We use Apache Commons here, because it is fastest - see table at https://stackoverflow.com/a/35242882/873282 + if (StringUtils.countMatches(textAfterDot, ' ') <= 1) { + bookTitle = bookTitle.substring(0, lastDot).trim(); + reference = textAfterDot; + } + } else { + reference = ""; + } + result.setField(StandardField.BOOKTITLE, bookTitle); result.setType(StandardEntryType.InProceedings); - reference = ""; } // Nucl. Fusion @@ -276,42 +311,40 @@ BibEntry parseReference(String number, String reference) { } else { result.setField(StandardField.JOURNAL, reference.replace("- ", "")); } - reference = ""; } else { + LOGGER.trace("InProceedings fallback used for current state of handled string {}", reference); String toAdd = reference; result.setType(StandardEntryType.InProceedings); if (result.hasField(StandardField.BOOKTITLE)) { String oldTitle = result.getField(StandardField.BOOKTITLE).get(); - result.setField(StandardField.BOOKTITLE, oldTitle + toAdd); + result.setField(StandardField.BOOKTITLE, oldTitle + " " + toAdd); } else { result.setField(StandardField.BOOKTITLE, toAdd); } - reference = ""; - LOGGER.debug("InProceedings fallback used for current state of handled string {}", reference); } - if (reference.isEmpty()) { - result.setField(StandardField.COMMENT, originalReference); - } else { - result.setField(StandardField.COMMENT, "Unprocessed: " + reference + "\n\n" + originalReference); - } + result.setField(StandardField.COMMENT, originalReference); return result; } /** * @param pattern A pattern matching two groups: The first one to take, the second one to leave at the end of the string */ - private static String updateEntryAndReferenceIfMatches(String reference, Pattern pattern, BibEntry result, Field field) { + private static EntryUpdateResult updateEntryAndReferenceIfMatches(String reference, Pattern pattern, BibEntry result, Field field) { Matcher matcher; matcher = pattern.matcher(reference); - if (matcher.find()) { - result.setField(field, matcher.group(1).replace("- ", "")); - String suffix = matcher.group(2); - if (!suffix.isEmpty()) { - suffix = " " + suffix; - } - reference = reference.substring(0, matcher.start()).trim() + suffix; + if (!matcher.find()) { + return new EntryUpdateResult(false, reference); + } + result.setField(field, matcher.group(1).replace("- ", "")); + String suffix = matcher.group(2); + if (!suffix.isEmpty()) { + suffix = " " + suffix; } - return reference; + reference = reference.substring(0, matcher.start()).trim() + suffix; + return new EntryUpdateResult(true, reference); + } + + private static final record EntryUpdateResult(boolean modified, String newReference) { } } diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java index 40130eb09a3..ba128d739d8 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java @@ -221,6 +221,63 @@ void tua3i2refpage() throws Exception { parserResult.getDatabase().getEntries()); } + @Test + void ieeePaper() throws Exception { + Path file = Path.of(BibliographyFromPdfImporterTest.class.getResource("/pdfs/IEEE/ieee-paper.pdf").toURI()); + ParserResult parserResult = bibliographyFromPdfImporter.importDatabase(file); + BibEntry entry01 = new BibEntry(StandardEntryType.Article) + .withCitationKey("Kondo2020") + .withField(StandardField.AUTHOR, "M. O. Alver and T. Tennøy and J. A. Alfredsen and G. Øie") + .withField(StandardField.TITLE, "Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks") + .withField(StandardField.JOURNAL, "Aquacultural engineering") + .withField(StandardField.VOLUME, "36") + .withField(StandardField.NUMBER, "2") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "115-121") + .withField(StandardField.COMMENT, "[1] M. O. Alver, T. Tennøy, J. A. Alfredsen, and G. Øie, “Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks,” Aquacultural engineering, vol. 36, no. 2, pp. 115–121, 2007."); + + BibEntry entry02 = new BibEntry(StandardEntryType.Article) + .withCitationKey("Devanz2017") + .withField(StandardField.AUTHOR, "M. O. Alver and others") + .withField(StandardField.TITLE, "Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates") + .withField(StandardField.BOOKTITLE, "Aquaculture") + .withField(StandardField.VOLUME, "268") + .withField(StandardField.NUMBER, "1") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "216-226") + .withField(StandardField.COMMENT, "[2] M. O. Alver et al., “Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates,” Aquaculture, vol. 268, no. 1, pp. 216–226, 2007."); + + BibEntry entry03 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Branas2018") + .withField(StandardField.AUTHOR, "Oliver Kopp and others") + .withField(StandardField.TITLE, "BPMN4TOSCA: A domain-specific language to model management plans for composite applications") + .withField(StandardField.BOOKTITLE, "Business Process Model and Notation") + .withField(StandardField.SERIES, "LNCS") + .withField(StandardField.VOLUME, "125") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.COMMENT, "[3] O. Kopp et al., “BPMN4TOSCA: A domain-specific language to model management plans for composite applications,” in Business Process Model and Notation, ser. LNCS, vol. 125. Springer, 2012."); + + BibEntry entry04 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Scantamburlo2023") + .withField(StandardField.AUTHOR, "O. Kopp and A. Armbruster and O. Zimmermann") + .withField(StandardField.TITLE, "Markdown architectural decision records: Format and tool support") + .withField(StandardField.BOOKTITLE, "ZEUS") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.PUBLISHER, "CEUR-WS.org") + .withField(StandardField.COMMENT, "[4] O. Kopp, A. Armbruster, and O. Zimmermann, “Markdown architectural decision records: Format and tool support,” in ZEUS. CEUR-WS.org, 2018."); + + BibEntry entry05 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Franco2023") + .withField(StandardField.AUTHOR, "S. König and others") + .withField(StandardField.TITLE, "BPMN4Cars: A car-tailored workflow engine") + .withField(StandardField.BOOKTITLE, "INDIN") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.YEAR, "2023") + .withField(StandardField.COMMENT, "[5] S. König et al., “BPMN4Cars: A car-tailored workflow engine,” in INDIN. IEEE, 2023."); + + assertEquals(List.of(entry01, entry02, entry03, entry04, entry05), parserResult.getDatabase().getEntries()); + } + static Stream references() { return Stream.of( Arguments.of( From 27943808585ee56dcb1f1664f6b524c6f670867b Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 20 Jul 2024 14:05:27 +0200 Subject: [PATCH 08/18] Fix name of ApacheCommonsLang3Allowed to be consistent with the other annotation names --- .../architecture/AllowedToUseApacheCommonsLang3.java | 2 +- .../architecture/ApacheCommonsLang3Allowed.java | 11 ----------- .../bibtexfields/HtmlToUnicodeFormatter.java | 4 ++-- .../java/org/jabref/model/strings/StringUtil.java | 12 ++++++------ .../jabref/architecture/MainArchitectureTest.java | 2 +- 5 files changed, 10 insertions(+), 21 deletions(-) delete mode 100644 src/main/java/org/jabref/architecture/ApacheCommonsLang3Allowed.java diff --git a/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java b/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java index 95aa6c0949b..c50d89171d1 100644 --- a/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java +++ b/src/main/java/org/jabref/architecture/AllowedToUseApacheCommonsLang3.java @@ -4,7 +4,7 @@ * Annotation to indicate that usage of ApacheCommonsLang3 is explicitly allowed. * The intention is to fully switch to Google Guava and only use Apache Commons Lang3 if there is no other possibility */ -public @interface ApacheCommonsLang3Allowed { +public @interface AllowedToUseApacheCommonsLang3 { // The rationale String value(); diff --git a/src/main/java/org/jabref/architecture/ApacheCommonsLang3Allowed.java b/src/main/java/org/jabref/architecture/ApacheCommonsLang3Allowed.java deleted file mode 100644 index 95aa6c0949b..00000000000 --- a/src/main/java/org/jabref/architecture/ApacheCommonsLang3Allowed.java +++ /dev/null @@ -1,11 +0,0 @@ -package org.jabref.architecture; - -/** - * Annotation to indicate that usage of ApacheCommonsLang3 is explicitly allowed. - * The intention is to fully switch to Google Guava and only use Apache Commons Lang3 if there is no other possibility - */ -public @interface ApacheCommonsLang3Allowed { - - // The rationale - String value(); -} diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/HtmlToUnicodeFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/HtmlToUnicodeFormatter.java index 5a9f6bf74b4..6d5c970d0e5 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/HtmlToUnicodeFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/HtmlToUnicodeFormatter.java @@ -1,13 +1,13 @@ package org.jabref.logic.formatter.bibtexfields; -import org.jabref.architecture.ApacheCommonsLang3Allowed; +import org.jabref.architecture.AllowedToUseApacheCommonsLang3; import org.jabref.logic.cleanup.Formatter; import org.jabref.logic.l10n.Localization; import org.jabref.logic.layout.LayoutFormatter; import org.apache.commons.text.StringEscapeUtils; -@ApacheCommonsLang3Allowed("There is no equivalent in Google's Guava") +@AllowedToUseApacheCommonsLang3("There is no equivalent in Google's Guava") public class HtmlToUnicodeFormatter extends Formatter implements LayoutFormatter { @Override diff --git a/src/main/java/org/jabref/model/strings/StringUtil.java b/src/main/java/org/jabref/model/strings/StringUtil.java index c85521a5f71..2237526cd3e 100644 --- a/src/main/java/org/jabref/model/strings/StringUtil.java +++ b/src/main/java/org/jabref/model/strings/StringUtil.java @@ -11,13 +11,13 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.jabref.architecture.ApacheCommonsLang3Allowed; +import org.jabref.architecture.AllowedToUseApacheCommonsLang3; import org.jabref.logic.bibtex.FieldWriter; import com.google.common.base.CharMatcher; import org.apache.commons.lang3.StringUtils; -@ApacheCommonsLang3Allowed("There is no equivalent in Google's Guava") +@AllowedToUseApacheCommonsLang3("There is no equivalent in Google's Guava") public class StringUtil { // Non-letters which are used to denote accents in LaTeX-commands, e.g., in {\"{a}} @@ -672,7 +672,7 @@ public static String unquote(String toUnquote, char quoteChar) { return result.toString(); } - @ApacheCommonsLang3Allowed("No Guava equivalent existing - see https://stackoverflow.com/q/3322152/873282 for a list of other implementations") + @AllowedToUseApacheCommonsLang3("No Guava equivalent existing - see https://stackoverflow.com/q/3322152/873282 for a list of other implementations") public static String stripAccents(String searchQuery) { return StringUtils.stripAccents(searchQuery); } @@ -709,7 +709,7 @@ public static List getStringAsSentences(String text) { return Arrays.asList(splitTextPattern.split(text)); } - @ApacheCommonsLang3Allowed("No direct Guava equivalent existing - see https://stackoverflow.com/q/16560635/873282") + @AllowedToUseApacheCommonsLang3("No direct Guava equivalent existing - see https://stackoverflow.com/q/16560635/873282") public static boolean containsIgnoreCase(String text, String searchString) { return StringUtils.containsIgnoreCase(text, searchString); } @@ -746,12 +746,12 @@ public static boolean containsWhitespace(String s) { return s.chars().anyMatch(Character::isWhitespace); } - @ApacheCommonsLang3Allowed("No Guava equivalent existing - see https://stackoverflow.com/a/23825984") + @AllowedToUseApacheCommonsLang3("No Guava equivalent existing - see https://stackoverflow.com/a/23825984") public static String removeStringAtTheEnd(String string, String stringToBeRemoved) { return StringUtils.removeEndIgnoreCase(string, stringToBeRemoved); } - @ApacheCommonsLang3Allowed("No Guava equivalent existing") + @AllowedToUseApacheCommonsLang3("No Guava equivalent existing") public static boolean endsWithIgnoreCase(String string, String suffix) { return StringUtils.endsWithIgnoreCase(string, suffix); } diff --git a/src/test/java/org/jabref/architecture/MainArchitectureTest.java b/src/test/java/org/jabref/architecture/MainArchitectureTest.java index 5a4e5e64704..22310ef0c28 100644 --- a/src/test/java/org/jabref/architecture/MainArchitectureTest.java +++ b/src/test/java/org/jabref/architecture/MainArchitectureTest.java @@ -29,7 +29,7 @@ class MainArchitectureTest { @ArchTest public void doNotUseApacheCommonsLang3(JavaClasses classes) { - noClasses().that().areNotAnnotatedWith(ApacheCommonsLang3Allowed.class) + noClasses().that().areNotAnnotatedWith(AllowedToUseApacheCommonsLang3.class) .should().accessClassesThat().resideInAPackage("org.apache.commons.lang3") .check(classes); } From 6f28d7bb4f3b4fee037762898c39b3240ec8edf3 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 11:30:47 +0200 Subject: [PATCH 09/18] Change treating of citation keys and URLs --- .../maintable/NewLibraryFromPdfAction.java | 3 +- .../org/jabref/logic/cleanup/URLCleanup.java | 16 +- .../BibliographyFromPdfImporter.java | 82 ++++++-- .../BibliographyFromPdfImporterTest.java | 179 +++++++++--------- 4 files changed, 170 insertions(+), 110 deletions(-) diff --git a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java index 58db1795f5e..0716a9aa2a0 100644 --- a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java +++ b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java @@ -54,7 +54,8 @@ public NewLibraryFromPdfAction( this.stateManager = stateManager; this.dialogService = dialogService; this.preferencesService = preferencesService; - this.bibliographyFromPdfImporter = new BibliographyFromPdfImporter(preferencesService.getCitationKeyPatternPreferences()); + // Instruct the importer to keep the numbers (instead of generating keys) + this.bibliographyFromPdfImporter = new BibliographyFromPdfImporter(); this.taskExecutor = taskExecutor; } diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 13554eca215..5429f84853b 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -23,20 +23,20 @@ public class URLCleanup implements CleanupJob { * In order to be functional, we made the necessary adjustments regarding Java * features (mainly doubled backslashes). */ - public static final String URL_REGEX = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]" + private static final String URL_REGEX = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]" + "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()" + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; + public static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX, Pattern.CASE_INSENSITIVE); - public static final String DATE_TERMS_REGEX = "accessed on|visited on|retrieved on|viewed on"; + private static final String DATE_TERMS_REGEX = "accessed on|visited on|retrieved on|viewed on"; private static final Field NOTE_FIELD = StandardField.NOTE; private static final Field URL_FIELD = StandardField.URL; private static final Field URLDATE_FIELD = StandardField.URLDATE; - final Pattern urlPattern = Pattern.compile(URL_REGEX, Pattern.CASE_INSENSITIVE); - final Pattern dateTermsPattern = Pattern.compile(DATE_TERMS_REGEX, Pattern.CASE_INSENSITIVE); - final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE); + private static final Pattern DATE_TERMS_PATTERN = Pattern.compile(DATE_TERMS_REGEX, Pattern.CASE_INSENSITIVE); + private static final Pattern DATE_PATTERN = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE); private final NormalizeDateFormatter formatter = new NormalizeDateFormatter(); @Override @@ -45,9 +45,9 @@ public List cleanup(BibEntry entry) { String noteFieldValue = entry.getField(NOTE_FIELD).orElse(""); - final Matcher urlMatcher = urlPattern.matcher(noteFieldValue); - final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue); - final Matcher dateMatcher = datePattern.matcher(noteFieldValue); + final Matcher urlMatcher = URL_PATTERN.matcher(noteFieldValue); + final Matcher dateTermsMatcher = DATE_TERMS_PATTERN.matcher(noteFieldValue); + final Matcher dateMatcher = DATE_PATTERN.matcher(noteFieldValue); if (urlMatcher.find()) { String url = urlMatcher.group(); diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java index 0b8b0de3952..8a12ddac9f7 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java @@ -11,8 +11,11 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jabref.architecture.AllowedToUseApacheCommonsLang3; import org.jabref.logic.citationkeypattern.CitationKeyGenerator; import org.jabref.logic.citationkeypattern.CitationKeyPatternPreferences; +import org.jabref.logic.cleanup.URLCleanup; +import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter; import org.jabref.logic.importer.Importer; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.l10n.Localization; @@ -41,6 +44,7 @@ *

* To extract a {@link BibEntry} matching the PDF, see {@link PdfContentImporter}. */ +@AllowedToUseApacheCommonsLang3("Fastest method to count spaces in a string") public class BibliographyFromPdfImporter extends Importer { private static final Logger LOGGER = LoggerFactory.getLogger(BibliographyFromPdfImporter.class); @@ -54,10 +58,17 @@ public class BibliographyFromPdfImporter extends Importer { private static final Pattern MONTH_AND_YEAR = Pattern.compile(", ([A-Z][a-z]{2,7}\\.? \\d+),? ?(.*)"); private static final Pattern VOLUME = Pattern.compile(", vol\\. (\\d+)(.*)"); private static final Pattern NO = Pattern.compile(", no\\. (\\d+)(.*)"); + private static final Pattern PROCEEDINGS_INDICATION = Pattern.compile("^in (Proc\\. )?(.*)"); + private static final Pattern WORKSHOP = Pattern.compile("Workshop"); private static final Pattern AUTHORS_AND_TITLE_AT_BEGINNING = Pattern.compile("^([^“]+), “(.*?)(”,|,”) "); private static final Pattern TITLE = Pattern.compile("“(.*?)”, (.*)"); private final CitationKeyPatternPreferences citationKeyPatternPreferences; + private final NormalizeUnicodeFormatter normalizeUnicodeFormatter = new NormalizeUnicodeFormatter(); + + public BibliographyFromPdfImporter() { + this.citationKeyPatternPreferences = null; + } public BibliographyFromPdfImporter(CitationKeyPatternPreferences citationKeyPatternPreferences) { this.citationKeyPatternPreferences = citationKeyPatternPreferences; @@ -105,6 +116,10 @@ public ParserResult importDatabase(Path filePath) { ParserResult parserResult = new ParserResult(result); + if (citationKeyPatternPreferences == null) { + return parserResult; + } + // Generate citation keys for result CitationKeyGenerator citationKeyGenerator = new CitationKeyGenerator(parserResult.getDatabaseContext(), citationKeyPatternPreferences); parserResult.getDatabase().getEntries().forEach(citationKeyGenerator::generateAndSetKey); @@ -184,10 +199,26 @@ private static String getPageContents(PDDocument document, PDFTextStripper strip */ @VisibleForTesting BibEntry parseReference(String number, String reference) { + reference = normalizeUnicodeFormatter.format(reference); String originalReference = "[" + number + "] " + reference; - BibEntry result = new BibEntry(StandardEntryType.Article); - - reference = reference.replace(".-", "-"); + BibEntry result = new BibEntry(StandardEntryType.Article) + .withCitationKey(number); + + reference = reference + .replace(".-", "-") + // Remove "- " introduced by linebreaks in the PDF + .replace("- ", ""); + + // Move URL to URL field + Matcher urlPatternMatcher = URLCleanup.URL_PATTERN.matcher(reference); + if (urlPatternMatcher.find()) { + String url = urlPatternMatcher.group(); + result.setField(StandardField.URL, url); + reference = reference.replace(url, "").trim(); + if (reference.endsWith(",")) { + reference = reference.substring(0, reference.length() - 1); + } + } // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016, 2017. doi:10.1088/ 1741-4326/aa6a6a // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019, pp. 977-979. doi:10.18429/ JACoW-IPAC2019-MOPTS051 @@ -280,24 +311,45 @@ BibEntry parseReference(String number, String reference) { } } - boolean startsWithInProc = reference.startsWith("in Proc."); - boolean containsWorkshop = reference.contains("Workshop"); - if (startsWithInProc || containsWorkshop || (!volumeFound && !numberFound)) { - int beginIndex = startsWithInProc ? 3 : 0; - String bookTitle = reference.substring(beginIndex).replace("- ", "").trim(); + Matcher proceedingsMatcher = PROCEEDINGS_INDICATION.matcher(reference); + Matcher workshopMatcher = WORKSHOP.matcher(reference); + if (proceedingsMatcher.matches() || workshopMatcher.matches() && (!volumeFound && !numberFound)) { + result.setType(StandardEntryType.InProceedings); + + String bookTitle; + if (proceedingsMatcher.matches()) { + String proc = proceedingsMatcher.group(1); + if (proc == null) { + bookTitle = proceedingsMatcher.group(2); + } else { + // We keep "Proc. " + bookTitle = proc + proceedingsMatcher.group(2); + } + } else { + bookTitle = reference; + } + reference = ""; + int lastDot = bookTitle.lastIndexOf('.'); if (lastDot > 0) { - String textAfterDot = reference.substring(lastDot + 1).trim(); + String textAfterDot = bookTitle.substring(lastDot + 1).trim(); // We use Apache Commons here, because it is fastest - see table at https://stackoverflow.com/a/35242882/873282 - if (StringUtils.countMatches(textAfterDot, ' ') <= 1) { + if (!textAfterDot.contains("http") && (StringUtils.countMatches(textAfterDot, ' ') <= 1)) { bookTitle = bookTitle.substring(0, lastDot).trim(); - reference = textAfterDot; + if (bookTitle.startsWith("in ")) { + bookTitle = bookTitle.substring(3); + } + result.setField(StandardField.PUBLISHER, textAfterDot); } - } else { - reference = ""; } + result.setField(StandardField.BOOKTITLE, bookTitle); - result.setType(StandardEntryType.InProceedings); + } + + if (reference.isEmpty()) { + // Early quit if everything was handled + result.setField(StandardField.COMMENT, originalReference); + return result; } // Nucl. Fusion @@ -345,6 +397,6 @@ private static EntryUpdateResult updateEntryAndReferenceIfMatches(String referen return new EntryUpdateResult(true, reference); } - private static final record EntryUpdateResult(boolean modified, String newReference) { + private record EntryUpdateResult(boolean modified, String newReference) { } } diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java index ba128d739d8..91d932213ee 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java @@ -4,8 +4,6 @@ import java.util.List; import java.util.stream.Stream; -import org.jabref.logic.citationkeypattern.CitationKeyPatternPreferences; -import org.jabref.logic.citationkeypattern.GlobalCitationKeyPatterns; import org.jabref.logic.importer.ParserResult; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.StandardField; @@ -17,12 +15,12 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; -import static org.jabref.logic.citationkeypattern.CitationKeyGenerator.DEFAULT_UNWANTED_CHARACTERS; import static org.junit.jupiter.api.Assertions.assertEquals; class BibliographyFromPdfImporterTest { private static final BibEntry KNASTER_2017 = new BibEntry(StandardEntryType.Article) + .withCitationKey("1") .withField(StandardField.AUTHOR, "J. Knaster and others") .withField(StandardField.TITLE, "Overview of the IFMIF/EVEDA project") .withField(StandardField.JOURNAL, "Nucl. Fusion") @@ -32,6 +30,7 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.DOI, "10.1088/1741-4326/aa6a6a") .withField(StandardField.COMMENT, "[1] J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016, 2017. doi:10.1088/ 1741-4326/aa6a6a"); private static final BibEntry SHIMOSAKI_2019 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("3") .withField(StandardField.AUTHOR, "Y. Shimosaki and others") .withField(StandardField.TITLE, "Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc") .withField(StandardField.BOOKTITLE, "Proc. IPAC’19, Melbourne, Australia") @@ -41,6 +40,7 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.DOI, "10.18429/JACoW-IPAC2019-MOPTS051") .withField(StandardField.COMMENT, "[3] Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019, pp. 977-979. doi:10.18429/ JACoW-IPAC2019-MOPTS051"); private static final BibEntry BELLAN_2021 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("6") .withField(StandardField.AUTHOR, "L. Bellan and others") .withField(StandardField.TITLE, "Acceleration of the high current deuteron beam through the IFMIF-EVEDA beam dynamics performances") .withField(StandardField.BOOKTITLE, "Proc. HB’21, Batavia, IL, USA") @@ -50,6 +50,7 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.DOI, "10.18429/JACoW-HB2021-WEDC2") .withField(StandardField.COMMENT, "[6] L. Bellan et al., “Acceleration of the high current deuteron beam through the IFMIF-EVEDA beam dynamics perfor- mances”, in Proc. HB’21, Batavia, IL, USA, Oct. 2021, pp. 197-202. doi:10.18429/JACoW-HB2021-WEDC2"); private static final BibEntry MASUDA_2022 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("7") .withField(StandardField.AUTHOR, "K. Masuda and others") .withField(StandardField.TITLE, "Commissioning of IFMIF Prototype Accelerator towards CW operation") .withField(StandardField.BOOKTITLE, "Proc. LINAC’22, Liverpool, UK") @@ -59,6 +60,7 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.DOI, "10.18429/JACoW-LINAC2022-TU2AA04") .withField(StandardField.COMMENT, "[7] K. Masuda et al., “Commissioning of IFMIF Prototype Ac- celerator towards CW operation”, in Proc. LINAC’22, Liv- erpool, UK, Aug.-Sep. 2022, pp. 319-323. doi:10.18429/ JACoW-LINAC2022-TU2AA04"); private static final BibEntry PODADERA_2012 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("11") .withField(StandardField.AUTHOR, "I. Podadera and J. M. Carmona and A. Ibarra and J. Molla") .withField(StandardField.TITLE, "Beam position monitor development for LIPAc") .withField(StandardField.BOOKTITLE, "th 8th DITANET Topical Workshop on Beam Position Monitors, CERN, Geneva, Switzreland") @@ -66,40 +68,78 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.YEAR, "2012") .withField(StandardField.COMMENT, "[11] I. Podadera, J. M. Carmona, A. Ibarra, and J. Molla, “Beam position monitor development for LIPAc”, presented at th 8th DITANET Topical Workshop on Beam Position Monitors, CERN, Geneva, Switzreland, Jan. 2012."); private static final BibEntry AKAGI_2023 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("15") .withField(StandardField.AUTHOR, "T. Akagi and others") .withField(StandardField.TITLE, "Achievement of high-current continuouswave deuteron injector for Linear IFMIF Prototype Accelerator (LIPAc)") - .withField(StandardField.BOOKTITLE, "IAEA FEC’23, London, UK, https://www.iaea.org/events/fec2023") + .withField(StandardField.BOOKTITLE, "IAEA FEC’23, London, UK") .withField(StandardField.MONTH, "#oct#") + .withField(StandardField.URL, "https://www.iaea.org/events/fec2023") .withField(StandardField.YEAR, "2023") .withField(StandardField.COMMENT, "[15] T. Akagi et al., “Achievement of high-current continuous- wave deuteron injector for Linear IFMIF Prototype Accelera- tor (LIPAc)”, to be presented at IAEA FEC’23, London, UK, Oct. 2023. https://www.iaea.org/events/fec2023"); private static final BibEntry INTERNAL_NOTE = new BibEntry(StandardEntryType.TechReport) + .withCitationKey("16") .withField(StandardField.TITLE, "AF4.1.1 SRF Linac Engineering Design Report") .withField(StandardField.NOTE, "Internal note") .withField(StandardField.COMMENT, "[16] “AF4.1.1 SRF Linac Engineering Design Report”, Internal note."); private static final BibEntry KWON_2023 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("14") .withField(StandardField.AUTHOR, "S. Kwon and others") .withField(StandardField.TITLE, "High beam current operation with beam di-agnostics at LIPAc") .withField(StandardField.BOOKTITLE, "HB’23, Geneva, Switzerland, paper FRC1I2, this conference") .withField(StandardField.MONTH, "#oct#") .withField(StandardField.YEAR, "2023") .withField(StandardField.COMMENT, "[14] S. Kwon et al., “High beam current operation with beam di-agnostics at LIPAc”, presented at HB’23, Geneva, Switzer- land, Oct. 2023, paper FRC1I2, this conference."); + private static final BibEntry ALVER2007 = new BibEntry(StandardEntryType.Article) + .withCitationKey("1") + .withField(StandardField.AUTHOR, "M. O. Alver and T. Tennøy and J. A. Alfredsen and G. Øie") + .withField(StandardField.TITLE, "Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks") + .withField(StandardField.JOURNAL, "Aquacultural engineering") + .withField(StandardField.VOLUME, "36") + .withField(StandardField.NUMBER, "2") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "115-121") + .withField(StandardField.COMMENT, "[1] M. O. Alver, T. Tennøy, J. A. Alfredsen, and G. Øie, “Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks,” Aquacultural engineering, vol. 36, no. 2, pp. 115–121, 2007."); + private static final BibEntry ALVER2007A = new BibEntry(StandardEntryType.Article) + .withCitationKey("2") + .withField(StandardField.AUTHOR, "M. O. Alver and others") + .withField(StandardField.TITLE, "Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates") + .withField(StandardField.BOOKTITLE, "Aquaculture") + .withField(StandardField.VOLUME, "268") + .withField(StandardField.NUMBER, "1") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "216-226") + .withField(StandardField.COMMENT, "[2] M. O. Alver et al., “Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates,” Aquaculture, vol. 268, no. 1, pp. 216–226, 2007."); + private static final BibEntry KOPP2012 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("3") + .withField(StandardField.AUTHOR, "Oliver Kopp and others") + .withField(StandardField.TITLE, "BPMN4TOSCA: A domain-specific language to model management plans for composite applications") + .withField(StandardField.BOOKTITLE, "Business Process Model and Notation") + .withField(StandardField.SERIES, "LNCS") + .withField(StandardField.VOLUME, "125") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.COMMENT, "[3] O. Kopp et al., “BPMN4TOSCA: A domain-specific language to model management plans for composite applications,” in Business Process Model and Notation, ser. LNCS, vol. 125. Springer, 2012."); + private static final BibEntry KOPPP2018 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("4") + .withField(StandardField.AUTHOR, "O. Kopp and A. Armbruster and O. Zimmermann") + .withField(StandardField.TITLE, "Markdown architectural decision records: Format and tool support") + .withField(StandardField.BOOKTITLE, "ZEUS") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.PUBLISHER, "CEUR-WS.org") + .withField(StandardField.COMMENT, "[4] O. Kopp, A. Armbruster, and O. Zimmermann, “Markdown architectural decision records: Format and tool support,” in ZEUS. CEUR-WS.org, 2018."); + private static final BibEntry KOENIG2023 = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("5") + .withField(StandardField.AUTHOR, "S. König and others") + .withField(StandardField.TITLE, "BPMN4Cars: A car-tailored workflow engine") + .withField(StandardField.BOOKTITLE, "INDIN") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.YEAR, "2023") + .withField(StandardField.COMMENT, "[5] S. König et al., “BPMN4Cars: A car-tailored workflow engine,” in INDIN. IEEE, 2023."); + private BibliographyFromPdfImporter bibliographyFromPdfImporter; @BeforeEach void setup() { - GlobalCitationKeyPatterns globalCitationKeyPattern = GlobalCitationKeyPatterns.fromPattern("[auth][year]"); - CitationKeyPatternPreferences citationKeyPatternPreferences = new CitationKeyPatternPreferences( - false, - false, - false, - CitationKeyPatternPreferences.KeySuffix.SECOND_WITH_A, - "", - "", - DEFAULT_UNWANTED_CHARACTERS, - globalCitationKeyPattern, - "", - ','); - bibliographyFromPdfImporter = new BibliographyFromPdfImporter(citationKeyPatternPreferences); + bibliographyFromPdfImporter = new BibliographyFromPdfImporter(); } @Test @@ -107,7 +147,7 @@ void tua3i2refpage() throws Exception { Path file = Path.of(BibliographyFromPdfImporterTest.class.getResource("/pdfs/IEEE/tua3i2refpage.pdf").toURI()); ParserResult parserResult = bibliographyFromPdfImporter.importDatabase(file); BibEntry entry02 = new BibEntry(StandardEntryType.Article) - .withCitationKey("Kondo2020") + .withCitationKey("2") .withField(StandardField.AUTHOR, "K. Kondo and others") .withField(StandardField.TITLE, "Validation of the Linear IFMIF Prototype Accelerator (LIPAc) in Rokkasho") .withField(StandardField.JOURNAL, "Fusion Eng. Des") // TODO: Final dot should be kept @@ -118,7 +158,7 @@ void tua3i2refpage() throws Exception { .withField(StandardField.COMMENT, "[2] K. Kondo et al., “Validation of the Linear IFMIF Prototype Accelerator (LIPAc) in Rokkasho”, Fusion Eng. Des., vol. 153, p. 111503, 2020. doi:10.1016/j.fusengdes.2020. 111503"); BibEntry entry04 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Devanz2017") + .withCitationKey("4") .withField(StandardField.AUTHOR, "G. Devanz and others") .withField(StandardField.TITLE, "Manufacturing and validation tests of IFMIF low-beta HWRs") .withField(StandardField.BOOKTITLE, "Proc. IPAC’17, Copenhagen, Denmark") @@ -129,7 +169,7 @@ void tua3i2refpage() throws Exception { .withField(StandardField.COMMENT, "[4] G. Devanz et al., “Manufacturing and validation tests of IFMIF low-beta HWRs”, in Proc. IPAC’17, Copen- hagen, Denmark, May 2017, pp. 942-944. doi:10.18429/ JACoW-IPAC2017-MOPVA039"); BibEntry entry05 = new BibEntry(StandardEntryType.Article) - .withCitationKey("Branas2018") + .withCitationKey("5") .withField(StandardField.AUTHOR, "B. Brañas and others") .withField(StandardField.TITLE, "The LIPAc Beam Dump") .withField(StandardField.JOURNAL, "Fusion Eng. Des") @@ -140,16 +180,16 @@ void tua3i2refpage() throws Exception { .withField(StandardField.COMMENT, "[5] B. Brañas et al., “The LIPAc Beam Dump”, Fusion Eng. Des., vol. 127, pp. 127-138, 2018. doi:10.1016/j.fusengdes. 2017.12.018"); BibEntry entry08 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Scantamburlo2023") + .withCitationKey("8") .withField(StandardField.AUTHOR, "F. Scantamburlo and others") .withField(StandardField.TITLE, "Linear IFMIF Prototype Accelera-tor (LIPAc) Radio Frequency Quadrupole’s (RFQ) RF couplers enhancement towards CW operation at nominal voltage") - .withField(StandardField.BOOKTITLE, "Proc. ISFNT’23, Las Palmas de Gran Canaria, Spain.") + .withField(StandardField.BOOKTITLE, "Proc. ISFNT’23, Las Palmas de Gran Canaria, Spain") .withField(StandardField.MONTH, "#sep#") .withField(StandardField.YEAR, "2023") .withField(StandardField.COMMENT, "[8] F. Scantamburlo et al., “Linear IFMIF Prototype Accelera-tor (LIPAc) Radio Frequency Quadrupole’s (RFQ) RF couplers enhancement towards CW operation at nominal voltage”, in Proc. ISFNT’23, Sep. 2023, Las Palmas de Gran Canaria, Spain."); BibEntry entry09 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Franco2023") + .withCitationKey("9") .withField(StandardField.AUTHOR, "A. De Franco and others") .withField(StandardField.BOOKTITLE, "Proc. IPAC’23, Venice, Italy") .withField(StandardField.TITLE, "RF conditioning towards continuous wave of the FRQ of the Linear IFMIF Prototype Accelerator") @@ -160,14 +200,14 @@ void tua3i2refpage() throws Exception { .withField(StandardField.COMMENT, "[9] A. De Franco et al., “RF conditioning towards continuous wave of the FRQ of the Linear IFMIF Prototype Accelerator”, in Proc. IPAC’23, Venice, Italy, May 2023, pp. 2345-2348. doi:10.18429/JACoW-IPAC2023-TUPM065"); BibEntry entry10 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Hirosawa") + .withCitationKey("10") .withField(StandardField.AUTHOR, "K. Hirosawa and others") - .withField(StandardField.BOOKTITLE, "Proc. PASJ’23, 2023, Japan.") + .withField(StandardField.BOOKTITLE, "Proc. PASJ’23, 2023, Japan") .withField(StandardField.TITLE, "High-Power RF tests of repaired circulator for LIPAc RFQ") .withField(StandardField.COMMENT, "[10] K. Hirosawa et al., “High-Power RF tests of repaired circu- lator for LIPAc RFQ”, in Proc. PASJ’23, 2023, Japan."); BibEntry entry12 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Podadera2019") + .withCitationKey("12") .withField(StandardField.AUTHOR, "I. Podadera and others") .withField(StandardField.TITLE, "Beam commissioning of beam position and phase monitors for LIPAc") .withField(StandardField.BOOKTITLE, "Proc. IBIC’19, Malmö, Sweden") @@ -178,7 +218,7 @@ void tua3i2refpage() throws Exception { .withField(StandardField.COMMENT, "[12] I. Podadera et al., “Beam commissioning of beam posi- tion and phase monitors for LIPAc”, in Proc. IBIC’19, Malmö, Sweden, Sep. 2019, pp. 534-538. doi:10.18429/ JACoW-IBIC2019-WEPP013"); BibEntry entry13 = new BibEntry(StandardEntryType.Article) - .withCitationKey("Kondo2021") + .withCitationKey("13") .withField(StandardField.AUTHOR, "K. Kondo and others") .withField(StandardField.TITLE, "Neutron production measurement in the 125 mA 5 MeV Deuteron beam commissioning of Linear IFMIF Prototype Accelerator (LIPAc) RFQ") .withField(StandardField.JOURNAL, "Nucl. Fusion") @@ -190,33 +230,34 @@ void tua3i2refpage() throws Exception { .withField(StandardField.COMMENT, "[13] K. Kondo et al., “Neutron production measurement in the 125 mA 5 MeV Deuteron beam commissioning of Linear IFMIF Prototype Accelerator (LIPAc) RFQ”, Nucl. Fusion, vol. 61, no. 1, p. 116002, 2021. doi:82310.1088/1741-4326/ ac233c"); BibEntry entry17 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Bellan2021a") + .withCitationKey("17") .withField(StandardField.AUTHOR, "L. Bellan and others") - .withField(StandardField.BOOKTITLE, "Proc. ICIS’21, TRIUMF, Vancouver, BC, Canada, https://indico.cern.ch/event/1027296/") + .withField(StandardField.BOOKTITLE, "Proc. ICIS’21, TRIUMF, Vancouver, BC, Canada") .withField(StandardField.COMMENT, "[17] L. Bellan et al., “Extraction and low energy beam transport models used for the IFMIF/EVEDA RFQ commissioning”, in Proc. ICIS’21, TRIUMF, Vancouver, BC, Canada, Sep. 2021. https://indico.cern.ch/event/1027296/") .withField(StandardField.MONTH, "#sep#") .withField(StandardField.TITLE, "Extraction and low energy beam transport models used for the IFMIF/EVEDA RFQ commissioning") + .withField(StandardField.URL, "https://indico.cern.ch/event/1027296/") .withField(StandardField.YEAR, "2021"); // We use the existing test entries, but add a citation key (which is added by the importer) // We need to clone to keep the static entries unmodified assertEquals(List.of( - ((BibEntry) KNASTER_2017.clone()).withCitationKey("Knaster2017"), + KNASTER_2017, entry02, - ((BibEntry) SHIMOSAKI_2019.clone()).withCitationKey("Shimosaki2019"), + SHIMOSAKI_2019, entry04, entry05, - ((BibEntry) BELLAN_2021.clone()).withCitationKey("Bellan2021"), - ((BibEntry) MASUDA_2022.clone()).withCitationKey("Masuda2022"), + BELLAN_2021, + MASUDA_2022, entry08, entry09, entry10, - ((BibEntry) PODADERA_2012.clone()).withCitationKey("Podadera2012"), + PODADERA_2012, entry12, entry13, - ((BibEntry) KWON_2023.clone()).withCitationKey("Kwon2023"), - ((BibEntry) AKAGI_2023.clone()).withCitationKey("Akagi2023"), - ((BibEntry) INTERNAL_NOTE.clone()), + KWON_2023, + AKAGI_2023, + INTERNAL_NOTE, entry17), parserResult.getDatabase().getEntries()); } @@ -225,57 +266,23 @@ void tua3i2refpage() throws Exception { void ieeePaper() throws Exception { Path file = Path.of(BibliographyFromPdfImporterTest.class.getResource("/pdfs/IEEE/ieee-paper.pdf").toURI()); ParserResult parserResult = bibliographyFromPdfImporter.importDatabase(file); - BibEntry entry01 = new BibEntry(StandardEntryType.Article) - .withCitationKey("Kondo2020") - .withField(StandardField.AUTHOR, "M. O. Alver and T. Tennøy and J. A. Alfredsen and G. Øie") - .withField(StandardField.TITLE, "Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks") - .withField(StandardField.JOURNAL, "Aquacultural engineering") - .withField(StandardField.VOLUME, "36") - .withField(StandardField.NUMBER, "2") - .withField(StandardField.YEAR, "2007") - .withField(StandardField.PAGES, "115-121") - .withField(StandardField.COMMENT, "[1] M. O. Alver, T. Tennøy, J. A. Alfredsen, and G. Øie, “Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks,” Aquacultural engineering, vol. 36, no. 2, pp. 115–121, 2007."); - - BibEntry entry02 = new BibEntry(StandardEntryType.Article) - .withCitationKey("Devanz2017") - .withField(StandardField.AUTHOR, "M. O. Alver and others") - .withField(StandardField.TITLE, "Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates") - .withField(StandardField.BOOKTITLE, "Aquaculture") - .withField(StandardField.VOLUME, "268") - .withField(StandardField.NUMBER, "1") - .withField(StandardField.YEAR, "2007") - .withField(StandardField.PAGES, "216-226") - .withField(StandardField.COMMENT, "[2] M. O. Alver et al., “Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates,” Aquaculture, vol. 268, no. 1, pp. 216–226, 2007."); - - BibEntry entry03 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Branas2018") - .withField(StandardField.AUTHOR, "Oliver Kopp and others") - .withField(StandardField.TITLE, "BPMN4TOSCA: A domain-specific language to model management plans for composite applications") - .withField(StandardField.BOOKTITLE, "Business Process Model and Notation") - .withField(StandardField.SERIES, "LNCS") - .withField(StandardField.VOLUME, "125") - .withField(StandardField.YEAR, "2018") - .withField(StandardField.COMMENT, "[3] O. Kopp et al., “BPMN4TOSCA: A domain-specific language to model management plans for composite applications,” in Business Process Model and Notation, ser. LNCS, vol. 125. Springer, 2012."); - - BibEntry entry04 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Scantamburlo2023") - .withField(StandardField.AUTHOR, "O. Kopp and A. Armbruster and O. Zimmermann") - .withField(StandardField.TITLE, "Markdown architectural decision records: Format and tool support") - .withField(StandardField.BOOKTITLE, "ZEUS") - .withField(StandardField.YEAR, "2018") - .withField(StandardField.PUBLISHER, "CEUR-WS.org") - .withField(StandardField.COMMENT, "[4] O. Kopp, A. Armbruster, and O. Zimmermann, “Markdown architectural decision records: Format and tool support,” in ZEUS. CEUR-WS.org, 2018."); + assertEquals(List.of(ALVER2007, ALVER2007A, KOPP2012, KOPPP2018, KOENIG2023), parserResult.getDatabase().getEntries()); + } - BibEntry entry05 = new BibEntry(StandardEntryType.InProceedings) - .withCitationKey("Franco2023") - .withField(StandardField.AUTHOR, "S. König and others") - .withField(StandardField.TITLE, "BPMN4Cars: A car-tailored workflow engine") - .withField(StandardField.BOOKTITLE, "INDIN") - .withField(StandardField.PUBLISHER, "IEEE") - .withField(StandardField.YEAR, "2023") - .withField(StandardField.COMMENT, "[5] S. König et al., “BPMN4Cars: A car-tailored workflow engine,” in INDIN. IEEE, 2023."); + static Stream referencesPlain() { + return Stream.of(KOENIG2023); + } - assertEquals(List.of(entry01, entry02, entry03, entry04, entry05), parserResult.getDatabase().getEntries()); + @ParameterizedTest + @MethodSource + void referencesPlain(BibEntry expectedEntry) { + String number = expectedEntry.getField(StandardField.COMMENT) + .map(comment -> comment.substring(1, 2)) + .get(); + String reference = expectedEntry.getField(StandardField.COMMENT) + .map(comment -> comment.substring(4)) + .get(); + assertEquals(expectedEntry, bibliographyFromPdfImporter.parseReference(number, reference)); } static Stream references() { From 5f2e8430f7be347fdb2aafc31ae15876ddb890fb Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 16:12:58 +0200 Subject: [PATCH 10/18] Add support for more cases --- .../BibliographyFromPdfImporter.java | 58 ++++++++++--------- .../BibliographyFromPdfImporterTest.java | 9 +-- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java index 8a12ddac9f7..31b1eef5f4c 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java @@ -54,6 +54,7 @@ public class BibliographyFromPdfImporter extends Importer { private static final Pattern YEAR_AT_END = Pattern.compile(", (\\d{4})\\.$"); private static final Pattern PAGES = Pattern.compile(", pp\\. (\\d+--?\\d+)\\.?(.*)"); private static final Pattern PAGE = Pattern.compile(", p\\. (\\d+)(.*)"); + private static final Pattern SERIES = Pattern.compile(", ser\\. ([^.,]+)(.*)"); private static final Pattern MONTH_RANGE_AND_YEAR = Pattern.compile(", ([A-Z][a-z]{2,7}\\.?)-[A-Z][a-z]{2,7}\\.? (\\d+)(.*)"); private static final Pattern MONTH_AND_YEAR = Pattern.compile(", ([A-Z][a-z]{2,7}\\.? \\d+),? ?(.*)"); private static final Pattern VOLUME = Pattern.compile(", vol\\. (\\d+)(.*)"); @@ -152,22 +153,18 @@ private List getEntriesFromPDFContent(String contents) { */ private String getReferencesPagesText(PDDocument document) throws IOException { int lastPage = document.getNumberOfPages(); - String lastPageContents = getPageContents(document, new PDFTextStripper(), lastPage); - String result = lastPageContents; - - if (!containsWordReferences(lastPageContents)) { - result = prependToResult(result, document, new PDFTextStripper(), lastPage); - } + String result = prependToResult("", document, new PDFTextStripper(), lastPage); + // Same matcher uses as in {@link containsWordReferences} Matcher matcher = REFERENCES.matcher(result); - if (!matcher.hasMatch()) { + if (!matcher.find()) { // Ensure that not too much is returned LOGGER.warn("Could not found 'References'. Returning last page only."); - return lastPageContents; + return getPageContents(document, new PDFTextStripper(), lastPage); } - int start = matcher.start(); - return result.substring(start); + int end = matcher.end(); + return result.substring(end); } private static boolean containsWordReferences(String result) { @@ -206,7 +203,10 @@ BibEntry parseReference(String number, String reference) { reference = reference .replace(".-", "-") - // Remove "- " introduced by linebreaks in the PDF + .replace("- ", "") + // Unicode en dash (used as page separator) + .replace("–", "-") + // Remove "- " introduced by linebreaks in the PDF .replace("- ", ""); // Move URL to URL field @@ -240,13 +240,15 @@ BibEntry parseReference(String number, String reference) { reference = updateEntryAndReferenceIfMatches(reference, PAGES, result, StandardField.PAGES).newReference; + reference = updateEntryAndReferenceIfMatches(reference, SERIES, result, StandardField.SERIES).newReference; + // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016 // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019 reference = updateEntryAndReferenceIfMatches(reference, PAGE, result, StandardField.PAGES).newReference; matcher = MONTH_RANGE_AND_YEAR.matcher(reference); if (matcher.find()) { - // strip out second month + // strip out second monthp reference = reference.substring(0, matcher.start()) + ", " + matcher.group(1) + " " + matcher.group(2) + matcher.group(3); } @@ -285,9 +287,7 @@ BibEntry parseReference(String number, String reference) { // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia matcher = AUTHORS_AND_TITLE_AT_BEGINNING.matcher(reference); if (matcher.find()) { - String authors = matcher.group(1) - .replace("- ", "") - .replaceAll("et al\\.?", "and others"); + String authors = matcher.group(1).replaceAll("et al\\.?", "and others"); result.setField(StandardField.AUTHOR, AuthorList.fixAuthorFirstNameFirst(authors)); result.setField(StandardField.TITLE, matcher.group(2) .replace("- ", "") @@ -313,11 +313,11 @@ BibEntry parseReference(String number, String reference) { Matcher proceedingsMatcher = PROCEEDINGS_INDICATION.matcher(reference); Matcher workshopMatcher = WORKSHOP.matcher(reference); - if (proceedingsMatcher.matches() || workshopMatcher.matches() && (!volumeFound && !numberFound)) { + if (proceedingsMatcher.find() || workshopMatcher.find() && (!volumeFound && !numberFound)) { result.setType(StandardEntryType.InProceedings); String bookTitle; - if (proceedingsMatcher.matches()) { + if (proceedingsMatcher.hasMatch()) { String proc = proceedingsMatcher.group(1); if (proc == null) { bookTitle = proceedingsMatcher.group(2); @@ -330,7 +330,10 @@ BibEntry parseReference(String number, String reference) { } reference = ""; - int lastDot = bookTitle.lastIndexOf('.'); + int lastDot = bookTitle.lastIndexOf(". "); + if (lastDot == -1) { + lastDot = bookTitle.lastIndexOf('.'); + } if (lastDot > 0) { String textAfterDot = bookTitle.substring(lastDot + 1).trim(); // We use Apache Commons here, because it is fastest - see table at https://stackoverflow.com/a/35242882/873282 @@ -353,25 +356,26 @@ BibEntry parseReference(String number, String reference) { } // Nucl. Fusion - reference = reference.trim() - .replace("- ", "") - .replaceAll("\\.$", ""); - if (!reference.contains(",") && !reference.isEmpty()) { + reference = reference.trim().replaceAll("\\.$", ""); + + if (volumeFound || numberFound) { + result.setField(StandardField.JOURNAL, reference); + } else if (!reference.contains(",") && !reference.isEmpty()) { if (reference.endsWith(" Note") || reference.endsWith(" note")) { result.setField(StandardField.NOTE, reference); result.setType(StandardEntryType.TechReport); } else { - result.setField(StandardField.JOURNAL, reference.replace("- ", "")); + LOGGER.debug("Falling back to journal even if no volume and no number was found. Reference: {}", reference); + result.setField(StandardField.JOURNAL, reference); } } else { - LOGGER.trace("InProceedings fallback used for current state of handled string {}", reference); - String toAdd = reference; + LOGGER.trace("InProceedings fallback used. Reference: {}", reference); result.setType(StandardEntryType.InProceedings); if (result.hasField(StandardField.BOOKTITLE)) { String oldTitle = result.getField(StandardField.BOOKTITLE).get(); - result.setField(StandardField.BOOKTITLE, oldTitle + " " + toAdd); + result.setField(StandardField.BOOKTITLE, oldTitle + " " + reference); } else { - result.setField(StandardField.BOOKTITLE, toAdd); + result.setField(StandardField.BOOKTITLE, reference); } } diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java index 91d932213ee..6e2098bfd2e 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java @@ -89,7 +89,7 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.MONTH, "#oct#") .withField(StandardField.YEAR, "2023") .withField(StandardField.COMMENT, "[14] S. Kwon et al., “High beam current operation with beam di-agnostics at LIPAc”, presented at HB’23, Geneva, Switzer- land, Oct. 2023, paper FRC1I2, this conference."); - private static final BibEntry ALVER2007 = new BibEntry(StandardEntryType.Article) + private static final BibEntry ALVER2007 = new BibEntry(StandardEntryType.Article) .withCitationKey("1") .withField(StandardField.AUTHOR, "M. O. Alver and T. Tennøy and J. A. Alfredsen and G. Øie") .withField(StandardField.TITLE, "Automatic measurement of rotifer brachionus plicatilis densities in first feeding tanks") @@ -103,7 +103,7 @@ class BibliographyFromPdfImporterTest { .withCitationKey("2") .withField(StandardField.AUTHOR, "M. O. Alver and others") .withField(StandardField.TITLE, "Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates") - .withField(StandardField.BOOKTITLE, "Aquaculture") + .withField(StandardField.JOURNAL, "Aquaculture") .withField(StandardField.VOLUME, "268") .withField(StandardField.NUMBER, "1") .withField(StandardField.YEAR, "2007") @@ -111,12 +111,13 @@ class BibliographyFromPdfImporterTest { .withField(StandardField.COMMENT, "[2] M. O. Alver et al., “Estimating larval density in cod (gadus morhua) first feeding tanks using measurements of feed density and larval growth rates,” Aquaculture, vol. 268, no. 1, pp. 216–226, 2007."); private static final BibEntry KOPP2012 = new BibEntry(StandardEntryType.InProceedings) .withCitationKey("3") - .withField(StandardField.AUTHOR, "Oliver Kopp and others") + .withField(StandardField.AUTHOR, "O. Kopp and others") .withField(StandardField.TITLE, "BPMN4TOSCA: A domain-specific language to model management plans for composite applications") .withField(StandardField.BOOKTITLE, "Business Process Model and Notation") .withField(StandardField.SERIES, "LNCS") .withField(StandardField.VOLUME, "125") - .withField(StandardField.YEAR, "2018") + .withField(StandardField.YEAR, "2012") + .withField(StandardField.PUBLISHER, "Springer") .withField(StandardField.COMMENT, "[3] O. Kopp et al., “BPMN4TOSCA: A domain-specific language to model management plans for composite applications,” in Business Process Model and Notation, ser. LNCS, vol. 125. Springer, 2012."); private static final BibEntry KOPPP2018 = new BibEntry(StandardEntryType.InProceedings) .withCitationKey("4") From ffab0f17cb0c2fdc49875686dfb792899d8f3a25 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 16:40:34 +0200 Subject: [PATCH 11/18] Show "online" and "offline" in menu --- .../org/jabref/gui/actions/StandardActions.java | 3 ++- .../java/org/jabref/gui/frame/MainMenu.java | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/gui/actions/StandardActions.java b/src/main/java/org/jabref/gui/actions/StandardActions.java index 4bb5338f1ce..92d1923e52e 100644 --- a/src/main/java/org/jabref/gui/actions/StandardActions.java +++ b/src/main/java/org/jabref/gui/actions/StandardActions.java @@ -93,7 +93,8 @@ public enum StandardActions implements Action { PARSE_LATEX(Localization.lang("Search for citations in LaTeX files..."), IconTheme.JabRefIcons.LATEX_CITATIONS), NEW_SUB_LIBRARY_FROM_AUX(Localization.lang("New sublibrary based on AUX file") + "...", Localization.lang("New BibTeX sublibrary") + Localization.lang("This feature generates a new library based on which entries are needed in an existing LaTeX document."), IconTheme.JabRefIcons.NEW), - NEW_LIBRARY_FROM_PDF(Localization.lang("New library based on references in PDF file..."), Localization.lang("This feature generates a new library based on the list of references in a PDF file."), IconTheme.JabRefIcons.NEW), + NEW_LIBRARY_FROM_PDF_ONLINE(Localization.lang("New library based on references in PDF file... (online)"), Localization.lang("This feature generates a new library based on the list of references in a PDF file. Thereby, it uses GROBID's functionality."), IconTheme.JabRefIcons.NEW), + NEW_LIBRARY_FROM_PDF_OFFLINE(Localization.lang("New library based on references in PDF file... (offline)"), Localization.lang("This feature generates a new library based on the list of references in a PDF file. Thereby, it uses JabRef's build-in functionality.."), IconTheme.JabRefIcons.NEW), WRITE_METADATA_TO_PDF(Localization.lang("Write metadata to PDF files"), Localization.lang("Will write metadata to the PDFs linked from selected entries."), KeyBinding.WRITE_METADATA_TO_PDF), START_NEW_STUDY(Localization.lang("Start new systematic literature review")), diff --git a/src/main/java/org/jabref/gui/frame/MainMenu.java b/src/main/java/org/jabref/gui/frame/MainMenu.java index c0414a642e7..dfaf80b69ae 100644 --- a/src/main/java/org/jabref/gui/frame/MainMenu.java +++ b/src/main/java/org/jabref/gui/frame/MainMenu.java @@ -81,6 +81,8 @@ import org.jabref.model.util.FileUpdateMonitor; import org.jabref.preferences.PreferencesService; +import com.tobiasdiez.easybind.EasyBind; + public class MainMenu extends MenuBar { private final JabRefFrame frame; private final FileHistoryMenu fileHistoryMenu; @@ -274,10 +276,17 @@ private void createMenu() { final MenuItem pushToApplicationMenuItem = factory.createMenuItem(pushToApplicationCommand.getAction(), pushToApplicationCommand); pushToApplicationCommand.registerReconfigurable(pushToApplicationMenuItem); + NewLibraryFromPdfAction newLibraryFromPdfAction = new NewLibraryFromPdfAction(frame, stateManager, dialogService, preferencesService, taskExecutor); + // Action used twice, because it distinguishes internally between online and offline + // We want the UI to show "online" and "offline" explicitly + MenuItem newLibraryFromPdfMenuItemOnline = factory.createMenuItem(StandardActions.NEW_LIBRARY_FROM_PDF_ONLINE, newLibraryFromPdfAction); + MenuItem newLibraryFromPdfMenuItemOffline = factory.createMenuItem(StandardActions.NEW_LIBRARY_FROM_PDF_OFFLINE, newLibraryFromPdfAction); + tools.getItems().addAll( factory.createMenuItem(StandardActions.PARSE_LATEX, new ParseLatexAction(stateManager)), factory.createMenuItem(StandardActions.NEW_SUB_LIBRARY_FROM_AUX, new NewSubLibraryAction(frame, stateManager, dialogService)), - factory.createMenuItem(StandardActions.NEW_LIBRARY_FROM_PDF, new NewLibraryFromPdfAction(frame, stateManager, dialogService, preferencesService, taskExecutor)), + newLibraryFromPdfMenuItemOnline, + newLibraryFromPdfMenuItemOffline, new SeparatorMenuItem(), @@ -305,6 +314,12 @@ private void createMenu() { factory.createMenuItem(StandardActions.REDOWNLOAD_MISSING_FILES, new RedownloadMissingFilesAction(stateManager, dialogService, preferencesService.getFilePreferences(), taskExecutor)) ); + + EasyBind.subscribe(preferencesService.getGrobidPreferences().grobidEnabledProperty(), enabled -> { + newLibraryFromPdfMenuItemOnline.setVisible(enabled); + newLibraryFromPdfMenuItemOffline.setVisible(!enabled); + }); + SidePaneType webSearchPane = SidePaneType.WEB_SEARCH; SidePaneType groupsPane = SidePaneType.GROUPS; SidePaneType openOfficePane = SidePaneType.OPEN_OFFICE; From a50616b70c97b399420fed12c5ae4b80902b4446 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 16:53:27 +0200 Subject: [PATCH 12/18] Merge test methods (and add one test) --- .../BibliographyFromPdfImporterTest.java | 77 +++++-------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java index 6e2098bfd2e..bc079eacc19 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java @@ -12,7 +12,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -240,8 +239,6 @@ void tua3i2refpage() throws Exception { .withField(StandardField.URL, "https://indico.cern.ch/event/1027296/") .withField(StandardField.YEAR, "2021"); - // We use the existing test entries, but add a citation key (which is added by the importer) - // We need to clone to keep the static entries unmodified assertEquals(List.of( KNASTER_2017, entry02, @@ -270,13 +267,30 @@ void ieeePaper() throws Exception { assertEquals(List.of(ALVER2007, ALVER2007A, KOPP2012, KOPPP2018, KOENIG2023), parserResult.getDatabase().getEntries()); } - static Stream referencesPlain() { - return Stream.of(KOENIG2023); + static Stream references() { + return Stream.of(KOENIG2023, + KNASTER_2017, + SHIMOSAKI_2019, + BELLAN_2021, + MASUDA_2022, + PODADERA_2012, + KWON_2023, + AKAGI_2023, + INTERNAL_NOTE, + new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("18") + .withField(StandardField.AUTHOR, "Z. Yao and D. S. Weld and W.-P. Chen and H. Sun") + .withField(StandardField.BOOKTITLE, "Proceedings of the 2018 World Wide Web Conference") + .withField(StandardField.COMMENT, "[18] Z. Yao, D. S. Weld, W.-P. Chen, and H. Sun, “Staqc: A systematically mined question-code dataset from stack overflow,” in Proceedings of the 2018 World Wide Web Conference, 2018, pp. 1693–1703.") + .withField(StandardField.TITLE, "Staqc: A systematically mined question-code dataset from stack overflow") + .withField(StandardField.PAGES, "1693-1703") + .withField(StandardField.YEAR, "2018") + ); } @ParameterizedTest @MethodSource - void referencesPlain(BibEntry expectedEntry) { + void references(BibEntry expectedEntry) { String number = expectedEntry.getField(StandardField.COMMENT) .map(comment -> comment.substring(1, 2)) .get(); @@ -285,55 +299,4 @@ void referencesPlain(BibEntry expectedEntry) { .get(); assertEquals(expectedEntry, bibliographyFromPdfImporter.parseReference(number, reference)); } - - static Stream references() { - return Stream.of( - Arguments.of( - KNASTER_2017, - "1", - "J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016, 2017. doi:10.1088/ 1741-4326/aa6a6a" - ), - Arguments.of( - SHIMOSAKI_2019, - "3", - "Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019, pp. 977-979. doi:10.18429/ JACoW-IPAC2019-MOPTS051" - ), - Arguments.of( - BELLAN_2021, - "6", - "L. Bellan et al., “Acceleration of the high current deuteron beam through the IFMIF-EVEDA beam dynamics perfor- mances”, in Proc. HB’21, Batavia, IL, USA, Oct. 2021, pp. 197-202. doi:10.18429/JACoW-HB2021-WEDC2" - ), - Arguments.of( - MASUDA_2022, - "7", - "K. Masuda et al., “Commissioning of IFMIF Prototype Ac- celerator towards CW operation”, in Proc. LINAC’22, Liv- erpool, UK, Aug.-Sep. 2022, pp. 319-323. doi:10.18429/ JACoW-LINAC2022-TU2AA04" - ), - Arguments.of( - PODADERA_2012, - "11", - "I. Podadera, J. M. Carmona, A. Ibarra, and J. Molla, “Beam position monitor development for LIPAc”, presented at th 8th DITANET Topical Workshop on Beam Position Monitors, CERN, Geneva, Switzreland, Jan. 2012." - ), - Arguments.of( - KWON_2023, - "14", - "S. Kwon et al., “High beam current operation with beam di-agnostics at LIPAc”, presented at HB’23, Geneva, Switzer- land, Oct. 2023, paper FRC1I2, this conference." - ), - Arguments.of( - AKAGI_2023, - "15", - "T. Akagi et al., “Achievement of high-current continuous- wave deuteron injector for Linear IFMIF Prototype Accelera- tor (LIPAc)”, to be presented at IAEA FEC’23, London, UK, Oct. 2023. https://www.iaea.org/events/fec2023" - ), - Arguments.of( - INTERNAL_NOTE, - "16", - "“AF4.1.1 SRF Linac Engineering Design Report”, Internal note." - ) - ); - } - - @ParameterizedTest - @MethodSource - void references(BibEntry expectedEntry, String number, String reference) { - assertEquals(expectedEntry, bibliographyFromPdfImporter.parseReference(number, reference)); - } } From 89c60f1d377085241747678b73280a1c6c0bab21 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 21:20:00 +0200 Subject: [PATCH 13/18] Cover more cases --- .../logic/importer/AuthorListParser.java | 55 ++++++++++--- .../BibliographyFromPdfImporter.java | 77 +++++++++++-------- .../logic/importer/AuthorListParserTest.java | 13 ++++ .../BibliographyFromPdfImporterTest.java | 21 +++-- .../jabref/model/entry/AuthorListTest.java | 14 ++++ 5 files changed, 125 insertions(+), 55 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/AuthorListParser.java b/src/main/java/org/jabref/logic/importer/AuthorListParser.java index 023135e87fa..c0269769571 100644 --- a/src/main/java/org/jabref/logic/importer/AuthorListParser.java +++ b/src/main/java/org/jabref/logic/importer/AuthorListParser.java @@ -37,7 +37,7 @@ public class AuthorListParser { private static final Set TEX_NAMES = Set.of( "aa", "ae", "l", "o", "oe", "i", "AA", "AE", "L", "O", "OE", "j"); - private static final Pattern STARTS_WITH_CAPITAL_LETTER_DOT = Pattern.compile("^[A-Z]\\. "); + private static final Pattern STARTS_WITH_CAPITAL_LETTER_DOT_OR_DASH = Pattern.compile("^[A-Z](\\.[ -]| ?-)"); /** * the raw bibtex author/editor field @@ -95,16 +95,14 @@ private static StringBuilder buildWithAffix(Collection indexArray, List return stringBuilder; } - /** - * Parses the String containing person names and returns a list of person information. - * - * @param listOfNames the String containing the person names to be parsed - * @return a parsed list of persons - */ - public AuthorList parse(@NonNull String listOfNames) { + private record SimpleNormalFormResult(String authors, boolean andOthersPresent) { + } + + private static SimpleNormalFormResult getSimpleNormalForm(String listOfNames) { + listOfNames = listOfNames.replace(" -", "-").trim(); + // Handling of "and others" // Remove it from the list; it will be added at the very end of this method as special Author.OTHERS - listOfNames = listOfNames.trim(); final String andOthersSuffix = " and others"; final boolean andOthersPresent; if (StringUtil.endsWithIgnoreCase(listOfNames, andOthersSuffix)) { @@ -114,7 +112,40 @@ public AuthorList parse(@NonNull String listOfNames) { andOthersPresent = false; } - listOfNames = checkNamesCommaSeparated(listOfNames); + return new SimpleNormalFormResult(checkNamesCommaSeparated(listOfNames), andOthersPresent); + } + + /** + * Tries to get a simple BibTeX author list of the given string. + * + * This is an intermediate step in {@link #parse}. Since parse does not work in all cases, + * this method can be used to get more valid BibTeX. + * + * @return Optional.empty if there was no normalization. + */ + public static Optional normalizeSimply(String listOfNames) { + SimpleNormalFormResult simpleNormalForm = getSimpleNormalForm(listOfNames); + String result = simpleNormalForm.authors; + if (simpleNormalForm.andOthersPresent) { + result += " and others"; + } + if (result.equals(listOfNames)) { + // No changes were done inside the method + return Optional.empty(); + } + return Optional.of(result); + } + + /** + * Parses the String containing person names and returns a list of person information. + * + * @param listOfNames the String containing the person names to be parsed + * @return a parsed list of persons + */ + public AuthorList parse(@NonNull String listOfNames) { + SimpleNormalFormResult simpleNormalForm = getSimpleNormalForm(listOfNames); + listOfNames = simpleNormalForm.authors; + boolean andOthersPresent = simpleNormalForm.andOthersPresent; // Handle case names in order lastname, firstname and separated by "," // E.g., Ali Babar, M., Dingsøyr, T., Lago, P., van der Vliet, H. @@ -188,11 +219,11 @@ private static String checkNamesCommaSeparated(String listOfNames) { int commandAndPos = listOfNames.lastIndexOf(", and "); if (commandAndPos >= 0) { String lastContainedName = listOfNames.substring(commandAndPos + ", and ".length()); - Matcher matcher = STARTS_WITH_CAPITAL_LETTER_DOT.matcher(lastContainedName); + Matcher matcher = STARTS_WITH_CAPITAL_LETTER_DOT_OR_DASH.matcher(lastContainedName); if (matcher.find()) { String namesBeforeAndString = listOfNames.substring(0, commandAndPos); String[] namesBeforeAnd = namesBeforeAndString.split(", "); - if (Arrays.stream(namesBeforeAnd).allMatch(name -> STARTS_WITH_CAPITAL_LETTER_DOT.matcher(name).find())) { + if (Arrays.stream(namesBeforeAnd).allMatch(name -> STARTS_WITH_CAPITAL_LETTER_DOT_OR_DASH.matcher(name).find())) { // Format found listOfNames = Arrays.stream(namesBeforeAnd).collect(Collectors.joining(" and ", "", " and " + lastContainedName)); } diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java index 31b1eef5f4c..49d3ffa6291 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporter.java @@ -16,6 +16,7 @@ import org.jabref.logic.citationkeypattern.CitationKeyPatternPreferences; import org.jabref.logic.cleanup.URLCleanup; import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter; +import org.jabref.logic.importer.AuthorListParser; import org.jabref.logic.importer.Importer; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.l10n.Localization; @@ -52,6 +53,7 @@ public class BibliographyFromPdfImporter extends Importer { private static final Pattern REFERENCES = Pattern.compile("References", Pattern.CASE_INSENSITIVE); private static final Pattern REFERENCE_PATTERN = Pattern.compile("\\[(\\d+)\\](.*?)(?=\\[|$)", Pattern.DOTALL); private static final Pattern YEAR_AT_END = Pattern.compile(", (\\d{4})\\.$"); + private static final Pattern YEAR = Pattern.compile(", (\\d{4})(.*)"); private static final Pattern PAGES = Pattern.compile(", pp\\. (\\d+--?\\d+)\\.?(.*)"); private static final Pattern PAGE = Pattern.compile(", p\\. (\\d+)(.*)"); private static final Pattern SERIES = Pattern.compile(", ser\\. ([^.,]+)(.*)"); @@ -128,7 +130,8 @@ public ParserResult importDatabase(Path filePath) { return parserResult; } - private record IntermediateData(String number, String reference) { + @VisibleForTesting + record IntermediateData(String number, String reference) { } /** @@ -136,16 +139,22 @@ private record IntermediateData(String number, String reference) { * Out: List<String> = ["[1] ...", "[2]...", "[3]..."] */ private List getEntriesFromPDFContent(String contents) { + List referencesStrings = getIntermediateData(contents); + + return referencesStrings.stream() + .map(data -> parseReference(data.number(), data.reference())) + .toList(); + } + + @VisibleForTesting + static List getIntermediateData(String contents) { List referencesStrings = new ArrayList<>(); Matcher matcher = REFERENCE_PATTERN.matcher(contents); while (matcher.find()) { String reference = matcher.group(2).replaceAll("\\r?\\n", " ").trim(); referencesStrings.add(new IntermediateData(matcher.group(1), reference)); } - - return referencesStrings.stream() - .map(data -> parseReference(data.number(), data.reference())) - .toList(); + return referencesStrings; } /** @@ -203,11 +212,10 @@ BibEntry parseReference(String number, String reference) { reference = reference .replace(".-", "-") - .replace("- ", "") // Unicode en dash (used as page separator) .replace("–", "-") - // Remove "- " introduced by linebreaks in the PDF - .replace("- ", ""); + // Remove "- " introduced by linebreaks in the PDF + .replaceAll("([^ ])- ", "$1"); // Move URL to URL field Matcher urlPatternMatcher = URLCleanup.URL_PATTERN.matcher(reference); @@ -230,23 +238,15 @@ BibEntry parseReference(String number, String reference) { reference = reference.substring(0, pos).trim(); } - // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016, 2017. - // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019, pp. 977-979 - Matcher matcher = YEAR_AT_END.matcher(reference); - if (matcher.find()) { - result.setField(StandardField.YEAR, matcher.group(1)); - reference = reference.substring(0, matcher.start()).trim(); - } - reference = updateEntryAndReferenceIfMatches(reference, PAGES, result, StandardField.PAGES).newReference; - reference = updateEntryAndReferenceIfMatches(reference, SERIES, result, StandardField.SERIES).newReference; - // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016 // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019 reference = updateEntryAndReferenceIfMatches(reference, PAGE, result, StandardField.PAGES).newReference; - matcher = MONTH_RANGE_AND_YEAR.matcher(reference); + reference = updateEntryAndReferenceIfMatches(reference, SERIES, result, StandardField.SERIES).newReference; + + Matcher matcher = MONTH_RANGE_AND_YEAR.matcher(reference); if (matcher.find()) { // strip out second monthp reference = reference.substring(0, matcher.start()) + ", " + matcher.group(1) + " " + matcher.group(2) + matcher.group(3); @@ -273,6 +273,16 @@ BibEntry parseReference(String number, String reference) { } } + // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57, p. 102016, 2017. + // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia, May 2019, pp. 977-979 + matcher = YEAR_AT_END.matcher(reference); + if (matcher.find()) { + result.setField(StandardField.YEAR, matcher.group(1)); + reference = reference.substring(0, matcher.start()).trim(); + } + + reference = updateEntryAndReferenceIfMatches(reference, YEAR, result, StandardField.YEAR).newReference; + // J. Knaster et al., “Overview of the IFMIF/EVEDA project”, Nucl. Fusion, vol. 57 // Y. Shimosaki et al., “Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc”, in Proc. IPAC’19, Mel- bourne, Australia EntryUpdateResult entryUpdateResult = updateEntryAndReferenceIfMatches(reference, VOLUME, result, StandardField.VOLUME); @@ -288,10 +298,13 @@ BibEntry parseReference(String number, String reference) { matcher = AUTHORS_AND_TITLE_AT_BEGINNING.matcher(reference); if (matcher.find()) { String authors = matcher.group(1).replaceAll("et al\\.?", "and others"); - result.setField(StandardField.AUTHOR, AuthorList.fixAuthorFirstNameFirst(authors)); - result.setField(StandardField.TITLE, matcher.group(2) - .replace("- ", "") - .replaceAll("et al\\.?", "and others")); + + // Alternative: AuthorList.fixAuthorFirstNameFirst(authors) only + // However, this does not work with special cases. Thus, we do a simple transformation only. + String fixedAuthors = AuthorListParser.normalizeSimply(authors).orElseGet(() -> AuthorList.fixAuthorFirstNameFirst(authors)); + + result.setField(StandardField.AUTHOR, fixedAuthors); + result.setField(StandardField.TITLE, matcher.group(2).replaceAll("et al\\.?", "and others")); reference = reference.substring(matcher.end()).trim(); } else { // No authors present @@ -317,7 +330,9 @@ BibEntry parseReference(String number, String reference) { result.setType(StandardEntryType.InProceedings); String bookTitle; + int offset; if (proceedingsMatcher.hasMatch()) { + offset = proceedingsMatcher.start(2) - 3; // 3 is the length of "in " String proc = proceedingsMatcher.group(1); if (proc == null) { bookTitle = proceedingsMatcher.group(2); @@ -326,19 +341,20 @@ BibEntry parseReference(String number, String reference) { bookTitle = proc + proceedingsMatcher.group(2); } } else { + offset = 0; bookTitle = reference; } reference = ""; - int lastDot = bookTitle.lastIndexOf(". "); + int lastDot = bookTitle.substring(offset).lastIndexOf(". "); if (lastDot == -1) { - lastDot = bookTitle.lastIndexOf('.'); + lastDot = bookTitle.substring(offset).lastIndexOf('.'); } - if (lastDot > 0) { - String textAfterDot = bookTitle.substring(lastDot + 1).trim(); + if (lastDot > offset) { + String textAfterDot = bookTitle.substring(offset + lastDot + 1).trim(); // We use Apache Commons here, because it is fastest - see table at https://stackoverflow.com/a/35242882/873282 if (!textAfterDot.contains("http") && (StringUtils.countMatches(textAfterDot, ' ') <= 1)) { - bookTitle = bookTitle.substring(0, lastDot).trim(); + bookTitle = bookTitle.substring(0, offset + lastDot).trim(); if (bookTitle.startsWith("in ")) { bookTitle = bookTitle.substring(3); } @@ -392,11 +408,8 @@ private static EntryUpdateResult updateEntryAndReferenceIfMatches(String referen if (!matcher.find()) { return new EntryUpdateResult(false, reference); } - result.setField(field, matcher.group(1).replace("- ", "")); + result.setField(field, matcher.group(1)); String suffix = matcher.group(2); - if (!suffix.isEmpty()) { - suffix = " " + suffix; - } reference = reference.substring(0, matcher.start()).trim() + suffix; return new EntryUpdateResult(true, reference); } diff --git a/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java b/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java index 56611851ca7..fdb24cff12f 100644 --- a/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java +++ b/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java @@ -1,10 +1,12 @@ package org.jabref.logic.importer; +import java.util.Optional; import java.util.stream.Stream; import org.jabref.model.entry.Author; import org.jabref.model.entry.AuthorList; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -46,6 +48,17 @@ void parseSingleAuthorCorrectly(String authorsString, Author authorsParsed) { assertEquals(AuthorList.of(authorsParsed), parser.parse(authorsString)); } + @Test + public void dashedNamesWithoutSpaceNormalized() { + assertEquals(Optional.of("Z. Yao and D. S. Weld and W-P. Chen and H. Sun"), AuthorListParser.normalizeSimply("Z. Yao, D. S. Weld, W-P. Chen, and H. Sun")); + } + + @Test + public void dashedNamesWithSpaceNormalized() { + assertEquals(Optional.of("Z. Yao and D. S. Weld and W.-P. Chen and H. Sun"), AuthorListParser.normalizeSimply("Z. Yao, D. S. Weld, W.-P. Chen, and H. Sun")); + + } + private static Stream parseMultipleCorrectly() { return Stream.of( Arguments.of( diff --git a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java index bc079eacc19..661f30c04b2 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/BibliographyFromPdfImporterTest.java @@ -31,7 +31,7 @@ class BibliographyFromPdfImporterTest { private static final BibEntry SHIMOSAKI_2019 = new BibEntry(StandardEntryType.InProceedings) .withCitationKey("3") .withField(StandardField.AUTHOR, "Y. Shimosaki and others") - .withField(StandardField.TITLE, "Lattice design for 5 MeV – 125 mA CW RFQ operation in LIPAc") + .withField(StandardField.TITLE, "Lattice design for 5 MeV - 125 mA CW RFQ operation in LIPAc") .withField(StandardField.BOOKTITLE, "Proc. IPAC’19, Melbourne, Australia") .withField(StandardField.MONTH, "#may#") .withField(StandardField.YEAR, "2019") @@ -202,8 +202,9 @@ void tua3i2refpage() throws Exception { BibEntry entry10 = new BibEntry(StandardEntryType.InProceedings) .withCitationKey("10") .withField(StandardField.AUTHOR, "K. Hirosawa and others") - .withField(StandardField.BOOKTITLE, "Proc. PASJ’23, 2023, Japan") + .withField(StandardField.BOOKTITLE, "Proc. PASJ’23, Japan") .withField(StandardField.TITLE, "High-Power RF tests of repaired circulator for LIPAc RFQ") + .withField(StandardField.YEAR, "2023") .withField(StandardField.COMMENT, "[10] K. Hirosawa et al., “High-Power RF tests of repaired circu- lator for LIPAc RFQ”, in Proc. PASJ’23, 2023, Japan."); BibEntry entry12 = new BibEntry(StandardEntryType.InProceedings) @@ -268,7 +269,8 @@ void ieeePaper() throws Exception { } static Stream references() { - return Stream.of(KOENIG2023, + return Stream.of( + KOENIG2023, KNASTER_2017, SHIMOSAKI_2019, BELLAN_2021, @@ -279,7 +281,7 @@ static Stream references() { INTERNAL_NOTE, new BibEntry(StandardEntryType.InProceedings) .withCitationKey("18") - .withField(StandardField.AUTHOR, "Z. Yao and D. S. Weld and W.-P. Chen and H. Sun") + .withField(StandardField.AUTHOR, "Z. Yao and D. S. Weld and W-P. Chen and H. Sun") .withField(StandardField.BOOKTITLE, "Proceedings of the 2018 World Wide Web Conference") .withField(StandardField.COMMENT, "[18] Z. Yao, D. S. Weld, W.-P. Chen, and H. Sun, “Staqc: A systematically mined question-code dataset from stack overflow,” in Proceedings of the 2018 World Wide Web Conference, 2018, pp. 1693–1703.") .withField(StandardField.TITLE, "Staqc: A systematically mined question-code dataset from stack overflow") @@ -291,12 +293,9 @@ static Stream references() { @ParameterizedTest @MethodSource void references(BibEntry expectedEntry) { - String number = expectedEntry.getField(StandardField.COMMENT) - .map(comment -> comment.substring(1, 2)) - .get(); - String reference = expectedEntry.getField(StandardField.COMMENT) - .map(comment -> comment.substring(4)) - .get(); - assertEquals(expectedEntry, bibliographyFromPdfImporter.parseReference(number, reference)); + List intermediateDataList = BibliographyFromPdfImporter.getIntermediateData(expectedEntry.getField(StandardField.COMMENT).get()); + assertEquals(1, intermediateDataList.size()); + BibliographyFromPdfImporter.IntermediateData intermediateData = intermediateDataList.getFirst(); + assertEquals(expectedEntry, bibliographyFromPdfImporter.parseReference(intermediateData.number(), intermediateData.reference())); } } diff --git a/src/test/java/org/jabref/model/entry/AuthorListTest.java b/src/test/java/org/jabref/model/entry/AuthorListTest.java index 57617913ddd..3e70d786ccb 100644 --- a/src/test/java/org/jabref/model/entry/AuthorListTest.java +++ b/src/test/java/org/jabref/model/entry/AuthorListTest.java @@ -4,6 +4,7 @@ import java.util.Collections; import java.util.Optional; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -1045,6 +1046,19 @@ public void parseNameWithHyphenInLastNameWhenLastNameGivenFirst() throws Excepti assertEquals(AuthorList.of(expected), AuthorList.parse("al-Ṣāliḥ, ʿAbdallāh")); } + @Test + @Disabled("Has issues with space character in W-P.") + public void parseWithDash() throws Exception { + assertEquals( + AuthorList.of( + new Author("Z.", "Z.", null, "Yao", null), + new Author("D. S.", "D. S.", null, "Weld", null), + new Author("W-P.", "W-P.", null, "Chen", null), + new Author("H.", "H.", null, "Sun", null) + ), + AuthorList.parse("Z. Yao, D. S. Weld, W.-P. Chen, and H. Sun")); + } + @Test public void parseNameWithBraces() throws Exception { Author expected = new Author("H{e}lene", "H.", null, "Fiaux", null); From 862c76f376e5e84b9dbd87cf0c04d79969f13455 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 21:39:53 +0200 Subject: [PATCH 14/18] Update CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9feb1a92e1..3d4218b6d53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,8 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ### Added - We added support for selecting and using CSL Styles in JabRef's OpenOffice/LibreOffice integration for inserting bibliographic and in-text citations into a document. [#2146](https://github.com/JabRef/jabref/issues/2146), [#8893](https://github.com/JabRef/jabref/issues/8893) -- We added Tools > New library based on references in PDF file... to create a new library based on the references section in a PDF file. -- When converting the references section of a paper (PDF file), more than the last page is treated. +- We added Tools > New library based on references in PDF file... to create a new library based on the references section in a PDF file. [#11522](https://github.com/JabRef/jabref/pull/11522) +- When converting the references section of a paper (PDF file), more than the last page is treated. [#11522](https://github.com/JabRef/jabref/pull/11522) - Added minimal support for [biblatex data annotation](https://mirrors.ctan.org/macros/latex/contrib/biblatex/doc/biblatex.pdf#subsection.3.7) fields in .layout files. [#11505](https://github.com/JabRef/jabref/issues/11505) - Added saving of selected options in the [Lookup -> Search for unlinked local files dialog](https://docs.jabref.org/collect/findunlinkedfiles#link-the-pdfs-to-your-bib-library). [#11439](https://github.com/JabRef/jabref/issues/11439) From 651b7fd3305e3642a85cb5ca21c5a69968a35c8e Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 22:01:47 +0200 Subject: [PATCH 15/18] Fix localization --- src/main/resources/l10n/JabRef_en.properties | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index 75f4b905355..22ee48ca568 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -891,8 +891,10 @@ AUX\ file\ import=AUX file import LaTeX\ AUX\ file\:=LaTeX AUX file\: found\ in\ AUX\ file=found in AUX file nested\ AUX\ files=nested AUX files -New\ library\ based\ on\ references\ in\ PDF\ file...=New library based on references in PDF file... -This\ feature\ generates\ a\ new\ library\ based\ on\ the\ list\ of\ references\ in\ a\ PDF\ file.=This feature generates a new library based on the list of references in a PDF file. +New\ library\ based\ on\ references\ in\ PDF\ file...\ (offline)=New library based on references in PDF file... (offline) +New\ library\ based\ on\ references\ in\ PDF\ file...\ (online)=New library based on references in PDF file... (online) +This\ feature\ generates\ a\ new\ library\ based\ on\ the\ list\ of\ references\ in\ a\ PDF\ file.\ Thereby,\ it\ uses\ GROBID's\ functionality.=This feature generates a new library based on the list of references in a PDF file. Thereby, it uses GROBID's functionality. +This\ feature\ generates\ a\ new\ library\ based\ on\ the\ list\ of\ references\ in\ a\ PDF\ file.\ Thereby,\ it\ uses\ JabRef's\ build-in\ functionality..=This feature generates a new library based on the list of references in a PDF file. Thereby, it uses JabRef's build-in functionality.. Sublibrary\ from\ AUX\ to\ BibTeX=Sublibrary from AUX to BibTeX New\ BibTeX\ sublibrary=New BibTeX sublibrary From def1aaa90b5e1d73196cbcc90f5b5ca8c1e9e58f Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 21 Jul 2024 22:12:39 +0200 Subject: [PATCH 16/18] Fix checkstyle --- .../java/org/jabref/logic/importer/AuthorListParserTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java b/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java index fdb24cff12f..c83ff72598a 100644 --- a/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java +++ b/src/test/java/org/jabref/logic/importer/AuthorListParserTest.java @@ -56,7 +56,6 @@ public void dashedNamesWithoutSpaceNormalized() { @Test public void dashedNamesWithSpaceNormalized() { assertEquals(Optional.of("Z. Yao and D. S. Weld and W.-P. Chen and H. Sun"), AuthorListParser.normalizeSimply("Z. Yao, D. S. Weld, W.-P. Chen, and H. Sun")); - } private static Stream parseMultipleCorrectly() { From 9cf5c71a9c81518c8941dfb15bb8afcbc0814c35 Mon Sep 17 00:00:00 2001 From: Siedlerchr Date: Wed, 24 Jul 2024 21:22:37 +0200 Subject: [PATCH 17/18] add both menu items --- src/main/java/org/jabref/gui/frame/MainMenu.java | 5 +---- .../org/jabref/gui/maintable/NewLibraryFromPdfAction.java | 5 +++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/gui/frame/MainMenu.java b/src/main/java/org/jabref/gui/frame/MainMenu.java index dfaf80b69ae..a7b7aab0e6a 100644 --- a/src/main/java/org/jabref/gui/frame/MainMenu.java +++ b/src/main/java/org/jabref/gui/frame/MainMenu.java @@ -315,10 +315,7 @@ private void createMenu() { factory.createMenuItem(StandardActions.REDOWNLOAD_MISSING_FILES, new RedownloadMissingFilesAction(stateManager, dialogService, preferencesService.getFilePreferences(), taskExecutor)) ); - EasyBind.subscribe(preferencesService.getGrobidPreferences().grobidEnabledProperty(), enabled -> { - newLibraryFromPdfMenuItemOnline.setVisible(enabled); - newLibraryFromPdfMenuItemOffline.setVisible(!enabled); - }); + EasyBind.subscribe(preferencesService.getGrobidPreferences().grobidEnabledProperty(), newLibraryFromPdfMenuItemOnline::setVisible); SidePaneType webSearchPane = SidePaneType.WEB_SEARCH; SidePaneType groupsPane = SidePaneType.GROUPS; diff --git a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java index 0716a9aa2a0..e7108cb34a6 100644 --- a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java +++ b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java @@ -17,6 +17,7 @@ import org.jabref.logic.importer.util.GrobidService; import org.jabref.logic.l10n.Localization; import org.jabref.logic.util.StandardFileType; +import org.jabref.model.database.BibDatabaseContext; import org.jabref.preferences.PreferencesService; import org.slf4j.Logger; @@ -65,7 +66,7 @@ public void execute() { builder.withDefaultExtension(StandardFileType.PDF); // Sensible default for the directory to start browsing is the directory of the currently opened library. The pdf storage dir seems not to be feasible, because extracting references from a PDF itself can be done by the context menu of the respective entry. stateManager.getActiveDatabase() - .flatMap(db -> db.getDatabasePath()) + .flatMap(BibDatabaseContext::getDatabasePath) .ifPresent(path -> builder.withInitialDirectory(path.getParent())); FileDialogConfiguration fileDialogConfiguration = builder.build(); @@ -76,7 +77,7 @@ public void execute() { Callable parserResultCallable = getParserResultCallable(path); BackgroundTask.wrap(parserResultCallable) .withInitialMessage(Localization.lang("Processing PDF(s)")) - .onFailure(failure -> Platform.runLater(() -> dialogService.showErrorDialogAndWait(failure))) + .onFailure(dialogService::showErrorDialogAndWait) .onSuccess(result -> { LOGGER.trace("Finished processing PDF(s): {}", result); libraryTabContainer.addTab(result.getDatabaseContext(), true); From 9535b75f30541554099fe56f42e9e57574397500 Mon Sep 17 00:00:00 2001 From: Siedlerchr Date: Wed, 24 Jul 2024 21:24:57 +0200 Subject: [PATCH 18/18] checkstyle --- .../java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java index e7108cb34a6..67d7c844d4a 100644 --- a/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java +++ b/src/main/java/org/jabref/gui/maintable/NewLibraryFromPdfAction.java @@ -3,8 +3,6 @@ import java.nio.file.Path; import java.util.concurrent.Callable; -import javafx.application.Platform; - import org.jabref.gui.DialogService; import org.jabref.gui.LibraryTabContainer; import org.jabref.gui.StateManager;