JabRef · koppor · Oct 30, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -26,11 +26,14 @@
 import org.jabref.model.entry.types.StandardEntryType;
 import org.jabref.model.strings.StringUtil;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Strings;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
 
+import static org.jabref.model.strings.StringUtil.isNullOrEmpty;
+
 /**
  * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry.
  * <p>
@@ -197,8 +200,8 @@ public ParserResult importDatabase(Path filePath) {
         List<BibEntry> result = new ArrayList<>(1);
         try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) {
             String firstPageContents = getFirstPageContents(document);
-            String title = extractTitleFromDocument(document);
-            Optional<BibEntry> entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE, title);
+            String titleByFontSize = extractTitleFromDocument(document);
+            Optional<BibEntry> entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE, titleByFontSize);
             entry.ifPresent(result::add);
         } catch (EncryptedPdfsNotSupportedException e) {
             return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
@@ -236,10 +239,25 @@ protected void writeString(String text, List<TextPosition> textPositions) {
             textPositionsList.addAll(textPositions);
         }
 
-        private boolean isUnwantedText(TextPosition previousTextPosition) {
-            return previousTextPosition != null
-                    && (previousTextPosition.getPageHeight() - previousTextPosition.getYDirAdj())
-                    < (previousTextPosition.getPageHeight() * 0.1);
+        private boolean isFarAway(TextPosition previous, TextPosition current) {
+            float XspaceThreshold = 3.0F;
+            float YspaceThreshold = previous.getFontSizeInPt() * 1.5F;
+            float Xgap = current.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj());
+            float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir());
+            return Xgap > XspaceThreshold && Ygap > YspaceThreshold;
+        }
+
+        private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition) {
+            if (textPosition == null || previousTextPosition == null) {
+                return false;
+            }
+            // The title usually don't in the bottom 10% of a page.
+            if ((textPosition.getPageHeight() - textPosition.getYDirAdj())
+                    < (textPosition.getPageHeight() * 0.1)) {
+                return true;
+            }
+            // The title character usually stay together.
+            return isFarAway(previousTextPosition, textPosition);
         }
 
         private String findLargestFontText(List<TextPosition> textPositions) {
@@ -248,7 +266,7 @@ private String findLargestFontText(List<TextPosition> textPositions) {
             TextPosition previousTextPosition = null;
             for (TextPosition textPosition : textPositions) {
                 // Exclude unwanted text based on heuristics
-                if (isUnwantedText(textPosition)) {
+                if (isUnwantedText(previousTextPosition, textPosition)) {
                     continue;
                 }
                 float fontSize = textPosition.getFontSizeInPt();
@@ -277,25 +295,38 @@ private boolean isThereSpace(TextPosition previous, TextPosition current) {
             float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir());
             return Xgap > XspaceThreshold || Ygap > YspaceThreshold;
         }
-
-//        private boolean isNewLine(TextPosition previous, TextPosition current) {
-//            float verticalThreshold = previous.getFontSizeInPt() * 1f; // Adjust threshold as needed
-//            float gap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir());
-//            return gap < verticalThreshold && gap > -verticalThreshold;
-//        }
     }
 
-    // make this method package visible so we can test it
-    Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineSeparator, String titleByPosition) {
-        // idea: split[] contains the different lines
-        // blocks are separated by empty lines
-        // treat each block
-        //   or do special treatment at authors (which are not broken)
-        //   therefore, we do a line-based and not a block-based splitting
-        // i points to the current line
-        // curString (mostly) contains the current block
-        //   the different lines are joined into one and thereby separated by " "
-
+    /**
+     * Parses the first page content of a PDF document and extracts bibliographic information such as title, author,
+     * abstract, keywords, and other relevant metadata. This method processes the content line-by-line and uses
+     * custom parsing logic to identify and assemble information blocks from academic papers.
+     *
+     * idea: split[] contains the different lines, blocks are separated by empty lines, treat each block
+     *       or do special treatment at authors (which are not broken).
+     *       Therefore, we do a line-based and not a block-based splitting i points to the current line
+     *       curString (mostly) contains the current block,
+     *       the different lines are joined into one and thereby separated by " "
+     *
+     * <p> This method follows the structure typically found in academic paper PDFs:
+     * - First, it attempts to detect the title by font size, if available, or by text position.
+     * - Authors are then processed line-by-line until reaching the next section.
+     * - Abstract and keywords, if found, are extracted as they appear on the page.
+     * - Finally, conference details, DOI, and publication information are parsed from the lower blocks.
+     *
+     * <p> The parsing logic also identifies and categorizes entries based on keywords such as "Abstract" or "Keywords"
+     * and specific terms that denote sections. Additionally, this method can handle
+     * publisher-specific formats like Springer or IEEE, extracting data like series, volume, and conference titles.
+     *
+     * @param firstpageContents The raw content of the PDF's first page, which may contain metadata and main content.
+     * @param lineSeparator     The line separator used to format and unify line breaks in the text content.
+     * @param titleByFontSize   An optional title string determined by font size; if provided, this overrides the
+     *                          default title parsing.
+     * @return An {@link Optional} containing a {@link BibEntry} with the parsed bibliographic data if extraction
+     *         is successful. Otherwise, an empty {@link Optional}.
+     */
+    @VisibleForTesting
+    Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineSeparator, String titleByFontSize) {
         String firstpageContentsUnifiedLineBreaks = StringUtil.unifyLineBreaks(firstpageContents, lineSeparator);
 
         lines = firstpageContentsUnifiedLineBreaks.split(lineSeparator);
@@ -352,8 +383,11 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         // start: title
         fillCurStringWithNonEmptyLines();
         title = streamlineTitle(curString);
-        curString = "";
         // i points to the next non-empty line
+        curString = "";
+        if (!isNullOrEmpty(titleByFontSize)) {
+            title = titleByFontSize;
+        }
 
         // after title: authors
         author = null;
@@ -470,13 +504,6 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                     // IEEE has the conference things at the end
                     publisher = "IEEE";
 
-                    // year is extracted by extractYear
-                    // otherwise, we could it determine as follows:
-                    // String yearStr = curString.substring(curString.length()-4);
-                    // if (isYear(yearStr)) {
-                    //  year = yearStr;
-                    // }
-
                     if (conference == null) {
                         pos = curString.indexOf('$');
                         if (pos > 0) {
@@ -514,8 +541,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             entry.setField(StandardField.KEYWORDS, keywords);
         }
         if (title != null) {
-//            title = guessBetterTitleInMetaData(buildMetaData(author, editor, abstractT, keywords, title, conference, doi, series, volume, number, pages, year, publisher));
-            entry.setField(StandardField.TITLE, titleByPosition == null || titleByPosition.isEmpty() ? title : titleByPosition);
+            entry.setField(StandardField.TITLE, title);
         }
         if (conference != null) {
             entry.setField(StandardField.BOOKTITLE, conference);
@@ -563,7 +589,7 @@ private String getFirstPageContents(PDDocument document) throws IOException {
 
         stripper.setStartPage(1);
         stripper.setEndPage(1);
-//        stripper.setSortByPosition(true);
+        stripper.setSortByPosition(true);
         stripper.setParagraphEnd(System.lineSeparator());
         StringWriter writer = new StringWriter();
         stripper.writeText(document, writer);

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -36,7 +36,7 @@ void importTwiceWorksAsExpected() throws Exception {
 
         BibEntry expected = new BibEntry(StandardEntryType.InProceedings)
                 .withField(StandardField.AUTHOR, "1 ")
-                .withField(StandardField.TITLE, "Hello World 1")
+                .withField(StandardField.TITLE, "Hello World")
                 .withFiles(List.of(new LinkedFile("", file.toAbsolutePath(), "PDF")));
         assertEquals(List.of(expected), result);
 
@@ -139,16 +139,16 @@ void pdfTitleExtraction(String filePath, String expectedTitle) throws Exception
 
     private static Stream<Arguments> providePdfData() {
         return Stream.of(
-                Arguments.of("/pdfs/se2paper.pdf", "On How We Can Teach – Exploring New Ways in Professional Software Development for Students"),
+                Arguments.of("/pdfs/PdfContentImporter/se2paper - On How We Can Teach – Exploring New Ways in Professional Software Development for Students.pdf", "On How We Can Teach – Exploring New Ways in Professional Software Development for Students"),
                 Arguments.of("/pdfs/IEEE/ieee-paper.pdf", "JabRef Example for Reference Parsing"),
                 Arguments.of("/org/jabref/logic/importer/util/LNCS-minimal.pdf", "Paper Title"),
                 Arguments.of("/pdfs/example-scientificThesisTemplate.pdf", "Is Oil the future?"),
                 Arguments.of("/pdfs/thesis-example.pdf", "Thesis Title"),
-                Arguments.of("/pdfs/3597503.3639130.pdf", "Recovering Trace Links Between Software Documentation And Code"),
-                Arguments.of("/pdfs/peerj-cs-213.pdf", "On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis"),
-                Arguments.of("/pdfs/s10664-020-09875-y.pdf", "Pandemic programming"),
-                Arguments.of("/pdfs/s10664-023-10367-y.pdf", "Do RESTful API design rules have an impact on the understandability of Web APIs?"),
-                Arguments.of("/pdfs/Softw Pract Exp - 2022 - Fritzsch - Adopting microservices and DevOps in the cyber‐physical systems domain  A rapid review.pdf", "Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study")
+                Arguments.of("/pdfs/PdfContentImporter/3597503.3639130 - Recovering Trace Links Between Software Documentation And Code.pdf", "Recovering Trace Links Between Software Documentation And Code"),
+                Arguments.of("/pdfs/PdfContentImporter/peerj-cs-213 - On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis.pdf", "On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis"),
+                Arguments.of("/pdfs/PdfContentImporter/s10664-020-09875-y - Pandemic programming.pdf", "Pandemic programming"),
+                Arguments.of("/pdfs/PdfContentImporter/s10664-023-10367-y - Do RESTful API design rules have an impact on the understandability of Web APIs?.pdf", "Do RESTful API design rules have an impact on the understandability of Web APIs?"),
+                Arguments.of("/pdfs/PdfContentImporter/spe.3169 - Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study.pdf", "Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study")
         );
     }
 }