Skip to content

Commit

Permalink
Support Annotations Created by Foxit (#2878)
Browse files Browse the repository at this point in the history
* add some passing tests

* Add Squiggly as annotation type

* Fix inconsistent number types of PDAnnotations

* Refactor the marked text extraction

* ChangeLog

* Refactoring

* Correctly support StrikeOut Annotations

* Improve naming and understandability of Bounding boxes

* Remove link from supported FileAnnotationTypes

* Explicitly ignore widget and link

* fix potential NPE

* incorporate feedback
  • Loading branch information
LinusDietz authored and Siedlerchr committed Jun 1, 2017
1 parent 3257f7b commit f5386b2
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 72 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- We fixed the IEEE Xplore web search functionality [#2789](https://github.com/JabRef/jabref/issues/2789)
- We fixed an error in the CrossRef fetcher that occurred if one of the fetched entries had no title
- We fixed an issue that prevented new entries to be automatically assigned to the currently selected group [#2783](https://github.com/JabRef/jabref/issues/2783).
- We fixed a bug that only allowed parsing positive timezones from a FileAnnotation [#2839](https://github.com/JabRef/jabref/issues/22839)
- We fixed a bug that only allowed parsing positive timezones from a FileAnnotation [#2839](https://github.com/JabRef/jabref/issues/2839)
- We fixed a bug that did not allow the correct re-export of the MS-Office XML field `msbib-accessed` with a different date format [#2859](https://github.com/JabRef/jabref/issues/2859).

- We fixed some bugs that prevented the display of FileAnnotations that were created using the Foxit Reader. [#2839, comment](https://github.com/JabRef/jabref/issues/2839#issuecomment-302058227).
### Removed


Expand Down
74 changes: 18 additions & 56 deletions src/main/java/org/jabref/logic/pdf/PdfAnnotationImporter.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.jabref.logic.pdf;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -18,13 +17,10 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;

public class PdfAnnotationImporter implements AnnotationImporter {

Expand Down Expand Up @@ -53,8 +49,8 @@ public List<FileAnnotation> importAnnotations(final Path path) {
if (!isSupportedAnnotationType(annotation)) {
continue;
}
if (FileAnnotationType.UNDERLINE.toString().equals(annotation.getSubtype()) ||
FileAnnotationType.HIGHLIGHT.toString().equals(annotation.getSubtype())) {

if (FileAnnotationType.isMarkedFileAnnotationType(annotation.getSubtype())) {
annotationsList.add(createMarkedAnnotations(pageIndex, page, annotation));
} else {
FileAnnotation fileAnnotation = new FileAnnotation(annotation, pageIndex + 1);
Expand All @@ -71,6 +67,13 @@ public List<FileAnnotation> importAnnotations(final Path path) {
}

private boolean isSupportedAnnotationType(PDAnnotation annotation) {
if (annotation.getSubtype() == null) {
return false;
}
if ("Link".equals(annotation.getSubtype()) || "Widget".equals(annotation.getSubtype())) {
LOGGER.debug(annotation.getSubtype() + " is excluded from the supported file annotations");
return false;
}
try {
if (!Arrays.asList(FileAnnotationType.values()).contains(FileAnnotationType.valueOf(annotation.getSubtype()))) {
return false;
Expand All @@ -86,75 +89,34 @@ private FileAnnotation createMarkedAnnotations(int pageIndex, PDPage page, PDAnn
annotation.getDictionary().getString(COSName.T), FileAnnotation.extractModifiedTime(annotation.getModifiedDate()),
pageIndex + 1, annotation.getContents(), FileAnnotationType.valueOf(annotation.getSubtype().toUpperCase(Locale.ROOT)), Optional.empty());

try {
if (FileAnnotationType.HIGHLIGHT.toString().equals(annotation.getSubtype()) || FileAnnotationType.UNDERLINE.toString().equals(annotation.getSubtype())) {
annotation.setContents(extractMarkedText(page, annotation));
if (annotationBelongingToMarking.getAnnotationType().isLinkedFileAnnotationType()) {
try {
COSArray boundingBoxes = (COSArray) annotation.getDictionary().getDictionaryObject(COSName.getPDFName("QuadPoints"));
annotation.setContents(new TextExtractor(page, boundingBoxes).extractMarkedText());
} catch (IOException e) {
annotation.setContents("JabRef: Could not extract any marked text!");
}
} catch (IOException e) {
annotation.setContents("JabRef: Could not extract any marked text!");
}

//Marked text that has a sticky note on it should be linked to the sticky note
return new FileAnnotation(annotation, pageIndex + 1, annotationBelongingToMarking);
}

private String extractMarkedText(PDPage page, PDAnnotation annotation) throws IOException {
//highlighted or underlined text has to be extracted by the rectangle calculated from the marking
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
COSArray quadsArray = (COSArray) annotation.getDictionary().getDictionaryObject(COSName.getPDFName("QuadPoints"));
String markedText = "";
for (int j = 1,
k = 0;
j <= (quadsArray.size() / 8);
j++) {

COSFloat upperLeftX = (COSFloat) quadsArray.get(k);
COSFloat upperLeftY = (COSFloat) quadsArray.get(1 + k);
COSFloat upperRightX = (COSFloat) quadsArray.get(2 + k);
COSFloat upperRightY = (COSFloat) quadsArray.get(3 + k);
COSFloat lowerLeftX = (COSFloat) quadsArray.get(4 + k);
COSFloat lowerLeftY = (COSFloat) quadsArray.get(5 + k);

k += 8;

float ulx = upperLeftX.floatValue() - 1;
float uly = upperLeftY.floatValue();
float width = upperRightX.floatValue() - lowerLeftX.floatValue();
float height = upperRightY.floatValue() - lowerLeftY.floatValue();

PDRectangle pageSize = page.getMediaBox();
uly = pageSize.getHeight() - uly;

Rectangle2D.Float rectangle = new Rectangle2D.Float(ulx, uly, width, height);
stripperByArea.addRegion("markedRegion", rectangle);
stripperByArea.extractRegions(page);
String markedTextInLine = stripperByArea.getTextForRegion("markedRegion");

if (j > 1) {
markedText = markedText.concat(markedTextInLine);
} else {
markedText = markedTextInLine;
}
}

return markedText.trim();
}

private boolean validatePath(Path path) {
Objects.requireNonNull(path);

if (!path.toString().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.warn(String.format("File %s does not end with .pdf!", path));
LOGGER.warn(String.format("File '%s' does not end with .pdf!", path));
return false;
}

if (!Files.exists(path)) {
LOGGER.warn(String.format("File %s does not exist!", path));
LOGGER.warn(String.format("File '%s' does not exist!", path));
return false;
}

if (!Files.isRegularFile(path) || !Files.isReadable(path)) {
LOGGER.warn(String.format("File %s is not readable!", path));
LOGGER.warn(String.format("File '%s' is not readable!", path));
return false;
}

Expand Down
86 changes: 86 additions & 0 deletions src/main/java/org/jabref/logic/pdf/TextExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package org.jabref.logic.pdf;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.Objects;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFTextStripperByArea;

/**
* Extracts the text of marked annotations using bounding boxes.
*/
public final class TextExtractor {

private final COSArray boundingBoxes;
private final PDPage page;

/**
* @param page the page the annotation is on, must not be null
* @param boundingBoxes the raw annotation, must not be null
*/
public TextExtractor(PDPage page, COSArray boundingBoxes) {
this.page = Objects.requireNonNull(page);
this.boundingBoxes = Objects.requireNonNull(boundingBoxes);
}

/**
* Extracts the text of a marked annotation such as highlights, underlines, strikeouts etc.
*
* @return The text of the annotation
* @throws IOException If the PDFTextStripperByArea fails to initialize.
*/
public String extractMarkedText() throws IOException {
// Text has to be extracted by the rectangle calculated from the marking
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
String markedText = "";

// Iterates over the array of segments. Each segment consists of 8 points forming a bounding box.
int totalSegments = boundingBoxes.size() / 8;
for (int currentSegment = 1, segmentPointer = 0; currentSegment <= totalSegments; currentSegment++, segmentPointer += 8) {
try {
stripperByArea.addRegion("markedRegion", calculateSegmentBoundingBox(boundingBoxes, segmentPointer));
stripperByArea.extractRegions(page);

markedText = markedText.concat(stripperByArea.getTextForRegion("markedRegion"));
} catch (IllegalArgumentException e) {
throw new IOException("Cannot read annotation coordinates!", e);
}
}

return markedText.trim();
}

private Rectangle2D calculateSegmentBoundingBox(COSArray quadsArray, int segmentPointer) {
// Extract coordinate values
float upperLeftX = toFloat(quadsArray.get(segmentPointer));
float upperLeftY = toFloat(quadsArray.get(segmentPointer + 1));
float upperRightX = toFloat(quadsArray.get(segmentPointer + 2));
float upperRightY = toFloat(quadsArray.get(segmentPointer + 3));
float lowerLeftX = toFloat(quadsArray.get(segmentPointer + 4));
float lowerLeftY = toFloat(quadsArray.get(segmentPointer + 5));

// Post-processing of the raw coordinates.
PDRectangle pageSize = page.getMediaBox();
float ulx = upperLeftX - 1; // It is magic.
float uly = pageSize.getHeight() - upperLeftY;
float width = upperRightX - lowerLeftX;
float height = upperRightY - lowerLeftY;

return new Rectangle2D.Float(ulx, uly, width, height);
}

private float toFloat(Object cosNumber) {
if (cosNumber instanceof COSFloat) {
return ((COSFloat) cosNumber).floatValue();
}
if (cosNumber instanceof COSInteger) {
return ((COSInteger) cosNumber).floatValue();
}
throw new IllegalArgumentException("The number type of the annotation is not supported!");
}
}
48 changes: 34 additions & 14 deletions src/main/java/org/jabref/model/pdf/FileAnnotationType.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,28 @@
*/

public enum FileAnnotationType {
TEXT("Text"),
HIGHLIGHT("Highlight"),
UNDERLINE("Underline"),
POLYGON("Polygon"),
POPUP("Popup"),
LINE("Line"),
CIRCLE("Circle"),
FREETEXT("FreeText"),
STRIKEOUT("Strikeout"),
LINK("Link"),
INK("Ink"),
UNKNOWN("Unknown"),
NONE("None");
TEXT("Text", false),
HIGHLIGHT("Highlight", true),
SQUIGGLY("Squiggly", true),
UNDERLINE("Underline", true),
STRIKEOUT("StrikeOut", true),
POLYGON("Polygon", false),
POPUP("Popup", false),
LINE("Line", false),
CIRCLE("Circle", false),
FREETEXT("FreeText", false),
INK("Ink", false),
UNKNOWN("Unknown", false),
NONE("None", false);

private static final Log LOGGER = LogFactory.getLog(FileAnnotationType.class);

private final String name;
private final boolean linkedFileAnnotationType;

FileAnnotationType(String name) {
FileAnnotationType(String name, boolean linkedFileAnnotationType) {
this.name = name;
this.linkedFileAnnotationType = linkedFileAnnotationType;
}

/**
Expand All @@ -50,6 +52,24 @@ public static FileAnnotationType parse(PDAnnotation annotation) {
}
}

/**
* Determines if a String is a supported marked FileAnnotation type.
*
* @param annotationType a type descriptor
* @return true if annotationType is a supported marked FileAnnotation type
*/
public static boolean isMarkedFileAnnotationType(String annotationType) {
try {
return FileAnnotationType.valueOf(annotationType.toUpperCase(Locale.ROOT)).linkedFileAnnotationType;
} catch (IllegalArgumentException e) {
return false;
}
}

public boolean isLinkedFileAnnotationType() {
return linkedFileAnnotationType;
}

public String toString() {
return this.name;
}
Expand Down
48 changes: 48 additions & 0 deletions src/test/java/org/jabref/logic/pdf/PdfAnnotationImporterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@ public class PdfAnnotationImporterTest {

private final AnnotationImporter importer = new PdfAnnotationImporter();

@Test
public void invalidPath() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("/asdf/does/not/exist.pdf")));
}

@Test
public void invalidDirectory() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("src/test/resources/pdfs")));
}

@Test
public void invalidDocumentType() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("src/test/resources/pdfs/write-protected.docx")));
}

@Test
public void noAnnotationsWriteProtected() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("src/test/resources/pdfs/write-protected.pdf")));
Expand Down Expand Up @@ -54,6 +69,16 @@ public void popupNoteMinimal() {
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-popup.pdf")));
}

@Test
public void highlightMinimalFoxit() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("lynyus", LocalDateTime.of(2017, 5, 31, 15, 16, 1), 1,
"this is a foxit highlight", FileAnnotationType.HIGHLIGHT, Optional.empty());
final FileAnnotation expected = new FileAnnotation("lynyus", LocalDateTime.of(2017, 5, 31, 15, 16, 1), 1,
"Hello", FileAnnotationType.HIGHLIGHT, Optional.of(expectedLinkedAnnotation));
assertEquals(Collections.singletonList(expected),
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-foxithighlight.pdf")));
}

@Test
public void highlightNoNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("Linus Dietz", LocalDateTime.of(2017, 3, 12, 20, 28, 39), 1,
Expand All @@ -65,6 +90,29 @@ public void highlightNoNoteMinimal() {
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-highlight-no-note.pdf")));
}

@Test
public void squigglyWithNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 2, 40, 25), 1,
"Squiggly note", FileAnnotationType.SQUIGGLY, Optional.empty());
final FileAnnotation expected = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 2, 40, 25), 1,
"ello", FileAnnotationType.SQUIGGLY, Optional.of(expectedLinkedAnnotation));

assertEquals(Collections.singletonList(expected),
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-squiggly.pdf")));
}

@Test

public void strikeoutWithNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 13, 2, 3), 1,
"striked out", FileAnnotationType.STRIKEOUT, Optional.empty());
final FileAnnotation expected = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 13, 2, 3), 1,
"World", FileAnnotationType.STRIKEOUT, Optional.of(expectedLinkedAnnotation));

assertEquals(Collections.singletonList(expected),
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-strikeout.pdf")));
}

@Test
public void highlightWithNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("Linus Dietz", LocalDateTime.of(2017, 3, 12, 20, 32, 2), 1,
Expand Down
Binary file not shown.
Binary file added src/test/resources/pdfs/minimal-foxitnote.pdf
Binary file not shown.
Binary file added src/test/resources/pdfs/minimal-squiggly.pdf
Binary file not shown.
Binary file added src/test/resources/pdfs/minimal-strikeout.pdf
Binary file not shown.

0 comments on commit f5386b2

Please sign in to comment.