Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Annotations Created by Foxit #2878

Merged
merged 13 commits into from
Jun 1, 2017
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- We fixed the IEEE Xplore web search functionality [#2789](https://github.com/JabRef/jabref/issues/2789)
- We fixed an error in the CrossRef fetcher that occurred if one of the fetched entries had no title
- We fixed an issue that prevented new entries to be automatically assigned to the currently selected group [#2783](https://github.com/JabRef/jabref/issues/2783).
- We fixed a bug that only allowed parsing positive timezones from a FileAnnotation [#2839](https://github.com/JabRef/jabref/issues/22839)
- We fixed a bug that only allowed parsing positive timezones from a FileAnnotation [#2839](https://github.com/JabRef/jabref/issues/2839)
- We fixed a bug that did not allow the correct re-export of the MS-Office XML field `msbib-accessed` with a different date format [#2859](https://github.com/JabRef/jabref/issues/2859).

- We fixed some bugs that prevented the display of FileAnnotations that were created using the Foxit Reader. [#2839, comment](https://github.com/JabRef/jabref/issues/2839#issuecomment-302058227).
### Removed


Expand Down
71 changes: 15 additions & 56 deletions src/main/java/org/jabref/logic/pdf/PdfAnnotationImporter.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.jabref.logic.pdf;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -18,13 +17,10 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;

public class PdfAnnotationImporter implements AnnotationImporter {

Expand Down Expand Up @@ -53,8 +49,8 @@ public List<FileAnnotation> importAnnotations(final Path path) {
if (!isSupportedAnnotationType(annotation)) {
continue;
}
if (FileAnnotationType.UNDERLINE.toString().equals(annotation.getSubtype()) ||
FileAnnotationType.HIGHLIGHT.toString().equals(annotation.getSubtype())) {

if (FileAnnotationType.isMarkedFileAnnotationType(annotation.getSubtype())) {
annotationsList.add(createMarkedAnnotations(pageIndex, page, annotation));
} else {
FileAnnotation fileAnnotation = new FileAnnotation(annotation, pageIndex + 1);
Expand All @@ -71,6 +67,10 @@ public List<FileAnnotation> importAnnotations(final Path path) {
}

private boolean isSupportedAnnotationType(PDAnnotation annotation) {
if (annotation.getSubtype().equals("Link") || annotation.getSubtype().equals("Widget")) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know how likely it is, but just to be sure you should change it to "Link".equals... otherwise there might be a potenial NPE

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, should not be the case, but that pdfbox library is really not to be trusted....

LOGGER.debug(annotation.getSubtype() + " is excluded from the supported file annotations");
return false;
}
try {
if (!Arrays.asList(FileAnnotationType.values()).contains(FileAnnotationType.valueOf(annotation.getSubtype()))) {
return false;
Expand All @@ -86,75 +86,34 @@ private FileAnnotation createMarkedAnnotations(int pageIndex, PDPage page, PDAnn
annotation.getDictionary().getString(COSName.T), FileAnnotation.extractModifiedTime(annotation.getModifiedDate()),
pageIndex + 1, annotation.getContents(), FileAnnotationType.valueOf(annotation.getSubtype().toUpperCase(Locale.ROOT)), Optional.empty());

try {
if (FileAnnotationType.HIGHLIGHT.toString().equals(annotation.getSubtype()) || FileAnnotationType.UNDERLINE.toString().equals(annotation.getSubtype())) {
annotation.setContents(extractMarkedText(page, annotation));
if (annotationBelongingToMarking.getAnnotationType().isLinkedAnnotationType()) {
try {
COSArray boundingBoxes = (COSArray) annotation.getDictionary().getDictionaryObject(COSName.getPDFName("QuadPoints"));
annotation.setContents(new TextExtractor(page, boundingBoxes).extractMarkedText());
} catch (IOException e) {
annotation.setContents("JabRef: Could not extract any marked text!");
}
} catch (IOException e) {
annotation.setContents("JabRef: Could not extract any marked text!");
}

//Marked text that has a sticky note on it should be linked to the sticky note
return new FileAnnotation(annotation, pageIndex + 1, annotationBelongingToMarking);
}

private String extractMarkedText(PDPage page, PDAnnotation annotation) throws IOException {
//highlighted or underlined text has to be extracted by the rectangle calculated from the marking
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
COSArray quadsArray = (COSArray) annotation.getDictionary().getDictionaryObject(COSName.getPDFName("QuadPoints"));
String markedText = "";
for (int j = 1,
k = 0;
j <= (quadsArray.size() / 8);
j++) {

COSFloat upperLeftX = (COSFloat) quadsArray.get(k);
COSFloat upperLeftY = (COSFloat) quadsArray.get(1 + k);
COSFloat upperRightX = (COSFloat) quadsArray.get(2 + k);
COSFloat upperRightY = (COSFloat) quadsArray.get(3 + k);
COSFloat lowerLeftX = (COSFloat) quadsArray.get(4 + k);
COSFloat lowerLeftY = (COSFloat) quadsArray.get(5 + k);

k += 8;

float ulx = upperLeftX.floatValue() - 1;
float uly = upperLeftY.floatValue();
float width = upperRightX.floatValue() - lowerLeftX.floatValue();
float height = upperRightY.floatValue() - lowerLeftY.floatValue();

PDRectangle pageSize = page.getMediaBox();
uly = pageSize.getHeight() - uly;

Rectangle2D.Float rectangle = new Rectangle2D.Float(ulx, uly, width, height);
stripperByArea.addRegion("markedRegion", rectangle);
stripperByArea.extractRegions(page);
String markedTextInLine = stripperByArea.getTextForRegion("markedRegion");

if (j > 1) {
markedText = markedText.concat(markedTextInLine);
} else {
markedText = markedTextInLine;
}
}

return markedText.trim();
}

private boolean validatePath(Path path) {
Objects.requireNonNull(path);

if (!path.toString().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.warn(String.format("File %s does not end with .pdf!", path));
LOGGER.warn(String.format("File '%s' does not end with .pdf!", path));
return false;
}

if (!Files.exists(path)) {
LOGGER.warn(String.format("File %s does not exist!", path));
LOGGER.warn(String.format("File '%s' does not exist!", path));
return false;
}

if (!Files.isRegularFile(path) || !Files.isReadable(path)) {
LOGGER.warn(String.format("File %s is not readable!", path));
LOGGER.warn(String.format("File '%s' is not readable!", path));
return false;
}

Expand Down
86 changes: 86 additions & 0 deletions src/main/java/org/jabref/logic/pdf/TextExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package org.jabref.logic.pdf;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.Objects;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFTextStripperByArea;

/**
* Extracts the text of marked annotations using bounding boxes.
*/
public final class TextExtractor {

private final COSArray boundingBoxes;
private final PDPage page;

/**
* @param page the page the annotation is on, must not be null
* @param boundingBoxes the raw annotation, must not be null
*/
public TextExtractor(PDPage page, COSArray boundingBoxes) {
this.page = Objects.requireNonNull(page);
this.boundingBoxes = Objects.requireNonNull(boundingBoxes);
}

/**
* Extracts the text of a marked annotation such as highlights, underlines, strikeouts etc.
*
* @return The text of the annotation
* @throws IOException If the PDFTextStripperByArea fails to initialize.
*/
public String extractMarkedText() throws IOException {
// Text has to be extracted by the rectangle calculated from the marking
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
String markedText = "";

// Iterates over the array of segments. Each segment consists of 8 points forming a bounding box.
int totalSegments = boundingBoxes.size() / 8;
for (int currentSegment = 1, segmentPointer = 0; currentSegment <= totalSegments; currentSegment++, segmentPointer += 8) {
try {
stripperByArea.addRegion("markedRegion", calculateSegmentBoundingBox(boundingBoxes, segmentPointer));
stripperByArea.extractRegions(page);

markedText = markedText.concat(stripperByArea.getTextForRegion("markedRegion"));
} catch (IllegalArgumentException e) {
throw new IOException("Cannot read annotation coordinates!", e);
}
}

return markedText.trim();
}

private Rectangle2D calculateSegmentBoundingBox(COSArray quadsArray, int segmentPointer) {
// Extract coordinate values
float upperLeftX = toFloat(quadsArray.get(segmentPointer));
float upperLeftY = toFloat(quadsArray.get(segmentPointer + 1));
float upperRightX = toFloat(quadsArray.get(segmentPointer + 2));
float upperRightY = toFloat(quadsArray.get(segmentPointer + 3));
float lowerLeftX = toFloat(quadsArray.get(segmentPointer + 4));
float lowerLeftY = toFloat(quadsArray.get(segmentPointer + 5));

// Post-processing of the raw coordinates.
PDRectangle pageSize = page.getMediaBox();
float ulx = upperLeftX - 1; // It is magic.
float uly = pageSize.getHeight() - upperLeftY;
float width = upperRightX - lowerLeftX;
float height = upperRightY - lowerLeftY;

return new Rectangle2D.Float(ulx, uly, width, height);
}

private float toFloat(Object cosNumber) {
if (cosNumber instanceof COSFloat) {
return ((COSFloat) cosNumber).floatValue();
}
if (cosNumber instanceof COSInteger) {
return ((COSInteger) cosNumber).floatValue();
}
throw new IllegalArgumentException("The number type of the annotation is not supported!");
}
}
54 changes: 40 additions & 14 deletions src/main/java/org/jabref/model/pdf/FileAnnotationType.java
Original file line number Diff line number Diff line change
@@ -1,37 +1,43 @@
package org.jabref.model.pdf;

import java.util.Arrays;
import java.util.Collections;
import java.util.Locale;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;

import static java.util.stream.Collectors.toList;

/**
* Our representation of the type of the FileAnnotation. This is needed as some FileAnnotationTypes require special
* handling (e.g., Highlight or Underline), because of the linked FileAnnotations.
*/

public enum FileAnnotationType {
TEXT("Text"),
HIGHLIGHT("Highlight"),
UNDERLINE("Underline"),
POLYGON("Polygon"),
POPUP("Popup"),
LINE("Line"),
CIRCLE("Circle"),
FREETEXT("FreeText"),
STRIKEOUT("Strikeout"),
LINK("Link"),
INK("Ink"),
UNKNOWN("Unknown"),
NONE("None");
TEXT("Text", false),
HIGHLIGHT("Highlight", true),
SQUIGGLY("Squiggly", true),
UNDERLINE("Underline", true),
STRIKEOUT("StrikeOut", true),
POLYGON("Polygon", false),
POPUP("Popup", false),
LINE("Line", false),
CIRCLE("Circle", false),
FREETEXT("FreeText", false),
INK("Ink", false),
UNKNOWN("Unknown", false),
NONE("None", false);

private static final Log LOGGER = LogFactory.getLog(FileAnnotationType.class);

private final String name;
private final boolean isLinkedAnnotationType;

FileAnnotationType(String name) {
FileAnnotationType(String name, boolean isLinkedAnnotationType) {
this.name = name;
this.isLinkedAnnotationType = isLinkedAnnotationType;
}

/**
Expand All @@ -50,6 +56,26 @@ public static FileAnnotationType parse(PDAnnotation annotation) {
}
}

/**
* Determines if a String is a supported marked FileAnnotation type.
*
* @param annotationType a type descriptor
* @return true if annotationType is a supported marked FileAnnotation type
*/
public static boolean isMarkedFileAnnotationType(String annotationType) {
for (FileAnnotationType type : Collections.unmodifiableList(Arrays.stream(FileAnnotationType.values())
.filter(FileAnnotationType::isLinkedAnnotationType).collect(toList()))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative and simpler solution would be to use Enum.valueOf... to parse the String. If it throws an Illegal Argument exception you simply return false...
Otherwise I would omit the loop and directly add a second filter Argument to the strea, and use anyMatch to return a boolean

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I don't understand why you wrap the Array in a list again

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea. fixed in b9ddbc4

if (type.toString().equals(annotationType)) {
return true;
}
}
return false;
}

public boolean isLinkedAnnotationType() {
return isLinkedAnnotationType;
}

public String toString() {
return this.name;
}
Expand Down
48 changes: 48 additions & 0 deletions src/test/java/org/jabref/logic/pdf/PdfAnnotationImporterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@ public class PdfAnnotationImporterTest {

private final AnnotationImporter importer = new PdfAnnotationImporter();

@Test
public void invalidPath() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("/asdf/does/not/exist.pdf")));
}

@Test
public void invalidDirectory() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("src/test/resources/pdfs")));
}

@Test
public void invalidDocumentType() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("src/test/resources/pdfs/write-protected.docx")));
}

@Test
public void noAnnotationsWriteProtected() {
assertEquals(Collections.emptyList(), importer.importAnnotations(Paths.get("src/test/resources/pdfs/write-protected.pdf")));
Expand Down Expand Up @@ -54,6 +69,16 @@ public void popupNoteMinimal() {
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-popup.pdf")));
}

@Test
public void highlightMinimalFoxit() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("lynyus", LocalDateTime.of(2017, 5, 31, 15, 16, 1), 1,
"this is a foxit highlight", FileAnnotationType.HIGHLIGHT, Optional.empty());
final FileAnnotation expected = new FileAnnotation("lynyus", LocalDateTime.of(2017, 5, 31, 15, 16, 1), 1,
"Hello", FileAnnotationType.HIGHLIGHT, Optional.of(expectedLinkedAnnotation));
assertEquals(Collections.singletonList(expected),
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-foxithighlight.pdf")));
}

@Test
public void highlightNoNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("Linus Dietz", LocalDateTime.of(2017, 3, 12, 20, 28, 39), 1,
Expand All @@ -65,6 +90,29 @@ public void highlightNoNoteMinimal() {
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-highlight-no-note.pdf")));
}

@Test
public void squigglyWithNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 2, 40, 25), 1,
"Squiggly note", FileAnnotationType.SQUIGGLY, Optional.empty());
final FileAnnotation expected = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 2, 40, 25), 1,
"ello", FileAnnotationType.SQUIGGLY, Optional.of(expectedLinkedAnnotation));

assertEquals(Collections.singletonList(expected),
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-squiggly.pdf")));
}

@Test

public void strikeoutWithNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 13, 2, 3), 1,
"striked out", FileAnnotationType.STRIKEOUT, Optional.empty());
final FileAnnotation expected = new FileAnnotation("lynyus", LocalDateTime.of(2017, 6, 1, 13, 2, 3), 1,
"World", FileAnnotationType.STRIKEOUT, Optional.of(expectedLinkedAnnotation));

assertEquals(Collections.singletonList(expected),
importer.importAnnotations(Paths.get("src/test/resources/pdfs/minimal-strikeout.pdf")));
}

@Test
public void highlightWithNoteMinimal() {
final FileAnnotation expectedLinkedAnnotation = new FileAnnotation("Linus Dietz", LocalDateTime.of(2017, 3, 12, 20, 32, 2), 1,
Expand Down
Binary file not shown.
Binary file added src/test/resources/pdfs/minimal-foxitnote.pdf
Binary file not shown.
Binary file added src/test/resources/pdfs/minimal-squiggly.pdf
Binary file not shown.
Binary file added src/test/resources/pdfs/minimal-strikeout.pdf
Binary file not shown.