-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support Annotations Created by Foxit #2878
Changes from 10 commits
68be206
d324d9b
1e0d91b
59f7f09
c05bbb0
2ad1888
af366ad
6eed4ea
0aab9cf
d2da4bf
5539a5e
92c97f7
b9ddbc4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package org.jabref.logic.pdf; | ||
|
||
import java.awt.geom.Rectangle2D; | ||
import java.io.IOException; | ||
import java.util.Objects; | ||
|
||
import org.apache.pdfbox.cos.COSArray; | ||
import org.apache.pdfbox.cos.COSFloat; | ||
import org.apache.pdfbox.cos.COSInteger; | ||
import org.apache.pdfbox.pdmodel.PDPage; | ||
import org.apache.pdfbox.pdmodel.common.PDRectangle; | ||
import org.apache.pdfbox.util.PDFTextStripperByArea; | ||
|
||
/** | ||
* Extracts the text of marked annotations using bounding boxes. | ||
*/ | ||
public final class TextExtractor { | ||
|
||
private final COSArray boundingBoxes; | ||
private final PDPage page; | ||
|
||
/** | ||
* @param page the page the annotation is on, must not be null | ||
* @param boundingBoxes the raw annotation, must not be null | ||
*/ | ||
public TextExtractor(PDPage page, COSArray boundingBoxes) { | ||
this.page = Objects.requireNonNull(page); | ||
this.boundingBoxes = Objects.requireNonNull(boundingBoxes); | ||
} | ||
|
||
/** | ||
* Extracts the text of a marked annotation such as highlights, underlines, strikeouts etc. | ||
* | ||
* @return The text of the annotation | ||
* @throws IOException If the PDFTextStripperByArea fails to initialize. | ||
*/ | ||
public String extractMarkedText() throws IOException { | ||
// Text has to be extracted by the rectangle calculated from the marking | ||
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea(); | ||
String markedText = ""; | ||
|
||
// Iterates over the array of segments. Each segment consists of 8 points forming a bounding box. | ||
int totalSegments = boundingBoxes.size() / 8; | ||
for (int currentSegment = 1, segmentPointer = 0; currentSegment <= totalSegments; currentSegment++, segmentPointer += 8) { | ||
try { | ||
stripperByArea.addRegion("markedRegion", calculateSegmentBoundingBox(boundingBoxes, segmentPointer)); | ||
stripperByArea.extractRegions(page); | ||
|
||
markedText = markedText.concat(stripperByArea.getTextForRegion("markedRegion")); | ||
} catch (IllegalArgumentException e) { | ||
throw new IOException("Cannot read annotation coordinates!", e); | ||
} | ||
} | ||
|
||
return markedText.trim(); | ||
} | ||
|
||
private Rectangle2D calculateSegmentBoundingBox(COSArray quadsArray, int segmentPointer) { | ||
// Extract coordinate values | ||
float upperLeftX = toFloat(quadsArray.get(segmentPointer)); | ||
float upperLeftY = toFloat(quadsArray.get(segmentPointer + 1)); | ||
float upperRightX = toFloat(quadsArray.get(segmentPointer + 2)); | ||
float upperRightY = toFloat(quadsArray.get(segmentPointer + 3)); | ||
float lowerLeftX = toFloat(quadsArray.get(segmentPointer + 4)); | ||
float lowerLeftY = toFloat(quadsArray.get(segmentPointer + 5)); | ||
|
||
// Post-processing of the raw coordinates. | ||
PDRectangle pageSize = page.getMediaBox(); | ||
float ulx = upperLeftX - 1; // It is magic. | ||
float uly = pageSize.getHeight() - upperLeftY; | ||
float width = upperRightX - lowerLeftX; | ||
float height = upperRightY - lowerLeftY; | ||
|
||
return new Rectangle2D.Float(ulx, uly, width, height); | ||
} | ||
|
||
private float toFloat(Object cosNumber) { | ||
if (cosNumber instanceof COSFloat) { | ||
return ((COSFloat) cosNumber).floatValue(); | ||
} | ||
if (cosNumber instanceof COSInteger) { | ||
return ((COSInteger) cosNumber).floatValue(); | ||
} | ||
throw new IllegalArgumentException("The number type of the annotation is not supported!"); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,43 @@ | ||
package org.jabref.model.pdf; | ||
|
||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.Locale; | ||
|
||
import org.apache.commons.logging.Log; | ||
import org.apache.commons.logging.LogFactory; | ||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; | ||
|
||
import static java.util.stream.Collectors.toList; | ||
|
||
/** | ||
* Our representation of the type of the FileAnnotation. This is needed as some FileAnnotationTypes require special | ||
* handling (e.g., Highlight or Underline), because of the linked FileAnnotations. | ||
*/ | ||
|
||
public enum FileAnnotationType { | ||
TEXT("Text"), | ||
HIGHLIGHT("Highlight"), | ||
UNDERLINE("Underline"), | ||
POLYGON("Polygon"), | ||
POPUP("Popup"), | ||
LINE("Line"), | ||
CIRCLE("Circle"), | ||
FREETEXT("FreeText"), | ||
STRIKEOUT("Strikeout"), | ||
LINK("Link"), | ||
INK("Ink"), | ||
UNKNOWN("Unknown"), | ||
NONE("None"); | ||
TEXT("Text", false), | ||
HIGHLIGHT("Highlight", true), | ||
SQUIGGLY("Squiggly", true), | ||
UNDERLINE("Underline", true), | ||
STRIKEOUT("StrikeOut", true), | ||
POLYGON("Polygon", false), | ||
POPUP("Popup", false), | ||
LINE("Line", false), | ||
CIRCLE("Circle", false), | ||
FREETEXT("FreeText", false), | ||
INK("Ink", false), | ||
UNKNOWN("Unknown", false), | ||
NONE("None", false); | ||
|
||
private static final Log LOGGER = LogFactory.getLog(FileAnnotationType.class); | ||
|
||
private final String name; | ||
private final boolean isLinkedAnnotationType; | ||
|
||
FileAnnotationType(String name) { | ||
FileAnnotationType(String name, boolean isLinkedAnnotationType) { | ||
this.name = name; | ||
this.isLinkedAnnotationType = isLinkedAnnotationType; | ||
} | ||
|
||
/** | ||
|
@@ -50,6 +56,26 @@ public static FileAnnotationType parse(PDAnnotation annotation) { | |
} | ||
} | ||
|
||
/** | ||
* Determines if a String is a supported marked FileAnnotation type. | ||
* | ||
* @param annotationType a type descriptor | ||
* @return true if annotationType is a supported marked FileAnnotation type | ||
*/ | ||
public static boolean isMarkedFileAnnotationType(String annotationType) { | ||
for (FileAnnotationType type : Collections.unmodifiableList(Arrays.stream(FileAnnotationType.values()) | ||
.filter(FileAnnotationType::isLinkedAnnotationType).collect(toList()))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An alternative and simpler solution would be to use Enum.valueOf... to parse the String. If it throws an Illegal Argument exception you simply return false... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And I don't understand why you wrap the Array in a list again There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea. fixed in b9ddbc4 |
||
if (type.toString().equals(annotationType)) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public boolean isLinkedAnnotationType() { | ||
return isLinkedAnnotationType; | ||
} | ||
|
||
public String toString() { | ||
return this.name; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know how likely it is, but just to be sure you should change it to "Link".equals... otherwise there might be a potenial NPE
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, should not be the case, but that pdfbox library is really not to be trusted....