Skip to content

Commit

Permalink
Better escaping for PDF title; fixes #636
Browse files Browse the repository at this point in the history
  • Loading branch information
jbreiden authored and zdenop committed Apr 2, 2017
1 parent c38eeda commit 9038faf
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 20 deletions.
61 changes: 41 additions & 20 deletions api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ const int kBasicBufSize = 2048;
// If the font is 10 pts, nominal character width is 5 pts
const int kCharWidth = 2;

// Used for memory allocation. A codepoint must take no more than this
// many bytes, when written in the PDF way. e.g. "<0063>" for the
// letter 'c'
const int kMaxBytesPerCodepoint = 20;

/**********************************************************************
* PDF Renderer interface implementation
**********************************************************************/
Expand Down Expand Up @@ -304,6 +309,22 @@ void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
*line_y1 = *line_y2 = (y1 + y2) / 2;
}

bool CodepointToUtf16be(int code, char *utf16) {
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
tprintf("Dropping invalid codepoint %d\n", code);
return false;
}
if (code < 0x10000) {
snprintf(utf16, sizeof(utf16), "%04X", code);
} else {
int a = code - 0x010000;
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
int low_surrogate = (0x03FF & a) + 0xDC00;
snprintf(utf16, sizeof(utf16), "%04X%04X", high_surrogate, low_surrogate);
}
return true;
}

char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
double width, double height) {
STRING pdf_str("");
Expand Down Expand Up @@ -442,25 +463,13 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
if (grapheme && grapheme[0] != '\0') {
GenericVector<int> unicodes;
UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
char utf16[20];
char utf16[kMaxBytesPerCodepoint];
for (int i = 0; i < unicodes.length(); i++) {
int code = unicodes[i];
// Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
tprintf("Dropping invalid codepoint %d\n", code);
continue;
}
if (code < 0x10000) {
snprintf(utf16, sizeof(utf16), "<%04X>", code);
} else {
int a = code - 0x010000;
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
int low_surrogate = (0x03FF & a) + 0xDC00;
snprintf(utf16, sizeof(utf16), "<%04X%04X>",
high_surrogate, low_surrogate);
if (CodepointToUtf16be(code, utf16)) {
pdf_word += utf16;
pdf_word_len++;
}
pdf_word += utf16;
pdf_word_len++;
}
}
delete []grapheme;
Expand All @@ -471,9 +480,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
pdf_str.add_str_double("", h_stretch);
pdf_str += " Tz"; // horizontal stretch
pdf_str += " [ ";
pdf_str += " [ <";
pdf_str += pdf_word; // UTF-16BE representation
pdf_str += " ] TJ"; // show the text
pdf_str += "> ] TJ"; // show the text
}
if (last_word_in_line) {
pdf_str += " \n";
Expand Down Expand Up @@ -960,15 +969,27 @@ bool TessPDFRenderer::EndDocumentHandler() {
offsets_.back() += pages_objsize; // manipulation #2

// INFO
STRING utf16_title = "FEFF"; // byte_order_marker
GenericVector<int> unicodes;
UNICHAR::UTF8ToUnicode(title(), &unicodes);
char utf16[kMaxBytesPerCodepoint];
for (int i = 0; i < unicodes.length(); i++) {
int code = unicodes[i];
if (CodepointToUtf16be(code, utf16)) {
utf16_title += utf16;
}
}

char* datestr = l_getFormattedDate();
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Producer (Tesseract %s)\n"
" /CreationDate (D:%s)\n"
" /Title (%s)"
" /Title <%s>\n"
">>\n"
"endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
"endobj\n",
obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str());
lept_free(datestr);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
Expand Down
1 change: 1 addition & 0 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class TESS_API TessResultRenderer {
/**
* Starts a new document with the given title.
* This clears the contents of the output data.
* Title should use UTF-8 encoding.
*/
bool BeginDocument(const char* title);

Expand Down

0 comments on commit 9038faf

Please sign in to comment.