Need better escaping for PDF title #636

jbreiden · 2017-01-04T00:09:29Z

We're going to get corrupt output if an open paren is passed as a title.
Proper escaping look like this. The leading FEFF is boilerplate that
signifies the byte order, everything else is UTF-16BE. The title in this
case is "ru"

<< /CreationDate (D:20170103154208-08'00') /Producer (Tesseract 2000) /Title <FEFF00720075> >>

https://github.com/tesseract-ocr/tesseract/blob/master/api/pdfrenderer.cpp#L963

The text was updated successfully, but these errors were encountered:

amitdo · 2017-01-06T11:19:16Z

DanBloomberg/leptonica@792db025518a

Encode pdf title in escape 4-byte hex for safety.

jbreiden · 2017-02-21T02:10:43Z

Dan's Leptonica change is for a different code path and is not reusuable, because it only works for ASCII. The Tesseract fix needs to use UTF16-BE, which is fortunately already used elsewhere in pdfrenderer.cpp.

jbreiden · 2017-03-30T17:18:03Z

Fix written, under review.

jbreiden · 2017-03-30T22:16:38Z

EDIT: I've revised the fix to remove a compiler warning.

--- tesseract/api/pdfrenderer.cpp	2017-03-30 16:10:23.000000000 -0700
+++ tesseract/api/pdfrenderer.cpp	2017-03-31 13:16:22.000000000 -0700
@@ -159,7 +159,7 @@
 
 OK there is a small problem there, if I use GID 0 then Acrobat gets
 upset about it and complains it cannot extract the font. If I set the
-CIDToGIDMap so that all the entries are 1 instead, its happy. Totally
+CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
 mad......
 
 */
@@ -169,10 +169,15 @@
 // Use for PDF object fragments. Must be large enough
 // to hold a colormap with 256 colors in the verbose
 // PDF representation.
-const int kBasicBufSize = 2048;
+static const int kBasicBufSize = 2048;
 
 // If the font is 10 pts, nominal character width is 5 pts
-const int kCharWidth = 2;
+static const int kCharWidth = 2;
+
+// Used for memory allocation. A codepoint must take no more than this
+// many bytes, when written in the PDF way. e.g. "<0063>" for the
+// letter 'c'
+static const int kMaxBytesPerCodepoint = 20;
 
 /**********************************************************************
  * PDF Renderer interface implementation
@@ -303,6 +308,23 @@
   if (rise < 2.0 && 2.0 < run)
     *line_y1 = *line_y2 = (y1 + y2) / 2;
 }
+
+bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
+  if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
+    tprintf("Dropping invalid codepoint %d\n", code);
+    return false;
+  }
+  if (code < 0x10000) {
+    snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
+  } else {
+    int a = code - 0x010000;
+    int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
+    int low_surrogate = (0x03FF & a) + 0xDC00;
+    snprintf(utf16, kMaxBytesPerCodepoint,
+             "%04X%04X", high_surrogate, low_surrogate);
+  }
+  return true;
+}
 
 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
                                          double width, double height) {
@@ -442,25 +464,13 @@
       if (grapheme && grapheme[0] != '\0') {
         GenericVector<int> unicodes;
         UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
-        char utf16[20];
+        char utf16[kMaxBytesPerCodepoint];
         for (int i = 0; i < unicodes.length(); i++) {
           int code = unicodes[i];
-          // Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
-          if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
-            tprintf("Dropping invalid codepoint %d\n", code);
-            continue;
+          if (CodepointToUtf16be(code, utf16)) {
+            pdf_word += utf16;
+            pdf_word_len++;
           }
-          if (code < 0x10000) {
-            snprintf(utf16, sizeof(utf16), "<%04X>", code);
-          } else {
-            int a = code - 0x010000;
-            int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
-            int low_surrogate = (0x03FF & a) + 0xDC00;
-            snprintf(utf16, sizeof(utf16), "<%04X%04X>",
-                     high_surrogate, low_surrogate);
-          }
-          pdf_word += utf16;
-          pdf_word_len++;
         }
       }
       delete []grapheme;
@@ -471,9 +481,9 @@
           kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
       pdf_str.add_str_double("", h_stretch);
       pdf_str += " Tz";          // horizontal stretch
-      pdf_str += " [ ";
+      pdf_str += " [ <";
       pdf_str += pdf_word;       // UTF-16BE representation
-      pdf_str += " ] TJ";        // show the text
+      pdf_str += "> ] TJ";       // show the text
     }
     if (last_word_in_line) {
       pdf_str += " \n";
@@ -960,15 +970,27 @@
   offsets_.back() += pages_objsize;    // manipulation #2
 
   // INFO
+  STRING utf16_title = "FEFF";  // byte_order_marker
+  GenericVector<int> unicodes;
+  UNICHAR::UTF8ToUnicode(title(), &unicodes);
+  char utf16[kMaxBytesPerCodepoint];
+  for (int i = 0; i < unicodes.length(); i++) {
+    int code = unicodes[i];
+    if (CodepointToUtf16be(code, utf16)) {
+      utf16_title += utf16;
+    }
+  }
+
   char* datestr = l_getFormattedDate();
   n = snprintf(buf, sizeof(buf),
                "%ld 0 obj\n"
                "<<\n"
                "  /Producer (Tesseract %s)\n"
                "  /CreationDate (D:%s)\n"
-               "  /Title (%s)"
+               "  /Title <%s>\n"
                ">>\n"
-               "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
+               "endobj\n",
+               obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str());
   lept_free(datestr);
   if (n >= sizeof(buf)) return false;
   AppendPDFObject(buf);
--- tesseract/api/renderer.h	2017-03-30 16:10:23.000000000 -0700
+++ tesseract/api/renderer.h	2017-03-31 10:34:40.000000000 -0700
@@ -57,6 +57,7 @@
     /**
      * Starts a new document with the given title.
      * This clears the contents of the output data.
+     * Title should use UTF-8 encoding.
      */
     bool BeginDocument(const char* title);

jbreiden added the PDF label Jan 4, 2017

jbreiden mentioned this issue Feb 21, 2017

new API method for setting producer for TessPDFRenderer #731

Closed

zdenop closed this as completed in 9038faf Apr 2, 2017

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Need better escaping for PDF title #636

Need better escaping for PDF title #636

jbreiden commented Jan 4, 2017

amitdo commented Jan 6, 2017

jbreiden commented Feb 21, 2017 •

edited

Loading

jbreiden commented Mar 30, 2017

jbreiden commented Mar 30, 2017 •

edited

Loading

Need better escaping for PDF title #636

Need better escaping for PDF title #636

Comments

jbreiden commented Jan 4, 2017

amitdo commented Jan 6, 2017

jbreiden commented Feb 21, 2017 • edited Loading

jbreiden commented Mar 30, 2017

jbreiden commented Mar 30, 2017 • edited Loading

jbreiden commented Feb 21, 2017 •

edited

Loading

jbreiden commented Mar 30, 2017 •

edited

Loading