Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Need better escaping for PDF title #636

Closed
jbreiden opened this issue Jan 4, 2017 · 4 comments
Closed

Need better escaping for PDF title #636

jbreiden opened this issue Jan 4, 2017 · 4 comments
Labels

Comments

@jbreiden
Copy link
Contributor

jbreiden commented Jan 4, 2017

We're going to get corrupt output if an open paren is passed as a title.
Proper escaping look like this. The leading FEFF is boilerplate that
signifies the byte order, everything else is UTF-16BE. The title in this
case is "ru"

<< /CreationDate (D:20170103154208-08'00') /Producer (Tesseract 2000) /Title <FEFF00720075> >>

https://github.com/tesseract-ocr/tesseract/blob/master/api/pdfrenderer.cpp#L963

@jbreiden jbreiden added the PDF label Jan 4, 2017
@amitdo
Copy link
Collaborator

amitdo commented Jan 6, 2017

DanBloomberg/leptonica@792db025518a

Encode pdf title in escape 4-byte hex for safety.

@jbreiden
Copy link
Contributor Author

jbreiden commented Feb 21, 2017

Dan's Leptonica change is for a different code path and is not reusuable, because it only works for ASCII. The Tesseract fix needs to use UTF16-BE, which is fortunately already used elsewhere in pdfrenderer.cpp.

@jbreiden
Copy link
Contributor Author

Fix written, under review.

@jbreiden
Copy link
Contributor Author

jbreiden commented Mar 30, 2017

EDIT: I've revised the fix to remove a compiler warning.

--- tesseract/api/pdfrenderer.cpp	2017-03-30 16:10:23.000000000 -0700
+++ tesseract/api/pdfrenderer.cpp	2017-03-31 13:16:22.000000000 -0700
@@ -159,7 +159,7 @@
 
 OK there is a small problem there, if I use GID 0 then Acrobat gets
 upset about it and complains it cannot extract the font. If I set the
-CIDToGIDMap so that all the entries are 1 instead, its happy. Totally
+CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
 mad......
 
 */
@@ -169,10 +169,15 @@
 // Use for PDF object fragments. Must be large enough
 // to hold a colormap with 256 colors in the verbose
 // PDF representation.
-const int kBasicBufSize = 2048;
+static const int kBasicBufSize = 2048;
 
 // If the font is 10 pts, nominal character width is 5 pts
-const int kCharWidth = 2;
+static const int kCharWidth = 2;
+
+// Used for memory allocation. A codepoint must take no more than this
+// many bytes, when written in the PDF way. e.g. "<0063>" for the
+// letter 'c'
+static const int kMaxBytesPerCodepoint = 20;
 
 /**********************************************************************
  * PDF Renderer interface implementation
@@ -303,6 +308,23 @@
   if (rise < 2.0 && 2.0 < run)
     *line_y1 = *line_y2 = (y1 + y2) / 2;
 }
+
+bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
+  if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
+    tprintf("Dropping invalid codepoint %d\n", code);
+    return false;
+  }
+  if (code < 0x10000) {
+    snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
+  } else {
+    int a = code - 0x010000;
+    int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
+    int low_surrogate = (0x03FF & a) + 0xDC00;
+    snprintf(utf16, kMaxBytesPerCodepoint,
+             "%04X%04X", high_surrogate, low_surrogate);
+  }
+  return true;
+}
 
 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
                                          double width, double height) {
@@ -442,25 +464,13 @@
       if (grapheme && grapheme[0] != '\0') {
         GenericVector<int> unicodes;
         UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
-        char utf16[20];
+        char utf16[kMaxBytesPerCodepoint];
         for (int i = 0; i < unicodes.length(); i++) {
           int code = unicodes[i];
-          // Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
-          if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
-            tprintf("Dropping invalid codepoint %d\n", code);
-            continue;
+          if (CodepointToUtf16be(code, utf16)) {
+            pdf_word += utf16;
+            pdf_word_len++;
           }
-          if (code < 0x10000) {
-            snprintf(utf16, sizeof(utf16), "<%04X>", code);
-          } else {
-            int a = code - 0x010000;
-            int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
-            int low_surrogate = (0x03FF & a) + 0xDC00;
-            snprintf(utf16, sizeof(utf16), "<%04X%04X>",
-                     high_surrogate, low_surrogate);
-          }
-          pdf_word += utf16;
-          pdf_word_len++;
         }
       }
       delete []grapheme;
@@ -471,9 +481,9 @@
           kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
       pdf_str.add_str_double("", h_stretch);
       pdf_str += " Tz";          // horizontal stretch
-      pdf_str += " [ ";
+      pdf_str += " [ <";
       pdf_str += pdf_word;       // UTF-16BE representation
-      pdf_str += " ] TJ";        // show the text
+      pdf_str += "> ] TJ";       // show the text
     }
     if (last_word_in_line) {
       pdf_str += " \n";
@@ -960,15 +970,27 @@
   offsets_.back() += pages_objsize;    // manipulation #2
 
   // INFO
+  STRING utf16_title = "FEFF";  // byte_order_marker
+  GenericVector<int> unicodes;
+  UNICHAR::UTF8ToUnicode(title(), &unicodes);
+  char utf16[kMaxBytesPerCodepoint];
+  for (int i = 0; i < unicodes.length(); i++) {
+    int code = unicodes[i];
+    if (CodepointToUtf16be(code, utf16)) {
+      utf16_title += utf16;
+    }
+  }
+
   char* datestr = l_getFormattedDate();
   n = snprintf(buf, sizeof(buf),
                "%ld 0 obj\n"
                "<<\n"
                "  /Producer (Tesseract %s)\n"
                "  /CreationDate (D:%s)\n"
-               "  /Title (%s)"
+               "  /Title <%s>\n"
                ">>\n"
-               "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
+               "endobj\n",
+               obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str());
   lept_free(datestr);
   if (n >= sizeof(buf)) return false;
   AppendPDFObject(buf);
--- tesseract/api/renderer.h	2017-03-30 16:10:23.000000000 -0700
+++ tesseract/api/renderer.h	2017-03-31 10:34:40.000000000 -0700
@@ -57,6 +57,7 @@
     /**
      * Starts a new document with the given title.
      * This clears the contents of the output data.
+     * Title should use UTF-8 encoding.
      */
     bool BeginDocument(const char* title);

@zdenop zdenop closed this as completed in 9038faf Apr 2, 2017
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants