diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 5be926f728..2ba6279a58 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -413,13 +413,25 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != '\0') { - // TODO(jbreiden) Do a real UTF-16BE conversion - // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure - string_32 utf32; - CubeUtils::UTF8ToUTF32(grapheme, &utf32); + GenericVector unicodes; + UNICHAR::UTF8ToUnicode(grapheme, &unicodes); char utf16[20]; - for (int i = 0; i < static_cast(utf32.length()); i++) { - snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]); + for (int i = 0; i < unicodes.length(); i++) { + int code = unicodes[i]; + // Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16 + if ((code > 0xD7FF && code < 0xE0000) || code > 0x10FFFF) { + tprintf("Dropping invalid codepoint %d\n", code); + continue; + } + if (code < 0x10000) { + snprintf(utf16, sizeof(utf16), "<%04X>", code); + } else { + int a = code - 0x010000; + int high_surrogate = (0x03FF & (a >> 10)) + 0xD800; + int low_surrogate = (0x03FF & a) + 0xDC00; + snprintf(utf16, sizeof(utf16), "<%04X%04X>", + high_surrogate, low_surrogate); + } pdf_word += utf16; pdf_word_len++; }