Merge pull request #12292 from calixteman/encoding

Fix encoding issues when printing/saving a form with non-ascii characters
mozilla · Jan 7, 2021 · 5bde4b7 · 5bde4b7
2 parents 4be76c8 + 5642496
commit 5bde4b7
Show file tree

Hide file tree

Showing 6 changed files with 506 additions and 55 deletions.
diff --git a/src/core/annotation.js b/src/core/annotation.js
@@ -23,10 +23,12 @@ import {
   assert,
   escapeString,
   getModificationDate,
+  isAscii,
   isString,
   OPS,
   shadow,
   stringToPDFString,
+  stringToUTF16BEString,
   unreachable,
   Util,
   warn,
@@ -1222,7 +1224,7 @@ class WidgetAnnotation extends Annotation {
       appearance = newTransform.encryptString(appearance);
     }
 
-    dict.set("V", value);
+    dict.set("V", isAscii(value) ? value : stringToUTF16BEString(value));
     dict.set("AP", AP);
     dict.set("M", `D:${getModificationDate()}`);
 
@@ -1298,25 +1300,29 @@ class WidgetAnnotation extends Annotation {
     const defaultAppearance = this.data.defaultAppearance;
     const alignment = this.data.textAlignment;
 
-    if (this.data.comb) {
-      return this._getCombAppearance(
+    if (this.data.multiLine) {
+      return this._getMultilineAppearance(
         defaultAppearance,
         value,
+        font,
+        fontSize,
         totalWidth,
+        totalHeight,
+        alignment,
         hPadding,
         vPadding
       );
     }
 
-    if (this.data.multiLine) {
-      return this._getMultilineAppearance(
+    // TODO: need to handle chars which are not in the font.
+    const encodedString = font.encodeString(value).join("");
+
+    if (this.data.comb) {
+      return this._getCombAppearance(
         defaultAppearance,
-        value,
         font,
-        fontSize,
+        encodedString,
         totalWidth,
-        totalHeight,
-        alignment,
         hPadding,
         vPadding
       );
@@ -1327,13 +1333,15 @@ class WidgetAnnotation extends Annotation {
       return (
         "/Tx BMC q BT " +
         defaultAppearance +
-        ` 1 0 0 1 ${hPadding} ${vPadding} Tm (${escapeString(value)}) Tj` +
+        ` 1 0 0 1 ${hPadding} ${vPadding} Tm (${escapeString(
+          encodedString
+        )}) Tj` +
         " ET Q EMC"
       );
     }
 
     const renderedText = this._renderText(
-      value,
+      encodedString,
       font,
       fontSize,
       totalWidth,
@@ -1373,10 +1381,21 @@ class WidgetAnnotation extends Annotation {
 
   _computeFontSize(font, fontName, fontSize, height) {
     if (fontSize === null || fontSize === 0) {
-      const em = font.charsToGlyphs("M")[0].width / 1000;
-      // According to https://en.wikipedia.org/wiki/Em_(typography)
-      // an average cap height should be 70% of 1em
-      const capHeight = 0.7 * em;
+      let capHeight;
+      if (font.capHeight) {
+        capHeight = font.capHeight;
+      } else {
+        const glyphs = font.charsToGlyphs(font.encodeString("M").join(""));
+        if (glyphs.length === 1 && glyphs[0].width) {
+          const em = glyphs[0].width / 1000;
+          // According to https://en.wikipedia.org/wiki/Em_(typography)
+          // an average cap height should be 70% of 1em
+          capHeight = 0.7 * em;
+        } else {
+          capHeight = 0.7;
+        }
+      }
+
       // 1.5 * capHeight * fontSize seems to be a good value for lineHeight
       fontSize = Math.max(1, Math.floor(height / (1.5 * capHeight)));
 
@@ -1510,11 +1529,12 @@ class TextWidgetAnnotation extends WidgetAnnotation {
       this.data.maxLen !== null;
   }
 
-  _getCombAppearance(defaultAppearance, text, width, hPadding, vPadding) {
+  _getCombAppearance(defaultAppearance, font, text, width, hPadding, vPadding) {
     const combWidth = (width / this.data.maxLen).toFixed(2);
     const buf = [];
-    for (const character of text) {
-      buf.push(`(${escapeString(character)}) Tj`);
+    const positions = font.getCharPositions(text);
+    for (const [start, end] of positions) {
+      buf.push(`(${escapeString(text.substring(start, end))}) Tj`);
     }
 
     const renderedComb = buf.join(` ${combWidth} 0 Td `);
@@ -1568,49 +1588,61 @@ class TextWidgetAnnotation extends WidgetAnnotation {
   }
 
   _splitLine(line, font, fontSize, width) {
-    if (line.length <= 1) {
+    // TODO: need to handle chars which are not in the font.
+    line = font.encodeString(line).join("");
+
+    const glyphs = font.charsToGlyphs(line);
+
+    if (glyphs.length <= 1) {
       // Nothing to split
       return [line];
     }
 
+    const positions = font.getCharPositions(line);
     const scale = fontSize / 1000;
-    const whitespace = font.charsToGlyphs(" ")[0].width * scale;
     const chunks = [];
 
-    let lastSpacePos = -1,
+    let lastSpacePosInStringStart = -1,
+      lastSpacePosInStringEnd = -1,
+      lastSpacePos = -1,
       startChunk = 0,
       currentWidth = 0;
 
-    for (let i = 0, ii = line.length; i < ii; i++) {
-      const character = line.charAt(i);
-      if (character === " ") {
-        if (currentWidth + whitespace > width) {
+    for (let i = 0, ii = glyphs.length; i < ii; i++) {
+      const [start, end] = positions[i];
+      const glyph = glyphs[i];
+      const glyphWidth = glyph.width * scale;
+      if (glyph.unicode === " ") {
+        if (currentWidth + glyphWidth > width) {
           // We can break here
-          chunks.push(line.substring(startChunk, i));
-          startChunk = i;
-          currentWidth = whitespace;
+          chunks.push(line.substring(startChunk, start));
+          startChunk = start;
+          currentWidth = glyphWidth;
+          lastSpacePosInStringStart = -1;
           lastSpacePos = -1;
         } else {
-          currentWidth += whitespace;
+          currentWidth += glyphWidth;
+          lastSpacePosInStringStart = start;
+          lastSpacePosInStringEnd = end;
           lastSpacePos = i;
         }
       } else {
-        const charWidth = font.charsToGlyphs(character)[0].width * scale;
-        if (currentWidth + charWidth > width) {
+        if (currentWidth + glyphWidth > width) {
           // We must break to the last white position (if available)
-          if (lastSpacePos !== -1) {
-            chunks.push(line.substring(startChunk, lastSpacePos + 1));
-            startChunk = i = lastSpacePos + 1;
-            lastSpacePos = -1;
+          if (lastSpacePosInStringStart !== -1) {
+            chunks.push(line.substring(startChunk, lastSpacePosInStringEnd));
+            startChunk = lastSpacePosInStringEnd;
+            i = lastSpacePos + 1;
+            lastSpacePosInStringStart = -1;
             currentWidth = 0;
           } else {
             // Just break in the middle of the word
-            chunks.push(line.substring(startChunk, i));
-            startChunk = i;
-            currentWidth = charWidth;
+            chunks.push(line.substring(startChunk, start));
+            startChunk = start;
+            currentWidth = glyphWidth;
           }
         } else {
-          currentWidth += charWidth;
+          currentWidth += glyphWidth;
         }
       }
     }

diff --git a/src/core/cmap.js b/src/core/cmap.js
@@ -338,6 +338,22 @@ class CMap {
     out.length = 1;
   }
 
+  getCharCodeLength(charCode) {
+    const codespaceRanges = this.codespaceRanges;
+    for (let n = 0, nn = codespaceRanges.length; n < nn; n++) {
+      // Check each codespace range to see if it falls within.
+      const codespaceRange = codespaceRanges[n];
+      for (let k = 0, kk = codespaceRange.length; k < kk; ) {
+        const low = codespaceRange[k++];
+        const high = codespaceRange[k++];
+        if (charCode >= low && charCode <= high) {
+          return n + 1;
+        }
+      }
+    }
+    return 1;
+  }
+
   get length() {
     return this._map.length;
   }

diff --git a/src/core/fonts.js b/src/core/fonts.js
@@ -590,6 +590,7 @@ var Font = (function FontClosure() {
     this.defaultWidth = properties.defaultWidth;
     this.composite = properties.composite;
     this.cMap = properties.cMap;
+    this.capHeight = properties.capHeight / PDF_GLYPH_SPACE_UNITS;
     this.ascent = properties.ascent / PDF_GLYPH_SPACE_UNITS;
     this.descent = properties.descent / PDF_GLYPH_SPACE_UNITS;
     this.fontMatrix = properties.fontMatrix;
@@ -3351,9 +3352,93 @@ var Font = (function FontClosure() {
       return (charsCache[charsCacheKey] = glyphs);
     },
 
+    /**
+     * Chars can have different sizes (depends on the encoding).
+     * @param {String} a string encoded with font encoding.
+     * @returns {Array<Array<number>>} the positions of each char in the string.
+     */
+    getCharPositions(chars) {
+      // This function doesn't use a cache because
+      // it's called only when saving or printing.
+      const positions = [];
+
+      if (this.cMap) {
+        const c = Object.create(null);
+        let i = 0;
+        while (i < chars.length) {
+          this.cMap.readCharCode(chars, i, c);
+          const length = c.length;
+          positions.push([i, i + length]);
+          i += length;
+        }
+      } else {
+        for (let i = 0, ii = chars.length; i < ii; ++i) {
+          positions.push([i, i + 1]);
+        }
+      }
+
+      return positions;
+    },
+
     get glyphCacheValues() {
       return Object.values(this.glyphCache);
     },
+
+    /**
+     * Encode a js string using font encoding.
+     * The resulting array contains an encoded string at even positions
+     * (can be empty) and a non-encoded one at odd positions.
+     * @param {String} a js string.
+     * @returns {Array<String>} an array of encoded strings or non-encoded ones.
+     */
+    encodeString(str) {
+      const buffers = [];
+      const currentBuf = [];
+
+      // buffers will contain: encoded, non-encoded, encoded, ...
+      // currentBuf is pushed in buffers each time there is a change.
+      // So when buffers.length is odd then the last string is an encoded one
+      // and currentBuf contains non-encoded chars.
+      const hasCurrentBufErrors = () => buffers.length % 2 === 1;
+
+      for (let i = 0, ii = str.length; i < ii; i++) {
+        const unicode = str.codePointAt(i);
+        if (unicode > 0xd7ff && (unicode < 0xe000 || unicode > 0xfffd)) {
+          // unicode is represented by two uint16
+          i++;
+        }
+        if (this.toUnicode) {
+          const char = String.fromCodePoint(unicode);
+          const charCode = this.toUnicode.charCodeOf(char);
+          if (charCode !== -1) {
+            if (hasCurrentBufErrors()) {
+              buffers.push(currentBuf.join(""));
+              currentBuf.length = 0;
+            }
+            const charCodeLength = this.cMap
+              ? this.cMap.getCharCodeLength(charCode)
+              : 1;
+            for (let j = charCodeLength - 1; j >= 0; j--) {
+              currentBuf.push(
+                String.fromCharCode((charCode >> (8 * j)) & 0xff)
+              );
+            }
+            continue;
+          }
+        }
+
+        // unicode can't be encoded
+        if (!hasCurrentBufErrors()) {
+          buffers.push(currentBuf.join(""));
+          currentBuf.length = 0;
+        }
+        currentBuf.push(String.fromCodePoint(unicode));
+      }
+
+      buffers.push(currentBuf.join(""));
+
+      return buffers;
+    },
   };
 
   return Font;
@@ -3371,6 +3456,9 @@ var ErrorFont = (function ErrorFontClosure() {
     charsToGlyphs: function ErrorFont_charsToGlyphs() {
       return [];
     },
+    encodeString: function ErrorFont_encodeString(chars) {
+      return [chars];
+    },
     exportData(extraProperties = false) {
       return { error: this.error };
     },

diff --git a/src/shared/util.js b/src/shared/util.js
@@ -842,6 +842,20 @@ function escapeString(str) {
   });
 }
 
+function isAscii(str) {
+  return /^[\x00-\x7F]*$/.test(str);
+}
+
+function stringToUTF16BEString(str) {
+  const buf = ["\xFE\xFF"];
+  for (let i = 0, ii = str.length; i < ii; i++) {
+    const char = str.charCodeAt(i);
+    buf.push(String.fromCharCode((char >> 8) & 0xff));
+    buf.push(String.fromCharCode(char & 0xff));
+  }
+  return buf.join("");
+}
+
 function stringToUTF8String(str) {
   return decodeURIComponent(escape(str));
 }
@@ -1044,6 +1058,7 @@ export {
   getModificationDate,
   getVerbosityLevel,
   info,
+  isAscii,
   isArrayBuffer,
   isArrayEqual,
   isBool,
@@ -1061,6 +1076,7 @@ export {
   string32,
   stringToBytes,
   stringToPDFString,
+  stringToUTF16BEString,
   stringToUTF8String,
   utf8StringToString,
   warn,