Skip to content

Commit

Permalink
Merge pull request #12292 from calixteman/encoding
Browse files Browse the repository at this point in the history
Fix encoding issues when printing/saving a form with non-ascii characters
  • Loading branch information
timvandermeij authored Jan 7, 2021
2 parents 4be76c8 + 5642496 commit 5bde4b7
Show file tree
Hide file tree
Showing 6 changed files with 506 additions and 55 deletions.
110 changes: 71 additions & 39 deletions src/core/annotation.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ import {
assert,
escapeString,
getModificationDate,
isAscii,
isString,
OPS,
shadow,
stringToPDFString,
stringToUTF16BEString,
unreachable,
Util,
warn,
Expand Down Expand Up @@ -1222,7 +1224,7 @@ class WidgetAnnotation extends Annotation {
appearance = newTransform.encryptString(appearance);
}

dict.set("V", value);
dict.set("V", isAscii(value) ? value : stringToUTF16BEString(value));
dict.set("AP", AP);
dict.set("M", `D:${getModificationDate()}`);

Expand Down Expand Up @@ -1298,25 +1300,29 @@ class WidgetAnnotation extends Annotation {
const defaultAppearance = this.data.defaultAppearance;
const alignment = this.data.textAlignment;

if (this.data.comb) {
return this._getCombAppearance(
if (this.data.multiLine) {
return this._getMultilineAppearance(
defaultAppearance,
value,
font,
fontSize,
totalWidth,
totalHeight,
alignment,
hPadding,
vPadding
);
}

if (this.data.multiLine) {
return this._getMultilineAppearance(
// TODO: need to handle chars which are not in the font.
const encodedString = font.encodeString(value).join("");

if (this.data.comb) {
return this._getCombAppearance(
defaultAppearance,
value,
font,
fontSize,
encodedString,
totalWidth,
totalHeight,
alignment,
hPadding,
vPadding
);
Expand All @@ -1327,13 +1333,15 @@ class WidgetAnnotation extends Annotation {
return (
"/Tx BMC q BT " +
defaultAppearance +
` 1 0 0 1 ${hPadding} ${vPadding} Tm (${escapeString(value)}) Tj` +
` 1 0 0 1 ${hPadding} ${vPadding} Tm (${escapeString(
encodedString
)}) Tj` +
" ET Q EMC"
);
}

const renderedText = this._renderText(
value,
encodedString,
font,
fontSize,
totalWidth,
Expand Down Expand Up @@ -1373,10 +1381,21 @@ class WidgetAnnotation extends Annotation {

_computeFontSize(font, fontName, fontSize, height) {
if (fontSize === null || fontSize === 0) {
const em = font.charsToGlyphs("M")[0].width / 1000;
// According to https://en.wikipedia.org/wiki/Em_(typography)
// an average cap height should be 70% of 1em
const capHeight = 0.7 * em;
let capHeight;
if (font.capHeight) {
capHeight = font.capHeight;
} else {
const glyphs = font.charsToGlyphs(font.encodeString("M").join(""));
if (glyphs.length === 1 && glyphs[0].width) {
const em = glyphs[0].width / 1000;
// According to https://en.wikipedia.org/wiki/Em_(typography)
// an average cap height should be 70% of 1em
capHeight = 0.7 * em;
} else {
capHeight = 0.7;
}
}

// 1.5 * capHeight * fontSize seems to be a good value for lineHeight
fontSize = Math.max(1, Math.floor(height / (1.5 * capHeight)));

Expand Down Expand Up @@ -1510,11 +1529,12 @@ class TextWidgetAnnotation extends WidgetAnnotation {
this.data.maxLen !== null;
}

_getCombAppearance(defaultAppearance, text, width, hPadding, vPadding) {
_getCombAppearance(defaultAppearance, font, text, width, hPadding, vPadding) {
const combWidth = (width / this.data.maxLen).toFixed(2);
const buf = [];
for (const character of text) {
buf.push(`(${escapeString(character)}) Tj`);
const positions = font.getCharPositions(text);
for (const [start, end] of positions) {
buf.push(`(${escapeString(text.substring(start, end))}) Tj`);
}

const renderedComb = buf.join(` ${combWidth} 0 Td `);
Expand Down Expand Up @@ -1568,49 +1588,61 @@ class TextWidgetAnnotation extends WidgetAnnotation {
}

_splitLine(line, font, fontSize, width) {
if (line.length <= 1) {
// TODO: need to handle chars which are not in the font.
line = font.encodeString(line).join("");

const glyphs = font.charsToGlyphs(line);

if (glyphs.length <= 1) {
// Nothing to split
return [line];
}

const positions = font.getCharPositions(line);
const scale = fontSize / 1000;
const whitespace = font.charsToGlyphs(" ")[0].width * scale;
const chunks = [];

let lastSpacePos = -1,
let lastSpacePosInStringStart = -1,
lastSpacePosInStringEnd = -1,
lastSpacePos = -1,
startChunk = 0,
currentWidth = 0;

for (let i = 0, ii = line.length; i < ii; i++) {
const character = line.charAt(i);
if (character === " ") {
if (currentWidth + whitespace > width) {
for (let i = 0, ii = glyphs.length; i < ii; i++) {
const [start, end] = positions[i];
const glyph = glyphs[i];
const glyphWidth = glyph.width * scale;
if (glyph.unicode === " ") {
if (currentWidth + glyphWidth > width) {
// We can break here
chunks.push(line.substring(startChunk, i));
startChunk = i;
currentWidth = whitespace;
chunks.push(line.substring(startChunk, start));
startChunk = start;
currentWidth = glyphWidth;
lastSpacePosInStringStart = -1;
lastSpacePos = -1;
} else {
currentWidth += whitespace;
currentWidth += glyphWidth;
lastSpacePosInStringStart = start;
lastSpacePosInStringEnd = end;
lastSpacePos = i;
}
} else {
const charWidth = font.charsToGlyphs(character)[0].width * scale;
if (currentWidth + charWidth > width) {
if (currentWidth + glyphWidth > width) {
// We must break to the last white position (if available)
if (lastSpacePos !== -1) {
chunks.push(line.substring(startChunk, lastSpacePos + 1));
startChunk = i = lastSpacePos + 1;
lastSpacePos = -1;
if (lastSpacePosInStringStart !== -1) {
chunks.push(line.substring(startChunk, lastSpacePosInStringEnd));
startChunk = lastSpacePosInStringEnd;
i = lastSpacePos + 1;
lastSpacePosInStringStart = -1;
currentWidth = 0;
} else {
// Just break in the middle of the word
chunks.push(line.substring(startChunk, i));
startChunk = i;
currentWidth = charWidth;
chunks.push(line.substring(startChunk, start));
startChunk = start;
currentWidth = glyphWidth;
}
} else {
currentWidth += charWidth;
currentWidth += glyphWidth;
}
}
}
Expand Down
16 changes: 16 additions & 0 deletions src/core/cmap.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,22 @@ class CMap {
out.length = 1;
}

getCharCodeLength(charCode) {
const codespaceRanges = this.codespaceRanges;
for (let n = 0, nn = codespaceRanges.length; n < nn; n++) {
// Check each codespace range to see if it falls within.
const codespaceRange = codespaceRanges[n];
for (let k = 0, kk = codespaceRange.length; k < kk; ) {
const low = codespaceRange[k++];
const high = codespaceRange[k++];
if (charCode >= low && charCode <= high) {
return n + 1;
}
}
}
return 1;
}

get length() {
return this._map.length;
}
Expand Down
88 changes: 88 additions & 0 deletions src/core/fonts.js
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ var Font = (function FontClosure() {
this.defaultWidth = properties.defaultWidth;
this.composite = properties.composite;
this.cMap = properties.cMap;
this.capHeight = properties.capHeight / PDF_GLYPH_SPACE_UNITS;
this.ascent = properties.ascent / PDF_GLYPH_SPACE_UNITS;
this.descent = properties.descent / PDF_GLYPH_SPACE_UNITS;
this.fontMatrix = properties.fontMatrix;
Expand Down Expand Up @@ -3351,9 +3352,93 @@ var Font = (function FontClosure() {
return (charsCache[charsCacheKey] = glyphs);
},

/**
* Chars can have different sizes (depends on the encoding).
* @param {String} a string encoded with font encoding.
* @returns {Array<Array<number>>} the positions of each char in the string.
*/
getCharPositions(chars) {
// This function doesn't use a cache because
// it's called only when saving or printing.
const positions = [];

if (this.cMap) {
const c = Object.create(null);
let i = 0;
while (i < chars.length) {
this.cMap.readCharCode(chars, i, c);
const length = c.length;
positions.push([i, i + length]);
i += length;
}
} else {
for (let i = 0, ii = chars.length; i < ii; ++i) {
positions.push([i, i + 1]);
}
}

return positions;
},

get glyphCacheValues() {
return Object.values(this.glyphCache);
},

/**
* Encode a js string using font encoding.
* The resulting array contains an encoded string at even positions
* (can be empty) and a non-encoded one at odd positions.
* @param {String} a js string.
* @returns {Array<String>} an array of encoded strings or non-encoded ones.
*/
encodeString(str) {
const buffers = [];
const currentBuf = [];

// buffers will contain: encoded, non-encoded, encoded, ...
// currentBuf is pushed in buffers each time there is a change.
// So when buffers.length is odd then the last string is an encoded one
// and currentBuf contains non-encoded chars.
const hasCurrentBufErrors = () => buffers.length % 2 === 1;

for (let i = 0, ii = str.length; i < ii; i++) {
const unicode = str.codePointAt(i);
if (unicode > 0xd7ff && (unicode < 0xe000 || unicode > 0xfffd)) {
// unicode is represented by two uint16
i++;
}
if (this.toUnicode) {
const char = String.fromCodePoint(unicode);
const charCode = this.toUnicode.charCodeOf(char);
if (charCode !== -1) {
if (hasCurrentBufErrors()) {
buffers.push(currentBuf.join(""));
currentBuf.length = 0;
}
const charCodeLength = this.cMap
? this.cMap.getCharCodeLength(charCode)
: 1;
for (let j = charCodeLength - 1; j >= 0; j--) {
currentBuf.push(
String.fromCharCode((charCode >> (8 * j)) & 0xff)
);
}
continue;
}
}

// unicode can't be encoded
if (!hasCurrentBufErrors()) {
buffers.push(currentBuf.join(""));
currentBuf.length = 0;
}
currentBuf.push(String.fromCodePoint(unicode));
}

buffers.push(currentBuf.join(""));

return buffers;
},
};

return Font;
Expand All @@ -3371,6 +3456,9 @@ var ErrorFont = (function ErrorFontClosure() {
charsToGlyphs: function ErrorFont_charsToGlyphs() {
return [];
},
encodeString: function ErrorFont_encodeString(chars) {
return [chars];
},
exportData(extraProperties = false) {
return { error: this.error };
},
Expand Down
16 changes: 16 additions & 0 deletions src/shared/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,20 @@ function escapeString(str) {
});
}

function isAscii(str) {
return /^[\x00-\x7F]*$/.test(str);
}

function stringToUTF16BEString(str) {
const buf = ["\xFE\xFF"];
for (let i = 0, ii = str.length; i < ii; i++) {
const char = str.charCodeAt(i);
buf.push(String.fromCharCode((char >> 8) & 0xff));
buf.push(String.fromCharCode(char & 0xff));
}
return buf.join("");
}

function stringToUTF8String(str) {
return decodeURIComponent(escape(str));
}
Expand Down Expand Up @@ -1044,6 +1058,7 @@ export {
getModificationDate,
getVerbosityLevel,
info,
isAscii,
isArrayBuffer,
isArrayEqual,
isBool,
Expand All @@ -1061,6 +1076,7 @@ export {
string32,
stringToBytes,
stringToPDFString,
stringToUTF16BEString,
stringToUTF8String,
utf8StringToString,
warn,
Expand Down
Loading

0 comments on commit 5bde4b7

Please sign in to comment.