Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix encoding issues when printing/saving a form with non-ascii characters #12292

Merged
merged 1 commit into from
Jan 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 71 additions & 39 deletions src/core/annotation.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ import {
assert,
escapeString,
getModificationDate,
isAscii,
isString,
OPS,
shadow,
stringToPDFString,
stringToUTF16BEString,
unreachable,
Util,
warn,
Expand Down Expand Up @@ -1222,7 +1224,7 @@ class WidgetAnnotation extends Annotation {
appearance = newTransform.encryptString(appearance);
}

dict.set("V", value);
dict.set("V", isAscii(value) ? value : stringToUTF16BEString(value));
dict.set("AP", AP);
dict.set("M", `D:${getModificationDate()}`);

Expand Down Expand Up @@ -1298,25 +1300,29 @@ class WidgetAnnotation extends Annotation {
const defaultAppearance = this.data.defaultAppearance;
const alignment = this.data.textAlignment;

if (this.data.comb) {
return this._getCombAppearance(
if (this.data.multiLine) {
return this._getMultilineAppearance(
defaultAppearance,
value,
font,
fontSize,
totalWidth,
totalHeight,
alignment,
hPadding,
vPadding
);
}

if (this.data.multiLine) {
return this._getMultilineAppearance(
// TODO: need to handle chars which are not in the font.
const encodedString = font.encodeString(value).join("");

if (this.data.comb) {
return this._getCombAppearance(
defaultAppearance,
value,
font,
fontSize,
encodedString,
totalWidth,
totalHeight,
alignment,
hPadding,
vPadding
);
Expand All @@ -1327,13 +1333,15 @@ class WidgetAnnotation extends Annotation {
return (
"/Tx BMC q BT " +
defaultAppearance +
` 1 0 0 1 ${hPadding} ${vPadding} Tm (${escapeString(value)}) Tj` +
` 1 0 0 1 ${hPadding} ${vPadding} Tm (${escapeString(
encodedString
)}) Tj` +
" ET Q EMC"
);
}

const renderedText = this._renderText(
value,
encodedString,
font,
fontSize,
totalWidth,
Expand Down Expand Up @@ -1373,10 +1381,21 @@ class WidgetAnnotation extends Annotation {

_computeFontSize(font, fontName, fontSize, height) {
if (fontSize === null || fontSize === 0) {
const em = font.charsToGlyphs("M")[0].width / 1000;
// According to https://en.wikipedia.org/wiki/Em_(typography)
// an average cap height should be 70% of 1em
const capHeight = 0.7 * em;
let capHeight;
if (font.capHeight) {
capHeight = font.capHeight;
} else {
const glyphs = font.charsToGlyphs(font.encodeString("M").join(""));
if (glyphs.length === 1 && glyphs[0].width) {
const em = glyphs[0].width / 1000;
// According to https://en.wikipedia.org/wiki/Em_(typography)
// an average cap height should be 70% of 1em
capHeight = 0.7 * em;
} else {
capHeight = 0.7;
}
}

// 1.5 * capHeight * fontSize seems to be a good value for lineHeight
fontSize = Math.max(1, Math.floor(height / (1.5 * capHeight)));

Expand Down Expand Up @@ -1510,11 +1529,12 @@ class TextWidgetAnnotation extends WidgetAnnotation {
this.data.maxLen !== null;
}

_getCombAppearance(defaultAppearance, text, width, hPadding, vPadding) {
_getCombAppearance(defaultAppearance, font, text, width, hPadding, vPadding) {
const combWidth = (width / this.data.maxLen).toFixed(2);
const buf = [];
for (const character of text) {
buf.push(`(${escapeString(character)}) Tj`);
const positions = font.getCharPositions(text);
for (const [start, end] of positions) {
buf.push(`(${escapeString(text.substring(start, end))}) Tj`);
}

const renderedComb = buf.join(` ${combWidth} 0 Td `);
Expand Down Expand Up @@ -1568,49 +1588,61 @@ class TextWidgetAnnotation extends WidgetAnnotation {
}

_splitLine(line, font, fontSize, width) {
if (line.length <= 1) {
// TODO: need to handle chars which are not in the font.
line = font.encodeString(line).join("");

const glyphs = font.charsToGlyphs(line);

if (glyphs.length <= 1) {
// Nothing to split
return [line];
}

const positions = font.getCharPositions(line);
const scale = fontSize / 1000;
const whitespace = font.charsToGlyphs(" ")[0].width * scale;
const chunks = [];

let lastSpacePos = -1,
let lastSpacePosInStringStart = -1,
lastSpacePosInStringEnd = -1,
lastSpacePos = -1,
startChunk = 0,
currentWidth = 0;

for (let i = 0, ii = line.length; i < ii; i++) {
const character = line.charAt(i);
if (character === " ") {
if (currentWidth + whitespace > width) {
for (let i = 0, ii = glyphs.length; i < ii; i++) {
const [start, end] = positions[i];
const glyph = glyphs[i];
const glyphWidth = glyph.width * scale;
if (glyph.unicode === " ") {
if (currentWidth + glyphWidth > width) {
// We can break here
chunks.push(line.substring(startChunk, i));
startChunk = i;
currentWidth = whitespace;
chunks.push(line.substring(startChunk, start));
startChunk = start;
currentWidth = glyphWidth;
lastSpacePosInStringStart = -1;
lastSpacePos = -1;
} else {
currentWidth += whitespace;
currentWidth += glyphWidth;
lastSpacePosInStringStart = start;
lastSpacePosInStringEnd = end;
lastSpacePos = i;
}
} else {
const charWidth = font.charsToGlyphs(character)[0].width * scale;
if (currentWidth + charWidth > width) {
if (currentWidth + glyphWidth > width) {
// We must break to the last white position (if available)
if (lastSpacePos !== -1) {
chunks.push(line.substring(startChunk, lastSpacePos + 1));
startChunk = i = lastSpacePos + 1;
lastSpacePos = -1;
if (lastSpacePosInStringStart !== -1) {
chunks.push(line.substring(startChunk, lastSpacePosInStringEnd));
startChunk = lastSpacePosInStringEnd;
i = lastSpacePos + 1;
lastSpacePosInStringStart = -1;
currentWidth = 0;
} else {
// Just break in the middle of the word
chunks.push(line.substring(startChunk, i));
startChunk = i;
currentWidth = charWidth;
chunks.push(line.substring(startChunk, start));
startChunk = start;
currentWidth = glyphWidth;
}
} else {
currentWidth += charWidth;
currentWidth += glyphWidth;
}
}
}
Expand Down
16 changes: 16 additions & 0 deletions src/core/cmap.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,22 @@ class CMap {
out.length = 1;
}

getCharCodeLength(charCode) {
const codespaceRanges = this.codespaceRanges;
for (let n = 0, nn = codespaceRanges.length; n < nn; n++) {
// Check each codespace range to see if it falls within.
const codespaceRange = codespaceRanges[n];
for (let k = 0, kk = codespaceRange.length; k < kk; ) {
const low = codespaceRange[k++];
const high = codespaceRange[k++];
if (charCode >= low && charCode <= high) {
return n + 1;
}
}
}
return 1;
}

get length() {
return this._map.length;
}
Expand Down
88 changes: 88 additions & 0 deletions src/core/fonts.js
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ var Font = (function FontClosure() {
this.defaultWidth = properties.defaultWidth;
this.composite = properties.composite;
this.cMap = properties.cMap;
this.capHeight = properties.capHeight / PDF_GLYPH_SPACE_UNITS;
this.ascent = properties.ascent / PDF_GLYPH_SPACE_UNITS;
this.descent = properties.descent / PDF_GLYPH_SPACE_UNITS;
this.fontMatrix = properties.fontMatrix;
Expand Down Expand Up @@ -3351,9 +3352,93 @@ var Font = (function FontClosure() {
return (charsCache[charsCacheKey] = glyphs);
},

/**
* Chars can have different sizes (depends on the encoding).
* @param {String} a string encoded with font encoding.
* @returns {Array<Array<number>>} the positions of each char in the string.
*/
getCharPositions(chars) {
// This function doesn't use a cache because
// it's called only when saving or printing.
const positions = [];

if (this.cMap) {
const c = Object.create(null);
let i = 0;
while (i < chars.length) {
this.cMap.readCharCode(chars, i, c);
const length = c.length;
positions.push([i, i + length]);
i += length;
}
} else {
for (let i = 0, ii = chars.length; i < ii; ++i) {
positions.push([i, i + 1]);
}
}

return positions;
},

get glyphCacheValues() {
return Object.values(this.glyphCache);
},

/**
* Encode a js string using font encoding.
* The resulting array contains an encoded string at even positions
* (can be empty) and a non-encoded one at odd positions.
* @param {String} a js string.
* @returns {Array<String>} an array of encoded strings or non-encoded ones.
*/
encodeString(str) {
const buffers = [];
const currentBuf = [];

// buffers will contain: encoded, non-encoded, encoded, ...
// currentBuf is pushed in buffers each time there is a change.
// So when buffers.length is odd then the last string is an encoded one
// and currentBuf contains non-encoded chars.
const hasCurrentBufErrors = () => buffers.length % 2 === 1;
calixteman marked this conversation as resolved.
Show resolved Hide resolved

for (let i = 0, ii = str.length; i < ii; i++) {
const unicode = str.codePointAt(i);
if (unicode > 0xd7ff && (unicode < 0xe000 || unicode > 0xfffd)) {
// unicode is represented by two uint16
i++;
}
if (this.toUnicode) {
const char = String.fromCodePoint(unicode);
const charCode = this.toUnicode.charCodeOf(char);
if (charCode !== -1) {
if (hasCurrentBufErrors()) {
buffers.push(currentBuf.join(""));
currentBuf.length = 0;
}
const charCodeLength = this.cMap
? this.cMap.getCharCodeLength(charCode)
: 1;
for (let j = charCodeLength - 1; j >= 0; j--) {
currentBuf.push(
String.fromCharCode((charCode >> (8 * j)) & 0xff)
);
}
continue;
}
}

// unicode can't be encoded
if (!hasCurrentBufErrors()) {
buffers.push(currentBuf.join(""));
currentBuf.length = 0;
}
currentBuf.push(String.fromCodePoint(unicode));
}

buffers.push(currentBuf.join(""));

return buffers;
},
};

return Font;
Expand All @@ -3371,6 +3456,9 @@ var ErrorFont = (function ErrorFontClosure() {
charsToGlyphs: function ErrorFont_charsToGlyphs() {
return [];
},
encodeString: function ErrorFont_encodeString(chars) {
return [chars];
},
exportData(extraProperties = false) {
return { error: this.error };
},
Expand Down
16 changes: 16 additions & 0 deletions src/shared/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,20 @@ function escapeString(str) {
});
}

function isAscii(str) {
return /^[\x00-\x7F]*$/.test(str);
}

function stringToUTF16BEString(str) {
const buf = ["\xFE\xFF"];
for (let i = 0, ii = str.length; i < ii; i++) {
const char = str.charCodeAt(i);
buf.push(String.fromCharCode((char >> 8) & 0xff));
buf.push(String.fromCharCode(char & 0xff));
}
return buf.join("");
}

function stringToUTF8String(str) {
return decodeURIComponent(escape(str));
}
Expand Down Expand Up @@ -1044,6 +1058,7 @@ export {
getModificationDate,
getVerbosityLevel,
info,
isAscii,
isArrayBuffer,
isArrayEqual,
isBool,
Expand All @@ -1061,6 +1076,7 @@ export {
string32,
stringToBytes,
stringToPDFString,
stringToUTF16BEString,
stringToUTF8String,
utf8StringToString,
warn,
Expand Down
Loading