diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 642e6aa47d7d0..2eb9847c91fbf 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3178,10 +3178,10 @@ class PartialEvaluator { } /** - * @returns {ToUnicodeMap} + * @returns {Array} * @private */ - _buildSimpleFontToUnicode(properties, forceGlyphs = false) { + _simpleFontToUnicode(properties, forceGlyphs = false) { assert(!properties.composite, "Must be a simple font."); const toUnicode = []; @@ -3242,7 +3242,7 @@ class PartialEvaluator { Number.isNaN(code) && Number.isInteger(parseInt(codeStr, 16)) ) { - return this._buildSimpleFontToUnicode( + return this._simpleFontToUnicode( properties, /* forceGlyphs */ true ); @@ -3275,7 +3275,7 @@ class PartialEvaluator { } toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]); } - return new ToUnicodeMap(toUnicode); + return toUnicode; } /** @@ -3284,7 +3284,7 @@ class PartialEvaluator { * @returns {Promise} A Promise that is resolved with a * {ToUnicodeMap|IdentityToUnicodeMap} object. */ - buildToUnicode(properties) { + async buildToUnicode(properties) { properties.hasIncludedToUnicodeMap = !!properties.toUnicode && properties.toUnicode.length > 0; @@ -3294,11 +3294,9 @@ class PartialEvaluator { // text-extraction. For simple fonts, containing encoding information, // use a fallback ToUnicode map to improve this (fixes issue8229.pdf). if (!properties.composite && properties.hasEncoding) { - properties.fallbackToUnicode = - this._buildSimpleFontToUnicode(properties); + properties.fallbackToUnicode = this._simpleFontToUnicode(properties); } - - return Promise.resolve(properties.toUnicode); + return properties.toUnicode; } // According to the spec if the font is a simple font we should only map @@ -3307,7 +3305,7 @@ class PartialEvaluator { // in pratice it seems better to always try to create a toUnicode map // based of the default encoding. if (!properties.composite /* is simple font */) { - return Promise.resolve(this._buildSimpleFontToUnicode(properties)); + return new ToUnicodeMap(this._simpleFontToUnicode(properties)); } // If the font is a composite font that uses one of the predefined CMaps @@ -3330,42 +3328,37 @@ class PartialEvaluator { // b) Obtain the registry and ordering of the character collection used // by the font’s CMap (for example, Adobe and Japan1) from its // CIDSystemInfo dictionary. - const registry = properties.cidSystemInfo.registry; - const ordering = properties.cidSystemInfo.ordering; + const { registry, ordering } = properties.cidSystemInfo; // c) Construct a second CMap name by concatenating the registry and // ordering obtained in step (b) in the format registry–ordering–UCS2 // (for example, Adobe–Japan1–UCS2). - const ucs2CMapName = Name.get(registry + "-" + ordering + "-UCS2"); + const ucs2CMapName = Name.get(`${registry}-${ordering}-UCS2`); // d) Obtain the CMap with the name constructed in step (c) (available // from the ASN Web site; see the Bibliography). - return CMapFactory.create({ + const ucs2CMap = await CMapFactory.create({ encoding: ucs2CMapName, fetchBuiltInCMap: this._fetchBuiltInCMapBound, useCMap: null, - }).then(function (ucs2CMap) { - const cMap = properties.cMap; - const toUnicode = []; - cMap.forEach(function (charcode, cid) { - if (cid > 0xffff) { - throw new FormatError("Max size of CID is 65,535"); - } - // e) Map the CID obtained in step (a) according to the CMap - // obtained in step (d), producing a Unicode value. - const ucs2 = ucs2CMap.lookup(cid); - if (ucs2) { - toUnicode[charcode] = String.fromCharCode( - (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1) - ); - } - }); - return new ToUnicodeMap(toUnicode); }); + const toUnicode = []; + properties.cMap.forEach(function (charcode, cid) { + if (cid > 0xffff) { + throw new FormatError("Max size of CID is 65,535"); + } + // e) Map the CID obtained in step (a) according to the CMap + // obtained in step (d), producing a Unicode value. + const ucs2 = ucs2CMap.lookup(cid); + if (ucs2) { + toUnicode[charcode] = String.fromCharCode( + (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1) + ); + } + }); + return new ToUnicodeMap(toUnicode); } // The viewer's choice, just use an identity map. - return Promise.resolve( - new IdentityToUnicodeMap(properties.firstChar, properties.lastChar) - ); + return new IdentityToUnicodeMap(properties.firstChar, properties.lastChar); } readToUnicode(cmapObj) { diff --git a/src/core/fonts.js b/src/core/fonts.js index 8b8ec434d3377..370035ffe5cdb 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -135,9 +135,6 @@ function adjustToUnicode(properties, builtInEncoding) { if (properties.isInternalFont) { return; } - if (properties.hasIncludedToUnicodeMap) { - return; // The font dictionary has a `ToUnicode` entry. - } if (builtInEncoding === properties.defaultEncoding) { return; // No point in trying to adjust `toUnicode` if the encodings match. } @@ -147,11 +144,17 @@ function adjustToUnicode(properties, builtInEncoding) { const toUnicode = [], glyphsUnicodeMap = getGlyphsUnicode(); for (const charCode in builtInEncoding) { - if ( - properties.hasEncoding && - properties.differences[charCode] !== undefined - ) { - continue; // The font dictionary has an `Encoding`/`Differences` entry. + if (properties.hasIncludedToUnicodeMap) { + if (properties.toUnicode.has(charCode)) { + continue; // The font dictionary has a `ToUnicode` entry. + } + } else { + if ( + properties.hasEncoding && + properties.differences[charCode] !== undefined + ) { + continue; // The font dictionary has an `Encoding`/`Differences` entry. + } } const glyphName = builtInEncoding[charCode]; const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); @@ -159,7 +162,32 @@ function adjustToUnicode(properties, builtInEncoding) { toUnicode[charCode] = String.fromCharCode(unicode); } } - properties.toUnicode.amend(toUnicode); + if (toUnicode.length > 0) { + properties.toUnicode.amend(toUnicode); + } +} + +/** + * NOTE: This function should only be called at the *end* of font-parsing, + * after e.g. `adjustToUnicode` has run, to prevent any issues. + */ +function amendFallbackToUnicode(properties) { + if (!properties.fallbackToUnicode) { + return; + } + if (properties.toUnicode instanceof IdentityToUnicodeMap) { + return; + } + const toUnicode = []; + for (const charCode in properties.fallbackToUnicode) { + if (properties.toUnicode.has(charCode)) { + continue; // The font dictionary has a `ToUnicode` entry. + } + toUnicode[charCode] = properties.fallbackToUnicode[charCode]; + } + if (toUnicode.length > 0) { + properties.toUnicode.amend(toUnicode); + } } class Glyph { @@ -849,8 +877,6 @@ class Font { this.defaultEncoding = properties.defaultEncoding; this.toUnicode = properties.toUnicode; - this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap(); - this.toFontChar = []; if (properties.type === "Type3") { @@ -936,6 +962,7 @@ class Font { return; } + amendFallbackToUnicode(properties); this.data = data; this.fontType = getFontType(type, subtype, properties.isStandardFont); @@ -1094,6 +1121,8 @@ class Font { } this.toFontChar = map; } + + amendFallbackToUnicode(properties); this.loadedName = fontName.split("-")[0]; this.fontType = getFontType(type, subtype, properties.isStandardFont); } @@ -2545,12 +2574,9 @@ class Font { const glyphsUnicodeMap = getGlyphsUnicode(); for (let charCode = 0; charCode < 256; charCode++) { let glyphName; - if (this.differences && charCode in this.differences) { + if (this.differences[charCode] !== undefined) { glyphName = this.differences[charCode]; - } else if ( - charCode in baseEncoding && - baseEncoding[charCode] !== "" - ) { + } else if (baseEncoding[charCode] !== "") { glyphName = baseEncoding[charCode]; } else { glyphName = StandardEncoding[charCode]; @@ -2955,15 +2981,12 @@ class Font { width = isNum(width) ? width : this.defaultWidth; const vmetric = this.vmetrics && this.vmetrics[widthCode]; - let unicode = - this.toUnicode.get(charcode) || - this.fallbackToUnicode.get(charcode) || - charcode; + let unicode = this.toUnicode.get(charcode) || charcode; if (typeof unicode === "number") { unicode = String.fromCharCode(unicode); } - let isInFont = charcode in this.toFontChar; + let isInFont = this.toFontChar[charcode] !== undefined; // First try the toFontChar map, if it's not there then try falling // back to the char code. fontCharCode = this.toFontChar[charcode] || charcode;