From 90eb579713152a965dc627b15f5ea8b3a1050727 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sat, 11 Jul 2020 13:52:11 +0200 Subject: [PATCH 1/3] Add local caching of "simple" Graphics State (ExtGState) data in `PartialEvaluator.getOperatorList` (issue 2813) This patch will help pathological cases the most, with issue 2813 being a particularily problematic example. While there's only *four* `/ExtGState` resources, there's a total `29062` of `setGState` operators. Even though parsing of a single `/ExtGState` resource is quite fast, having to re-parse them thousands of times does add up quite significantly. For simplicity we'll only cache "simple" `/ExtGState` resource, since e.g. the general `SMask` case cannot be easily cached (without re-factoring other code, which may have undesirable effects on general parsing). By caching "simple" `/ExtGState` resource, we thus improve performance by: - Not having to fetch/validate/parse the same `/ExtGState` data over and over. - Handling of repeated `setGState` operators becomes *synchronous* during the `OperatorList` building, instead of having to defer to the event-loop/microtask-queue since the `/ExtGState` parsing is done asynchronously. --- Obviously I had intended to include (standard) benchmark results with this patch, but for reasons I don't understand the test run-time (even with `master`) of the document in issue 2813 is *a lot* slower than in the development viewer (making normal benchmarking infeasible). However, testing this manually in the development viewer (using `pdfBug=Stats`) shows a *reduction* of `~10 %` in the rendering time of the PDF document in issue 2813. --- src/core/evaluator.js | 97 +++++++++++++++++++++++++++++-------- src/core/image_utils.js | 22 +++++++++ test/unit/evaluator_spec.js | 44 +++++++++++------ 3 files changed, 127 insertions(+), 36 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 3a1296f1dade6..7db443e2714f9 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -75,7 +75,11 @@ import { import { getTilingPatternIR, Pattern } from "./pattern.js"; import { isPDFFunction, PDFFunctionFactory } from "./function.js"; import { Lexer, Parser } from "./parser.js"; -import { LocalColorSpaceCache, LocalImageCache } from "./image_utils.js"; +import { + LocalColorSpaceCache, + LocalGStateCache, + LocalImageCache, +} from "./image_utils.js"; import { bidi } from "./bidi.js"; import { ColorSpace } from "./colorspace.js"; import { DecodeStream } from "./stream.js"; @@ -828,14 +832,18 @@ class PartialEvaluator { throw reason; } - setGState( + setGState({ resources, gState, operatorList, + cacheKey, task, stateManager, - localColorSpaceCache - ) { + localGStateCache, + localColorSpaceCache, + }) { + const gStateRef = gState.objId; + let isSimpleGState = true; // This array holds the converted/processed state data. var gStateObj = []; var gStateKeys = gState.getKeys(); @@ -881,6 +889,8 @@ class PartialEvaluator { break; } if (isDict(value)) { + isSimpleGState = false; + promise = promise.then(() => { return this.handleSMask( value, @@ -925,6 +935,10 @@ class PartialEvaluator { if (gStateObj.length > 0) { operatorList.addOp(OPS.setGState, [gStateObj]); } + + if (isSimpleGState) { + localGStateCache.set(cacheKey, gStateRef, gStateObj); + } }); } @@ -1245,6 +1259,7 @@ class PartialEvaluator { let parsingText = false; const localImageCache = new LocalImageCache(); const localColorSpaceCache = new LocalColorSpaceCache(); + const localGStateCache = new LocalGStateCache(); var xobjs = resources.get("XObject") || Dict.empty; var patterns = resources.get("Pattern") || Dict.empty; @@ -1274,7 +1289,8 @@ class PartialEvaluator { operation = {}, i, ii, - cs; + cs, + name; while (!(stop = timeSlotManager.check())) { // The arguments parsed by read() are used beyond this loop, so we // cannot reuse the same array on each iteration. Therefore we pass @@ -1290,7 +1306,7 @@ class PartialEvaluator { switch (fn | 0) { case OPS.paintXObject: // eagerly compile XForm objects - var name = args[0].name; + name = args[0].name; if (name) { const localImage = localImageCache.getByName(name); if (localImage) { @@ -1653,23 +1669,64 @@ class PartialEvaluator { fn = OPS.shadingFill; break; case OPS.setGState: - var dictName = args[0]; - var extGState = resources.get("ExtGState"); - - if (!isDict(extGState) || !extGState.has(dictName.name)) { - break; + name = args[0].name; + if (name) { + const localGStateObj = localGStateCache.getByName(name); + if (localGStateObj) { + if (localGStateObj.length > 0) { + operatorList.addOp(OPS.setGState, [localGStateObj]); + } + args = null; + continue; + } } - var gState = extGState.get(dictName.name); next( - self.setGState( - resources, - gState, - operatorList, - task, - stateManager, - localColorSpaceCache - ) + new Promise(function (resolveGState, rejectGState) { + if (!name) { + throw new FormatError("GState must be referred to by name."); + } + + const extGState = resources.get("ExtGState"); + if (!(extGState instanceof Dict)) { + throw new FormatError("ExtGState should be a dictionary."); + } + + const gState = extGState.get(name); + // TODO: Attempt to lookup cached GStates by reference as well, + // if and only if there are PDF documents where doing so + // would significantly improve performance. + if (!(gState instanceof Dict)) { + throw new FormatError("GState should be a dictionary."); + } + + self + .setGState({ + resources, + gState, + operatorList, + cacheKey: name, + task, + stateManager, + localGStateCache, + localColorSpaceCache, + }) + .then(resolveGState, rejectGState); + }).catch(function (reason) { + if (reason instanceof AbortException) { + return; + } + if (self.options.ignoreErrors) { + // Error(s) in the ExtGState -- sending unsupported feature + // notification and allow parsing/rendering to continue. + self.handler.send("UnsupportedFeature", { + featureId: UNSUPPORTED_FEATURES.errorExtGState, + }); + warn(`getOperatorList - ignoring ExtGState: "${reason}".`); + return; + } + throw reason; + }) ); return; case OPS.moveTo: diff --git a/src/core/image_utils.js b/src/core/image_utils.js index b7498d4383ddf..3e1f899981cf2 100644 --- a/src/core/image_utils.js +++ b/src/core/image_utils.js @@ -113,6 +113,27 @@ class LocalFunctionCache extends BaseLocalCache { } } +class LocalGStateCache extends BaseLocalCache { + set(name, ref = null, data) { + if (!name) { + throw new Error('LocalGStateCache.set - expected "name" argument.'); + } + if (ref) { + if (this._imageCache.has(ref)) { + return; + } + this._nameRefMap.set(name, ref); + this._imageCache.put(ref, data); + return; + } + // name + if (this._imageMap.has(name)) { + return; + } + this._imageMap.set(name, data); + } +} + class GlobalImageCache { static get NUM_PAGES_THRESHOLD() { return shadow(this, "NUM_PAGES_THRESHOLD", 2); @@ -210,5 +231,6 @@ export { LocalImageCache, LocalColorSpaceCache, LocalFunctionCache, + LocalGStateCache, GlobalImageCache, }; diff --git a/test/unit/evaluator_spec.js b/test/unit/evaluator_spec.js index bdcbc55f0c0ef..a3fd5e826e1b5 100644 --- a/test/unit/evaluator_spec.js +++ b/test/unit/evaluator_spec.js @@ -242,23 +242,35 @@ describe("evaluator", function () { ); }); it("should execute if nested commands", function (done) { + const gState = new Dict(); + gState.set("LW", 2); + gState.set("CA", 0.5); + + const extGState = new Dict(); + extGState.set("GS2", gState); + + const resources = new ResourcesMock(); + resources.ExtGState = extGState; + var stream = new StringStream("/F2 /GS2 gs 5.711 Tf"); - runOperatorListCheck( - partialEvaluator, - stream, - new ResourcesMock(), - function (result) { - expect(result.fnArray.length).toEqual(3); - expect(result.fnArray[0]).toEqual(OPS.setGState); - expect(result.fnArray[1]).toEqual(OPS.dependency); - expect(result.fnArray[2]).toEqual(OPS.setFont); - expect(result.argsArray.length).toEqual(3); - expect(result.argsArray[0].length).toEqual(1); - expect(result.argsArray[1].length).toEqual(1); - expect(result.argsArray[2].length).toEqual(2); - done(); - } - ); + runOperatorListCheck(partialEvaluator, stream, resources, function ( + result + ) { + expect(result.fnArray.length).toEqual(3); + expect(result.fnArray[0]).toEqual(OPS.setGState); + expect(result.fnArray[1]).toEqual(OPS.dependency); + expect(result.fnArray[2]).toEqual(OPS.setFont); + expect(result.argsArray.length).toEqual(3); + expect(result.argsArray[0]).toEqual([ + [ + ["LW", 2], + ["CA", 0.5], + ], + ]); + expect(result.argsArray[1]).toEqual(["g_font_error"]); + expect(result.argsArray[2]).toEqual(["g_font_error", 5.711]); + done(); + }); }); it("should skip if too few arguments", function (done) { var stream = new StringStream("5 d0"); From 981ff41b5f2e432bf39458ed8b5959575ed60986 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sat, 11 Jul 2020 14:05:53 +0200 Subject: [PATCH 2/3] Add local caching of non-font Graphics State (ExtGState) data in `PartialEvaluator.getTextContent` It turns out that `getTextContent` suffers from *similar* problems with repeated GStates as `getOperatorList`; please see the previous patch. While only `/ExtGState` resources containing Fonts will actually be *parsed* by `PartialEvaluator.getTextContent`, we're still forced to fetch/validate repeated `/ExtGState` resources even though *most* of them won't affect the textContent (since they mostly contain purely graphical state). With these changes we also no longer need to immediately reset the current text-state when encountering a `setGState` operator, which may thus improve text-selection in some cases. --- src/core/evaluator.js | 69 ++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 7db443e2714f9..5739f372f5f87 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1848,6 +1848,7 @@ class PartialEvaluator { // The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd. var xobjs = null; const emptyXObjectCache = new LocalImageCache(); + const emptyGStateCache = new LocalGStateCache(); var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager); @@ -2420,25 +2421,59 @@ class PartialEvaluator { ); return; case OPS.setGState: - flushTextContentItem(); - var dictName = args[0]; - var extGState = resources.get("ExtGState"); - - if (!isDict(extGState) || !isName(dictName)) { - break; - } - var gState = extGState.get(dictName.name); - if (!isDict(gState)) { + name = args[0].name; + if (name && emptyGStateCache.getByName(name)) { break; } - var gStateFont = gState.get("Font"); - if (gStateFont) { - textState.fontName = null; - textState.fontSize = gStateFont[1]; - next(handleSetFont(null, gStateFont[0])); - return; - } - break; + + next( + new Promise(function (resolveGState, rejectGState) { + if (!name) { + throw new FormatError("GState must be referred to by name."); + } + + const extGState = resources.get("ExtGState"); + if (!(extGState instanceof Dict)) { + throw new FormatError("ExtGState should be a dictionary."); + } + + const gState = extGState.get(name); + // TODO: Attempt to lookup cached GStates by reference as well, + // if and only if there are PDF documents where doing so + // would significantly improve performance. + if (!(gState instanceof Dict)) { + throw new FormatError("GState should be a dictionary."); + } + + const gStateFont = gState.get("Font"); + if (!gStateFont) { + emptyGStateCache.set(name, gState.objId, true); + + resolveGState(); + return; + } + flushTextContentItem(); + + textState.fontName = null; + textState.fontSize = gStateFont[1]; + handleSetFont(null, gStateFont[0]).then( + resolveGState, + rejectGState + ); + }).catch(function (reason) { + if (reason instanceof AbortException) { + return; + } + if (self.options.ignoreErrors) { + // Error(s) in the ExtGState -- allow text-extraction to + // continue. + warn(`getTextContent - ignoring ExtGState: "${reason}".`); + return; + } + throw reason; + }) + ); + return; } // switch if (textContent.items.length >= sink.desiredSize) { // Wait for ready, if we reach highWaterMark. From 03547b5633f51ebe7687be0db8b7a0df432559b2 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 15 Jul 2020 14:25:24 +0200 Subject: [PATCH 3/3] Change `PartialEvaluator.setGState` to an `async` method Since this method calls `Dict.get` to fetch data, there could thus be `Error`s thrown in corrupt PDF documents when attempting to resolve an indirect object. To ensure that this won't ever become a problem, we change the method to be `async` such that a rejected Promise would be returned and general OperatorList parsing won't break. --- src/core/evaluator.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 5739f372f5f87..96bc5ff5bcd83 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -832,7 +832,7 @@ class PartialEvaluator { throw reason; } - setGState({ + async setGState({ resources, gState, operatorList,