From fbb25ff4e20b74dd72e2473ec4bfa39d2b3d2e4f Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 25 Jul 2018 16:53:47 +0200 Subject: [PATCH 1/2] Move `getPage`, on the worker side, from `Catalog` and into `PDFDocument` instead Addresses an existing TODO, and avoids having to pass in a `pageFactory` when creating `Catalog` instances. --- src/core/document.js | 38 +++++++++++++++++++++----------------- src/core/obj.js | 17 +---------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 675e9c1a6d5ea..5fdf83d074504 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -355,6 +355,7 @@ var PDFDocument = (function PDFDocumentClosure() { xref: this.xref, isEvalSupported: evaluatorOptions.isEvalSupported, }); + this._pagePromises = []; } function find(stream, needle, limit, backwards) { @@ -520,21 +521,7 @@ var PDFDocument = (function PDFDocumentClosure() { }, setup: function PDFDocument_setup(recoveryMode) { this.xref.parse(recoveryMode); - var pageFactory = { - createPage: (pageIndex, dict, ref, fontCache, builtInCMapCache) => { - return new Page({ - pdfManager: this.pdfManager, - xref: this.xref, - pageIndex, - pageDict: dict, - ref, - fontCache, - builtInCMapCache, - pdfFunctionFactory: this.pdfFunctionFactory, - }); - }, - }; - this.catalog = new Catalog(this.pdfManager, this.xref, pageFactory); + this.catalog = new Catalog(this.pdfManager, this.xref); }, get numPages() { var linearization = this.linearization; @@ -599,8 +586,25 @@ var PDFDocument = (function PDFDocumentClosure() { return shadow(this, 'fingerprint', fileID); }, - getPage: function PDFDocument_getPage(pageIndex) { - return this.catalog.getPage(pageIndex); + getPage(pageIndex) { + if (this._pagePromises[pageIndex] !== undefined) { + return this._pagePromises[pageIndex]; + } + const catalog = this.catalog; + + return this._pagePromises[pageIndex] = + catalog.getPageDict(pageIndex).then(([pageDict, ref]) => { + return new Page({ + pdfManager: this.pdfManager, + xref: this.xref, + pageIndex, + pageDict, + ref, + fontCache: catalog.fontCache, + builtInCMapCache: catalog.builtInCMapCache, + pdfFunctionFactory: this.pdfFunctionFactory, + }); + }); }, cleanup: function PDFDocument_cleanup() { diff --git a/src/core/obj.js b/src/core/obj.js index 823a8d138cb61..f819f84c09629 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -29,7 +29,7 @@ import { CipherTransformFactory } from './crypto'; import { ColorSpace } from './colorspace'; var Catalog = (function CatalogClosure() { - function Catalog(pdfManager, xref, pageFactory) { + function Catalog(pdfManager, xref) { this.pdfManager = pdfManager; this.xref = xref; this.catDict = xref.getCatalogObj(); @@ -40,9 +40,6 @@ var Catalog = (function CatalogClosure() { this.fontCache = new RefSetCache(); this.builtInCMapCache = Object.create(null); this.pageKidsCountCache = new RefSetCache(); - // TODO refactor to move getPage() to the PDFDocument. - this.pageFactory = pageFactory; - this.pagePromises = []; } Catalog.prototype = { @@ -453,18 +450,6 @@ var Catalog = (function CatalogClosure() { }); }, - getPage: function Catalog_getPage(pageIndex) { - if (!(pageIndex in this.pagePromises)) { - this.pagePromises[pageIndex] = this.getPageDict(pageIndex).then( - ([dict, ref]) => { - return this.pageFactory.createPage(pageIndex, dict, ref, - this.fontCache, - this.builtInCMapCache); - }); - } - return this.pagePromises[pageIndex]; - }, - getPageDict: function Catalog_getPageDict(pageIndex) { var capability = createPromiseCapability(); var nodesToVisit = [this.catDict.getRaw('Pages')]; From ec3728b54018d3e74086c67d10c2b3d53bc8c699 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 25 Jul 2018 20:50:25 +0200 Subject: [PATCH 2/2] Use the `Linearization` dictionary, if it exists, when fetching the first Page Since PDF.js already supports range requests and streaming, not to mention chunked rendering, attempting to use the `Linearization` dictionary in `PDFDocument.getPage` probably isn't going to improve performance in any noticeable way. Nonetheless, when `Linearization` data is available, it will allow looking up the first Page *directly* without having to descend into the `Pages` tree to find the correct object. --- src/core/document.js | 61 +++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 5fdf83d074504..6367d89b47408 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -13,12 +13,13 @@ * limitations under the License. */ -import { Catalog, ObjectLoader, XRef } from './obj'; -import { Dict, isDict, isName, isStream } from './primitives'; import { - getInheritableProperty, info, isArrayBuffer, isNum, isSpace, isString, - MissingDataException, OPS, shadow, stringToBytes, stringToPDFString, Util + assert, FormatError, getInheritableProperty, info, isArrayBuffer, isNum, + isSpace, isString, MissingDataException, OPS, shadow, stringToBytes, + stringToPDFString, Util } from '../shared/util'; +import { Catalog, ObjectLoader, XRef } from './obj'; +import { Dict, isDict, isName, isStream, Ref } from './primitives'; import { NullStream, Stream, StreamsSequenceStream } from './stream'; import { AnnotationFactory } from './annotation'; import { calculateMD5 } from './crypto'; @@ -586,25 +587,49 @@ var PDFDocument = (function PDFDocumentClosure() { return shadow(this, 'fingerprint', fileID); }, + _getLinearizationPage(pageIndex) { + const { catalog, linearization, } = this; + assert(linearization && linearization.pageFirst === pageIndex); + + const ref = new Ref(linearization.objectNumberFirst, 0); + return this.xref.fetchAsync(ref).then((obj) => { + // Ensure that the object that was found is actually a Page dictionary. + if (isDict(obj, 'Page') || + (isDict(obj) && !obj.has('Type') && obj.has('Contents'))) { + if (ref && !catalog.pageKidsCountCache.has(ref)) { + catalog.pageKidsCountCache.put(ref, 1); // Cache the Page reference. + } + return [obj, ref]; + } + throw new FormatError('The Linearization dictionary doesn\'t point ' + + 'to a valid Page dictionary.'); + }).catch((reason) => { + info(reason); + return catalog.getPageDict(pageIndex); + }); + }, + getPage(pageIndex) { if (this._pagePromises[pageIndex] !== undefined) { return this._pagePromises[pageIndex]; } - const catalog = this.catalog; - - return this._pagePromises[pageIndex] = - catalog.getPageDict(pageIndex).then(([pageDict, ref]) => { - return new Page({ - pdfManager: this.pdfManager, - xref: this.xref, - pageIndex, - pageDict, - ref, - fontCache: catalog.fontCache, - builtInCMapCache: catalog.builtInCMapCache, - pdfFunctionFactory: this.pdfFunctionFactory, - }); + const { catalog, linearization, } = this; + + const promise = (linearization && linearization.pageFirst === pageIndex) ? + this._getLinearizationPage(pageIndex) : catalog.getPageDict(pageIndex); + + return this._pagePromises[pageIndex] = promise.then(([pageDict, ref]) => { + return new Page({ + pdfManager: this.pdfManager, + xref: this.xref, + pageIndex, + pageDict, + ref, + fontCache: catalog.fontCache, + builtInCMapCache: catalog.builtInCMapCache, + pdfFunctionFactory: this.pdfFunctionFactory, }); + }); }, cleanup: function PDFDocument_cleanup() {