From 43d5512f5c9f370d39dfae946311d83e23ff0136 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Fri, 8 Jan 2021 18:40:09 +0100 Subject: [PATCH] [api-minor] Change the "dc:creator" Metadata field to an Array - add scripting support for doc.info.authors - doc.info.metadata is the raw string with xml code --- src/core/writer.js | 4 +- src/display/metadata.js | 62 ++++++++++++++++++++++------- src/scripting_api/doc.js | 15 +++---- src/shared/xml_parser.js | 6 ++- test/integration/scripting_spec.js | 25 ++++++++++++ test/pdfs/.gitignore | 1 + test/pdfs/js-authors.pdf | Bin 0 -> 7078 bytes test/unit/metadata_spec.js | 4 +- test/unit/xml_spec.js | 10 +++-- web/app.js | 3 +- 10 files changed, 97 insertions(+), 33 deletions(-) create mode 100644 test/pdfs/js-authors.pdf diff --git a/src/core/writer.js b/src/core/writer.js index 16c58fe97bdc0..4d61ad73a8e8f 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -130,9 +130,7 @@ function updateXFA(datasetsRef, newRefs, xref) { } const datasets = xref.fetchIfRef(datasetsRef); const str = bytesToString(datasets.getBytes()); - const xml = new SimpleXMLParser(/* hasAttributes */ true).parseFromString( - str - ); + const xml = new SimpleXMLParser({ hasAttributes: true }).parseFromString(str); for (const { xfa } of newRefs) { if (!xfa) { diff --git a/src/display/metadata.js b/src/display/metadata.js index 963eac9264552..c34bd4907f031 100644 --- a/src/display/metadata.js +++ b/src/display/metadata.js @@ -24,7 +24,7 @@ class Metadata { data = this._repair(data); // Convert the string to an XML document. - const parser = new SimpleXMLParser(); + const parser = new SimpleXMLParser({ lowerCaseName: true }); const xmlDocument = parser.parseFromString(data); this._metadataMap = new Map(); @@ -32,6 +32,7 @@ class Metadata { if (xmlDocument) { this._parse(xmlDocument); } + this._data = data; } _repair(data) { @@ -79,40 +80,71 @@ class Metadata { }); } + _getSequence(entry) { + const name = entry.nodeName; + if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") { + return null; + } + + return entry.childNodes.filter(node => node.nodeName === "rdf:li"); + } + + _getCreators(entry) { + if (entry.nodeName !== "dc:creator") { + return false; + } + if (!entry.hasChildNodes()) { + return true; + } + + // Child must be a Bag (unordered array) or a Seq. + const seqNode = entry.childNodes[0]; + const authors = this._getSequence(seqNode) || []; + this._metadataMap.set( + entry.nodeName, + authors.map(node => node.textContent.trim()) + ); + + return true; + } + _parse(xmlDocument) { let rdf = xmlDocument.documentElement; - if (rdf.nodeName.toLowerCase() !== "rdf:rdf") { + if (rdf.nodeName !== "rdf:rdf") { // Wrapped in rdf = rdf.firstChild; - while (rdf && rdf.nodeName.toLowerCase() !== "rdf:rdf") { + while (rdf && rdf.nodeName !== "rdf:rdf") { rdf = rdf.nextSibling; } } - const nodeName = rdf ? rdf.nodeName.toLowerCase() : null; - if (!rdf || nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) { + if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) { return; } - const children = rdf.childNodes; - for (let i = 0, ii = children.length; i < ii; i++) { - const desc = children[i]; - if (desc.nodeName.toLowerCase() !== "rdf:description") { + for (const desc of rdf.childNodes) { + if (desc.nodeName !== "rdf:description") { continue; } - for (let j = 0, jj = desc.childNodes.length; j < jj; j++) { - if (desc.childNodes[j].nodeName.toLowerCase() !== "#text") { - const entry = desc.childNodes[j]; - const name = entry.nodeName.toLowerCase(); - - this._metadataMap.set(name, entry.textContent.trim()); + for (const entry of desc.childNodes) { + const name = entry.nodeName; + if (name === "#text") { + continue; } + if (this._getCreators(entry)) { + continue; + } + this._metadataMap.set(name, entry.textContent.trim()); } } } + getRaw() { + return this._data; + } + get(name) { return this._metadataMap.has(name) ? this._metadataMap.get(name) : null; } diff --git a/src/scripting_api/doc.js b/src/scripting_api/doc.js index 063c4cf7e7816..6797e693570df 100644 --- a/src/scripting_api/doc.js +++ b/src/scripting_api/doc.js @@ -42,7 +42,7 @@ class Doc extends PDFObject { this._dirty = false; this._disclosed = false; this._media = undefined; - this._metadata = data.metadata; + this._metadata = data.metadata || ""; this._noautocomplete = undefined; this._nocache = undefined; this._spellDictionaryOrder = []; @@ -74,12 +74,13 @@ class Doc extends PDFObject { // and they're are read-only. this._info = new Proxy( { - title: this.title, - author: this.author, - subject: this.subject, - keywords: this.keywords, - creator: this.creator, - producer: this.producer, + title: this._title, + author: this._author, + authors: data.authors || [this._author], + subject: this._subject, + keywords: this._keywords, + creator: this._creator, + producer: this._producer, creationdate: this._creationDate, moddate: this._modDate, trapped: data.Trapped || "Unknown", diff --git a/src/shared/xml_parser.js b/src/shared/xml_parser.js index 2ba25f83a7472..582280b6a68bc 100644 --- a/src/shared/xml_parser.js +++ b/src/shared/xml_parser.js @@ -427,12 +427,13 @@ class SimpleDOMNode { } class SimpleXMLParser extends XMLParserBase { - constructor(hasAttributes = false) { + constructor({ hasAttributes = false, lowerCaseName = false }) { super(); this._currentFragment = null; this._stack = null; this._errorCode = XMLParserErrorCode.NoError; this._hasAttributes = hasAttributes; + this._lowerCaseName = lowerCaseName; } parseFromString(data) { @@ -476,6 +477,9 @@ class SimpleXMLParser extends XMLParserBase { } onBeginElement(name, attributes, isEmpty) { + if (this._lowerCaseName) { + name = name.toLowerCase(); + } const node = new SimpleDOMNode(name); node.childNodes = []; if (this._hasAttributes) { diff --git a/test/integration/scripting_spec.js b/test/integration/scripting_spec.js index 01f6e7ea5d47c..1b6d94bfff65c 100644 --- a/test/integration/scripting_spec.js +++ b/test/integration/scripting_spec.js @@ -451,4 +451,29 @@ describe("Interaction", () => { ); }); }); + + describe("in js-authors.pdf", () => { + let pages; + + beforeAll(async () => { + pages = await loadAndWait("js-authors.pdf", "#\\32 5R"); + }); + + afterAll(async () => { + await closePages(pages); + }); + + it("must print authors in a text field", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const text = await actAndWaitForInput(page, "#\\32 5R", async () => { + await page.click("[data-annotation-id='26R']"); + }); + expect(text) + .withContext(`In ${browserName}`) + .toEqual("author1::author2::author3::author4::author5"); + }) + ); + }); + }); }); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 6e52ba6968159..1676f297bd899 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -324,6 +324,7 @@ !tensor-allflags-withfunction.pdf !issue10084_reduced.pdf !issue4246.pdf +!js-authors.pdf !issue4461.pdf !issue4573.pdf !issue4722.pdf diff --git a/test/pdfs/js-authors.pdf b/test/pdfs/js-authors.pdf new file mode 100644 index 0000000000000000000000000000000000000000..840a73c7ce71455b5bce636e0e2e5f2ae8b52676 GIT binary patch literal 7078 zcmeHMc~}$I76+A}QAF_*w^toekQ9bsK!5^8+1e{GBYy1Q;Fu((AG;G>KQ`YDQBP1oC?e_% zM}&h(6pG3|JC2o>XQj|_b(USHoU<;>X{NHSHEoMXapIv4G3UBnEH0WYS*3^Yt=%(8QE5JeDJMbH)l zAga?>#$UJR>9Cs*z=DYe>nWCurXb7?9B?byLTOV;IuizK9#)eDG~g+mYSYjRf0;3z zOit3wBDJ)O76?xyz$APqo6XQngaI!h3(3~AXR}cV1$hBN#Y{sw+XN)C+0kmYV`MN( zYQ4P)67a+V5v10!LuEEPhq72Xka`w7M$Mkg_Cy%53n>qzRvIi;8WLii5J?0G z$Vd`L3?PJ5I?okC&Nbr{mGxWUfIq|5Z`>vSB&4#wskz_ax|QG7-v8!49q@(kvS}$F zj*j;BZa%x`PCyvC@zWuTjucS$?pF8uF?QNJ;=swJ%ftg4%%L~D{Za?|$KWHE6^y;X z@zbJ1lN1pHPZF!Q92y&to?aabe?O+B=h}=R4?<)71KaSuyRg+etG0c8 zpHi;Y>&&}674QvVxgi1a3}O| za9d(vHFV`^rp%gt+%{TylEB!FSx+8cA{RuId{D=q8w;o0IZ*r6XQEQylf;5|OQvF* zt{@|=8y7!XpsxSSx}#|4KY6*vM9%u+MDqT+Lk*HY=rVIy{?OkW%1rE-X}dWCs<`UO zn@_(km!9@EhL0S88B{*y`^v8m9zJulC{aJGu_ANk+QyDeW!DyRR7>D2!xZzR+o7eL zx&NspRGgI>>7vH7JxkXP`ljF`L-wKY`28P@zMeDTqvm62g710j9tKvkQ7yv|XODa>?U?8B> z#v5jCPTtn{NtJD@`IE)I2U@JV&z#cx&lJmw4ZCuEHf&*SxmA9L*nI8N#oG@y>9%_7 zt_dfaw`D0NX3eO+5*y#p>&l7p%cC+j?k!xnpP2IAo*#$q=T!E=ANoG{`8}<7VEmLz zdSvOQ^NXNxZqP5E?71A)wx=?3*p3pPgWC@YOUv_nkB2MbdW>EtnmDS8Tp#vP@Rx@U z7Bqgm+IGt}058W^Uq8D4RL+cpDMOB);AE`G87?0_yl2V`?yq>}@HhK3&S^E|RMUM% zT|5VuAJyI8C!cX(*NEDy$gLf3A2Oiv2bW~dFrB_T_+2K?H|75uQ!=xE$2ZD9gvc}tm2fWKm5^DPo z%PiBBXi8eL)GK8Bw%oYc@1);Jm9HuBt+KT{-<}ml_5Pbdcp9@N=dbdA` zJU;)|k;+;tZ#RMRU@Sm4N zuyiE3IvLrk_KF_kGh)QN+kZ434e*iAJ9xh0M!@DZ3!~pXmK3ycSW3pbRd;5-S-QG> z-J8a2-=^<-H?d3m#D-7uX~CC%cWgpZbFVPlav|O`exgF@ZLn@$ zS-4#y+1qivk<;1@-M_+OsYE{pN{PFs_NI+7Jpvl6W{8)E@Z7ba9hh# zuRe05(!Vh`d#N#5t&wf$oBYodO&qGA)D`gorq7r#zRQBEJ%qu0^SjR|Ux@R&K1@?p z2@an-iv-tnN4!yBCbYRE4Qa@1gDEoT&XLc9AcHP4C|#gJRAxDuV~8!Xkf}w9YHd-j zR-y}fBZ?hSAT7u<=Yf&~D#$aMtkQzWAOh5@q!c_e&2SK87ol?_gJetxG*y)h$tep7 zzF8;;A@B>Fd~Fe0T)H27$QXkot0To6jP){Saf=6ibCle4!lJM z<2yzk4l6thtlS|fb6Eol$RQUmN7EvlqFYh zr1BjB+vkHp2+RubSgN9Q27RFdRBKUS8c{9+{={lTBE`f~TnwJ22oeGRMbZVe-G&@g z`e0Y$!3HixCGIxdB>!TgliQxGJmZP8Shmal#g2Uh%A%$yW0ZrsPAni${6PviQ(%Hx z3_}nR?ug4>!_AmZE7gOFkf5Uov)O3S5)9#BFe4GLtE;=bmcfCtxIxUI%y2ZC$nM_S z*$*^K+Kn@T zEuu+FcG{KjMIydHz~`c*Uc$vS5}YeWF+EoxAcP2CED}q^nohfJx^8yCTuT9IbT%z% zc?m9c6L&M?T&Pmu9#~{xp1U`AkeowWtblA$Ak(;{I~$$y4&dE#p`AC5R*-2yl83vp z{4Za$v7ofdQ+6xa_Nok#!A9yVfl3OFbL+vtn% zg|Nv;Q`^M^&ML6SXR}VXEM>OKzGHWM5cut7Z*by&8|i!1gE!0-!;3ceUEZLW&~Y2X zoaB0PC(YqbIJfQ$+!)&{wRl6v#hA+F!R76Ryyd}_!+B{+ma>>t%nD?cDMO0O4zmJV zSge-hDHHNmaoXc5*{kYe+T)f7v%O>5*}RKvC99~owg_AV+zQ#yajc!QUYW6eYR$?X zvnFv=iNPnr$B%!@tEU(D0s6Iv(A `; - const root = new SimpleXMLParser(true).parseFromString(xml) - .documentElement; + const root = new SimpleXMLParser({ hasAttributes: true }).parseFromString( + xml + ).documentElement; function getAttr(path) { return root.searchNode(parseXFAPath(path), 0).attributes[0].value; } @@ -96,8 +97,9 @@ describe("XML", function () { `; - const root = new SimpleXMLParser(true).parseFromString(xml) - .documentElement; + const root = new SimpleXMLParser({ hasAttributes: true }).parseFromString( + xml + ).documentElement; const buffer = []; root.dump(buffer); diff --git a/web/app.js b/web/app.js index f2c049f377169..d6a365e173313 100644 --- a/web/app.js +++ b/web/app.js @@ -1655,7 +1655,8 @@ const PDFViewerApplication = { baseURL: this.baseUrl, filesize: this._contentLength, filename: this._docFilename, - metadata: this.metadata, + metadata: this.metadata?.getRaw(), + authors: this.metadata?.get("dc:creator"), numPages: pdfDocument.numPages, URL: this.url, actions: docActions,