diff --git a/lib/saxes.js b/lib/saxes.js index 082ac41a..aa5bca16 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -803,7 +803,7 @@ class SaxesParser { * @private * * @param {number[]} chars An array of codepoints. Encountering a character in - * the array ends the capture. + * the array ends the capture. (``chars`` may safely contain ``NL``.) * * @return {number} The character code that made the capture end, or ``EOC`` * if we hit the end of the chunk. The return value cannot be NL_LIKE: NL is @@ -813,22 +813,17 @@ class SaxesParser { let { i: start } = this; const { chunk } = this; while (true) { - let c = this.getCode(); - switch (c) { - case NL_LIKE: - this.text += `${chunk.slice(start, this.prevI)}\n`; - start = this.i; - c = NL; - break; - case EOC: - this.text += chunk.slice(start); - return EOC; - default: + const c = this.getCode(); + const isNLLike = c === NL_LIKE; + const final = isNLLike ? NL : c; + if (final === EOC || chars.includes(final)) { + this.text += chunk.slice(start, this.prevI); + return final; } - if (chars.includes(c)) { - this.text += chunk.slice(start, this.prevI); - return c; + if (isNLLike) { + this.text += `${chunk.slice(start, this.prevI)}\n`; + start = this.i; } } } @@ -838,7 +833,8 @@ class SaxesParser { * * @private * - * @param {number} char The codepoint that ends the capture. + * @param {number} char The codepoint that ends the capture. **NOTE ``char`` + * MAY NOT CONTAIN ``NL``.** Passing ``NL`` will result in buggy behavior. * * @return {boolean} ``true`` if we ran into the character. Otherwise, we ran * into the end of the current chunk. @@ -1486,7 +1482,7 @@ class SaxesParser { /** @private */ sPIRest() { const c = this.captureWhileNameCheck("piTarget"); - if ((c === QUESTION || isS(c))) { + if (c === QUESTION || isS(c)) { this.piIsXMLDecl = this.piTarget === "xml"; if (this.piIsXMLDecl && !this.xmlDeclPossible) { this.fail("an XML declaration must be at the start of the document."); diff --git a/test/eol-handling.js b/test/eol-handling.js index 27a385b1..0565886a 100644 --- a/test/eol-handling.js +++ b/test/eol-handling.js @@ -58,6 +58,242 @@ describe("eol handling", () => { }); }); + // The comprehensive test is meant to have EOL in all possible contexts. + describe("comprehensive", () => { + const nl = `\ + + + +" +> +]> + + + +abc + + + + + + +`; + const xmlDeclEnd = nl.indexOf("?>"); + + const expect = [ + ["text", "\n"], + ["doctype", ` +root +[ + + +" +> +]`], + ["text", "\n"], + ["comment", "\n"], + ["text", "\n"], + ["processinginstruction", { target: "fnord", body: "" }], + ["text", "\n"], + ["opentagstart", { name: "moo", attributes: {} }], + ["opentag", { + name: "moo", + attributes: { + a: "12\n 3", + b: "\nz\n", + }, + isSelfClosing: false, + }], + ["text", ` +abc +\r + +`], + ["comment", "\n"], + ["text", "\n"], + ["processinginstruction", { target: "fnord", body: "" }], + ["text", "\n"], + ["opentagstart", { name: "abc", attributes: {} }], + ["opentag", { + name: "abc", + attributes: { + a: "bc", + }, + isSelfClosing: true, + }], + ["closetag", { + name: "abc", + attributes: { + a: "bc", + }, + isSelfClosing: true, + }], + ["text", "\n"], + ["closetag", { + name: "moo", + attributes: { + a: "12\n 3", + b: "\nz\n", + }, + isSelfClosing: false, + }], + ["text", "\n"], + ]; + + describe("nl", () => { + test({ + name: "one chunk", + xml: nl, + expect, + }); + + test({ + name: "char-by-char", + expect, + fn(parser) { + for (const x of nl) { + parser.write(x); + } + parser.close(); + }, + }); + }); + + describe("cr", () => { + const cr = nl.replace(/\n/g, "\r"); + + test({ + name: "one chunk", + xml: cr, + expect, + }); + + test({ + name: "char-by-char", + expect, + fn(parser) { + for (const x of cr) { + parser.write(x); + } + parser.close(); + }, + }); + }); + + describe("crnl", () => { + const crnl = nl.replace(/\n/g, "\r\n"); + + test({ + name: "one chunk", + xml: crnl, + expect, + }); + + test({ + name: "char-by-char", + expect, + fn(parser) { + for (const x of crnl) { + parser.write(x); + } + parser.close(); + }, + }); + }); + + describe("nel", () => { + // We have to switch the EOL characters after the XML declaration. + const nel = nl.slice(0, xmlDeclEnd).replace("1.0", "1.1") + + nl.slice(xmlDeclEnd).replace(/\n/g, "\u0085"); + + test({ + name: "one chunk", + xml: nel, + expect, + }); + + test({ + name: "char-by-char", + expect, + fn(parser) { + for (const x of nel) { + parser.write(x); + } + parser.close(); + }, + }); + }); + + describe("ls", () => { + // We have to switch the EOL characters after the XML declaration. + const ls = nl.slice(0, xmlDeclEnd).replace("1.0", "1.1") + + nl.slice(xmlDeclEnd).replace(/\n/g, "\u2028"); + + test({ + name: "one chunk", + xml: ls, + expect, + }); + + test({ + name: "char-by-char", + expect, + fn(parser) { + for (const x of ls) { + parser.write(x); + } + parser.close(); + }, + }); + }); + }); + describe("bad start", () => { const xml = ` `;