Skip to content

Commit

Permalink
perf: improve the check for ]]> in character data
Browse files Browse the repository at this point in the history
XML disallows the sequence of characters ]]> in character data. The previous way
in which we checked for the presence of ]]> was rather expensive. (It probably
triggered repeated flattenings of the `this.text` string.) The code was reworked
to dramatically reduce the cost of that check.
  • Loading branch information
lddubeau committed Jun 11, 2019
1 parent bd00485 commit 21df9b5
Show file tree
Hide file tree
Showing 3 changed files with 31,388 additions and 20 deletions.
49 changes: 29 additions & 20 deletions lib/saxes.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ function isQuote(c) {

const QUOTES = [DQUOTE, SQUOTE];

const TEXT_TERMINATOR = [LESS, AMP];
const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER];
const DOCTYPE_DTD_TERMINATOR = [...QUOTES, CLOSE_BRACKET];
const XML_DECL_NAME_TERMINATOR = [EQUAL, QUESTION, ...S_LIST];
Expand Down Expand Up @@ -333,10 +332,6 @@ class SaxesParser {
this.requiredSeparator = undefined;
this.entityReturnState = undefined;
this.badEntityName = false;
// This records the index before which we don't have to check for the
// presence of ]]]>. The text before that index has been checked already,
// and should not be checked twice.
this.textCheckedBefore = 0;
const xmlnsOpt = this.xmlnsOpt = !!this.opt.xmlns;

if (xmlnsOpt) {
Expand Down Expand Up @@ -786,18 +781,44 @@ class SaxesParser {
this.reportedTextBeforeRoot = true;
}
this.text = String.fromCodePoint(c);
this.textCheckedBefore = 0;
this.state = S_TEXT;
this.xmlDeclPossible = false;
}
}

/** @private */
sText() {
const c = this.captureTo(TEXT_TERMINATOR, "text");
// This is essentially a specialized version of captureTo which is optimized
// for performing the ]]> check. A previous version of this code, checked
// ``this.text`` for the presence of ]]>. It simplified the code but was
// very costly when character data contained a lot of entities to be parsed.
const { chunk, limit, i: start } = this;
let c;
while (this.i < limit) {
const code = this.getCode();
if (code === LESS || code === AMP) {
c = code;
break;
}
}

// This is faster than adding codepoints one by one.
const slice = chunk.substring(start,
c === undefined ? undefined :
(this.i - (c <= 0xFFFF ? 1 : 2)));

// We test for the presence of ]]>, which is not allowed in CharData. We
// have to take into account edge conditions.
if (slice.includes("]]>") ||
(slice[0] === ">" && this.text.endsWith("]]")) ||
(slice.startsWith("]>") && this.text.endsWith("]"))) {
this.fail("the string \"]]>\" is disallowed in char data.");
}

this.text += slice;

if ((!this.sawRoot || this.closedRoot) &&
(/\S/.test(this.text) || c === AMP)) {
(/\S/.test(slice) || c === AMP)) {
// We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
// to avoid reporting errors for every single character that is out of
// place.
Expand All @@ -812,15 +833,6 @@ class SaxesParser {
}
}

if (this.text.includes("]]>", this.textCheckedBefore)) {
this.fail("the string \"]]>\" is disallowed in char data.");
}

// We have to go back two spaces so that we can catch the case where on a
// previous write call, the text buffer ended on ``]]`` and we started
// with ``>`` this time around.
this.textCheckedBefore = this.text.length - 2;

switch (c) {
case LESS:
this.state = S_OPEN_WAKA;
Expand Down Expand Up @@ -1568,15 +1580,13 @@ class SaxesParser {
else {
this.text += this.parseEntity(this.entity);
}
this.textCheckedBefore = this.text.length;
this.entity = "";
this.state = this.entityReturnState;
this.badEntityName = false;
}
else if (c) {
this.fail("disallowed character in entity name.");
this.text += `&${this.entity}${String.fromCodePoint(c)}`;
this.textCheckedBefore = this.text.length;
this.entity = "";
this.state = this.entityReturnState;
this.badEntityName = false;
Expand Down Expand Up @@ -1623,7 +1633,6 @@ class SaxesParser {
closeText() {
this.ontext(this.text);
this.text = "";
this.textCheckedBefore = 0;
}

/**
Expand Down
12 changes: 12 additions & 0 deletions test/entities.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
"use strict";

const { execFile: _execFile } = require("child_process");
const util = require("util");

const execFile = util.promisify(_execFile);

require(".").test({
name: "entities",
xml: "<r>&amp; &lt; &gt; ></r>",
Expand All @@ -10,3 +15,10 @@ require(".").test({
["closetag", { name: "r", attributes: {}, isSelfClosing: false }],
],
});

// This test mainly exists to check parsing speed of a file with a lot of
// entities.
it("mass entities", async () => {
await execFile("node", ["./examples/null-parser.js",
"test/files/entities.xml"]);
});
Loading

0 comments on commit 21df9b5

Please sign in to comment.