From 732325ee4dc03cac24ed76845f5e7fc0bcd07afa Mon Sep 17 00:00:00 2001 From: Louis-Dominique Dubeau Date: Thu, 20 Jun 2019 08:08:52 -0400 Subject: [PATCH] perf: split sText into two specialized loops The fact of the matter is that the test for non-space characters matters only outside of the root element, which is a minority of cases. The test for ]]> matters only inside the root. (Outside the root, it raises the non-space text error.) The specialized code improves performance. --- lib/saxes.js | 96 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 18 deletions(-) diff --git a/lib/saxes.js b/lib/saxes.js index 212e1e94..638f3016 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -802,6 +802,28 @@ class SaxesParser { /** @private */ sText() { + // + // We did try a version of saxes where the S_TEXT state was split in two + // states: one for text inside the root element, and one for text + // outside. This was avoiding having to test this.tags.length to decide what + // implementation to actually use. + // + // Peformance testing on gigabyte-size files did not show any advantage to + // using the two states solution instead of the current one. Conversely, it + // made the code a bit more complicated elsewhere. For instance, a comment + // can appear before the root element so when a comment ended it was + // necessary to determine whether to return to the S_TEXT state or to the + // new text-outside-root state. + // + if (this.tags.length !== 0) { + this.handleTextInRoot(); + } + else { + this.handleTextOutsideRoot(); + } + } + + handleTextInRoot() { // This is essentially a specialized version of captureTo which is optimized // for performing the ]]> check. A previous version of this code, checked // ``this.text`` for the presence of ]]>. It simplified the code but was @@ -812,7 +834,6 @@ class SaxesParser { // const { chunk, limit, i: start } = this; let { forbiddenState } = this; - let nonSpace = false; let c; // eslint-disable-next-line no-labels, no-restricted-syntax scanLoop: @@ -830,7 +851,6 @@ class SaxesParser { this.entityReturnState = S_TEXT; c = code; forbiddenState = FORBIDDEN_START; - nonSpace = true; // eslint-disable-next-line no-labels break scanLoop; case CLOSE_BRACKET: @@ -844,44 +864,84 @@ class SaxesParser { case FORBIDDEN_BRACKET_BRACKET: break; default: - forbiddenState = FORBIDDEN_START; + throw new Error("impossible state"); } - nonSpace = true; break; case GREATER: if (forbiddenState === FORBIDDEN_BRACKET_BRACKET) { this.fail("the string \"]]>\" is disallowed in char data."); } forbiddenState = FORBIDDEN_START; - nonSpace = true; break; default: forbiddenState = FORBIDDEN_START; + } + } + this.forbiddenState = forbiddenState; + + // This is faster than adding codepoints one by one. + this.text += chunk.substring(start, + c === undefined ? undefined : + (this.i - (c <= 0xFFFF ? 1 : 2))); + } + + handleTextOutsideRoot() { + // This is essentially a specialized version of captureTo which is optimized + // for performing the ]]> check. A previous version of this code, checked + // ``this.text`` for the presence of ]]>. It simplified the code but was + // very costly when character data contained a lot of entities to be parsed. + // + // Since we are using a specialized loop, we also keep track of the presence + // of non-space characters in the text since these are errors when appearing + // outside the document root element. + // + const { chunk, limit, i: start } = this; + let nonSpace = false; + let c; + // eslint-disable-next-line no-labels, no-restricted-syntax + outRootLoop: + while (this.i < limit) { + const code = this.getCode(); + switch (code) { + case LESS: + this.state = S_OPEN_WAKA; + c = code; + // eslint-disable-next-line no-labels + break outRootLoop; + case AMP: + this.state = S_ENTITY; + this.entityReturnState = S_TEXT; + c = code; + nonSpace = true; + // eslint-disable-next-line no-labels + break outRootLoop; + default: if (!isS(code)) { nonSpace = true; } } } - this.forbiddenState = forbiddenState; // This is faster than adding codepoints one by one. this.text += chunk.substring(start, c === undefined ? undefined : (this.i - (c <= 0xFFFF ? 1 : 2))); - if (nonSpace && (!this.sawRoot || this.closedRoot)) { - // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags - // to avoid reporting errors for every single character that is out of - // place. - if (!this.sawRoot && !this.reportedTextBeforeRoot) { - this.fail("text data outside of root node."); - this.reportedTextBeforeRoot = true; - } + if (!nonSpace) { + return; + } - if (this.closedRoot && !this.reportedTextAfterRoot) { - this.fail("text data outside of root node."); - this.reportedTextAfterRoot = true; - } + // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags + // to avoid reporting errors for every single character that is out of + // place. + if (!this.sawRoot && !this.reportedTextBeforeRoot) { + this.fail("text data outside of root node."); + this.reportedTextBeforeRoot = true; + } + + if (this.closedRoot && !this.reportedTextAfterRoot) { + this.fail("text data outside of root node."); + this.reportedTextAfterRoot = true; } }