Skip to content

Commit

Permalink
perf: split sText into two specialized loops
Browse files Browse the repository at this point in the history
The fact of the matter is that the test for non-space characters matters only
outside of the root element, which is a minority of cases. The test for ]]>
matters only inside the root. (Outside the root, it raises the non-space text
error.) The specialized code improves performance.
  • Loading branch information
lddubeau committed Jun 20, 2019
1 parent 28845a0 commit 732325e
Showing 1 changed file with 78 additions and 18 deletions.
96 changes: 78 additions & 18 deletions lib/saxes.js
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,28 @@ class SaxesParser {

/** @private */
sText() {
//
// We did try a version of saxes where the S_TEXT state was split in two
// states: one for text inside the root element, and one for text
// outside. This was avoiding having to test this.tags.length to decide what
// implementation to actually use.
//
// Peformance testing on gigabyte-size files did not show any advantage to
// using the two states solution instead of the current one. Conversely, it
// made the code a bit more complicated elsewhere. For instance, a comment
// can appear before the root element so when a comment ended it was
// necessary to determine whether to return to the S_TEXT state or to the
// new text-outside-root state.
//
if (this.tags.length !== 0) {
this.handleTextInRoot();
}
else {
this.handleTextOutsideRoot();
}
}

handleTextInRoot() {
// This is essentially a specialized version of captureTo which is optimized
// for performing the ]]> check. A previous version of this code, checked
// ``this.text`` for the presence of ]]>. It simplified the code but was
Expand All @@ -812,7 +834,6 @@ class SaxesParser {
//
const { chunk, limit, i: start } = this;
let { forbiddenState } = this;
let nonSpace = false;
let c;
// eslint-disable-next-line no-labels, no-restricted-syntax
scanLoop:
Expand All @@ -830,7 +851,6 @@ class SaxesParser {
this.entityReturnState = S_TEXT;
c = code;
forbiddenState = FORBIDDEN_START;
nonSpace = true;
// eslint-disable-next-line no-labels
break scanLoop;
case CLOSE_BRACKET:
Expand All @@ -844,44 +864,84 @@ class SaxesParser {
case FORBIDDEN_BRACKET_BRACKET:
break;
default:
forbiddenState = FORBIDDEN_START;
throw new Error("impossible state");
}
nonSpace = true;
break;
case GREATER:
if (forbiddenState === FORBIDDEN_BRACKET_BRACKET) {
this.fail("the string \"]]>\" is disallowed in char data.");
}
forbiddenState = FORBIDDEN_START;
nonSpace = true;
break;
default:
forbiddenState = FORBIDDEN_START;
}
}
this.forbiddenState = forbiddenState;

// This is faster than adding codepoints one by one.
this.text += chunk.substring(start,
c === undefined ? undefined :
(this.i - (c <= 0xFFFF ? 1 : 2)));
}

handleTextOutsideRoot() {
// This is essentially a specialized version of captureTo which is optimized
// for performing the ]]> check. A previous version of this code, checked
// ``this.text`` for the presence of ]]>. It simplified the code but was
// very costly when character data contained a lot of entities to be parsed.
//
// Since we are using a specialized loop, we also keep track of the presence
// of non-space characters in the text since these are errors when appearing
// outside the document root element.
//
const { chunk, limit, i: start } = this;
let nonSpace = false;
let c;
// eslint-disable-next-line no-labels, no-restricted-syntax
outRootLoop:
while (this.i < limit) {
const code = this.getCode();
switch (code) {
case LESS:
this.state = S_OPEN_WAKA;
c = code;
// eslint-disable-next-line no-labels
break outRootLoop;
case AMP:
this.state = S_ENTITY;
this.entityReturnState = S_TEXT;
c = code;
nonSpace = true;
// eslint-disable-next-line no-labels
break outRootLoop;
default:
if (!isS(code)) {
nonSpace = true;
}
}
}
this.forbiddenState = forbiddenState;

// This is faster than adding codepoints one by one.
this.text += chunk.substring(start,
c === undefined ? undefined :
(this.i - (c <= 0xFFFF ? 1 : 2)));

if (nonSpace && (!this.sawRoot || this.closedRoot)) {
// We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
// to avoid reporting errors for every single character that is out of
// place.
if (!this.sawRoot && !this.reportedTextBeforeRoot) {
this.fail("text data outside of root node.");
this.reportedTextBeforeRoot = true;
}
if (!nonSpace) {
return;
}

if (this.closedRoot && !this.reportedTextAfterRoot) {
this.fail("text data outside of root node.");
this.reportedTextAfterRoot = true;
}
// We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
// to avoid reporting errors for every single character that is out of
// place.
if (!this.sawRoot && !this.reportedTextBeforeRoot) {
this.fail("text data outside of root node.");
this.reportedTextBeforeRoot = true;
}

if (this.closedRoot && !this.reportedTextAfterRoot) {
this.fail("text data outside of root node.");
this.reportedTextAfterRoot = true;
}
}

Expand Down

0 comments on commit 732325e

Please sign in to comment.