perf: split sText into two specialized loops

The fact of the matter is that the test for non-space characters matters only outside of the root element, which is a minority of cases. The test for ]]> matters only inside the root. (Outside the root, it raises the non-space text error.) The specialized code improves performance.
lddubeau · Jun 20, 2019 · 732325e · 732325e
1 parent 28845a0
commit 732325e
Showing 1 changed file with 78 additions and 18 deletions.
diff --git a/lib/saxes.js b/lib/saxes.js
@@ -802,6 +802,28 @@ class SaxesParser {
 
   /** @private */
   sText() {
+    //
+    // We did try a version of saxes where the S_TEXT state was split in two
+    // states: one for text inside the root element, and one for text
+    // outside. This was avoiding having to test this.tags.length to decide what
+    // implementation to actually use.
+    //
+    // Peformance testing on gigabyte-size files did not show any advantage to
+    // using the two states solution instead of the current one. Conversely, it
+    // made the code a bit more complicated elsewhere. For instance, a comment
+    // can appear before the root element so when a comment ended it was
+    // necessary to determine whether to return to the S_TEXT state or to the
+    // new text-outside-root state.
+    //
+    if (this.tags.length !== 0) {
+      this.handleTextInRoot();
+    }
+    else {
+      this.handleTextOutsideRoot();
+    }
+  }
+
+  handleTextInRoot() {
     // This is essentially a specialized version of captureTo which is optimized
     // for performing the ]]> check. A previous version of this code, checked
     // ``this.text`` for the presence of ]]>. It simplified the code but was
@@ -812,7 +834,6 @@ class SaxesParser {
     //
     const { chunk, limit, i: start } = this;
     let { forbiddenState } = this;
-    let nonSpace = false;
     let c;
     // eslint-disable-next-line no-labels, no-restricted-syntax
     scanLoop:
@@ -830,7 +851,6 @@ class SaxesParser {
         this.entityReturnState = S_TEXT;
         c = code;
         forbiddenState = FORBIDDEN_START;
-        nonSpace = true;
         // eslint-disable-next-line no-labels
         break scanLoop;
       case CLOSE_BRACKET:
@@ -844,44 +864,84 @@ class SaxesParser {
         case FORBIDDEN_BRACKET_BRACKET:
           break;
         default:
-          forbiddenState = FORBIDDEN_START;
+          throw new Error("impossible state");
         }
-        nonSpace = true;
         break;
       case GREATER:
         if (forbiddenState === FORBIDDEN_BRACKET_BRACKET) {
           this.fail("the string \"]]>\" is disallowed in char data.");
         }
         forbiddenState = FORBIDDEN_START;
-        nonSpace = true;
         break;
       default:
         forbiddenState = FORBIDDEN_START;
+      }
+    }
+    this.forbiddenState = forbiddenState;
+
+    // This is faster than adding codepoints one by one.
+    this.text += chunk.substring(start,
+                                 c === undefined ? undefined :
+                                 (this.i - (c <= 0xFFFF ? 1 : 2)));
+  }
+
+  handleTextOutsideRoot() {
+    // This is essentially a specialized version of captureTo which is optimized
+    // for performing the ]]> check. A previous version of this code, checked
+    // ``this.text`` for the presence of ]]>. It simplified the code but was
+    // very costly when character data contained a lot of entities to be parsed.
+    //
+    // Since we are using a specialized loop, we also keep track of the presence
+    // of non-space characters in the text since these are errors when appearing
+    // outside the document root element.
+    //
+    const { chunk, limit, i: start } = this;
+    let nonSpace = false;
+    let c;
+    // eslint-disable-next-line no-labels, no-restricted-syntax
+    outRootLoop:
+    while (this.i < limit) {
+      const code = this.getCode();
+      switch (code) {
+      case LESS:
+        this.state = S_OPEN_WAKA;
+        c = code;
+        // eslint-disable-next-line no-labels
+        break outRootLoop;
+      case AMP:
+        this.state = S_ENTITY;
+        this.entityReturnState = S_TEXT;
+        c = code;
+        nonSpace = true;
+        // eslint-disable-next-line no-labels
+        break outRootLoop;
+      default:
         if (!isS(code)) {
           nonSpace = true;
         }
       }
     }
-    this.forbiddenState = forbiddenState;
 
     // This is faster than adding codepoints one by one.
     this.text += chunk.substring(start,
                                  c === undefined ? undefined :
                                  (this.i - (c <= 0xFFFF ? 1 : 2)));
 
-    if (nonSpace && (!this.sawRoot || this.closedRoot)) {
-      // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
-      // to avoid reporting errors for every single character that is out of
-      // place.
-      if (!this.sawRoot && !this.reportedTextBeforeRoot) {
-        this.fail("text data outside of root node.");
-        this.reportedTextBeforeRoot = true;
-      }
+    if (!nonSpace) {
+      return;
+    }
 
-      if (this.closedRoot && !this.reportedTextAfterRoot) {
-        this.fail("text data outside of root node.");
-        this.reportedTextAfterRoot = true;
-      }
+    // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
+    // to avoid reporting errors for every single character that is out of
+    // place.
+    if (!this.sawRoot && !this.reportedTextBeforeRoot) {
+      this.fail("text data outside of root node.");
+      this.reportedTextBeforeRoot = true;
+    }
+
+    if (this.closedRoot && !this.reportedTextAfterRoot) {
+      this.fail("text data outside of root node.");
+      this.reportedTextAfterRoot = true;
     }
   }