perf: minimine concatenation by adding the capability to unget codes

There was a recurring pattern in the code whereby a buffer (e.g. ``this.name``) would be filled first by adding a single character to it, and then by concatenating a slice from the current chunk. Internally, Node would create a cons string from the two parts, which would then be eventually concatenated behind the scenes to a final string. The new code adds ``this.prevI`` which allows backing up the reading of codes ("ungetting" codes). So in the scenario described above, instead of filling the buffer with a single character concatenated with a slice, the buffer is filled with a slice that begins one character earlier than it would otherwise have. This skips the creation of a cons string and the concatenation behind the scenes. It also simplifies a few other pieces of code that need the index of the code just read from the chunk.
rubensworks · Sep 12, 2019 · 27fa8b9 · 27fa8b9
1 parent fff5edb
commit 27fa8b9
Showing 1 changed file with 34 additions and 26 deletions.
diff --git a/lib/saxes.js b/lib/saxes.js
@@ -334,6 +334,16 @@ class SaxesParser {
     this.chunk = "";
     this.chunkPosition = 0;
     this.i = 0;
+    //
+    // We use prevI to allow "ungetting" the previously read code point. Note
+    // however, that it is not safe to unget everything and anything. In
+    // particular ungetting EOL characters will screw positioning up.
+    //
+    // Practically, you must not unget a code which has any side effect beyond
+    // updating ``this.i`` and ``this.prevI``. Only EOL codes have such side
+    // effects.
+    //
+    this.prevI = 0;
     this.trailingCR = false;
     this.originalNL = true;
     this.forbiddenState = FORBIDDEN_START;
@@ -618,6 +628,7 @@ class SaxesParser {
    */
   getCode10() {
     const { chunk, i } = this;
+    this.prevI = i;
     // Using charCodeAt and handling the surrogates ourselves is faster
     // than using codePointAt.
     const code = chunk.charCodeAt(i);
@@ -640,10 +651,6 @@ class SaxesParser {
           // A \r\n sequence is converted to \n so we have to skip over the next
           // character. We already know it has a size of 1 so ++ is fine here.
           this.i = i + 2;
-          this.skipTwo = true;
-        }
-        else {
-          this.skipTwo = false;
         }
         // Otherwise, a \r is just converted to \n, so we don't have to skip
         // ahead.
@@ -700,6 +707,7 @@ class SaxesParser {
    */
   getCode11() {
     const { chunk, i } = this;
+    this.prevI = i;
     // Using charCodeAt and handling the surrogates ourselves is faster
     // than using codePointAt.
     const code = chunk.charCodeAt(i);
@@ -725,10 +733,6 @@ class SaxesParser {
           // A CR NL or CR NEL sequence is converted to NL so we have to skip over
           // the next character. We already know it has a size of 1.
           this.i = i + 2;
-          this.skipTwo = true;
-        }
-        else {
-          this.skipTwo = false;
         }
         // Otherwise, a CR is just converted to NL, no skip.
       }
@@ -791,9 +795,8 @@ class SaxesParser {
       return start;
     }
 
-    const { i } = this;
-    this[buffer] += `${chunk.slice(start, i - (this.skipTwo ? 2 : 1))}\n`;
-    return i;
+    this[buffer] += `${chunk.slice(start, this.prevI)}\n`;
+    return this.i;
   }
 
   /**
@@ -824,7 +827,7 @@ class SaxesParser {
       }
 
       if (chars.includes(c)) {
-        this[buffer] += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
+        this[buffer] += chunk.slice(start, this.prevI);
         return c;
       }
     }
@@ -856,7 +859,7 @@ class SaxesParser {
       }
 
       if (c === char) {
-        this[buffer] += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
+        this[buffer] += chunk.slice(start, this.prevI);
         return true;
       }
     }
@@ -882,7 +885,7 @@ class SaxesParser {
 
       // NL is not a name char so we don't have to test specifically for it.
       if (!isNameChar(c)) {
-        this.name += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
+        this.name += chunk.slice(start, this.prevI);
         return c;
       }
     }
@@ -911,7 +914,7 @@ class SaxesParser {
       // NL cannot satisfy this.nameCheck so we don't have to test
       // specifically for it.
       if (!this.nameCheck(c)) {
-        this[buffer] += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
+        this[buffer] += chunk.slice(start, this.prevI);
         return c;
       }
     }
@@ -985,7 +988,7 @@ class SaxesParser {
         this.fail("text data outside of root node.");
         this.reportedTextBeforeRoot = true;
       }
-      this.text = String.fromCodePoint(c);
+      this.i = this.prevI;
       this.state = S_TEXT;
       this.xmlDeclPossible = false;
     }
@@ -1032,14 +1035,14 @@ class SaxesParser {
       switch (this.getCode()) {
       case LESS:
         this.state = S_OPEN_WAKA;
-        this.text += chunk.slice(start, this.i - 1);
+        this.text += chunk.slice(start, this.prevI);
         forbiddenState = FORBIDDEN_START;
         // eslint-disable-next-line no-labels
         break scanLoop;
       case AMP:
         this.state = S_ENTITY;
         this.entityReturnState = S_TEXT;
-        this.text += chunk.slice(start, this.i - 1);
+        this.text += chunk.slice(start, this.prevI);
         forbiddenState = FORBIDDEN_START;
         // eslint-disable-next-line no-labels
         break scanLoop;
@@ -1094,13 +1097,13 @@ class SaxesParser {
       switch (code) {
       case LESS:
         this.state = S_OPEN_WAKA;
-        this.text += chunk.slice(start, this.i - 1);
+        this.text += chunk.slice(start, this.prevI);
         // eslint-disable-next-line no-labels
         break outRootLoop;
       case AMP:
         this.state = S_ENTITY;
         this.entityReturnState = S_TEXT;
-        this.text += chunk.slice(start, this.i - 1);
+        this.text += chunk.slice(start, this.prevI);
         nonSpace = true;
         // eslint-disable-next-line no-labels
         break outRootLoop;
@@ -1139,11 +1142,15 @@ class SaxesParser {
 
   /** @private */
   sOpenWaka() {
+    // Reminder: a state handler is called with at least one character
+    // available in the current chunk. So the first call to get code inside of
+    // a state handler cannot return ``undefined``. That's why we don't test
+    // for it.
     const c = this.getCode();
     // either a /, ?, !, or text is coming next.
     if (isNameStartChar(c)) {
       this.state = S_OPEN_TAG;
-      this.name = String.fromCodePoint(c);
+      this.i = this.prevI;
       this.xmlDeclPossible = false;
     }
     else {
@@ -1683,6 +1690,7 @@ class SaxesParser {
       name: this.name,
       attributes: Object.create(null),
     };
+    this.name = "";
 
     if (this.xmlnsOpt) {
       tag.ns = Object.create(null);
@@ -1731,7 +1739,7 @@ class SaxesParser {
       return;
     }
     if (isNameStartChar(c)) {
-      this.name = String.fromCodePoint(c);
+      this.i = this.prevI;
       this.state = S_ATTRIB_NAME;
     }
     else if (c === GREATER) {
@@ -1823,7 +1831,7 @@ class SaxesParser {
         this.openTag();
       }
       else if (isNameStartChar(c)) {
-        this.name = String.fromCodePoint(c);
+        this.i = this.prevI;
         this.state = S_ATTRIB_NAME;
       }
       else {
@@ -1843,7 +1851,7 @@ class SaxesParser {
     else if (!isS(c)) {
       this.fail("unquoted attribute value.");
       this.state = S_ATTRIB_VALUE_UNQUOTED;
-      this.text = String.fromCodePoint(c);
+      this.i = this.prevI;
     }
   }
 
@@ -1865,7 +1873,7 @@ class SaxesParser {
         start = this.handleEOL("text", chunk, start);
       }
       else if (code === q || code === AMP || code === LESS) {
-        const slice = chunk.slice(start, this.i - 1);
+        const slice = chunk.slice(start, this.prevI);
         switch (code) {
         case q:
           this.pushAttrib(this.name, this.text + slice);
@@ -1901,7 +1909,7 @@ class SaxesParser {
     }
     else if (isNameStartChar(c)) {
       this.fail("no whitespace between attributes.");
-      this.name = String.fromCodePoint(c);
+      this.i = this.prevI;
       this.state = S_ATTRIB_NAME;
     }
     else {