fix: handling of end of line characters

BREAKING CHANGE: previous versions of saxes did not consistently convert end of line characters to NL (0xA) in the data reported by event handlers. This has been fixed. If your code relied on the old (incorrect) behavior then you'll have to update it.
rubensworks · Sep 12, 2019 · f13247a · f13247a
1 parent bc212a9
commit f13247a
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 39 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -79,3 +79,38 @@ You should run it before you make your change and after. Make sure you run it
 when your CPU is relatively idle, and run it multiple times to get a valid
 result. If the code takes 10 times longer with your change, you know you have a
 problem. Address it before submitting your PR.
+
+In general, we want both elegance and performance. However, in those cases where
+one must be sacrificed for the other, **we are willing to sacrifice elegance in
+favor of performance.** A good example of this is how saxes handles the
+normalization of end-of-line (EOL) characters to newlines (NL). In September
+2019 I (lddubeau) discovered that saxes was not doing the normalization
+correctly. I came up with two fixes:
+
+1. One fix modified the ``write`` method to split chunks on all EOL characters
+   except NL, and would normalize those characters to NL early on. It would also
+   adjust positioning data to handle skipping ``\r`` in the ``\r\n`` sequence,
+   etc. It performed excellently with files containing only ``\n`` but it
+   performed terribly with files that needed normalization. **However**, a file
+   with ``\n`` as EOL would process half as fast if its EOL characters were
+   replaced with ``\r\n`` prior to parsing. The performance for files using
+   anything else than ``\n`` for EOL was atrocious.
+
+   This approach was the more elegant of the two approaches mentioned here
+   because it made all the logic normalizing EOL characters localized to one
+   spot in the code. It was also the least error-prone approach, for the same
+   reason. Adding a new ``captureSomething`` method would not run the risk of
+   forgetting to handle EOL characters properly.
+
+2. Another fix took the approach of recording state while running ``getCode``
+   and using this state in all places where substrings are extracted from chunks
+   to handle the presence of EOL characters. This fix performs about as fast as
+   the other fix described above for files that contain ``\n`` as EOL, and maybe
+   10-20% slower when a file contains ``\r\n``.
+
+   This approach is definitely not as elegant as the first. It spreads the logic
+   for handling EOL into multiple locations, and there's a risk of forgetting to
+   add the proper logic when adding a new ``captureSomething`` method.
+
+Of the two approaches, the 2nd one was the one selected for posterity. Though
+the first method was more elegant, its performance was unacceptable.
diff --git a/README.md b/README.md
@@ -238,6 +238,15 @@ namespaces is onerous.
 Note that you can use `additionalNamespaces` and `resolvePrefix` together if you
 want. `additionalNamespaces` applies before `resolvePrefix`.
 
+### Performance Tips
+
+* saxes works faster on files that use newlines (``\u000A``) as end of line
+  markers than files that use other end of line markers (like ``\r`` or
+  ``\r\n``). The XML specification requires that conformant applications behave
+  as if all characters that are to be treated as end of line characters are
+  converted to ``\u000A`` prior to parsing. The optimal code path for saxes is a
+  file in which all end of line characters are already ``\u000A``.
+
 ## FAQ
 
 Q. Why has saxes dropped support for limiting the size of data chunks passed to

diff --git a/lib/saxes.js b/lib/saxes.js
@@ -335,6 +335,7 @@ class SaxesParser {
     this.chunkPosition = 0;
     this.i = 0;
     this.trailingCR = false;
+    this.originalNL = true;
     this.forbiddenState = FORBIDDEN_START;
     /**
      * A map of entity name to expansion.
@@ -562,20 +563,22 @@ class SaxesParser {
     // isn't. (There may be Node-specific code that would perform faster than
     // ``Array.from`` but don't want to be dependent on Node.)
 
-    let limit = chunk.length;
-
     if (this.trailingCR) {
       // The previous chunk had a trailing cr. We need to handle it now.
       chunk = `\r${chunk}`;
+      this.trailingCR = false;
     }
 
-    if (!end && chunk[limit - 1] === CR) {
+    let limit = chunk.length;
+    if (!end && chunk[limit - 1] === "\r") {
       // The chunk ends with a trailing CR. We cannot know how to handle it
       // until we get the next chunk or the end of the stream. So save it for
       // later.
       limit--;
+      chunk = chunk.substring(0, limit);
       this.trailingCR = true;
     }
+
     this.chunk = chunk;
     this.i = 0;
     while (this.i < limit) {
@@ -596,6 +599,13 @@ class SaxesParser {
     return this.write(null);
   }
 
+  /** @private */
+  newline(originalNL) {
+    this.originalNL = originalNL;
+    this.line++;
+    this.positionAtNewLine = this.position;
+  }
+
   /**
    * Get a single code point out of the current chunk. This updates the current
    * position if we do position tracking.
@@ -621,21 +631,25 @@ class SaxesParser {
       }
 
       switch (code) {
+      case NL:
+        this.newline(true);
+        return NL;
       case CR:
         // We may get NaN if we read past the end of the chunk, which is fine.
         if (chunk.charCodeAt(i + 1) === NL) {
           // A \r\n sequence is converted to \n so we have to skip over the next
           // character. We already know it has a size of 1 so ++ is fine here.
           this.i = i + 2;
+          this.skipTwo = true;
+        }
+        else {
+          this.skipTwo = false;
         }
         // Otherwise, a \r is just converted to \n, so we don't have to skip
         // ahead.
 
         // In either case, \r becomes \n.
-        /* yes, fall through */
-      case NL:
-        this.line++;
-        this.positionAtNewLine = this.position;
+        this.newline(false);
         return NL;
       default:
         // If we get here, then code < SPACE and it is not NL CR or TAB.
@@ -694,11 +708,15 @@ class SaxesParser {
     // read this.i again, which is a bit faster.
     this.i = i + 1;
     if (code < 0xD800) {
-      if ((code > 0x1F && code < 0x7F) || code > 0x9F || code === TAB) {
+      if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) ||
+          code === TAB) {
         return code;
       }
 
       switch (code) {
+      case NL: // 0xA
+        this.newline(true);
+        return NL;
       case CR: { // 0xD
         // We may get NaN if we read past the end of the chunk, which is
         // fine.
@@ -707,15 +725,17 @@ class SaxesParser {
           // A CR NL or CR NEL sequence is converted to NL so we have to skip over
           // the next character. We already know it has a size of 1.
           this.i = i + 2;
+          this.skipTwo = true;
+        }
+        else {
+          this.skipTwo = false;
         }
         // Otherwise, a CR is just converted to NL, no skip.
       }
         /* yes, fall through */
-      case NL: // 0xA
       case NEL: // 0x85
       case LS: // Ox2028
-        this.line++;
-        this.positionAtNewLine = this.position;
+        this.newline(false);
         return NL;
       default:
         this.fail("disallowed character.");
@@ -763,6 +783,19 @@ class SaxesParser {
    * ``false`` otherwise.
    */
 
+  /**
+   * @private
+   */
+  handleEOL(buffer, chunk, start) {
+    if (this.originalNL) {
+      return start;
+    }
+
+    const { i } = this;
+    this[buffer] += `${chunk.substring(start, i - (this.skipTwo ? 2 : 1))}\n`;
+    return i;
+  }
+
   /**
    * Capture characters into a buffer until encountering one of a set of
    * characters.
@@ -778,18 +811,22 @@ class SaxesParser {
    * ``undefined`` if we hit the end of the chunk.
    */
   captureTo(chars, buffer) {
-    const { chunk, i: start } = this;
+    let { i: start } = this;
+    const { chunk } = this;
     while (true) {
       const c = this.getCode();
-      if (chars.includes(c)) {
-        this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2));
-        return c;
+      if (c === NL) {
+        start = this.handleEOL(buffer, chunk, start);
       }
-
-      if (c === undefined) {
+      else if (c === undefined) {
         this[buffer] += chunk.substring(start);
         return undefined;
       }
+
+      if (chars.includes(c)) {
+        this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2));
+        return c;
+      }
     }
   }
 
@@ -806,18 +843,22 @@ class SaxesParser {
    * into the end of the current chunk.
    */
   captureToChar(char, buffer) {
-    const { chunk, i: start } = this;
+    let { i: start } = this;
+    const { chunk } = this;
     while (true) {
       const c = this.getCode();
-      if (c === char) {
-        this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2));
-        return true;
+      if (c === NL) {
+        start = this.handleEOL(buffer, chunk, start);
       }
-
-      if (c === undefined) {
+      else if (c === undefined) {
         this[buffer] += chunk.substring(start);
         return false;
       }
+
+      if (c === char) {
+        this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2));
+        return true;
+      }
     }
   }
 
@@ -839,6 +880,7 @@ class SaxesParser {
         return undefined;
       }
 
+      // NL is not a name char so we don't have to test specifically for it.
       if (!isNameChar(c)) {
         this.name += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2));
         return c;
@@ -866,6 +908,8 @@ class SaxesParser {
         return undefined;
       }
 
+      // NL cannot satisfy this.nameCheck so we don't have to test
+      // specifically for it.
       if (!this.nameCheck(c)) {
         this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2));
         return c;
@@ -980,17 +1024,12 @@ class SaxesParser {
     // Since we are using a specialized loop, we also keep track of the presence
     // of ]]> in text data. The sequence ]]> is forbidden to appear as-is.
     //
-    const { chunk, i: start } = this;
-    let { forbiddenState } = this;
+    let { i: start, forbiddenState } = this;
+    const { chunk } = this;
     // eslint-disable-next-line no-labels, no-restricted-syntax
     scanLoop:
     while (true) {
-      const code = this.getCode();
-      if (code === undefined) {
-        this.text += chunk.substring(start);
-        break;
-      }
-      switch (code) {
+      switch (this.getCode()) {
       case LESS:
         this.state = S_OPEN_WAKA;
         this.text += chunk.substring(start, this.i - 1);
@@ -1024,6 +1063,14 @@ class SaxesParser {
         }
         forbiddenState = FORBIDDEN_START;
         break;
+      case NL:
+        start = this.handleEOL("text", chunk, start);
+        forbiddenState = FORBIDDEN_START;
+        break;
+      case undefined:
+        this.text += chunk.substring(start);
+        // eslint-disable-next-line no-labels
+        break scanLoop;
       default:
         forbiddenState = FORBIDDEN_START;
       }
@@ -1037,16 +1084,13 @@ class SaxesParser {
     // for a specialized task. We keep track of the presence of non-space
     // characters in the text since these are errors when appearing outside the
     // document root element.
-    const { chunk, i: start } = this;
+    let { i: start } = this;
+    const { chunk } = this;
     let nonSpace = false;
     // eslint-disable-next-line no-labels, no-restricted-syntax
     outRootLoop:
     while (true) {
       const code = this.getCode();
-      if (code === undefined) {
-        this.text += chunk.substring(start);
-        break;
-      }
       switch (code) {
       case LESS:
         this.state = S_OPEN_WAKA;
@@ -1060,6 +1104,14 @@ class SaxesParser {
         nonSpace = true;
         // eslint-disable-next-line no-labels
         break outRootLoop;
+      case NL:
+        start = this.handleEOL("text", chunk, start);
+        // eslint-disable-next-line no-labels
+        break;
+      case undefined:
+        this.text += chunk.substring(start);
+        // eslint-disable-next-line no-labels
+        break outRootLoop;
       default:
         if (!isS(code)) {
           nonSpace = true;
@@ -1109,7 +1161,7 @@ class SaxesParser {
         this.state = S_PI_FIRST_CHAR;
         break;
       default:
-        this.fail("disallowed character in tag name.");
+        this.fail("disallowed character in tag name");
         this.state = S_TEXT;
         this.xmlDeclPossible = false;
       }
@@ -1800,14 +1852,19 @@ class SaxesParser {
     // We deliberately do not use captureTo here. The specialized code we use
     // here is faster than using captureTo.
     const { q } = this;
-    const { chunk, i: start } = this;
+    let { i: start } = this;
+    const { chunk } = this;
     while (true) {
       const code = this.getCode();
       if (code === undefined) {
         this.text += chunk.substring(start);
         return;
       }
-      if (code === q || code === AMP || code === LESS) {
+
+      if (code === NL) {
+        start = this.handleEOL("text", chunk, start);
+      }
+      else if (code === q || code === AMP || code === LESS) {
         const slice = chunk.substring(start, this.i - 1);
         switch (code) {
         case q: