Skip to content

Commit

Permalink
perf: minimine concatenation by adding the capability to unget codes
Browse files Browse the repository at this point in the history
There was a recurring pattern in the code whereby a buffer (e.g. ``this.name``)
would be filled first by adding a single character to it, and then by
concatenating a slice from the current chunk. Internally, Node would create a
cons string from the two parts, which would then be eventually concatenated
behind the scenes to a final string.

The new code adds ``this.prevI`` which allows backing up the reading of
codes ("ungetting" codes). So in the scenario described above, instead of
filling the buffer with a single character concatenated with a slice, the buffer
is filled with a slice that begins one character earlier than it would otherwise
have. This skips the creation of a cons string and the concatenation behind the
scenes.

It also simplifies a few other pieces of code that need the index of the code
just read from the chunk.
  • Loading branch information
lddubeau committed Sep 12, 2019
1 parent fff5edb commit 27fa8b9
Showing 1 changed file with 34 additions and 26 deletions.
60 changes: 34 additions & 26 deletions lib/saxes.js
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,16 @@ class SaxesParser {
this.chunk = "";
this.chunkPosition = 0;
this.i = 0;
//
// We use prevI to allow "ungetting" the previously read code point. Note
// however, that it is not safe to unget everything and anything. In
// particular ungetting EOL characters will screw positioning up.
//
// Practically, you must not unget a code which has any side effect beyond
// updating ``this.i`` and ``this.prevI``. Only EOL codes have such side
// effects.
//
this.prevI = 0;
this.trailingCR = false;
this.originalNL = true;
this.forbiddenState = FORBIDDEN_START;
Expand Down Expand Up @@ -618,6 +628,7 @@ class SaxesParser {
*/
getCode10() {
const { chunk, i } = this;
this.prevI = i;
// Using charCodeAt and handling the surrogates ourselves is faster
// than using codePointAt.
const code = chunk.charCodeAt(i);
Expand All @@ -640,10 +651,6 @@ class SaxesParser {
// A \r\n sequence is converted to \n so we have to skip over the next
// character. We already know it has a size of 1 so ++ is fine here.
this.i = i + 2;
this.skipTwo = true;
}
else {
this.skipTwo = false;
}
// Otherwise, a \r is just converted to \n, so we don't have to skip
// ahead.
Expand Down Expand Up @@ -700,6 +707,7 @@ class SaxesParser {
*/
getCode11() {
const { chunk, i } = this;
this.prevI = i;
// Using charCodeAt and handling the surrogates ourselves is faster
// than using codePointAt.
const code = chunk.charCodeAt(i);
Expand All @@ -725,10 +733,6 @@ class SaxesParser {
// A CR NL or CR NEL sequence is converted to NL so we have to skip over
// the next character. We already know it has a size of 1.
this.i = i + 2;
this.skipTwo = true;
}
else {
this.skipTwo = false;
}
// Otherwise, a CR is just converted to NL, no skip.
}
Expand Down Expand Up @@ -791,9 +795,8 @@ class SaxesParser {
return start;
}

const { i } = this;
this[buffer] += `${chunk.slice(start, i - (this.skipTwo ? 2 : 1))}\n`;
return i;
this[buffer] += `${chunk.slice(start, this.prevI)}\n`;
return this.i;
}

/**
Expand Down Expand Up @@ -824,7 +827,7 @@ class SaxesParser {
}

if (chars.includes(c)) {
this[buffer] += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
this[buffer] += chunk.slice(start, this.prevI);
return c;
}
}
Expand Down Expand Up @@ -856,7 +859,7 @@ class SaxesParser {
}

if (c === char) {
this[buffer] += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
this[buffer] += chunk.slice(start, this.prevI);
return true;
}
}
Expand All @@ -882,7 +885,7 @@ class SaxesParser {

// NL is not a name char so we don't have to test specifically for it.
if (!isNameChar(c)) {
this.name += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
this.name += chunk.slice(start, this.prevI);
return c;
}
}
Expand Down Expand Up @@ -911,7 +914,7 @@ class SaxesParser {
// NL cannot satisfy this.nameCheck so we don't have to test
// specifically for it.
if (!this.nameCheck(c)) {
this[buffer] += chunk.slice(start, this.i - (c <= 0xFFFF ? 1 : 2));
this[buffer] += chunk.slice(start, this.prevI);
return c;
}
}
Expand Down Expand Up @@ -985,7 +988,7 @@ class SaxesParser {
this.fail("text data outside of root node.");
this.reportedTextBeforeRoot = true;
}
this.text = String.fromCodePoint(c);
this.i = this.prevI;
this.state = S_TEXT;
this.xmlDeclPossible = false;
}
Expand Down Expand Up @@ -1032,14 +1035,14 @@ class SaxesParser {
switch (this.getCode()) {
case LESS:
this.state = S_OPEN_WAKA;
this.text += chunk.slice(start, this.i - 1);
this.text += chunk.slice(start, this.prevI);
forbiddenState = FORBIDDEN_START;
// eslint-disable-next-line no-labels
break scanLoop;
case AMP:
this.state = S_ENTITY;
this.entityReturnState = S_TEXT;
this.text += chunk.slice(start, this.i - 1);
this.text += chunk.slice(start, this.prevI);
forbiddenState = FORBIDDEN_START;
// eslint-disable-next-line no-labels
break scanLoop;
Expand Down Expand Up @@ -1094,13 +1097,13 @@ class SaxesParser {
switch (code) {
case LESS:
this.state = S_OPEN_WAKA;
this.text += chunk.slice(start, this.i - 1);
this.text += chunk.slice(start, this.prevI);
// eslint-disable-next-line no-labels
break outRootLoop;
case AMP:
this.state = S_ENTITY;
this.entityReturnState = S_TEXT;
this.text += chunk.slice(start, this.i - 1);
this.text += chunk.slice(start, this.prevI);
nonSpace = true;
// eslint-disable-next-line no-labels
break outRootLoop;
Expand Down Expand Up @@ -1139,11 +1142,15 @@ class SaxesParser {

/** @private */
sOpenWaka() {
// Reminder: a state handler is called with at least one character
// available in the current chunk. So the first call to get code inside of
// a state handler cannot return ``undefined``. That's why we don't test
// for it.
const c = this.getCode();
// either a /, ?, !, or text is coming next.
if (isNameStartChar(c)) {
this.state = S_OPEN_TAG;
this.name = String.fromCodePoint(c);
this.i = this.prevI;
this.xmlDeclPossible = false;
}
else {
Expand Down Expand Up @@ -1683,6 +1690,7 @@ class SaxesParser {
name: this.name,
attributes: Object.create(null),
};
this.name = "";

if (this.xmlnsOpt) {
tag.ns = Object.create(null);
Expand Down Expand Up @@ -1731,7 +1739,7 @@ class SaxesParser {
return;
}
if (isNameStartChar(c)) {
this.name = String.fromCodePoint(c);
this.i = this.prevI;
this.state = S_ATTRIB_NAME;
}
else if (c === GREATER) {
Expand Down Expand Up @@ -1823,7 +1831,7 @@ class SaxesParser {
this.openTag();
}
else if (isNameStartChar(c)) {
this.name = String.fromCodePoint(c);
this.i = this.prevI;
this.state = S_ATTRIB_NAME;
}
else {
Expand All @@ -1843,7 +1851,7 @@ class SaxesParser {
else if (!isS(c)) {
this.fail("unquoted attribute value.");
this.state = S_ATTRIB_VALUE_UNQUOTED;
this.text = String.fromCodePoint(c);
this.i = this.prevI;
}
}

Expand All @@ -1865,7 +1873,7 @@ class SaxesParser {
start = this.handleEOL("text", chunk, start);
}
else if (code === q || code === AMP || code === LESS) {
const slice = chunk.slice(start, this.i - 1);
const slice = chunk.slice(start, this.prevI);
switch (code) {
case q:
this.pushAttrib(this.name, this.text + slice);
Expand Down Expand Up @@ -1901,7 +1909,7 @@ class SaxesParser {
}
else if (isNameStartChar(c)) {
this.fail("no whitespace between attributes.");
this.name = String.fromCodePoint(c);
this.i = this.prevI;
this.state = S_ATTRIB_NAME;
}
else {
Expand Down

0 comments on commit 27fa8b9

Please sign in to comment.