Skip to content

Commit

Permalink
fix: handle column computation over characters in the astral plane
Browse files Browse the repository at this point in the history
The column numbers were reported as JavaScript string indexes. This is wrong if
any astral plane character is present in the line. This commit fixes the issue.

BREAKING CHANGE: The fix to column number reporting changes the meaning of the
``column`` field. If you need the old behavior of ``column`` you can use the new
``columnIndex`` field which behaves like the old ``column`` and may be useful in
some contexts. Ultimately you should decide whether your application needs to
know column numbers by Unicode character count or by JavaScript index. (And you
need to know the difference between the two. You can see [this
page](https://mathiasbynens.be/notes/javascript-unicode) for a detailed
discussion of the Unicode problem in JavaScript. Note that the numbers put in
the error messages that ``fail`` produce are still based on the ``column`` field
and thus use the new meaning of ``column``. If you want error message that use
``columnIndex`` you may override the ``fail`` method.
  • Loading branch information
lddubeau committed Jan 10, 2020
1 parent c68ceed commit cefc8f7
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 14 deletions.
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,12 @@ Settings supported:
is `false`.

* `position` - Boolean. If `false`, then don't track line/col/position. Unset is
treated as `true`. Default is unset.
treated as `true`. Default is unset. Currently, setting this to `false` only
results in a cosmetic change: the errors reported do not contain position
information. sax-js would literally turn off the position-computing logic if
this flag was set to false. The notion was that it would optimize
execution. In saxes at least it turns out that continually testing this flag
causes a cost that offsets the benefits of turning off this logic.

* `fileName` - String. Set a file name for error reporting. This is useful only
when tracking positions. You may leave it unset, in which case the file name
Expand Down Expand Up @@ -164,8 +169,10 @@ done processing the buffer, which is signaled by the `end` event.

The parser has the following properties:

`line`, `column`, `position` - Indications of the position in the XML document
where the parser currently is looking.
`line`, `column`, `columnIndex`, `position` - Indications of the position in the
XML document where the parser currently is looking. The `columnIndex` property
counts columns as if indexing into a JavaScript string, whereas the `column`
property counts Unicode characters.

`closed` - Boolean indicating whether or not the parser can be written to. If
it's `true`, then wait for the `ready` event to write again.
Expand Down
66 changes: 56 additions & 10 deletions src/saxes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -505,9 +505,24 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
*/
xmlDecl!: XMLDecl;

/** The line number the parser is currently looking at. */
/**
* The line number of the next character to be read by the parser. This field
* is one-based. (The first line is numbered 1.)
*/
line!: number;

/**
* The column number of the next character to be read by the parser. *
* This field is zero-based. (The first column is 0.)
*
* This field counts columns by *Unicode character*. Note that this *can*
* be different from the index of the character in a JavaScript string due
* to how JavaScript handles astral plane characters.
*
* See [[columnIndex]] for a number that corresponds to the JavaScript index.
*/
column!: number;

/**
* A map of entity name to expansion.
*/
Expand Down Expand Up @@ -664,18 +679,37 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
};

this.line = 1;
this.column = 0;

this.ENTITIES = Object.create(XML_ENTITIES);

this.onready();
}

/** The stream position the parser is currently looking at. */
/**
* The stream position the parser is currently looking at. This field is
* zero-based.
*
* This field is not based on counting Unicode characters but is to be
* interpreted as a plain index into a JavaScript string.
*/
get position(): number {
return this.chunkPosition + this.i;
}

get column(): number {
/**
* The column number of the next character to be read by the parser. *
* This field is zero-based. (The first column in a line is 0.)
*
* This field reports the index at which the next character would be in the
* line if the line were represented as a JavaScript string. Note that this
* *can* be different to a count based on the number of *Unicode characters*
* due to how JavaScript handles astral plane characters.
*
* See [[column]] for a number that corresponds to a count of Unicode
* characters.
*/
get columnIndex(): number {
return this.position - this.positionAtNewLine;
}

Expand Down Expand Up @@ -898,6 +932,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
// than using codePointAt.
const code = chunk.charCodeAt(i);

this.column++;
if (code < 0xD800) {
if (code >= SPACE || code === TAB) {
return code;
Expand All @@ -906,6 +941,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
switch (code) {
case NL:
this.line++;
this.column = 0;
this.positionAtNewLine = this.position;
return NL;
case CR:
Expand All @@ -921,6 +957,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {

// In either case, \r becomes \n.
this.line++;
this.column = 0;
this.positionAtNewLine = this.position;
return NL_LIKE;
default:
Expand Down Expand Up @@ -978,6 +1015,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
// than using codePointAt.
const code = chunk.charCodeAt(i);

this.column++;
if (code < 0xD800) {
if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) ||
code === TAB) {
Expand All @@ -987,6 +1025,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
switch (code) {
case NL: // 0xA
this.line++;
this.column = 0;
this.positionAtNewLine = this.position;
return NL;
case CR: { // 0xD
Expand All @@ -1004,6 +1043,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
case NEL: // 0x85
case LS: // Ox2028
this.line++;
this.column = 0;
this.positionAtNewLine = this.position;
return NL_LIKE;
default:
Expand Down Expand Up @@ -1045,6 +1085,11 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
return c === NL_LIKE ? NL : c;
}

private unget(): void {
this.i = this.prevI;
this.column--;
}

/**
* Capture characters into a buffer until encountering one of a set of
* characters.
Expand Down Expand Up @@ -1181,6 +1226,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
// If the initial character is 0xFEFF, ignore it.
if (this.chunk.charCodeAt(0) === 0xFEFF) {
this.i++;
this.column++;
}

// This initial loop is a specialized version of skipSpaces. We need to know
Expand Down Expand Up @@ -1216,7 +1262,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
case EOC:
break;
default:
this.i = this.prevI;
this.unget();
this.state = S_TEXT;
this.xmlDeclPossible = false;
}
Expand Down Expand Up @@ -1416,7 +1462,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
// either a /, ?, !, or text is coming next.
if (isNameStartChar(c)) {
this.state = S_OPEN_TAG;
this.i = this.prevI;
this.unget();
this.xmlDeclPossible = false;
}
else {
Expand Down Expand Up @@ -1816,7 +1862,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {

if (!isS(c)) {
this.fail("whitespace required.");
this.i = this.prevI;
this.unget();
}

this.state = S_XML_DECL_NAME_START;
Expand Down Expand Up @@ -1899,7 +1945,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
return;
}
if (isNameStartChar(c)) {
this.i = this.prevI;
this.unget();
this.state = S_ATTRIB_NAME;
}
else if (c === GREATER) {
Expand Down Expand Up @@ -1950,7 +1996,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
this.openTag();
}
else if (isNameStartChar(c)) {
this.i = this.prevI;
this.unget();
this.state = S_ATTRIB_NAME;
}
else {
Expand All @@ -1969,7 +2015,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
else if (!isS(c)) {
this.fail("unquoted attribute value.");
this.state = S_ATTRIB_VALUE_UNQUOTED;
this.i = this.prevI;
this.unget();
}
}

Expand Down Expand Up @@ -2025,7 +2071,7 @@ export class SaxesParser<O extends SaxesOptions = SaxesOptions> {
}
else if (isNameStartChar(c)) {
this.fail("no whitespace between attributes.");
this.i = this.prevI;
this.unget();
this.state = S_ATTRIB_NAME;
}
else {
Expand Down
7 changes: 7 additions & 0 deletions test/parser-position.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ describe("parser position", () => {
});
});

testPosition("astral plane", ["<a>💩</a>"], [
["opentagstart", { position: 3, line: 1, column: 3 }],
["opentag", { position: 3, line: 1, column: 3 }],
["text", { position: 6, line: 1, column: 5 }],
["closetag", { position: 9, line: 1, column: 8 }],
]);

test({
name: "empty document",
xml: "",
Expand Down
2 changes: 1 addition & 1 deletion test/trailing-non-whitespace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ test({
attributes: {},
isSelfClosing: false,
}],
["error", "1:37: text data outside of root node."],
["error", "1:36: text data outside of root node."],
["text", " to monkey land"],
["end", undefined],
["ready", undefined],
Expand Down

0 comments on commit cefc8f7

Please sign in to comment.