Skip to content

Commit

Permalink
fix: use xmlchars for checking names
Browse files Browse the repository at this point in the history
This plugs some holes in name checking.
  • Loading branch information
lddubeau committed Jul 5, 2018
1 parent a95c554 commit 2c939fe
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 41 deletions.
86 changes: 46 additions & 40 deletions lib/saxes.js
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
// this really needs to be replaced with character classes.
// XML allows all manner of ridiculous numbers and digits.
// this really needs to be replaced with character classes. XML allows all
// manner of ridiculous numbers and digits.

"use strict";

const { XML_1_0: { ED5 } } = require("xmlchars");

const {
regexes: {
NAME_START_CHAR,
NAME_CHAR,
},
} = ED5;

const CDATA = "[CDATA[";
const DOCTYPE = "DOCTYPE";
const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace";
const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/";

// http://www.w3.org/TR/REC-xml/#NT-NameStartChar
// This implementation works on strings, a single character at a time
// as such, it cannot ever support astral-plane characters (10000-EFFFF)
// without a significant breaking change to either this parser, or the
// JavaScript language. Implementation of an emoji-capable xml parser
// is left as an exercise for the reader.
const nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/;

const nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/;
// We have to make our own regexp because the way we record character references
// as entities. So we have to allow for "#" at the start.
const ENTITY_START_CHAR =
new RegExp(`^[${ED5.fragments.NAME_START_CHAR}#]$`, "u");

const entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/;
const entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/;
// This implementation works on strings, a single character at a time as such,
// it cannot ever support astral-plane characters (10000-EFFFF) without a
// significant breaking change to either this parser, or the JavaScript
// language. Implementation of an emoji-capable xml parser is left as an
// exercise for the reader.

const rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE };

Expand Down Expand Up @@ -85,9 +92,8 @@ exports.EVENTS = [
];

const buffers = [
"comment", "openWakaBang", "textNode", "tagName", "doctype",
"procInstName", "procInstBody", "entity", "attribName",
"attribValue", "cdata",
"comment", "openWakaBang", "textNode", "tagName", "doctype", "procInstName",
"procInstBody", "entity", "attribName", "attribValue", "cdata",
];

let Stream;
Expand Down Expand Up @@ -283,9 +289,9 @@ class SAXParser {
}
else {
if (!isWhitespace(c)) {
// We use the reportedTextBeforeRoot and
// reportedTextAfterRoot flags to avoid reporting errors
// for every single character that is out of place.
// We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
// to avoid reporting errors for every single character that is out
// of place.
if (!this.sawRoot && !this.reportedTextBeforeRoot) {
this.fail("Text data outside of root node.");
this.reportedTextBeforeRoot = true;
Expand Down Expand Up @@ -314,7 +320,7 @@ class SAXParser {
else if (isWhitespace(c)) {
// wait for it...
}
else if (isMatch(nameStart, c)) {
else if (isMatch(NAME_START_CHAR, c)) {
this.state = S_OPEN_TAG;
this.tagName = c;
}
Expand Down Expand Up @@ -360,8 +366,8 @@ class SAXParser {
}
else {
this.openWakaBang += c;
// 7 happens to be the maximum length of the string that can
// possibly match one of the cases above.
// 7 happens to be the maximum length of the string that can possibly
// match one of the cases above.
if (this.openWakaBang.length >= 7) {
this.fail("Incorrect syntax");
}
Expand Down Expand Up @@ -494,13 +500,12 @@ class SAXParser {
this.state = S_PROC_INST_BODY;
}
else {
// When namespaces are used, colons are not valid in pi names.
// https://www.w3.org/XML/xml-names-19990114-errata.html
// NE08
if (!(isMatch(this.procInstName.length ? nameBody : nameStart, c) &&
// When namespaces are used, colons are not valid in entity names.
// https://www.w3.org/XML/xml-names-19990114-errata.html
// NE08
if (!(isMatch(this.procInstName.length ?
NAME_CHAR : NAME_START_CHAR, c) &&
// When namespaces are used, colons are not valid in entity
// names.
// https://www.w3.org/XML/xml-names-19990114-errata.html
// NE08
(!this.opt.xmlns || c !== ":"))) {
this.fail("Invalid characer in processing instruction name.");
}
Expand Down Expand Up @@ -536,7 +541,7 @@ class SAXParser {
continue;

case S_OPEN_TAG:
if (isMatch(nameBody, c)) {
if (isMatch(NAME_CHAR, c)) {
this.tagName += c;
}
else {
Expand Down Expand Up @@ -578,7 +583,7 @@ class SAXParser {
else if (c === "/") {
this.state = S_OPEN_TAG_SLASH;
}
else if (isMatch(nameStart, c)) {
else if (isMatch(NAME_START_CHAR, c)) {
this.attribName = c;
this.attribValue = "";
this.state = S_ATTRIB_NAME;
Expand All @@ -601,7 +606,7 @@ class SAXParser {
else if (isWhitespace(c)) {
this.state = S_ATTRIB_NAME_SAW_WHITE;
}
else if (isMatch(nameBody, c)) {
else if (isMatch(NAME_CHAR, c)) {
this.attribName += c;
}
else {
Expand All @@ -628,7 +633,7 @@ class SAXParser {
if (c === ">") {
this.openTag();
}
else if (isMatch(nameStart, c)) {
else if (isMatch(NAME_START_CHAR, c)) {
this.attribName = c;
this.state = S_ATTRIB_NAME;
}
Expand Down Expand Up @@ -680,7 +685,7 @@ class SAXParser {
else if (c === "/") {
this.state = S_OPEN_TAG_SLASH;
}
else if (isMatch(nameStart, c)) {
else if (isMatch(NAME_START_CHAR, c)) {
this.fail("No whitespace between attributes");
this.attribName = c;
this.attribValue = "";
Expand Down Expand Up @@ -716,7 +721,7 @@ class SAXParser {
if (isWhitespace(c)) {
continue;
}
else if (notMatch(nameStart, c)) {
else if (notMatch(NAME_START_CHAR, c)) {
this.fail("Invalid tagname in closing tag.");
}
else {
Expand All @@ -726,7 +731,7 @@ class SAXParser {
else if (c === ">") {
this.closeTag();
}
else if (isMatch(nameBody, c)) {
else if (isMatch(NAME_CHAR, c)) {
this.tagName += c;
}
else {
Expand Down Expand Up @@ -778,8 +783,10 @@ class SAXParser {
this.entity = "";
this.state = returnState;
}
else if (isMatch(this.entity.length ? entityBody : entityStart, c) &&
// When namespaces are used, colons are not valid in entity names.
else if (isMatch(this.entity.length ?
NAME_CHAR : ENTITY_START_CHAR, c) &&
// When namespaces are used, colons are not valid in entity
// names.
// https://www.w3.org/XML/xml-names-19990114-errata.html
// NE08
(!this.opt.xmlns || c !== ":")) {
Expand Down Expand Up @@ -1100,8 +1107,7 @@ class SAXStream extends Stream {
this._decoder = null;

for (const ev of streamWraps) {
// Override the no-op defaults with handlers that emit on the
// stream.
// Override the no-op defaults with handlers that emit on the stream.
this._parser[`on${ev}`] = (...args) => {
this.emit(ev, ...args);
};
Expand Down
5 changes: 5 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,8 @@
"eslint-config-lddubeau-base": "^2.1.0",
"husky": "^0.14.3",
"mocha": "^5.2.0"
},
"dependencies": {
"xmlchars": "^1.0.0"
}
}
2 changes: 1 addition & 1 deletion test/xml-internal-entities.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const entitiesToTest = {
u_score: true,
"d-ash": true,
"d.ot": true,
"all:_#-.": true,
"all:_#-.": [5, "#"],
};

let xmlStart = "<a test=\"&amp;\" ";
Expand Down

0 comments on commit 2c939fe

Please sign in to comment.