From 2c939feee504e071cadc1528594f4088bb7c440f Mon Sep 17 00:00:00 2001 From: Louis-Dominique Dubeau Date: Thu, 5 Jul 2018 14:00:34 -0400 Subject: [PATCH] fix: use xmlchars for checking names This plugs some holes in name checking. --- lib/saxes.js | 86 +++++++++++++++++++---------------- package-lock.json | 5 ++ package.json | 3 ++ test/xml-internal-entities.js | 2 +- 4 files changed, 55 insertions(+), 41 deletions(-) diff --git a/lib/saxes.js b/lib/saxes.js index 40aa4603..8cc7d51e 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -1,25 +1,32 @@ -// this really needs to be replaced with character classes. -// XML allows all manner of ridiculous numbers and digits. +// this really needs to be replaced with character classes. XML allows all +// manner of ridiculous numbers and digits. "use strict"; +const { XML_1_0: { ED5 } } = require("xmlchars"); + +const { + regexes: { + NAME_START_CHAR, + NAME_CHAR, + }, +} = ED5; + const CDATA = "[CDATA["; const DOCTYPE = "DOCTYPE"; const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"; const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"; -// http://www.w3.org/TR/REC-xml/#NT-NameStartChar -// This implementation works on strings, a single character at a time -// as such, it cannot ever support astral-plane characters (10000-EFFFF) -// without a significant breaking change to either this parser, or the -// JavaScript language. Implementation of an emoji-capable xml parser -// is left as an exercise for the reader. -const nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; - -const nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; +// We have to make our own regexp because the way we record character references +// as entities. So we have to allow for "#" at the start. +const ENTITY_START_CHAR = + new RegExp(`^[${ED5.fragments.NAME_START_CHAR}#]$`, "u"); -const entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; -const entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; +// This implementation works on strings, a single character at a time as such, +// it cannot ever support astral-plane characters (10000-EFFFF) without a +// significant breaking change to either this parser, or the JavaScript +// language. Implementation of an emoji-capable xml parser is left as an +// exercise for the reader. const rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE }; @@ -85,9 +92,8 @@ exports.EVENTS = [ ]; const buffers = [ - "comment", "openWakaBang", "textNode", "tagName", "doctype", - "procInstName", "procInstBody", "entity", "attribName", - "attribValue", "cdata", + "comment", "openWakaBang", "textNode", "tagName", "doctype", "procInstName", + "procInstBody", "entity", "attribName", "attribValue", "cdata", ]; let Stream; @@ -283,9 +289,9 @@ class SAXParser { } else { if (!isWhitespace(c)) { - // We use the reportedTextBeforeRoot and - // reportedTextAfterRoot flags to avoid reporting errors - // for every single character that is out of place. + // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags + // to avoid reporting errors for every single character that is out + // of place. if (!this.sawRoot && !this.reportedTextBeforeRoot) { this.fail("Text data outside of root node."); this.reportedTextBeforeRoot = true; @@ -314,7 +320,7 @@ class SAXParser { else if (isWhitespace(c)) { // wait for it... } - else if (isMatch(nameStart, c)) { + else if (isMatch(NAME_START_CHAR, c)) { this.state = S_OPEN_TAG; this.tagName = c; } @@ -360,8 +366,8 @@ class SAXParser { } else { this.openWakaBang += c; - // 7 happens to be the maximum length of the string that can - // possibly match one of the cases above. + // 7 happens to be the maximum length of the string that can possibly + // match one of the cases above. if (this.openWakaBang.length >= 7) { this.fail("Incorrect syntax"); } @@ -494,13 +500,12 @@ class SAXParser { this.state = S_PROC_INST_BODY; } else { - // When namespaces are used, colons are not valid in pi names. - // https://www.w3.org/XML/xml-names-19990114-errata.html - // NE08 - if (!(isMatch(this.procInstName.length ? nameBody : nameStart, c) && - // When namespaces are used, colons are not valid in entity names. - // https://www.w3.org/XML/xml-names-19990114-errata.html - // NE08 + if (!(isMatch(this.procInstName.length ? + NAME_CHAR : NAME_START_CHAR, c) && + // When namespaces are used, colons are not valid in entity + // names. + // https://www.w3.org/XML/xml-names-19990114-errata.html + // NE08 (!this.opt.xmlns || c !== ":"))) { this.fail("Invalid characer in processing instruction name."); } @@ -536,7 +541,7 @@ class SAXParser { continue; case S_OPEN_TAG: - if (isMatch(nameBody, c)) { + if (isMatch(NAME_CHAR, c)) { this.tagName += c; } else { @@ -578,7 +583,7 @@ class SAXParser { else if (c === "/") { this.state = S_OPEN_TAG_SLASH; } - else if (isMatch(nameStart, c)) { + else if (isMatch(NAME_START_CHAR, c)) { this.attribName = c; this.attribValue = ""; this.state = S_ATTRIB_NAME; @@ -601,7 +606,7 @@ class SAXParser { else if (isWhitespace(c)) { this.state = S_ATTRIB_NAME_SAW_WHITE; } - else if (isMatch(nameBody, c)) { + else if (isMatch(NAME_CHAR, c)) { this.attribName += c; } else { @@ -628,7 +633,7 @@ class SAXParser { if (c === ">") { this.openTag(); } - else if (isMatch(nameStart, c)) { + else if (isMatch(NAME_START_CHAR, c)) { this.attribName = c; this.state = S_ATTRIB_NAME; } @@ -680,7 +685,7 @@ class SAXParser { else if (c === "/") { this.state = S_OPEN_TAG_SLASH; } - else if (isMatch(nameStart, c)) { + else if (isMatch(NAME_START_CHAR, c)) { this.fail("No whitespace between attributes"); this.attribName = c; this.attribValue = ""; @@ -716,7 +721,7 @@ class SAXParser { if (isWhitespace(c)) { continue; } - else if (notMatch(nameStart, c)) { + else if (notMatch(NAME_START_CHAR, c)) { this.fail("Invalid tagname in closing tag."); } else { @@ -726,7 +731,7 @@ class SAXParser { else if (c === ">") { this.closeTag(); } - else if (isMatch(nameBody, c)) { + else if (isMatch(NAME_CHAR, c)) { this.tagName += c; } else { @@ -778,8 +783,10 @@ class SAXParser { this.entity = ""; this.state = returnState; } - else if (isMatch(this.entity.length ? entityBody : entityStart, c) && - // When namespaces are used, colons are not valid in entity names. + else if (isMatch(this.entity.length ? + NAME_CHAR : ENTITY_START_CHAR, c) && + // When namespaces are used, colons are not valid in entity + // names. // https://www.w3.org/XML/xml-names-19990114-errata.html // NE08 (!this.opt.xmlns || c !== ":")) { @@ -1100,8 +1107,7 @@ class SAXStream extends Stream { this._decoder = null; for (const ev of streamWraps) { - // Override the no-op defaults with handlers that emit on the - // stream. + // Override the no-op defaults with handlers that emit on the stream. this._parser[`on${ev}`] = (...args) => { this.emit(ev, ...args); }; diff --git a/package-lock.json b/package-lock.json index 847f0595..392c8bfb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2842,6 +2842,11 @@ "mkdirp": "^0.5.1" } }, + "xmlchars": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-1.0.0.tgz", + "integrity": "sha512-zNReY0hmka9/q+OiTW/3ScE0fa46LRt3wH2yk1NnenSH0GC9+s/C6k6hARRQOlJiBbSDkz9p7feyteXoGZcQvQ==" + }, "yallist": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", diff --git a/package.json b/package.json index 4484f523..de0cbef1 100644 --- a/package.json +++ b/package.json @@ -27,5 +27,8 @@ "eslint-config-lddubeau-base": "^2.1.0", "husky": "^0.14.3", "mocha": "^5.2.0" + }, + "dependencies": { + "xmlchars": "^1.0.0" } } diff --git a/test/xml-internal-entities.js b/test/xml-internal-entities.js index d6087480..0ee47a16 100644 --- a/test/xml-internal-entities.js +++ b/test/xml-internal-entities.js @@ -14,7 +14,7 @@ const entitiesToTest = { u_score: true, "d-ash": true, "d.ot": true, - "all:_#-.": true, + "all:_#-.": [5, "#"], }; let xmlStart = "