diff --git a/README.md b/README.md index 798a67a..df8ef2a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ `minify-xml` is a lightweight and fast XML minifier for NodeJS with a command line. -Existing XML minifiers commonly only remove comments and whitespace between tags. This minifier also includes minification of tags, e.g. by collapsing the whitespace between multiple attributes. Additionally the minifier is able to remove any unused namespace declarations. `minify-xml` is based on regular expressions and thus executes blazingly fast. +Existing XML minifiers, such as `pretty-data` often do a pretty (*phun intended*) bad job minifying XML in usually only removing comments and whitespace between tags. `minify-xml` on the other hand also includes minification of tags, e.g. by collapsing the whitespace between multiple attributes and further minifications, such as the removal of unused namespace declarations. `minify-xml` is based on regular expressions and thus executes blazingly fast. ## Installation @@ -20,10 +20,10 @@ const xml = ` With the default options all comments will be removed and whitespace in tags, like spaces between attributes, will be collapsed / removed --> - + - + any valid element content is left unaffected (strangely enough = " ... " and even > are valid characters in XML, only < must always be encoded) @@ -35,7 +35,7 @@ console.log(minifyXML(code)); This outputs the minified XML: ```xml - + any valid element content is left unaffected (strangely enough = " ... " and even > are valid characters in XML, only < must always be encoded) @@ -55,7 +55,7 @@ require("minify-xml").minify(``, { ... }); - `collapseWhitespaceInTags` (default: `true`): Collapse whitespace in tags like ``. -- `removeUnusedNamespaces` (default: `true`): Removes any namespaces from tags, which are not used anywhere in the document, like ``. +- `removeUnusedNamespaces` (default: `true`): Removes any namespaces from tags, which are not used anywhere in the document, like ``. Notice the word *anywhere* here, the minifier not does consider the structure of the XML document, thus namespaces which might be only used in a certain sub-tree of elements might not be removed, even though they are not used in that sub-tree. ## CLI diff --git a/index.js b/index.js index 2196a1d..05b0cb1 100644 --- a/index.js +++ b/index.js @@ -3,16 +3,16 @@ function escapeRegExp(string) { } function findAllMatches(string, regexp, group) { var match, matches = []; - while ((match = regexp.exec(string))) { + while ((match = regexp.exec(string))) { if (match[group]) { matches.push(typeof group === 'number' ? match[group] : match); - } return matches; + } } return matches; } -// note: this funky looking positive backward reference regular expression is necessary to match contents inside of tags <...>. -// this is due to that literally any character except <&" is allowed to be put next to everywhere in XML. as even > is a allowed +// note: this funky looking positive lookbehind regular expression is necessary to match contents inside of tags <...>. this +// is due to that literally any characters except <&" are allowed to be put next to everywhere in XML. as even > is an allowed // character, simply checking for (?<=<[^>]*) would not do the trick if e.g. > is used inside of a tag attribute. -const emptyRegexp = new RegExp(), inTagPattern = /(?<=<[^=\s>]+(?:\s+[^=\s>]+\s*=\s*(?:"[^"]*"|'[^']*'))*\1)/; -function replaceInTag(xml, regexp, lookbehind, replacement) { +const emptyRegexp = new RegExp(), inTagPattern = /(?<=<[^\s>]+(?:\s+[^=\s>]+\s*=\s*(?:"[^"]*"|'[^']*'))*\1)/; +function replaceInTags(xml, regexp, lookbehind, replacement) { if (!replacement) { replacement = lookbehind; lookbehind = emptyRegexp; @@ -29,16 +29,16 @@ const defaultOptions = { }; module.exports = { - minify: function(xml, userOptions) { - // mix in the user options - const options = { + minify: function(xml, options) { + // apply the default options + options = { ...defaultOptions, - ...(userOptions || {}) + ...(options || {}) }; // remove XML comments if (options.removeComments) { - xml = xml.replace(//g, String()); + xml = xml.replace(//g, String()); } // remove whitespace between tags @@ -48,20 +48,23 @@ module.exports = { // remove / collapse multiple whitespace in tags if (options.collapseWhitespaceInTags) { - xml = replaceInTag(xml, /\s*=\s*/, /\s+[^=\s>]+/, "="); // remove leading / tailing whitespace around = "..." - xml = replaceInTag(xml, /\s+/, " "); // collapse whitespace between attributes - xml = replaceInTag(xml, /\s*(?=\/>)/, String()); // remove whitespace before closing > /> of tags + xml = replaceInTags(xml, /\s*=\s*/, /\s+[^=\s>]+/, "="); // remove leading / tailing whitespace around = "..." + xml = replaceInTags(xml, /\s+/, " "); // collapse whitespace between attributes + xml = replaceInTags(xml, /\s*(?=\/>)/, String()); // remove whitespace before closing > /> of tags } - // remove namespace declarations which are not used anywhere in the document + // remove namespace declarations which are not used anywhere in the document (limitation: the approach taken here will not consider the structure of the XML document + // thus namespaces which might be only used in a certain sub-tree of elements might not be removed, even though they are not used in that sub-tree) if (options.removeUnusedNamespaces) { // the search for all xml namespaces could result in some "fake" namespaces (e.g. if a xmlns:... string is found inside the content of an element), as we do not // limit the search to the inside of tags. this however comes with no major drawback as we the replace only inside of tags and thus it simplifies the search - var all = findAllMatches(xml, /\sxmlns:([^\s\/]+)=/g, 1), used = findAllMatches(xml, /<([^\s\/]+):/g, 1), - unused = all.filter(ns => !used.includes(ns)); + var all = findAllMatches(xml, /\sxmlns:([^\s\/]+)=/g, 1), used = [ + ...findAllMatches(xml, /<([^\s\/]+):/g, 1), // look for all tags with namespaces + ...findAllMatches(xml, /<[^\s>]+(?:\s+(?:([^=\s>]+):[^=\s>]+)\s*=\s*(?:"[^"]*"|'[^']*'))*/g, 1) // look for all attributes with namespaces + ], unused = all.filter(ns => !used.includes(ns)); - if (used.length) { - xml = replaceInTag(xml, new RegExp(`\\s+xmlns:(?:${ unused.map(escapeRegExp).join("|") })=(?:"[^"]*"|'[^']*')`), String()); + if (unused.length) { + xml = replaceInTags(xml, new RegExp(`\\s+xmlns:(?:${ unused.map(escapeRegExp).join("|") })=(?:"[^"]*"|'[^']*')`), String()); } } diff --git a/package.json b/package.json index 0d6d738..2323734 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "minify-xml", - "version": "2.0.0", + "version": "2.0.1", "description": "Fast XML minifier / compressor / uglifier with a command-line", "keywords": [ "XML", diff --git a/test.js b/test.js index b82303a..4e92e97 100644 --- a/test.js +++ b/test.js @@ -5,10 +5,10 @@ const xml = ` With the default options all comments will be removed and whitespace in tags, like spaces between attributes, will be collapsed / removed --> - + - + any valid element content is left unaffected (strangely enough = " ... " and even > are valid characters in XML, only < must always be encoded)