diff --git a/packages/concerto-util/index.js b/packages/concerto-util/index.js index 7b943341cb..02fb6192f9 100644 --- a/packages/concerto-util/index.js +++ b/packages/concerto-util/index.js @@ -49,6 +49,9 @@ const TypedStack = require('./lib/typedstack'); // Label const Label = require('./lib/label'); +// Identifiers +const Identifiers = require('./lib/identifiers'); + module.exports = { BaseException, BaseFileException, @@ -63,5 +66,6 @@ module.exports = { ModelWriter, Logger, TypedStack, - Label -}; \ No newline at end of file + Label, + Identifiers +}; diff --git a/packages/concerto-util/lib/identifiers.js b/packages/concerto-util/lib/identifiers.js new file mode 100644 index 0000000000..51f5e1c88a --- /dev/null +++ b/packages/concerto-util/lib/identifiers.js @@ -0,0 +1,74 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +// Conforms to Concerto Spec for identifiers +const ID_REGEX = /^(\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}|\$|_|\\u[0-9A-Fa-f]{4})(?:\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}|\$|_|\\u[0-9A-Fa-f]{4}|\p{Mn}|\p{Mc}|\p{Nd}|\p{Pc}|\u200C|\u200D)*$/u; + +/** + * Function that attempts to normalize arbitrary strings + * into valid Concerto identifiers + * + * @param {string} identifier - the input value + * @param {number} [truncateLength] - Length at which to truncate the identifier + * @returns {string} - An identifier that meets the Concerto specification + */ +function normalizeIdentifier(identifier, truncateLength = -1) { + const replacer = (_match, group1) => { + let escapedChar = ''; + // Loop through characters with multiple code points + for (const codePoint of group1) { + escapedChar += `_${codePoint.codePointAt(0).toString(16)}`; + } + return escapedChar; + }; + + // Stringify null & undefined values + let result = identifier ?? String(identifier); + + if (typeof result !== 'string'){ + throw new Error(`Unsupported identifier type, '${typeof result}'.`); + } + + // 1. If the identifier begins with a number, add a leading underscore + result = result + .replace(/^\p{Nd}/u, '_$&') + + // 2. Substitute Whitespace, and joiners + .replace(/[-‐−@#:;><|/\\\u200c\u200d]/g, '_') + .replace(/\s/g, '_') + + // 3a. Replace Invalid Characters + .replace(/(?!\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}|\$|_|\p{Mn}|\p{Mc}|\p{Nd}|\p{Pc}|\u200C|\u200D|\\u[0-9A-Fa-f]{4})(.)/gu, replacer) + + // 3b. Escape Surrogate Pairs + .replace(/([\uD800-\uDFFF])/g, replacer); + + // 4. Optionally truncate the identifier + if (truncateLength > 0){ + result = result.substring(0,truncateLength); + } + + // Check validity + if (!ID_REGEX.test(result)){ + throw new Error(`Unexpected error. Not able to escape identifier '${result}'.`); + } + return result; +} + +module.exports = { + normalizeIdentifier, + ID_REGEX +}; diff --git a/packages/concerto-util/test/identifiers.js b/packages/concerto-util/test/identifiers.js new file mode 100644 index 0000000000..43965334f6 --- /dev/null +++ b/packages/concerto-util/test/identifiers.js @@ -0,0 +1,111 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const { normalizeIdentifier } = require('../lib/identifiers'); + +require('chai').should(); + +describe('Identifiers', function () { + + describe('normalizeIdentifier', function() { + const ids = [ + // No-op Values + ['a'], // Letter, lowercase + ['ՠ'], // Letter, lowercase. Unicode 11.0 + ['A'], // Letter, uppercase + ['ĦĔĽĻŎ'], // Letter, uppercase + ['Dž'], // Letter, titlecase + ['ᾩ'], // Letter, titlecase + ['〱〱〱〱'], // Letter, modifier + ['जावास्क्रिप्ट'], // Letter, other + ['Ⅶ'], // Number, letter + ['$class'], // leading $ + ['_class'], // leading _ + ['\u03C9'], // Escaped Unicode Code Point, ᾧ + ['abc'], // Letter, lowercase + ['a123'], // Number, digit + ['foo$bar'], // $ separator + ['foo_bar'], // _ separator + ['αβγδεζηθ'], // Letter, lowercase + ['foo\u03C9bar'], // Escaped Unicode Code Point, fooᾧbar + ['foo\u03c9bar'], // Escaped Unicode Code Point lowercase, fooᾧbar + ['foo‿bar'], // Punctuation, connector + ['पः'], // Mark, combining character + ['CharlesⅢ'], // Number, letter + ['true'], // reserved words + ['false'], + ['null'], + ['while'], + ['for'], + ['nully'], // leading reserved word + ['こんにちは世界'], // Japanese + ['foo‌bar', 'foo_bar'], // unescaped zero-width non-joiner + ['foo‍bar', 'foo_bar'], // unescaped zero-width joiner + + // Bad Identifiers + ['123', '_123'], + ['1st', '_1st'], + ['foo bar', 'foo_bar'], + ['foo\u0020bar', 'foo_bar'], // Escaped Unicode, space + ['foo\x3Dbar', 'foo_3dbar'], // Escaped Hex Sequence, foo=bar + ['foo\x3Dbar', 'foo_3dbar'], // Escaped Hex Sequence, foo=bar + ['‍foo', '_foo'], // leading unescaped zero-width joiner + ['foo-bar', 'foo_bar'], + ['foo‐bar', 'foo_bar'], // U+2010 HYPHEN' + ['foo−bar', 'foo_bar'], // U+2212 MINUS + ['foo|bar', 'foo_bar'], + ['foo@bar', 'foo_bar'], + ['foo#bar', 'foo_bar'], + ['foo/bar', 'foo_bar'], + ['foo>bar', 'foo_bar'], + ['\x3D', '_3d'], // Escaped Hex Sequence, = + ['😄', '_1f604'], // Surrogate pair, Emoji + ['\u{1F604}', '_1f604'], // Escaped surrogate pair, Emoji + ['𐴓𐴠𐴑𐴤𐴝', '_d803_dd13_d803_dd20_d803_dd11'], // Surrogate pairs, Hanifi Rohingya RTL + [null, 'null'], + [undefined, 'undefined'], + ]; + ids.forEach(([id, expectedValue]) => { + it(`'${id}' should equal '${expectedValue ?? id}'`, function() { + normalizeIdentifier(id, 30).should.equal(expectedValue ?? id); + }); + }); + + it('should throw for empty string', () => { + (() => normalizeIdentifier('')).should.throw(/Unexpected error/); + }); + + it('should not normalize non string identifiers', () => { + (() => normalizeIdentifier({ a: 1 })).should.throw(/Unsupported identifier type/); + (() => normalizeIdentifier(Symbol.for('a'))).should.throw(/Unsupported identifier type/); + (() => normalizeIdentifier(false)).should.throw(/Unsupported identifier type/); + (() => normalizeIdentifier(true)).should.throw(/Unsupported identifier type/); + (() => normalizeIdentifier(1)).should.throw(/Unsupported identifier type/); + (() => normalizeIdentifier(1.112345678987654)).should.throw(/Unsupported identifier type/); + (() => normalizeIdentifier(3.1e2)).should.throw(/Unsupported identifier type/); + }); + + it('should truncate identifiers', () => { + normalizeIdentifier('a', 2).should.equal('a'); + normalizeIdentifier('aaa', 2).should.equal('aa'); + normalizeIdentifier('aaa', 0).should.equal('aaa'); + normalizeIdentifier('aaa', -1).should.equal('aaa'); + normalizeIdentifier('$a', 1).should.equal('$'); + normalizeIdentifier('😄', 2).should.equal('_1'); + normalizeIdentifier('𐴓', 2).should.equal('_d'); // surrogate pair character + }); + }); +}); diff --git a/packages/concerto-util/types/index.d.ts b/packages/concerto-util/types/index.d.ts index 97e76f29f1..3d3d7a0248 100644 --- a/packages/concerto-util/types/index.d.ts +++ b/packages/concerto-util/types/index.d.ts @@ -12,4 +12,5 @@ import ModelWriter = require("./lib/modelwriter"); import Logger = require("./lib/logger"); import TypedStack = require("./lib/typedstack"); import Label = require("./lib/label"); -export { BaseException, BaseFileException, FileDownloader, CompositeFileLoader, DefaultFileLoader, GitHubFileLoader, HTTPFileLoader, Writer, FileWriter, InMemoryWriter, ModelWriter, Logger, TypedStack, Label }; +import Identifiers = require("./lib/identifiers"); +export { BaseException, BaseFileException, FileDownloader, CompositeFileLoader, DefaultFileLoader, GitHubFileLoader, HTTPFileLoader, Writer, FileWriter, InMemoryWriter, ModelWriter, Logger, TypedStack, Label, Identifiers }; diff --git a/packages/concerto-util/types/lib/identifiers.d.ts b/packages/concerto-util/types/lib/identifiers.d.ts new file mode 100644 index 0000000000..b6bf8140e7 --- /dev/null +++ b/packages/concerto-util/types/lib/identifiers.d.ts @@ -0,0 +1,10 @@ +/** + * Function that attempts to normalize arbitrary strings + * into valid Concerto identifiers + * + * @param {string} identifier - the input value + * @param {number} [truncateLength] - Length at which to truncate the identifier + * @returns {string} - An identifier that meets the Concerto specification + */ +export function normalizeIdentifier(identifier: string, truncateLength?: number): string; +export const ID_REGEX: RegExp;