Generate Unicode character property data at build time

Added a script that fetches the latest Unicode character database’s property file for Indic syllable categories and generates a function for combining graphemes based on it.
1ec5 · Aug 21, 2024 · 4013b21 · 4013b21
1 parent 8966a01
commit 4013b21
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 24 deletions.
diff --git a/build/generate-unicode-data.ts b/build/generate-unicode-data.ts
@@ -0,0 +1,43 @@
+import * as fs from 'fs';
+
+// Or https://www.unicode.org/Public/draft/UCD/ucd if the next Unicode version is finalized and awaiting publication. 
+const ucdBaseUrl = 'https://www.unicode.org/Public/UCD/latest/ucd';
+
+async function getPropertyData(property: string, value: string): Promise<{[_: string]: string}> {
+    const indicSyllabicCategoryUrl = `${ucdBaseUrl}/${property.replaceAll('_', '')}.txt`;
+    const response = await fetch(indicSyllabicCategoryUrl);
+    if (!response.ok) {
+        throw new Error(`Unable to fetch latest Unicode character database file for ${property}: ${response.status}`);
+    }
+
+    const table = await response.text();
+    const header = table.match(/^# \w+-(\d+\.\d+\.\d+)\.txt\n# Date: (\d\d\d\d-\d\d-\d\d)/);
+    const tableRegExp = new RegExp(`^([0-9A-Z]{4,6}(?:..[0-9A-Z]{4,6})?)(?= *; ${value})`, 'gm');
+    const characterClass = table
+        .match(tableRegExp)
+        .map(record => record
+             .split('..')
+             .map(codePoint => (codePoint.length > 4) ? `\\u{${codePoint}}` : `\\u${codePoint}`)
+             .join('-'))
+        .join('')
+    return {
+        version: header && header[1],
+        date: header && header[2],
+        characterClass,
+    };
+}
+
+const indicSyllabicCategory = await getPropertyData('Indic_Syllabic_Category', 'Invisible_Stacker');
+
+fs.writeFileSync('src/data/unicode_properties.ts',
+    `// This file is generated. Edit build/generate-unicode-data.ts, then run \`npm run generate-unicode-data\`.
+
+/**
+ * Returns whether two grapheme clusters detected by \`Intl.Segmenter\` can be combined to prevent an invisible combining mark from appearing unexpectedly. 
+ */
+export function canCombineGraphemes(former: string, latter: string): boolean {
+    // Indic_Syllabic_Category=Invisible_Stacker as of Unicode ${indicSyllabicCategory.version}, published ${indicSyllabicCategory.date}.
+    const invisibleStackersRegExp = /[${indicSyllabicCategory.characterClass}]$/u;
+    return invisibleStackersRegExp.test(former) || /^\\p{gc=Mc}/u.test(latter);
+}
+`);
diff --git a/package.json b/package.json
@@ -140,6 +140,7 @@
   },
   "scripts": {
     "generate-dist-package": "node --no-warnings --loader ts-node/esm build/generate-dist-package.js",
+    "generate-unicode-data": "node --no-warnings --loader ts-node/esm build/generate-unicode-data.ts",
     "generate-shaders": "node --no-warnings --loader ts-node/esm build/generate-shaders.ts",
     "generate-struct-arrays": "node --no-warnings --loader ts-node/esm build/generate-struct-arrays.ts",
     "generate-style-code": "node --no-warnings --loader ts-node/esm build/generate-style-code.ts",

diff --git a/src/data/unicode_properties.ts b/src/data/unicode_properties.ts
@@ -0,0 +1,10 @@
+// This file is generated. Edit build/generate-unicode-data.ts, then run `npm run generate-unicode-data`.
+
+/**
+ * Returns whether two grapheme clusters detected by `Intl.Segmenter` can be combined to prevent an invisible combining mark from appearing unexpectedly. 
+ */
+export function canCombineGraphemes(former: string, latter: string): boolean {
+    // Indic_Syllabic_Category=Invisible_Stacker as of Unicode 16.0.0, published 2024-04-30.
+    const invisibleStackersRegExp = /[\u1039\u17D2\u1A60\u1BAB\uAAF6\u{10A3F}\u{11133}\u{113D0}\u{1193E}\u{11A47}\u{11A99}\u{11D45}\u{11D97}\u{11F42}]$/u;
+    return invisibleStackersRegExp.test(former) || /^\p{gc=Mc}/u.test(latter);
+}
diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts
@@ -1,30 +1,10 @@
 /* eslint-disable new-cap */
 
 import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block';
+import {canCombineGraphemes} from '../data/unicode_properties';
 
 const segmenter = new Intl.Segmenter();
 
-// Indic_Syllabic_Category=Invisible_Stacker as of Unicode 16.0.0.
-// https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
-const invisibleStackerCodePoints = [
-	0x1039, // MYANMAR SIGN VIRAMA
-	0x17D2, // KHMER SIGN COENG
-	0x1A60, // TAI THAM SIGN SAKOT
-	0x1BAB, // SUNDANESE SIGN VIRAMA
-	0xAAF6, // MEETEI MAYEK VIRAMA
-	0x10A3F, // KHAROSHTHI VIRAMA
-	0x11133, // CHAKMA VIRAMA
-	0x113D0, // TULU-TIGALARI CONJOINER
-	0x1193E, // DIVES AKURU VIRAMA
-	0x11A47, // ZANABAZAR SQUARE SUBJOINER
-	0x11A99, // SOYOMBO SUBJOINER
-	0x11D45, // MASARAM GONDI VIRAMA
-	0x11D97, // GUNJALA GONDI VIRAMA
-	0x11F42, // KAWI CONJOINER
-];
-
-const invisibleStackersRegExp = new RegExp('[' + invisibleStackerCodePoints.map(cp => `\\u{${cp.toString(16)}}`) + ']$', 'u');
-
 export function splitByGraphemeCluster(text: string) {
     const segments = segmenter.segment(text)[Symbol.iterator]();
     let segment = segments.next();
@@ -35,9 +15,7 @@ export function splitByGraphemeCluster(text: string) {
     const baseSegments = [];
     while (!segment.done) {
         const baseSegment = segment;
-        while (!nextSegment.done &&
-			   (/^\p{gc=Mc}/u.test(nextSegment.value.segment) ||
-				invisibleStackersRegExp.test(baseSegment.value.segment))) {
+        while (!nextSegment.done && canCombineGraphemes(baseSegment.value.segment, nextSegment.value.segment)) {
             baseSegment.value.segment += nextSegment.value.segment;
             segment = segments.next();
             nextSegment = nextSegments.next();