Skip to content

Commit

Permalink
Generate Unicode character property data at build time
Browse files Browse the repository at this point in the history
Added a script that fetches the latest Unicode character database’s property file for Indic syllable categories and generates a function for combining graphemes based on it.
  • Loading branch information
1ec5 committed Aug 21, 2024
1 parent 8966a01 commit 4013b21
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 24 deletions.
43 changes: 43 additions & 0 deletions build/generate-unicode-data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import * as fs from 'fs';

// Or https://www.unicode.org/Public/draft/UCD/ucd if the next Unicode version is finalized and awaiting publication.
const ucdBaseUrl = 'https://www.unicode.org/Public/UCD/latest/ucd';

async function getPropertyData(property: string, value: string): Promise<{[_: string]: string}> {
const indicSyllabicCategoryUrl = `${ucdBaseUrl}/${property.replaceAll('_', '')}.txt`;
const response = await fetch(indicSyllabicCategoryUrl);
if (!response.ok) {
throw new Error(`Unable to fetch latest Unicode character database file for ${property}: ${response.status}`);
}

const table = await response.text();
const header = table.match(/^# \w+-(\d+\.\d+\.\d+)\.txt\n# Date: (\d\d\d\d-\d\d-\d\d)/);
const tableRegExp = new RegExp(`^([0-9A-Z]{4,6}(?:..[0-9A-Z]{4,6})?)(?= *; ${value})`, 'gm');
const characterClass = table
.match(tableRegExp)
.map(record => record
.split('..')
.map(codePoint => (codePoint.length > 4) ? `\\u{${codePoint}}` : `\\u${codePoint}`)
.join('-'))
.join('')
return {
version: header && header[1],
date: header && header[2],
characterClass,
};
}

const indicSyllabicCategory = await getPropertyData('Indic_Syllabic_Category', 'Invisible_Stacker');

fs.writeFileSync('src/data/unicode_properties.ts',
`// This file is generated. Edit build/generate-unicode-data.ts, then run \`npm run generate-unicode-data\`.
/**
* Returns whether two grapheme clusters detected by \`Intl.Segmenter\` can be combined to prevent an invisible combining mark from appearing unexpectedly.
*/
export function canCombineGraphemes(former: string, latter: string): boolean {
// Indic_Syllabic_Category=Invisible_Stacker as of Unicode ${indicSyllabicCategory.version}, published ${indicSyllabicCategory.date}.
const invisibleStackersRegExp = /[${indicSyllabicCategory.characterClass}]$/u;
return invisibleStackersRegExp.test(former) || /^\\p{gc=Mc}/u.test(latter);
}
`);
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@
},
"scripts": {
"generate-dist-package": "node --no-warnings --loader ts-node/esm build/generate-dist-package.js",
"generate-unicode-data": "node --no-warnings --loader ts-node/esm build/generate-unicode-data.ts",
"generate-shaders": "node --no-warnings --loader ts-node/esm build/generate-shaders.ts",
"generate-struct-arrays": "node --no-warnings --loader ts-node/esm build/generate-struct-arrays.ts",
"generate-style-code": "node --no-warnings --loader ts-node/esm build/generate-style-code.ts",
Expand Down
10 changes: 10 additions & 0 deletions src/data/unicode_properties.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// This file is generated. Edit build/generate-unicode-data.ts, then run `npm run generate-unicode-data`.

/**
* Returns whether two grapheme clusters detected by `Intl.Segmenter` can be combined to prevent an invisible combining mark from appearing unexpectedly.
*/
export function canCombineGraphemes(former: string, latter: string): boolean {
// Indic_Syllabic_Category=Invisible_Stacker as of Unicode 16.0.0, published 2024-04-30.
const invisibleStackersRegExp = /[\u1039\u17D2\u1A60\u1BAB\uAAF6\u{10A3F}\u{11133}\u{113D0}\u{1193E}\u{11A47}\u{11A99}\u{11D45}\u{11D97}\u{11F42}]$/u;
return invisibleStackersRegExp.test(former) || /^\p{gc=Mc}/u.test(latter);
}
26 changes: 2 additions & 24 deletions src/util/script_detection.ts
Original file line number Diff line number Diff line change
@@ -1,30 +1,10 @@
/* eslint-disable new-cap */

import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block';
import {canCombineGraphemes} from '../data/unicode_properties';

const segmenter = new Intl.Segmenter();

// Indic_Syllabic_Category=Invisible_Stacker as of Unicode 16.0.0.
// https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
const invisibleStackerCodePoints = [
0x1039, // MYANMAR SIGN VIRAMA
0x17D2, // KHMER SIGN COENG
0x1A60, // TAI THAM SIGN SAKOT
0x1BAB, // SUNDANESE SIGN VIRAMA
0xAAF6, // MEETEI MAYEK VIRAMA
0x10A3F, // KHAROSHTHI VIRAMA
0x11133, // CHAKMA VIRAMA
0x113D0, // TULU-TIGALARI CONJOINER
0x1193E, // DIVES AKURU VIRAMA
0x11A47, // ZANABAZAR SQUARE SUBJOINER
0x11A99, // SOYOMBO SUBJOINER
0x11D45, // MASARAM GONDI VIRAMA
0x11D97, // GUNJALA GONDI VIRAMA
0x11F42, // KAWI CONJOINER
];

const invisibleStackersRegExp = new RegExp('[' + invisibleStackerCodePoints.map(cp => `\\u{${cp.toString(16)}}`) + ']$', 'u');

export function splitByGraphemeCluster(text: string) {
const segments = segmenter.segment(text)[Symbol.iterator]();
let segment = segments.next();
Expand All @@ -35,9 +15,7 @@ export function splitByGraphemeCluster(text: string) {
const baseSegments = [];
while (!segment.done) {
const baseSegment = segment;
while (!nextSegment.done &&
(/^\p{gc=Mc}/u.test(nextSegment.value.segment) ||
invisibleStackersRegExp.test(baseSegment.value.segment))) {
while (!nextSegment.done && canCombineGraphemes(baseSegment.value.segment, nextSegment.value.segment)) {
baseSegment.value.segment += nextSegment.value.segment;
segment = segments.next();
nextSegment = nextSegments.next();
Expand Down

0 comments on commit 4013b21

Please sign in to comment.