forked from maplibre/maplibre-gl-js
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Generate Unicode character property data at build time
Added a script that fetches the latest Unicode character database’s property file for Indic syllable categories and generates a function for combining graphemes based on it.
- Loading branch information
Showing
4 changed files
with
56 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import * as fs from 'fs'; | ||
|
||
// Or https://www.unicode.org/Public/draft/UCD/ucd if the next Unicode version is finalized and awaiting publication. | ||
const ucdBaseUrl = 'https://www.unicode.org/Public/UCD/latest/ucd'; | ||
|
||
async function getPropertyData(property: string, value: string): Promise<{[_: string]: string}> { | ||
const indicSyllabicCategoryUrl = `${ucdBaseUrl}/${property.replaceAll('_', '')}.txt`; | ||
const response = await fetch(indicSyllabicCategoryUrl); | ||
if (!response.ok) { | ||
throw new Error(`Unable to fetch latest Unicode character database file for ${property}: ${response.status}`); | ||
} | ||
|
||
const table = await response.text(); | ||
const header = table.match(/^# \w+-(\d+\.\d+\.\d+)\.txt\n# Date: (\d\d\d\d-\d\d-\d\d)/); | ||
const tableRegExp = new RegExp(`^([0-9A-Z]{4,6}(?:..[0-9A-Z]{4,6})?)(?= *; ${value})`, 'gm'); | ||
const characterClass = table | ||
.match(tableRegExp) | ||
.map(record => record | ||
.split('..') | ||
.map(codePoint => (codePoint.length > 4) ? `\\u{${codePoint}}` : `\\u${codePoint}`) | ||
.join('-')) | ||
.join('') | ||
return { | ||
version: header && header[1], | ||
date: header && header[2], | ||
characterClass, | ||
}; | ||
} | ||
|
||
const indicSyllabicCategory = await getPropertyData('Indic_Syllabic_Category', 'Invisible_Stacker'); | ||
|
||
fs.writeFileSync('src/data/unicode_properties.ts', | ||
`// This file is generated. Edit build/generate-unicode-data.ts, then run \`npm run generate-unicode-data\`. | ||
/** | ||
* Returns whether two grapheme clusters detected by \`Intl.Segmenter\` can be combined to prevent an invisible combining mark from appearing unexpectedly. | ||
*/ | ||
export function canCombineGraphemes(former: string, latter: string): boolean { | ||
// Indic_Syllabic_Category=Invisible_Stacker as of Unicode ${indicSyllabicCategory.version}, published ${indicSyllabicCategory.date}. | ||
const invisibleStackersRegExp = /[${indicSyllabicCategory.characterClass}]$/u; | ||
return invisibleStackersRegExp.test(former) || /^\\p{gc=Mc}/u.test(latter); | ||
} | ||
`); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
// This file is generated. Edit build/generate-unicode-data.ts, then run `npm run generate-unicode-data`. | ||
|
||
/** | ||
* Returns whether two grapheme clusters detected by `Intl.Segmenter` can be combined to prevent an invisible combining mark from appearing unexpectedly. | ||
*/ | ||
export function canCombineGraphemes(former: string, latter: string): boolean { | ||
// Indic_Syllabic_Category=Invisible_Stacker as of Unicode 16.0.0, published 2024-04-30. | ||
const invisibleStackersRegExp = /[\u1039\u17D2\u1A60\u1BAB\uAAF6\u{10A3F}\u{11133}\u{113D0}\u{1193E}\u{11A47}\u{11A99}\u{11D45}\u{11D97}\u{11F42}]$/u; | ||
return invisibleStackersRegExp.test(former) || /^\p{gc=Mc}/u.test(latter); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters