From 42d68479f10bcdfb4c3252f237da9043663d3c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Minh=20Nguy=E1=BB=85n?= Date: Thu, 15 Aug 2024 05:21:14 -0700 Subject: [PATCH] Simplify script detection using regular expressions (#4560) * Test script detection functions * Simplified script detection functions using regular expressions * Automatically exclude unsupported script codes from regular expressions * Consolidated ideographic breaking and vertical orientation logic * Prefer local glyph rendering for all CJKV characters Simplified the logic for preferring local glyph rendering to consider the script, which requires less maintenance than a hard-coded list of blocks. --------- Co-authored-by: Harel M --- CHANGELOG.md | 2 +- src/render/glyph_manager.test.ts | 19 +++- src/render/glyph_manager.ts | 14 +-- src/style/style.ts | 4 +- src/ui/map.ts | 4 +- src/util/is_char_in_unicode_block.ts | 46 ++++---- src/util/script_detection.test.ts | 131 ++++++++++++++++++++++- src/util/script_detection.ts | 151 ++++++++++++++++++--------- 8 files changed, 284 insertions(+), 87 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aadf54ddd9..b3da31c264 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ - Fix right-to-left layout of labels that contain characters in the Arabic Extended-B code block. ([#4536](https://github.com/maplibre/maplibre-gl-js/pull/4536)) - Fix 3D map freezing when camera is adjusted against map bounds. ([#4537](https://github.com/maplibre/maplibre-gl-js/issues/4537)) - Fix `getStyle()` to return a clone so the object cannot be internally changed ([#4488](https://github.com/maplibre/maplibre-gl-js/issues/4488)) - +- Prefer local glyph rendering for all CJKV characters, not just those in the CJK Unified Ideographs, Hiragana, Katakana, and Hangul Syllables blocks. ([#4560](https://github.com/maplibre/maplibre-gl-js/pull/4560))) - - _...Add new stuff here..._ ## 4.5.2 diff --git a/src/render/glyph_manager.test.ts b/src/render/glyph_manager.test.ts index 60eb97f61e..05d07f2fed 100644 --- a/src/render/glyph_manager.test.ts +++ b/src/render/glyph_manager.test.ts @@ -93,7 +93,7 @@ describe('GlyphManager', () => { test('GlyphManager generates CJK PBF locally', async () => { const manager = createGlyphManager('sans-serif'); - // character 平 + // Chinese character píng 平 const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x5e73]}); expect(returnedGlyphs['Arial Unicode MS'][0x5e73].metrics.advance).toBe(0.5); }); @@ -114,6 +114,23 @@ describe('GlyphManager', () => { expect(returnedGlyphs['Arial Unicode MS'][0x3066].metrics.advance).toBe(0.5); }); + test('GlyphManager consistently generates CJKV text locally', async () => { + const manager = createGlyphManager('sans-serif'); + + // Space + expect(manager._doesCharSupportLocalGlyph(0x0020)).toBe(false); + // Chinese character píng 平 + expect(manager._doesCharSupportLocalGlyph(0x5e73)).toBe(true); + // Chinese character biáng 𰻞 + expect(manager._doesCharSupportLocalGlyph(0x30EDE)).toBe(true); + // Katakana letter te テ + expect(manager._doesCharSupportLocalGlyph(0x30c6)).toBe(true); + // Hiragana letter te て + expect(manager._doesCharSupportLocalGlyph(0x3066)).toBe(true); + // Hangul letter a 아 + expect(manager._doesCharSupportLocalGlyph(0xC544)).toBe(true); + }); + test('GlyphManager caches locally generated glyphs', async () => { const manager = createGlyphManager('sans-serif'); diff --git a/src/render/glyph_manager.ts b/src/render/glyph_manager.ts index 3898fe5503..0efe9f066f 100644 --- a/src/render/glyph_manager.ts +++ b/src/render/glyph_manager.ts @@ -1,7 +1,6 @@ import {loadGlyphRange} from '../style/load_glyph_range'; import TinySDF from '@mapbox/tiny-sdf'; -import {unicodeBlockLookup} from '../util/is_char_in_unicode_block'; import {AlphaImage} from '../util/image'; import type {StyleGlyph} from '../style/style_glyph'; @@ -120,13 +119,14 @@ export class GlyphManager { } _doesCharSupportLocalGlyph(id: number): boolean { - /* eslint-disable new-cap */ + // The CJK Unified Ideographs blocks and Hangul Syllables blocks are + // spread across many glyph PBFs and are typically accessed very + // randomly. Preferring local rendering for these blocks reduces + // wasteful bandwidth consumption. For visual consistency within CJKV + // text, also include any other CJKV or siniform ideograph or hangul, + // hiragana, or katakana character. return !!this.localIdeographFontFamily && - (unicodeBlockLookup['CJK Unified Ideographs'](id) || - unicodeBlockLookup['Hangul Syllables'](id) || - unicodeBlockLookup['Hiragana'](id) || - unicodeBlockLookup['Katakana'](id)); - /* eslint-enable new-cap */ + /\p{Ideo}|\p{sc=Hang}|\p{sc=Hira}|\p{sc=Kana}/u.test(String.fromCodePoint(id)); } _tinySDF(entry: Entry, stack: string, id: number): StyleGlyph { diff --git a/src/style/style.ts b/src/style/style.ts index 6ad6acef22..dd500ff05d 100644 --- a/src/style/style.ts +++ b/src/style/style.ts @@ -91,8 +91,8 @@ export type StyleOptions = { validate?: boolean; /** * Defines a CSS - * font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges. - * In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold). + * font-family for locally overriding generation of Chinese, Japanese, and Korean characters. + * For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold). * Set to `false`, to enable font settings from the map's style for these glyph ranges. * Forces a full update. */ diff --git a/src/ui/map.ts b/src/ui/map.ts index 463dd46e13..b7cb3379e0 100644 --- a/src/ui/map.ts +++ b/src/ui/map.ts @@ -289,8 +289,8 @@ export type MapOptions = { fitBoundsOptions?: FitBoundsOptions; /** * Defines a CSS - * font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges. - * In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold). + * font-family for locally overriding generation of Chinese, Japanese, and Korean characters. + * For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold). * Set to `false`, to enable font settings from the map's style for these glyph ranges. * The purpose of this option is to avoid bandwidth-intensive glyph server requests. (See [Use locally generated ideographs](https://maplibre.org/maplibre-gl-js/docs/examples/local-ideographs).) * @defaultValue 'sans-serif' diff --git a/src/util/is_char_in_unicode_block.ts b/src/util/is_char_in_unicode_block.ts index 469fd2f3b6..da108b0a6a 100644 --- a/src/util/is_char_in_unicode_block.ts +++ b/src/util/is_char_in_unicode_block.ts @@ -16,16 +16,16 @@ export const unicodeBlockLookup: UnicodeBlockLookup = { // 'Cyrillic Supplement': (char) => char >= 0x0500 && char <= 0x052F, // 'Armenian': (char) => char >= 0x0530 && char <= 0x058F, //'Hebrew': (char) => char >= 0x0590 && char <= 0x05FF, - 'Arabic': (char) => char >= 0x0600 && char <= 0x06FF, + // 'Arabic': (char) => char >= 0x0600 && char <= 0x06FF, //'Syriac': (char) => char >= 0x0700 && char <= 0x074F, - 'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F, + // 'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F, // 'Thaana': (char) => char >= 0x0780 && char <= 0x07BF, // 'NKo': (char) => char >= 0x07C0 && char <= 0x07FF, // 'Samaritan': (char) => char >= 0x0800 && char <= 0x083F, // 'Mandaic': (char) => char >= 0x0840 && char <= 0x085F, // 'Syriac Supplement': (char) => char >= 0x0860 && char <= 0x086F, - 'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F, - 'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF, + // 'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F, + // 'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF, // 'Devanagari': (char) => char >= 0x0900 && char <= 0x097F, // 'Bengali': (char) => char >= 0x0980 && char <= 0x09FF, // 'Gurmukhi': (char) => char >= 0x0A00 && char <= 0x0A7F, @@ -45,7 +45,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = { // 'Ethiopic': (char) => char >= 0x1200 && char <= 0x137F, // 'Ethiopic Supplement': (char) => char >= 0x1380 && char <= 0x139F, // 'Cherokee': (char) => char >= 0x13A0 && char <= 0x13FF, - 'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F, + // 'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F, // 'Ogham': (char) => char >= 0x1680 && char <= 0x169F, // 'Runic': (char) => char >= 0x16A0 && char <= 0x16FF, // 'Tagalog': (char) => char >= 0x1700 && char <= 0x171F, @@ -54,7 +54,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = { // 'Tagbanwa': (char) => char >= 0x1760 && char <= 0x177F, 'Khmer': (char) => char >= 0x1780 && char <= 0x17FF, // 'Mongolian': (char) => char >= 0x1800 && char <= 0x18AF, - 'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF, + // 'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF, // 'Limbu': (char) => char >= 0x1900 && char <= 0x194F, // 'Tai Le': (char) => char >= 0x1950 && char <= 0x197F, // 'New Tai Lue': (char) => char >= 0x1980 && char <= 0x19DF, @@ -108,25 +108,25 @@ export const unicodeBlockLookup: UnicodeBlockLookup = { // 'Ethiopic Extended': (char) => char >= 0x2D80 && char <= 0x2DDF, // 'Cyrillic Extended-A': (char) => char >= 0x2DE0 && char <= 0x2DFF, // 'Supplemental Punctuation': (char) => char >= 0x2E00 && char <= 0x2E7F, - 'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF, - 'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF, + // 'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF, + // 'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF, 'Ideographic Description Characters': (char) => char >= 0x2FF0 && char <= 0x2FFF, 'CJK Symbols and Punctuation': (char) => char >= 0x3000 && char <= 0x303F, - 'Hiragana': (char) => char >= 0x3040 && char <= 0x309F, + // 'Hiragana': (char) => char >= 0x3040 && char <= 0x309F, 'Katakana': (char) => char >= 0x30A0 && char <= 0x30FF, - 'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F, - 'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F, + // 'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F, + // 'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F, 'Kanbun': (char) => char >= 0x3190 && char <= 0x319F, - 'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF, + // 'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF, 'CJK Strokes': (char) => char >= 0x31C0 && char <= 0x31EF, - 'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF, + // 'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF, 'Enclosed CJK Letters and Months': (char) => char >= 0x3200 && char <= 0x32FF, 'CJK Compatibility': (char) => char >= 0x3300 && char <= 0x33FF, - 'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF, + // 'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF, 'Yijing Hexagram Symbols': (char) => char >= 0x4DC0 && char <= 0x4DFF, - 'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF, - 'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F, - 'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF, + // 'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF, + // 'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F, + // 'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF, // 'Lisu': (char) => char >= 0xA4D0 && char <= 0xA4FF, // 'Vai': (char) => char >= 0xA500 && char <= 0xA63F, // 'Cyrillic Extended-B': (char) => char >= 0xA640 && char <= 0xA69F, @@ -140,7 +140,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = { // 'Devanagari Extended': (char) => char >= 0xA8E0 && char <= 0xA8FF, // 'Kayah Li': (char) => char >= 0xA900 && char <= 0xA92F, // 'Rejang': (char) => char >= 0xA930 && char <= 0xA95F, - 'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F, + // 'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F, // 'Javanese': (char) => char >= 0xA980 && char <= 0xA9DF, // 'Myanmar Extended-B': (char) => char >= 0xA9E0 && char <= 0xA9FF, // 'Cham': (char) => char >= 0xAA00 && char <= 0xAA5F, @@ -151,21 +151,21 @@ export const unicodeBlockLookup: UnicodeBlockLookup = { // 'Latin Extended-E': (char) => char >= 0xAB30 && char <= 0xAB6F, // 'Cherokee Supplement': (char) => char >= 0xAB70 && char <= 0xABBF, // 'Meetei Mayek': (char) => char >= 0xABC0 && char <= 0xABFF, - 'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF, - 'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF, + // 'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF, + // 'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF, // 'High Surrogates': (char) => char >= 0xD800 && char <= 0xDB7F, // 'High Private Use Surrogates': (char) => char >= 0xDB80 && char <= 0xDBFF, // 'Low Surrogates': (char) => char >= 0xDC00 && char <= 0xDFFF, 'Private Use Area': (char) => char >= 0xE000 && char <= 0xF8FF, - 'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF, + // 'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF, // 'Alphabetic Presentation Forms': (char) => char >= 0xFB00 && char <= 0xFB4F, - 'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF, + // 'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF, // 'Variation Selectors': (char) => char >= 0xFE00 && char <= 0xFE0F, 'Vertical Forms': (char) => char >= 0xFE10 && char <= 0xFE1F, // 'Combining Half Marks': (char) => char >= 0xFE20 && char <= 0xFE2F, 'CJK Compatibility Forms': (char) => char >= 0xFE30 && char <= 0xFE4F, 'Small Form Variants': (char) => char >= 0xFE50 && char <= 0xFE6F, - 'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF, + // 'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF, 'Halfwidth and Fullwidth Forms': (char) => char >= 0xFF00 && char <= 0xFFEF // 'Specials': (char) => char >= 0xFFF0 && char <= 0xFFFF, // 'Linear B Syllabary': (char) => char >= 0x10000 && char <= 0x1007F, diff --git a/src/util/script_detection.test.ts b/src/util/script_detection.test.ts index 825dc0b697..7e3f5a350d 100644 --- a/src/util/script_detection.test.ts +++ b/src/util/script_detection.test.ts @@ -1,9 +1,138 @@ -import {charInComplexShapingScript} from './script_detection'; +import {charAllowsIdeographicBreaking, charAllowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, charInRTLScript} from './script_detection'; + +describe('charAllowsIdeographicBreaking', () => { + test('disallows ideographic breaking of Latin text', () => { + expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false); + }); + + test('allows ideographic breaking of ideographic punctuation', () => { + expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true); + }); + + test('allows ideographic breaking of Bopomofo text', () => { + expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true); + }); + + test('allows ideographic breaking of Chinese and Vietnamese text', () => { + expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true); + expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true); + }); + + test('disallows ideographic breaking of Korean text', () => { + expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false); + }); + + test('allows ideographic breaking of Japanese text', () => { + expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true); + expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true); + }); + + test('allows ideographic breaking of Yi text', () => { + expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true); + }); +}); + +describe('charAllowsLetterSpacing', () => { + test('allows letter spacing of Latin text', () => { + expect(charAllowsLetterSpacing('A'.codePointAt(0))).toBe(true); + }); + + test('disallows ideographic breaking of Arabic text', () => { + // Arabic + expect(charAllowsLetterSpacing('۳'.codePointAt(0))).toBe(false); + // Arabic Supplement + expect(charAllowsLetterSpacing('ݣ'.codePointAt(0))).toBe(false); + // Arabic Extended-A + expect(charAllowsLetterSpacing('ࢳ'.codePointAt(0))).toBe(false); + // Arabic Extended-B + expect(charAllowsLetterSpacing('࢐'.codePointAt(0))).toBe(false); + // Arabic Presentation Forms-A + expect(charAllowsLetterSpacing('ﰤ'.codePointAt(0))).toBe(false); + // Arabic Presentation Forms-B + expect(charAllowsLetterSpacing('ﺽ'.codePointAt(0))).toBe(false); + }); +}); + +describe('charHasUprightVerticalOrientation', () => { + test('rotates Latin text sideways', () => { + expect(charHasUprightVerticalOrientation('A'.codePointAt(0))).toBe(false); + }); + + test('keeps Bopomofo text upright', () => { + expect(charHasUprightVerticalOrientation('ㄎ'.codePointAt(0))).toBe(true); + }); + + test('keeps Canadian Aboriginal text upright', () => { + expect(charHasUprightVerticalOrientation('ᐃ'.codePointAt(0))).toBe(true); + }); + + test('keeps Chinese and Vietnamese text upright', () => { + expect(charHasUprightVerticalOrientation('市'.codePointAt(0))).toBe(true); + expect(charHasUprightVerticalOrientation('𡔖'.codePointAt(0))).toBe(true); + }); + + test('keeps Korean text upright', () => { + expect(charHasUprightVerticalOrientation('아'.codePointAt(0))).toBe(true); + }); + + test('keeps Japanese text upright', () => { + expect(charHasUprightVerticalOrientation('あ'.codePointAt(0))).toBe(true); + expect(charHasUprightVerticalOrientation('カ'.codePointAt(0))).toBe(true); + }); + + test('keeps Yi text upright', () => { + expect(charHasUprightVerticalOrientation('ꉆ'.codePointAt(0))).toBe(true); + }); +}); describe('charInComplexShapingScript', () => { test('recognizes that Arabic text needs complex shaping', () => { + // Non-Arabic expect(charInComplexShapingScript('3'.codePointAt(0))).toBe(false); + // Arabic expect(charInComplexShapingScript('۳'.codePointAt(0))).toBe(true); + // Arabic Supplement + expect(charInComplexShapingScript('ݣ'.codePointAt(0))).toBe(true); + // Arabic Extended-A + expect(charInComplexShapingScript('ࢳ'.codePointAt(0))).toBe(true); + // Arabic Extended-B expect(charInComplexShapingScript('࢐'.codePointAt(0))).toBe(true); + // Arabic Presentation Forms-A + expect(charInComplexShapingScript('ﰤ'.codePointAt(0))).toBe(true); + // Arabic Presentation Forms-B + expect(charInComplexShapingScript('ﺽ'.codePointAt(0))).toBe(true); + }); +}); + +describe('charInRTLScript', () => { + test('does not identify direction-neutral text as right-to-left', () => { + expect(charInRTLScript('3'.codePointAt(0))).toBe(false); + }); + + test('identifies Arabic text as right-to-left', () => { + // Arabic + expect(charInRTLScript('۳'.codePointAt(0))).toBe(true); + // Arabic Supplement + expect(charInRTLScript('ݣ'.codePointAt(0))).toBe(true); + // Arabic Extended-A + expect(charInRTLScript('ࢳ'.codePointAt(0))).toBe(true); + // Arabic Extended-B + expect(charInRTLScript('࢐'.codePointAt(0))).toBe(true); + // Arabic Presentation Forms-A + expect(charInRTLScript('ﰤ'.codePointAt(0))).toBe(true); + // Arabic Presentation Forms-B + expect(charInRTLScript('ﺽ'.codePointAt(0))).toBe(true); + }); + + test('identifies Hebrew text as right-to-left', () => { + // Hebrew + expect(charInRTLScript('ה'.codePointAt(0))).toBe(true); + // Alphabetic Presentation Forms + expect(charInRTLScript('ﬡ'.codePointAt(0))).toBe(true); + }); + + test('identifies Thaana text as right-to-left', () => { + // Thaana + expect(charInRTLScript('ޘ'.codePointAt(0))).toBe(true); }); }); diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts index f0e051ee4f..4043e4e043 100644 --- a/src/util/script_detection.ts +++ b/src/util/script_detection.ts @@ -23,43 +23,71 @@ export function allowsLetterSpacing(chars: string) { return true; } -export function charAllowsLetterSpacing(char: number) { - if (isChar['Arabic'](char)) return false; - if (isChar['Arabic Supplement'](char)) return false; - if (isChar['Arabic Extended-A'](char)) return false; - if (isChar['Arabic Extended-B'](char)) return false; - if (isChar['Arabic Presentation Forms-A'](char)) return false; - if (isChar['Arabic Presentation Forms-B'](char)) return false; +/** + * Returns a regular expression matching the given script codes, excluding any + * code that the execution environment lacks support for in regular expressions. + */ +function sanitizedRegExpFromScriptCodes(scriptCodes: Array): RegExp { + const supportedPropertyEscapes = scriptCodes.map(code => { + try { + return new RegExp(`\\p{sc=${code}}`, 'u').source; + } catch (e) { + return null; + } + }).filter(pe => pe); + return new RegExp(supportedPropertyEscapes.join('|'), 'u'); +} - return true; +/** + * ISO 15924 script codes of scripts that disallow letter spacing as of Unicode + * 16.0.0. + * + * In general, cursive scripts are incompatible with letter spacing. + */ +const cursiveScriptCodes = [ + 'Arab', // Arabic + 'Dupl', // Duployan + 'Mong', // Mongolian + 'Ougr', // Old Uyghur + 'Syrc', // Syriac +]; + +const cursiveScriptRegExp = sanitizedRegExpFromScriptCodes(cursiveScriptCodes); + +export function charAllowsLetterSpacing(char: number) { + return !cursiveScriptRegExp.test(String.fromCodePoint(char)); } +/** + * ISO 15924 script codes of scripts that allow ideographic line breaking beyond + * the CJKV scripts that are considered ideographic in Unicode 16.0.0. + */ +const ideographicBreakingScriptCodes = [ + 'Bopo', // Bopomofo + 'Hani', // Han + 'Hira', // Hiragana + 'Kana', // Katakana + 'Kits', // Khitan Small Script + 'Nshu', // Nushu + 'Tang', // Tangut + 'Yiii', // Yi +]; + +const ideographicBreakingRegExp = sanitizedRegExpFromScriptCodes(ideographicBreakingScriptCodes); + export function charAllowsIdeographicBreaking(char: number) { // Return early for characters outside all ideographic ranges. if (char < 0x2E80) return false; - if (isChar['Bopomofo Extended'](char)) return true; - if (isChar['Bopomofo'](char)) return true; if (isChar['CJK Compatibility Forms'](char)) return true; - if (isChar['CJK Compatibility Ideographs'](char)) return true; if (isChar['CJK Compatibility'](char)) return true; - if (isChar['CJK Radicals Supplement'](char)) return true; if (isChar['CJK Strokes'](char)) return true; if (isChar['CJK Symbols and Punctuation'](char)) return true; - if (isChar['CJK Unified Ideographs Extension A'](char)) return true; - if (isChar['CJK Unified Ideographs'](char)) return true; if (isChar['Enclosed CJK Letters and Months'](char)) return true; if (isChar['Halfwidth and Fullwidth Forms'](char)) return true; - if (isChar['Hiragana'](char)) return true; if (isChar['Ideographic Description Characters'](char)) return true; - if (isChar['Kangxi Radicals'](char)) return true; - if (isChar['Katakana Phonetic Extensions'](char)) return true; - if (isChar['Katakana'](char)) return true; if (isChar['Vertical Forms'](char)) return true; - if (isChar['Yi Radicals'](char)) return true; - if (isChar['Yi Syllables'](char)) return true; - - return false; + return ideographicBreakingRegExp.test(String.fromCodePoint(char)); } // The following logic comes from @@ -93,16 +121,12 @@ export function charHasUprightVerticalOrientation(char: number) { // upright in vertical writing mode. if (char < 0x1100) return false; - if (isChar['Bopomofo Extended'](char)) return true; - if (isChar['Bopomofo'](char)) return true; if (isChar['CJK Compatibility Forms'](char)) { if (!((char >= 0xFE49 /* dashed overline */ && char <= 0xFE4F) /* wavy low line */)) { return true; } } - if (isChar['CJK Compatibility Ideographs'](char)) return true; if (isChar['CJK Compatibility'](char)) return true; - if (isChar['CJK Radicals Supplement'](char)) return true; if (isChar['CJK Strokes'](char)) return true; if (isChar['CJK Symbols and Punctuation'](char)) { if (!((char >= 0x3008 /* left angle bracket */ && char <= 0x3011) /* right black lenticular bracket */) && @@ -111,19 +135,9 @@ export function charHasUprightVerticalOrientation(char: number) { return true; } } - if (isChar['CJK Unified Ideographs Extension A'](char)) return true; - if (isChar['CJK Unified Ideographs'](char)) return true; if (isChar['Enclosed CJK Letters and Months'](char)) return true; - if (isChar['Hangul Compatibility Jamo'](char)) return true; - if (isChar['Hangul Jamo Extended-A'](char)) return true; - if (isChar['Hangul Jamo Extended-B'](char)) return true; - if (isChar['Hangul Jamo'](char)) return true; - if (isChar['Hangul Syllables'](char)) return true; - if (isChar['Hiragana'](char)) return true; if (isChar['Ideographic Description Characters'](char)) return true; if (isChar['Kanbun'](char)) return true; - if (isChar['Kangxi Radicals'](char)) return true; - if (isChar['Katakana Phonetic Extensions'](char)) return true; if (isChar['Katakana'](char)) { if (char !== 0x30FC /* katakana-hiragana prolonged sound mark */) { return true; @@ -149,12 +163,12 @@ export function charHasUprightVerticalOrientation(char: number) { return true; } } - if (isChar['Unified Canadian Aboriginal Syllabics'](char)) return true; - if (isChar['Unified Canadian Aboriginal Syllabics Extended'](char)) return true; if (isChar['Vertical Forms'](char)) return true; if (isChar['Yijing Hexagram Symbols'](char)) return true; - if (isChar['Yi Syllables'](char)) return true; - if (isChar['Yi Radicals'](char)) return true; + + if (/* Canadian Aboriginal */ /\p{sc=Cans}/u.test(String.fromCodePoint(char))) return true; + if (/* Hangul */ /\p{sc=Hang}/u.test(String.fromCodePoint(char))) return true; + if (ideographicBreakingRegExp.test(String.fromCodePoint(char))) return true; return false; } @@ -266,19 +280,56 @@ export function charHasRotatedVerticalOrientation(char: number) { } export function charInComplexShapingScript(char: number) { - return isChar['Arabic'](char) || - isChar['Arabic Supplement'](char) || - isChar['Arabic Extended-A'](char) || - isChar['Arabic Extended-B'](char) || - isChar['Arabic Presentation Forms-A'](char) || - isChar['Arabic Presentation Forms-B'](char); + return /\p{sc=Arab}/u.test(String.fromCodePoint(char)); } +/** + * ISO 15924 script codes of scripts that are primarily written horizontally + * right-to-left according to Unicode 16.0.0. + */ +const rtlScriptCodes = [ + 'Adlm', // Adlam + 'Arab', // Arabic + 'Armi', // Imperial Aramaic + 'Avst', // Avestan + 'Chrs', // Chorasmian + 'Cprt', // Cypriot + 'Egyp', // Egyptian Hieroglyphs + 'Elym', // Elymaic + 'Gara', // Garay + 'Hatr', // Hatran + 'Hebr', // Hebrew + 'Hung', // Old Hungarian + 'Khar', // Kharoshthi + 'Lydi', // Lydian + 'Mand', // Mandaic + 'Mani', // Manichaean + 'Mend', // Mende Kikakui + 'Merc', // Meroitic Cursive + 'Mero', // Meroitic Hieroglyphs + 'Narb', // Old North Arabian + 'Nbat', // Nabataean + 'Nkoo', // NKo + 'Orkh', // Old Turkic + 'Palm', // Palmyrene + 'Phli', // Inscriptional Pahlavi + 'Phlp', // Psalter Pahlavi + 'Phnx', // Phoenician + 'Prti', // Inscriptional Parthian + 'Rohg', // Hanifi Rohingya + 'Samr', // Samaritan + 'Sarb', // Old South Arabian + 'Sogo', // Old Sogdian + 'Syrc', // Syriac + 'Thaa', // Thaana + 'Todr', // Todhri + 'Yezi', // Yezidi +]; + +const rtlScriptRegExp = sanitizedRegExpFromScriptCodes(rtlScriptCodes); + export function charInRTLScript(char: number) { - // Main blocks for Hebrew, Arabic, Thaana and other RTL scripts - return (char >= 0x0590 && char <= 0x08FF) || - isChar['Arabic Presentation Forms-A'](char) || - isChar['Arabic Presentation Forms-B'](char); + return rtlScriptRegExp.test(String.fromCodePoint(char)); } export function charInSupportedScript(char: number, canRenderRTL: boolean) {