From 42d68479f10bcdfb4c3252f237da9043663d3c24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Minh=20Nguy=E1=BB=85n?= <mxn@1ec5.org>
Date: Thu, 15 Aug 2024 05:21:14 -0700
Subject: [PATCH] Simplify script detection using regular expressions (#4560)

* Test script detection functions

* Simplified script detection functions using regular expressions

* Automatically exclude unsupported script codes from regular expressions

* Consolidated ideographic breaking and vertical orientation logic

* Prefer local glyph rendering for all CJKV characters

Simplified the logic for preferring local glyph rendering to consider the script, which requires less maintenance than a hard-coded list of blocks.

---------

Co-authored-by: Harel M <harel.mazor@gmail.com>
---
 CHANGELOG.md                         |   2 +-
 src/render/glyph_manager.test.ts     |  19 +++-
 src/render/glyph_manager.ts          |  14 +--
 src/style/style.ts                   |   4 +-
 src/ui/map.ts                        |   4 +-
 src/util/is_char_in_unicode_block.ts |  46 ++++----
 src/util/script_detection.test.ts    | 131 ++++++++++++++++++++++-
 src/util/script_detection.ts         | 151 ++++++++++++++++++---------
 8 files changed, 284 insertions(+), 87 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aadf54ddd9..b3da31c264 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,7 @@
 - Fix right-to-left layout of labels that contain characters in the Arabic Extended-B code block. ([#4536](https://github.com/maplibre/maplibre-gl-js/pull/4536))
 - Fix 3D map freezing when camera is adjusted against map bounds. ([#4537](https://github.com/maplibre/maplibre-gl-js/issues/4537))
 - Fix `getStyle()` to return a clone so the object cannot be internally changed ([#4488](https://github.com/maplibre/maplibre-gl-js/issues/4488))
-
+- Prefer local glyph rendering for all CJKV characters, not just those in the CJK Unified Ideographs, Hiragana, Katakana, and Hangul Syllables blocks. ([#4560](https://github.com/maplibre/maplibre-gl-js/pull/4560)))
 - - _...Add new stuff here..._
 
 ## 4.5.2
diff --git a/src/render/glyph_manager.test.ts b/src/render/glyph_manager.test.ts
index 60eb97f61e..05d07f2fed 100644
--- a/src/render/glyph_manager.test.ts
+++ b/src/render/glyph_manager.test.ts
@@ -93,7 +93,7 @@ describe('GlyphManager', () => {
     test('GlyphManager generates CJK PBF locally', async () => {
         const manager = createGlyphManager('sans-serif');
 
-        // character 平
+        // Chinese character píng 平
         const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x5e73]});
         expect(returnedGlyphs['Arial Unicode MS'][0x5e73].metrics.advance).toBe(0.5);
     });
@@ -114,6 +114,23 @@ describe('GlyphManager', () => {
         expect(returnedGlyphs['Arial Unicode MS'][0x3066].metrics.advance).toBe(0.5);
     });
 
+    test('GlyphManager consistently generates CJKV text locally', async () => {
+        const manager = createGlyphManager('sans-serif');
+
+        // Space
+        expect(manager._doesCharSupportLocalGlyph(0x0020)).toBe(false);
+        // Chinese character píng 平
+        expect(manager._doesCharSupportLocalGlyph(0x5e73)).toBe(true);
+        // Chinese character biáng 𰻞
+        expect(manager._doesCharSupportLocalGlyph(0x30EDE)).toBe(true);
+        // Katakana letter te テ
+        expect(manager._doesCharSupportLocalGlyph(0x30c6)).toBe(true);
+        // Hiragana letter te て
+        expect(manager._doesCharSupportLocalGlyph(0x3066)).toBe(true);
+        // Hangul letter a 아
+        expect(manager._doesCharSupportLocalGlyph(0xC544)).toBe(true);
+    });
+
     test('GlyphManager caches locally generated glyphs', async () => {
 
         const manager = createGlyphManager('sans-serif');
diff --git a/src/render/glyph_manager.ts b/src/render/glyph_manager.ts
index 3898fe5503..0efe9f066f 100644
--- a/src/render/glyph_manager.ts
+++ b/src/render/glyph_manager.ts
@@ -1,7 +1,6 @@
 import {loadGlyphRange} from '../style/load_glyph_range';
 
 import TinySDF from '@mapbox/tiny-sdf';
-import {unicodeBlockLookup} from '../util/is_char_in_unicode_block';
 import {AlphaImage} from '../util/image';
 
 import type {StyleGlyph} from '../style/style_glyph';
@@ -120,13 +119,14 @@ export class GlyphManager {
     }
 
     _doesCharSupportLocalGlyph(id: number): boolean {
-        /* eslint-disable new-cap */
+        // The CJK Unified Ideographs blocks and Hangul Syllables blocks are
+        // spread across many glyph PBFs and are typically accessed very
+        // randomly. Preferring local rendering for these blocks reduces
+        // wasteful bandwidth consumption. For visual consistency within CJKV
+        // text, also include any other CJKV or siniform ideograph or hangul,
+        // hiragana, or katakana character.
         return !!this.localIdeographFontFamily &&
-            (unicodeBlockLookup['CJK Unified Ideographs'](id) ||
-            unicodeBlockLookup['Hangul Syllables'](id) ||
-            unicodeBlockLookup['Hiragana'](id) ||
-            unicodeBlockLookup['Katakana'](id));
-        /* eslint-enable new-cap */
+            /\p{Ideo}|\p{sc=Hang}|\p{sc=Hira}|\p{sc=Kana}/u.test(String.fromCodePoint(id));
     }
 
     _tinySDF(entry: Entry, stack: string, id: number): StyleGlyph {
diff --git a/src/style/style.ts b/src/style/style.ts
index 6ad6acef22..dd500ff05d 100644
--- a/src/style/style.ts
+++ b/src/style/style.ts
@@ -91,8 +91,8 @@ export type StyleOptions = {
     validate?: boolean;
     /**
      * Defines a CSS
-     * font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges.
-     * In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
+     * font-family for locally overriding generation of Chinese, Japanese, and Korean characters.
+     * For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
      * Set to `false`, to enable font settings from the map's style for these glyph ranges.
      * Forces a full update.
      */
diff --git a/src/ui/map.ts b/src/ui/map.ts
index 463dd46e13..b7cb3379e0 100644
--- a/src/ui/map.ts
+++ b/src/ui/map.ts
@@ -289,8 +289,8 @@ export type MapOptions = {
     fitBoundsOptions?: FitBoundsOptions;
     /**
      * Defines a CSS
-     * font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges.
-     * In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
+     * font-family for locally overriding generation of Chinese, Japanese, and Korean characters.
+     * For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
      * Set to `false`, to enable font settings from the map's style for these glyph ranges.
      * The purpose of this option is to avoid bandwidth-intensive glyph server requests. (See [Use locally generated ideographs](https://maplibre.org/maplibre-gl-js/docs/examples/local-ideographs).)
      * @defaultValue 'sans-serif'
diff --git a/src/util/is_char_in_unicode_block.ts b/src/util/is_char_in_unicode_block.ts
index 469fd2f3b6..da108b0a6a 100644
--- a/src/util/is_char_in_unicode_block.ts
+++ b/src/util/is_char_in_unicode_block.ts
@@ -16,16 +16,16 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Cyrillic Supplement': (char) => char >= 0x0500 && char <= 0x052F,
     // 'Armenian': (char) => char >= 0x0530 && char <= 0x058F,
     //'Hebrew': (char) => char >= 0x0590 && char <= 0x05FF,
-    'Arabic': (char) => char >= 0x0600 && char <= 0x06FF,
+    // 'Arabic': (char) => char >= 0x0600 && char <= 0x06FF,
     //'Syriac': (char) => char >= 0x0700 && char <= 0x074F,
-    'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F,
+    // 'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F,
     // 'Thaana': (char) => char >= 0x0780 && char <= 0x07BF,
     // 'NKo': (char) => char >= 0x07C0 && char <= 0x07FF,
     // 'Samaritan': (char) => char >= 0x0800 && char <= 0x083F,
     // 'Mandaic': (char) => char >= 0x0840 && char <= 0x085F,
     // 'Syriac Supplement': (char) => char >= 0x0860 && char <= 0x086F,
-    'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F,
-    'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF,
+    // 'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F,
+    // 'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF,
     // 'Devanagari': (char) => char >= 0x0900 && char <= 0x097F,
     // 'Bengali': (char) => char >= 0x0980 && char <= 0x09FF,
     // 'Gurmukhi': (char) => char >= 0x0A00 && char <= 0x0A7F,
@@ -45,7 +45,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Ethiopic': (char) => char >= 0x1200 && char <= 0x137F,
     // 'Ethiopic Supplement': (char) => char >= 0x1380 && char <= 0x139F,
     // 'Cherokee': (char) => char >= 0x13A0 && char <= 0x13FF,
-    'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F,
+    // 'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F,
     // 'Ogham': (char) => char >= 0x1680 && char <= 0x169F,
     // 'Runic': (char) => char >= 0x16A0 && char <= 0x16FF,
     // 'Tagalog': (char) => char >= 0x1700 && char <= 0x171F,
@@ -54,7 +54,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Tagbanwa': (char) => char >= 0x1760 && char <= 0x177F,
     'Khmer': (char) => char >= 0x1780 && char <= 0x17FF,
     // 'Mongolian': (char) => char >= 0x1800 && char <= 0x18AF,
-    'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF,
+    // 'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF,
     // 'Limbu': (char) => char >= 0x1900 && char <= 0x194F,
     // 'Tai Le': (char) => char >= 0x1950 && char <= 0x197F,
     // 'New Tai Lue': (char) => char >= 0x1980 && char <= 0x19DF,
@@ -108,25 +108,25 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Ethiopic Extended': (char) => char >= 0x2D80 && char <= 0x2DDF,
     // 'Cyrillic Extended-A': (char) => char >= 0x2DE0 && char <= 0x2DFF,
     // 'Supplemental Punctuation': (char) => char >= 0x2E00 && char <= 0x2E7F,
-    'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF,
-    'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF,
+    // 'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF,
+    // 'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF,
     'Ideographic Description Characters': (char) => char >= 0x2FF0 && char <= 0x2FFF,
     'CJK Symbols and Punctuation': (char) => char >= 0x3000 && char <= 0x303F,
-    'Hiragana': (char) => char >= 0x3040 && char <= 0x309F,
+    // 'Hiragana': (char) => char >= 0x3040 && char <= 0x309F,
     'Katakana': (char) => char >= 0x30A0 && char <= 0x30FF,
-    'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F,
-    'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F,
+    // 'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F,
+    // 'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F,
     'Kanbun': (char) => char >= 0x3190 && char <= 0x319F,
-    'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF,
+    // 'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF,
     'CJK Strokes': (char) => char >= 0x31C0 && char <= 0x31EF,
-    'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF,
+    // 'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF,
     'Enclosed CJK Letters and Months': (char) => char >= 0x3200 && char <= 0x32FF,
     'CJK Compatibility': (char) => char >= 0x3300 && char <= 0x33FF,
-    'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF,
+    // 'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF,
     'Yijing Hexagram Symbols': (char) => char >= 0x4DC0 && char <= 0x4DFF,
-    'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF,
-    'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F,
-    'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF,
+    // 'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF,
+    // 'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F,
+    // 'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF,
     // 'Lisu': (char) => char >= 0xA4D0 && char <= 0xA4FF,
     // 'Vai': (char) => char >= 0xA500 && char <= 0xA63F,
     // 'Cyrillic Extended-B': (char) => char >= 0xA640 && char <= 0xA69F,
@@ -140,7 +140,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Devanagari Extended': (char) => char >= 0xA8E0 && char <= 0xA8FF,
     // 'Kayah Li': (char) => char >= 0xA900 && char <= 0xA92F,
     // 'Rejang': (char) => char >= 0xA930 && char <= 0xA95F,
-    'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F,
+    // 'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F,
     // 'Javanese': (char) => char >= 0xA980 && char <= 0xA9DF,
     // 'Myanmar Extended-B': (char) => char >= 0xA9E0 && char <= 0xA9FF,
     // 'Cham': (char) => char >= 0xAA00 && char <= 0xAA5F,
@@ -151,21 +151,21 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Latin Extended-E': (char) => char >= 0xAB30 && char <= 0xAB6F,
     // 'Cherokee Supplement': (char) => char >= 0xAB70 && char <= 0xABBF,
     // 'Meetei Mayek': (char) => char >= 0xABC0 && char <= 0xABFF,
-    'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF,
-    'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF,
+    // 'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF,
+    // 'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF,
     // 'High Surrogates': (char) => char >= 0xD800 && char <= 0xDB7F,
     // 'High Private Use Surrogates': (char) => char >= 0xDB80 && char <= 0xDBFF,
     // 'Low Surrogates': (char) => char >= 0xDC00 && char <= 0xDFFF,
     'Private Use Area': (char) => char >= 0xE000 && char <= 0xF8FF,
-    'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF,
+    // 'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF,
     // 'Alphabetic Presentation Forms': (char) => char >= 0xFB00 && char <= 0xFB4F,
-    'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF,
+    // 'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF,
     // 'Variation Selectors': (char) => char >= 0xFE00 && char <= 0xFE0F,
     'Vertical Forms': (char) => char >= 0xFE10 && char <= 0xFE1F,
     // 'Combining Half Marks': (char) => char >= 0xFE20 && char <= 0xFE2F,
     'CJK Compatibility Forms': (char) => char >= 0xFE30 && char <= 0xFE4F,
     'Small Form Variants': (char) => char >= 0xFE50 && char <= 0xFE6F,
-    'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF,
+    // 'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF,
     'Halfwidth and Fullwidth Forms': (char) => char >= 0xFF00 && char <= 0xFFEF
     // 'Specials': (char) => char >= 0xFFF0 && char <= 0xFFFF,
     // 'Linear B Syllabary': (char) => char >= 0x10000 && char <= 0x1007F,
diff --git a/src/util/script_detection.test.ts b/src/util/script_detection.test.ts
index 825dc0b697..7e3f5a350d 100644
--- a/src/util/script_detection.test.ts
+++ b/src/util/script_detection.test.ts
@@ -1,9 +1,138 @@
-import {charInComplexShapingScript} from './script_detection';
+import {charAllowsIdeographicBreaking, charAllowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, charInRTLScript} from './script_detection';
+
+describe('charAllowsIdeographicBreaking', () => {
+    test('disallows ideographic breaking of Latin text', () => {
+        expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false);
+    });
+
+    test('allows ideographic breaking of ideographic punctuation', () => {
+        expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true);
+    });
+
+    test('allows ideographic breaking of Bopomofo text', () => {
+        expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true);
+    });
+
+    test('allows ideographic breaking of Chinese and Vietnamese text', () => {
+        expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true);
+        expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true);
+    });
+
+    test('disallows ideographic breaking of Korean text', () => {
+        expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false);
+    });
+
+    test('allows ideographic breaking of Japanese text', () => {
+        expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true);
+        expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true);
+    });
+
+    test('allows ideographic breaking of Yi text', () => {
+        expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true);
+    });
+});
+
+describe('charAllowsLetterSpacing', () => {
+    test('allows letter spacing of Latin text', () => {
+        expect(charAllowsLetterSpacing('A'.codePointAt(0))).toBe(true);
+    });
+
+    test('disallows ideographic breaking of Arabic text', () => {
+        // Arabic
+        expect(charAllowsLetterSpacing('۳'.codePointAt(0))).toBe(false);
+        // Arabic Supplement
+        expect(charAllowsLetterSpacing('ݣ'.codePointAt(0))).toBe(false);
+        // Arabic Extended-A
+        expect(charAllowsLetterSpacing('ࢳ'.codePointAt(0))).toBe(false);
+        // Arabic Extended-B
+        expect(charAllowsLetterSpacing('࢐'.codePointAt(0))).toBe(false);
+        // Arabic Presentation Forms-A
+        expect(charAllowsLetterSpacing('ﰤ'.codePointAt(0))).toBe(false);
+        // Arabic Presentation Forms-B
+        expect(charAllowsLetterSpacing('ﺽ'.codePointAt(0))).toBe(false);
+    });
+});
+
+describe('charHasUprightVerticalOrientation', () => {
+    test('rotates Latin text sideways', () => {
+        expect(charHasUprightVerticalOrientation('A'.codePointAt(0))).toBe(false);
+    });
+
+    test('keeps Bopomofo text upright', () => {
+        expect(charHasUprightVerticalOrientation('ㄎ'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Canadian Aboriginal text upright', () => {
+        expect(charHasUprightVerticalOrientation('ᐃ'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Chinese and Vietnamese text upright', () => {
+        expect(charHasUprightVerticalOrientation('市'.codePointAt(0))).toBe(true);
+        expect(charHasUprightVerticalOrientation('𡔖'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Korean text upright', () => {
+        expect(charHasUprightVerticalOrientation('아'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Japanese text upright', () => {
+        expect(charHasUprightVerticalOrientation('あ'.codePointAt(0))).toBe(true);
+        expect(charHasUprightVerticalOrientation('カ'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Yi text upright', () => {
+        expect(charHasUprightVerticalOrientation('ꉆ'.codePointAt(0))).toBe(true);
+    });
+});
 
 describe('charInComplexShapingScript', () => {
     test('recognizes that Arabic text needs complex shaping', () => {
+        // Non-Arabic
         expect(charInComplexShapingScript('3'.codePointAt(0))).toBe(false);
+        // Arabic
         expect(charInComplexShapingScript('۳'.codePointAt(0))).toBe(true);
+        // Arabic Supplement
+        expect(charInComplexShapingScript('ݣ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-A
+        expect(charInComplexShapingScript('ࢳ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-B
         expect(charInComplexShapingScript('࢐'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-A
+        expect(charInComplexShapingScript('ﰤ'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-B
+        expect(charInComplexShapingScript('ﺽ'.codePointAt(0))).toBe(true);
+    });
+});
+
+describe('charInRTLScript', () => {
+    test('does not identify direction-neutral text as right-to-left', () => {
+        expect(charInRTLScript('3'.codePointAt(0))).toBe(false);
+    });
+
+    test('identifies Arabic text as right-to-left', () => {
+        // Arabic
+        expect(charInRTLScript('۳'.codePointAt(0))).toBe(true);
+        // Arabic Supplement
+        expect(charInRTLScript('ݣ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-A
+        expect(charInRTLScript('ࢳ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-B
+        expect(charInRTLScript('࢐'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-A
+        expect(charInRTLScript('ﰤ'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-B
+        expect(charInRTLScript('ﺽ'.codePointAt(0))).toBe(true);
+    });
+
+    test('identifies Hebrew text as right-to-left', () => {
+        // Hebrew
+        expect(charInRTLScript('ה'.codePointAt(0))).toBe(true);
+        // Alphabetic Presentation Forms
+        expect(charInRTLScript('ﬡ'.codePointAt(0))).toBe(true);
+    });
+
+    test('identifies Thaana text as right-to-left', () => {
+        // Thaana
+        expect(charInRTLScript('ޘ'.codePointAt(0))).toBe(true);
     });
 });
diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts
index f0e051ee4f..4043e4e043 100644
--- a/src/util/script_detection.ts
+++ b/src/util/script_detection.ts
@@ -23,43 +23,71 @@ export function allowsLetterSpacing(chars: string) {
     return true;
 }
 
-export function charAllowsLetterSpacing(char: number) {
-    if (isChar['Arabic'](char)) return false;
-    if (isChar['Arabic Supplement'](char)) return false;
-    if (isChar['Arabic Extended-A'](char)) return false;
-    if (isChar['Arabic Extended-B'](char)) return false;
-    if (isChar['Arabic Presentation Forms-A'](char)) return false;
-    if (isChar['Arabic Presentation Forms-B'](char)) return false;
+/**
+ * Returns a regular expression matching the given script codes, excluding any
+ * code that the execution environment lacks support for in regular expressions.
+ */
+function sanitizedRegExpFromScriptCodes(scriptCodes: Array<string>): RegExp {
+    const supportedPropertyEscapes = scriptCodes.map(code => {
+        try {
+            return new RegExp(`\\p{sc=${code}}`, 'u').source;
+        } catch (e) {
+            return null;
+        }
+    }).filter(pe => pe);
+    return new RegExp(supportedPropertyEscapes.join('|'), 'u');
+}
 
-    return true;
+/**
+ * ISO 15924 script codes of scripts that disallow letter spacing as of Unicode
+ * 16.0.0.
+ *
+ * In general, cursive scripts are incompatible with letter spacing.
+ */
+const cursiveScriptCodes = [
+    'Arab', // Arabic
+    'Dupl', // Duployan
+    'Mong', // Mongolian
+    'Ougr', // Old Uyghur
+    'Syrc', // Syriac
+];
+
+const cursiveScriptRegExp = sanitizedRegExpFromScriptCodes(cursiveScriptCodes);
+
+export function charAllowsLetterSpacing(char: number) {
+    return !cursiveScriptRegExp.test(String.fromCodePoint(char));
 }
 
+/**
+ * ISO 15924 script codes of scripts that allow ideographic line breaking beyond
+ * the CJKV scripts that are considered ideographic in Unicode 16.0.0.
+ */
+const ideographicBreakingScriptCodes = [
+    'Bopo', // Bopomofo
+    'Hani', // Han
+    'Hira', // Hiragana
+    'Kana', // Katakana
+    'Kits', // Khitan Small Script
+    'Nshu', // Nushu
+    'Tang', // Tangut
+    'Yiii', // Yi
+];
+
+const ideographicBreakingRegExp = sanitizedRegExpFromScriptCodes(ideographicBreakingScriptCodes);
+
 export function charAllowsIdeographicBreaking(char: number) {
     // Return early for characters outside all ideographic ranges.
     if (char < 0x2E80) return false;
 
-    if (isChar['Bopomofo Extended'](char)) return true;
-    if (isChar['Bopomofo'](char)) return true;
     if (isChar['CJK Compatibility Forms'](char)) return true;
-    if (isChar['CJK Compatibility Ideographs'](char)) return true;
     if (isChar['CJK Compatibility'](char)) return true;
-    if (isChar['CJK Radicals Supplement'](char)) return true;
     if (isChar['CJK Strokes'](char)) return true;
     if (isChar['CJK Symbols and Punctuation'](char)) return true;
-    if (isChar['CJK Unified Ideographs Extension A'](char)) return true;
-    if (isChar['CJK Unified Ideographs'](char)) return true;
     if (isChar['Enclosed CJK Letters and Months'](char)) return true;
     if (isChar['Halfwidth and Fullwidth Forms'](char)) return true;
-    if (isChar['Hiragana'](char)) return true;
     if (isChar['Ideographic Description Characters'](char)) return true;
-    if (isChar['Kangxi Radicals'](char)) return true;
-    if (isChar['Katakana Phonetic Extensions'](char)) return true;
-    if (isChar['Katakana'](char)) return true;
     if (isChar['Vertical Forms'](char)) return true;
-    if (isChar['Yi Radicals'](char)) return true;
-    if (isChar['Yi Syllables'](char)) return true;
-
-    return false;
+    return ideographicBreakingRegExp.test(String.fromCodePoint(char));
 }
 
 // The following logic comes from
@@ -93,16 +121,12 @@ export function charHasUprightVerticalOrientation(char: number) {
     // upright in vertical writing mode.
     if (char < 0x1100) return false;
 
-    if (isChar['Bopomofo Extended'](char)) return true;
-    if (isChar['Bopomofo'](char)) return true;
     if (isChar['CJK Compatibility Forms'](char)) {
         if (!((char >= 0xFE49 /* dashed overline */ && char <= 0xFE4F) /* wavy low line */)) {
             return true;
         }
     }
-    if (isChar['CJK Compatibility Ideographs'](char)) return true;
     if (isChar['CJK Compatibility'](char)) return true;
-    if (isChar['CJK Radicals Supplement'](char)) return true;
     if (isChar['CJK Strokes'](char)) return true;
     if (isChar['CJK Symbols and Punctuation'](char)) {
         if (!((char >= 0x3008 /* left angle bracket */ && char <= 0x3011) /* right black lenticular bracket */) &&
@@ -111,19 +135,9 @@ export function charHasUprightVerticalOrientation(char: number) {
             return true;
         }
     }
-    if (isChar['CJK Unified Ideographs Extension A'](char)) return true;
-    if (isChar['CJK Unified Ideographs'](char)) return true;
     if (isChar['Enclosed CJK Letters and Months'](char)) return true;
-    if (isChar['Hangul Compatibility Jamo'](char)) return true;
-    if (isChar['Hangul Jamo Extended-A'](char)) return true;
-    if (isChar['Hangul Jamo Extended-B'](char)) return true;
-    if (isChar['Hangul Jamo'](char)) return true;
-    if (isChar['Hangul Syllables'](char)) return true;
-    if (isChar['Hiragana'](char)) return true;
     if (isChar['Ideographic Description Characters'](char)) return true;
     if (isChar['Kanbun'](char)) return true;
-    if (isChar['Kangxi Radicals'](char)) return true;
-    if (isChar['Katakana Phonetic Extensions'](char)) return true;
     if (isChar['Katakana'](char)) {
         if (char !== 0x30FC /* katakana-hiragana prolonged sound mark */) {
             return true;
@@ -149,12 +163,12 @@ export function charHasUprightVerticalOrientation(char: number) {
             return true;
         }
     }
-    if (isChar['Unified Canadian Aboriginal Syllabics'](char)) return true;
-    if (isChar['Unified Canadian Aboriginal Syllabics Extended'](char)) return true;
     if (isChar['Vertical Forms'](char)) return true;
     if (isChar['Yijing Hexagram Symbols'](char)) return true;
-    if (isChar['Yi Syllables'](char)) return true;
-    if (isChar['Yi Radicals'](char)) return true;
+
+    if (/* Canadian Aboriginal */ /\p{sc=Cans}/u.test(String.fromCodePoint(char))) return true;
+    if (/* Hangul */ /\p{sc=Hang}/u.test(String.fromCodePoint(char))) return true;
+    if (ideographicBreakingRegExp.test(String.fromCodePoint(char))) return true;
 
     return false;
 }
@@ -266,19 +280,56 @@ export function charHasRotatedVerticalOrientation(char: number) {
 }
 
 export function charInComplexShapingScript(char: number) {
-    return isChar['Arabic'](char) ||
-           isChar['Arabic Supplement'](char) ||
-           isChar['Arabic Extended-A'](char) ||
-           isChar['Arabic Extended-B'](char) ||
-           isChar['Arabic Presentation Forms-A'](char) ||
-           isChar['Arabic Presentation Forms-B'](char);
+    return /\p{sc=Arab}/u.test(String.fromCodePoint(char));
 }
 
+/**
+ * ISO 15924 script codes of scripts that are primarily written horizontally
+ * right-to-left according to Unicode 16.0.0.
+ */
+const rtlScriptCodes = [
+    'Adlm', // Adlam
+    'Arab', // Arabic
+    'Armi', // Imperial Aramaic
+    'Avst', // Avestan
+    'Chrs', // Chorasmian
+    'Cprt', // Cypriot
+    'Egyp', // Egyptian Hieroglyphs
+    'Elym', // Elymaic
+    'Gara', // Garay
+    'Hatr', // Hatran
+    'Hebr', // Hebrew
+    'Hung', // Old Hungarian
+    'Khar', // Kharoshthi
+    'Lydi', // Lydian
+    'Mand', // Mandaic
+    'Mani', // Manichaean
+    'Mend', // Mende Kikakui
+    'Merc', // Meroitic Cursive
+    'Mero', // Meroitic Hieroglyphs
+    'Narb', // Old North Arabian
+    'Nbat', // Nabataean
+    'Nkoo', // NKo
+    'Orkh', // Old Turkic
+    'Palm', // Palmyrene
+    'Phli', // Inscriptional Pahlavi
+    'Phlp', // Psalter Pahlavi
+    'Phnx', // Phoenician
+    'Prti', // Inscriptional Parthian
+    'Rohg', // Hanifi Rohingya
+    'Samr', // Samaritan
+    'Sarb', // Old South Arabian
+    'Sogo', // Old Sogdian
+    'Syrc', // Syriac
+    'Thaa', // Thaana
+    'Todr', // Todhri
+    'Yezi', // Yezidi
+];
+
+const rtlScriptRegExp = sanitizedRegExpFromScriptCodes(rtlScriptCodes);
+
 export function charInRTLScript(char: number) {
-    // Main blocks for Hebrew, Arabic, Thaana and other RTL scripts
-    return (char >= 0x0590 && char <= 0x08FF) ||
-        isChar['Arabic Presentation Forms-A'](char) ||
-        isChar['Arabic Presentation Forms-B'](char);
+    return rtlScriptRegExp.test(String.fromCodePoint(char));
 }
 
 export function charInSupportedScript(char: number, canRenderRTL: boolean) {