Simplify script detection using regular expressions (#4560)

* Test script detection functions * Simplified script detection functions using regular expressions * Automatically exclude unsupported script codes from regular expressions * Consolidated ideographic breaking and vertical orientation logic * Prefer local glyph rendering for all CJKV characters Simplified the logic for preferring local glyph rendering to consider the script, which requires less maintenance than a hard-coded list of blocks. --------- Co-authored-by: Harel M <[email protected]>
maplibre · Aug 15, 2024 · 42d6847 · 42d6847
1 parent 4797952
commit 42d6847
Show file tree

Hide file tree

Showing 8 changed files with 284 additions and 87 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,7 @@
 - Fix right-to-left layout of labels that contain characters in the Arabic Extended-B code block. ([#4536](https://github.com/maplibre/maplibre-gl-js/pull/4536))
 - Fix 3D map freezing when camera is adjusted against map bounds. ([#4537](https://github.com/maplibre/maplibre-gl-js/issues/4537))
 - Fix `getStyle()` to return a clone so the object cannot be internally changed ([#4488](https://github.com/maplibre/maplibre-gl-js/issues/4488))
-
+- Prefer local glyph rendering for all CJKV characters, not just those in the CJK Unified Ideographs, Hiragana, Katakana, and Hangul Syllables blocks. ([#4560](https://github.com/maplibre/maplibre-gl-js/pull/4560)))
 - - _...Add new stuff here..._
 
 ## 4.5.2

diff --git a/src/render/glyph_manager.test.ts b/src/render/glyph_manager.test.ts
@@ -93,7 +93,7 @@ describe('GlyphManager', () => {
     test('GlyphManager generates CJK PBF locally', async () => {
         const manager = createGlyphManager('sans-serif');
 
-        // character 平
+        // Chinese character píng 平
         const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x5e73]});
         expect(returnedGlyphs['Arial Unicode MS'][0x5e73].metrics.advance).toBe(0.5);
     });
@@ -114,6 +114,23 @@ describe('GlyphManager', () => {
         expect(returnedGlyphs['Arial Unicode MS'][0x3066].metrics.advance).toBe(0.5);
     });
 
+    test('GlyphManager consistently generates CJKV text locally', async () => {
+        const manager = createGlyphManager('sans-serif');
+
+        // Space
+        expect(manager._doesCharSupportLocalGlyph(0x0020)).toBe(false);
+        // Chinese character píng 平
+        expect(manager._doesCharSupportLocalGlyph(0x5e73)).toBe(true);
+        // Chinese character biáng 𰻞
+        expect(manager._doesCharSupportLocalGlyph(0x30EDE)).toBe(true);
+        // Katakana letter te テ
+        expect(manager._doesCharSupportLocalGlyph(0x30c6)).toBe(true);
+        // Hiragana letter te て
+        expect(manager._doesCharSupportLocalGlyph(0x3066)).toBe(true);
+        // Hangul letter a 아
+        expect(manager._doesCharSupportLocalGlyph(0xC544)).toBe(true);
+    });
+
     test('GlyphManager caches locally generated glyphs', async () => {
 
         const manager = createGlyphManager('sans-serif');

diff --git a/src/render/glyph_manager.ts b/src/render/glyph_manager.ts
@@ -1,7 +1,6 @@
 import {loadGlyphRange} from '../style/load_glyph_range';
 
 import TinySDF from '@mapbox/tiny-sdf';
-import {unicodeBlockLookup} from '../util/is_char_in_unicode_block';
 import {AlphaImage} from '../util/image';
 
 import type {StyleGlyph} from '../style/style_glyph';
@@ -120,13 +119,14 @@ export class GlyphManager {
     }
 
     _doesCharSupportLocalGlyph(id: number): boolean {
-        /* eslint-disable new-cap */
+        // The CJK Unified Ideographs blocks and Hangul Syllables blocks are
+        // spread across many glyph PBFs and are typically accessed very
+        // randomly. Preferring local rendering for these blocks reduces
+        // wasteful bandwidth consumption. For visual consistency within CJKV
+        // text, also include any other CJKV or siniform ideograph or hangul,
+        // hiragana, or katakana character.
         return !!this.localIdeographFontFamily &&
-            (unicodeBlockLookup['CJK Unified Ideographs'](id) ||
-            unicodeBlockLookup['Hangul Syllables'](id) ||
-            unicodeBlockLookup['Hiragana'](id) ||
-            unicodeBlockLookup['Katakana'](id));
-        /* eslint-enable new-cap */
+            /\p{Ideo}|\p{sc=Hang}|\p{sc=Hira}|\p{sc=Kana}/u.test(String.fromCodePoint(id));
     }
 
     _tinySDF(entry: Entry, stack: string, id: number): StyleGlyph {

diff --git a/src/style/style.ts b/src/style/style.ts
@@ -91,8 +91,8 @@ export type StyleOptions = {
     validate?: boolean;
     /**
      * Defines a CSS
-     * font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges.
-     * In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
+     * font-family for locally overriding generation of Chinese, Japanese, and Korean characters.
+     * For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
      * Set to `false`, to enable font settings from the map's style for these glyph ranges.
      * Forces a full update.
      */

diff --git a/src/ui/map.ts b/src/ui/map.ts
@@ -289,8 +289,8 @@ export type MapOptions = {
     fitBoundsOptions?: FitBoundsOptions;
     /**
      * Defines a CSS
-     * font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges.
-     * In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
+     * font-family for locally overriding generation of Chinese, Japanese, and Korean characters.
+     * For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
      * Set to `false`, to enable font settings from the map's style for these glyph ranges.
      * The purpose of this option is to avoid bandwidth-intensive glyph server requests. (See [Use locally generated ideographs](https://maplibre.org/maplibre-gl-js/docs/examples/local-ideographs).)
      * @defaultValue 'sans-serif'

diff --git a/src/util/is_char_in_unicode_block.ts b/src/util/is_char_in_unicode_block.ts
@@ -16,16 +16,16 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Cyrillic Supplement': (char) => char >= 0x0500 && char <= 0x052F,
     // 'Armenian': (char) => char >= 0x0530 && char <= 0x058F,
     //'Hebrew': (char) => char >= 0x0590 && char <= 0x05FF,
-    'Arabic': (char) => char >= 0x0600 && char <= 0x06FF,
+    // 'Arabic': (char) => char >= 0x0600 && char <= 0x06FF,
     //'Syriac': (char) => char >= 0x0700 && char <= 0x074F,
-    'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F,
+    // 'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F,
     // 'Thaana': (char) => char >= 0x0780 && char <= 0x07BF,
     // 'NKo': (char) => char >= 0x07C0 && char <= 0x07FF,
     // 'Samaritan': (char) => char >= 0x0800 && char <= 0x083F,
     // 'Mandaic': (char) => char >= 0x0840 && char <= 0x085F,
     // 'Syriac Supplement': (char) => char >= 0x0860 && char <= 0x086F,
-    'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F,
-    'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF,
+    // 'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F,
+    // 'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF,
     // 'Devanagari': (char) => char >= 0x0900 && char <= 0x097F,
     // 'Bengali': (char) => char >= 0x0980 && char <= 0x09FF,
     // 'Gurmukhi': (char) => char >= 0x0A00 && char <= 0x0A7F,
@@ -45,7 +45,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Ethiopic': (char) => char >= 0x1200 && char <= 0x137F,
     // 'Ethiopic Supplement': (char) => char >= 0x1380 && char <= 0x139F,
     // 'Cherokee': (char) => char >= 0x13A0 && char <= 0x13FF,
-    'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F,
+    // 'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F,
     // 'Ogham': (char) => char >= 0x1680 && char <= 0x169F,
     // 'Runic': (char) => char >= 0x16A0 && char <= 0x16FF,
     // 'Tagalog': (char) => char >= 0x1700 && char <= 0x171F,
@@ -54,7 +54,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Tagbanwa': (char) => char >= 0x1760 && char <= 0x177F,
     'Khmer': (char) => char >= 0x1780 && char <= 0x17FF,
     // 'Mongolian': (char) => char >= 0x1800 && char <= 0x18AF,
-    'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF,
+    // 'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF,
     // 'Limbu': (char) => char >= 0x1900 && char <= 0x194F,
     // 'Tai Le': (char) => char >= 0x1950 && char <= 0x197F,
     // 'New Tai Lue': (char) => char >= 0x1980 && char <= 0x19DF,
@@ -108,25 +108,25 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Ethiopic Extended': (char) => char >= 0x2D80 && char <= 0x2DDF,
     // 'Cyrillic Extended-A': (char) => char >= 0x2DE0 && char <= 0x2DFF,
     // 'Supplemental Punctuation': (char) => char >= 0x2E00 && char <= 0x2E7F,
-    'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF,
-    'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF,
+    // 'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF,
+    // 'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF,
     'Ideographic Description Characters': (char) => char >= 0x2FF0 && char <= 0x2FFF,
     'CJK Symbols and Punctuation': (char) => char >= 0x3000 && char <= 0x303F,
-    'Hiragana': (char) => char >= 0x3040 && char <= 0x309F,
+    // 'Hiragana': (char) => char >= 0x3040 && char <= 0x309F,
     'Katakana': (char) => char >= 0x30A0 && char <= 0x30FF,
-    'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F,
-    'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F,
+    // 'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F,
+    // 'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F,
     'Kanbun': (char) => char >= 0x3190 && char <= 0x319F,
-    'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF,
+    // 'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF,
     'CJK Strokes': (char) => char >= 0x31C0 && char <= 0x31EF,
-    'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF,
+    // 'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF,
     'Enclosed CJK Letters and Months': (char) => char >= 0x3200 && char <= 0x32FF,
     'CJK Compatibility': (char) => char >= 0x3300 && char <= 0x33FF,
-    'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF,
+    // 'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF,
     'Yijing Hexagram Symbols': (char) => char >= 0x4DC0 && char <= 0x4DFF,
-    'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF,
-    'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F,
-    'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF,
+    // 'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF,
+    // 'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F,
+    // 'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF,
     // 'Lisu': (char) => char >= 0xA4D0 && char <= 0xA4FF,
     // 'Vai': (char) => char >= 0xA500 && char <= 0xA63F,
     // 'Cyrillic Extended-B': (char) => char >= 0xA640 && char <= 0xA69F,
@@ -140,7 +140,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Devanagari Extended': (char) => char >= 0xA8E0 && char <= 0xA8FF,
     // 'Kayah Li': (char) => char >= 0xA900 && char <= 0xA92F,
     // 'Rejang': (char) => char >= 0xA930 && char <= 0xA95F,
-    'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F,
+    // 'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F,
     // 'Javanese': (char) => char >= 0xA980 && char <= 0xA9DF,
     // 'Myanmar Extended-B': (char) => char >= 0xA9E0 && char <= 0xA9FF,
     // 'Cham': (char) => char >= 0xAA00 && char <= 0xAA5F,
@@ -151,21 +151,21 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
     // 'Latin Extended-E': (char) => char >= 0xAB30 && char <= 0xAB6F,
     // 'Cherokee Supplement': (char) => char >= 0xAB70 && char <= 0xABBF,
     // 'Meetei Mayek': (char) => char >= 0xABC0 && char <= 0xABFF,
-    'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF,
-    'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF,
+    // 'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF,
+    // 'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF,
     // 'High Surrogates': (char) => char >= 0xD800 && char <= 0xDB7F,
     // 'High Private Use Surrogates': (char) => char >= 0xDB80 && char <= 0xDBFF,
     // 'Low Surrogates': (char) => char >= 0xDC00 && char <= 0xDFFF,
     'Private Use Area': (char) => char >= 0xE000 && char <= 0xF8FF,
-    'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF,
+    // 'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF,
     // 'Alphabetic Presentation Forms': (char) => char >= 0xFB00 && char <= 0xFB4F,
-    'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF,
+    // 'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF,
     // 'Variation Selectors': (char) => char >= 0xFE00 && char <= 0xFE0F,
     'Vertical Forms': (char) => char >= 0xFE10 && char <= 0xFE1F,
     // 'Combining Half Marks': (char) => char >= 0xFE20 && char <= 0xFE2F,
     'CJK Compatibility Forms': (char) => char >= 0xFE30 && char <= 0xFE4F,
     'Small Form Variants': (char) => char >= 0xFE50 && char <= 0xFE6F,
-    'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF,
+    // 'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF,
     'Halfwidth and Fullwidth Forms': (char) => char >= 0xFF00 && char <= 0xFFEF
     // 'Specials': (char) => char >= 0xFFF0 && char <= 0xFFFF,
     // 'Linear B Syllabary': (char) => char >= 0x10000 && char <= 0x1007F,

diff --git a/src/util/script_detection.test.ts b/src/util/script_detection.test.ts
@@ -1,9 +1,138 @@
-import {charInComplexShapingScript} from './script_detection';
+import {charAllowsIdeographicBreaking, charAllowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, charInRTLScript} from './script_detection';
+
+describe('charAllowsIdeographicBreaking', () => {
+    test('disallows ideographic breaking of Latin text', () => {
+        expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false);
+    });
+
+    test('allows ideographic breaking of ideographic punctuation', () => {
+        expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true);
+    });
+
+    test('allows ideographic breaking of Bopomofo text', () => {
+        expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true);
+    });
+
+    test('allows ideographic breaking of Chinese and Vietnamese text', () => {
+        expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true);
+        expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true);
+    });
+
+    test('disallows ideographic breaking of Korean text', () => {
+        expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false);
+    });
+
+    test('allows ideographic breaking of Japanese text', () => {
+        expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true);
+        expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true);
+    });
+
+    test('allows ideographic breaking of Yi text', () => {
+        expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true);
+    });
+});
+
+describe('charAllowsLetterSpacing', () => {
+    test('allows letter spacing of Latin text', () => {
+        expect(charAllowsLetterSpacing('A'.codePointAt(0))).toBe(true);
+    });
+
+    test('disallows ideographic breaking of Arabic text', () => {
+        // Arabic
+        expect(charAllowsLetterSpacing('۳'.codePointAt(0))).toBe(false);
+        // Arabic Supplement
+        expect(charAllowsLetterSpacing('ݣ'.codePointAt(0))).toBe(false);
+        // Arabic Extended-A
+        expect(charAllowsLetterSpacing('ࢳ'.codePointAt(0))).toBe(false);
+        // Arabic Extended-B
+        expect(charAllowsLetterSpacing('࢐'.codePointAt(0))).toBe(false);
+        // Arabic Presentation Forms-A
+        expect(charAllowsLetterSpacing('ﰤ'.codePointAt(0))).toBe(false);
+        // Arabic Presentation Forms-B
+        expect(charAllowsLetterSpacing('ﺽ'.codePointAt(0))).toBe(false);
+    });
+});
+
+describe('charHasUprightVerticalOrientation', () => {
+    test('rotates Latin text sideways', () => {
+        expect(charHasUprightVerticalOrientation('A'.codePointAt(0))).toBe(false);
+    });
+
+    test('keeps Bopomofo text upright', () => {
+        expect(charHasUprightVerticalOrientation('ㄎ'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Canadian Aboriginal text upright', () => {
+        expect(charHasUprightVerticalOrientation('ᐃ'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Chinese and Vietnamese text upright', () => {
+        expect(charHasUprightVerticalOrientation('市'.codePointAt(0))).toBe(true);
+        expect(charHasUprightVerticalOrientation('𡔖'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Korean text upright', () => {
+        expect(charHasUprightVerticalOrientation('아'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Japanese text upright', () => {
+        expect(charHasUprightVerticalOrientation('あ'.codePointAt(0))).toBe(true);
+        expect(charHasUprightVerticalOrientation('カ'.codePointAt(0))).toBe(true);
+    });
+
+    test('keeps Yi text upright', () => {
+        expect(charHasUprightVerticalOrientation('ꉆ'.codePointAt(0))).toBe(true);
+    });
+});
 
 describe('charInComplexShapingScript', () => {
     test('recognizes that Arabic text needs complex shaping', () => {
+        // Non-Arabic
         expect(charInComplexShapingScript('3'.codePointAt(0))).toBe(false);
+        // Arabic
         expect(charInComplexShapingScript('۳'.codePointAt(0))).toBe(true);
+        // Arabic Supplement
+        expect(charInComplexShapingScript('ݣ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-A
+        expect(charInComplexShapingScript('ࢳ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-B
         expect(charInComplexShapingScript('࢐'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-A
+        expect(charInComplexShapingScript('ﰤ'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-B
+        expect(charInComplexShapingScript('ﺽ'.codePointAt(0))).toBe(true);
+    });
+});
+
+describe('charInRTLScript', () => {
+    test('does not identify direction-neutral text as right-to-left', () => {
+        expect(charInRTLScript('3'.codePointAt(0))).toBe(false);
+    });
+
+    test('identifies Arabic text as right-to-left', () => {
+        // Arabic
+        expect(charInRTLScript('۳'.codePointAt(0))).toBe(true);
+        // Arabic Supplement
+        expect(charInRTLScript('ݣ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-A
+        expect(charInRTLScript('ࢳ'.codePointAt(0))).toBe(true);
+        // Arabic Extended-B
+        expect(charInRTLScript('࢐'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-A
+        expect(charInRTLScript('ﰤ'.codePointAt(0))).toBe(true);
+        // Arabic Presentation Forms-B
+        expect(charInRTLScript('ﺽ'.codePointAt(0))).toBe(true);
+    });
+
+    test('identifies Hebrew text as right-to-left', () => {
+        // Hebrew
+        expect(charInRTLScript('ה'.codePointAt(0))).toBe(true);
+        // Alphabetic Presentation Forms
+        expect(charInRTLScript('ﬡ'.codePointAt(0))).toBe(true);
+    });
+
+    test('identifies Thaana text as right-to-left', () => {
+        // Thaana
+        expect(charInRTLScript('ޘ'.codePointAt(0))).toBe(true);
     });
 });