Skip to content

Commit

Permalink
Simplify script detection using regular expressions (#4560)
Browse files Browse the repository at this point in the history
* Test script detection functions

* Simplified script detection functions using regular expressions

* Automatically exclude unsupported script codes from regular expressions

* Consolidated ideographic breaking and vertical orientation logic

* Prefer local glyph rendering for all CJKV characters

Simplified the logic for preferring local glyph rendering to consider the script, which requires less maintenance than a hard-coded list of blocks.

---------

Co-authored-by: Harel M <[email protected]>
  • Loading branch information
1ec5 and HarelM authored Aug 15, 2024
1 parent 4797952 commit 42d6847
Show file tree
Hide file tree
Showing 8 changed files with 284 additions and 87 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- Fix right-to-left layout of labels that contain characters in the Arabic Extended-B code block. ([#4536](https://github.com/maplibre/maplibre-gl-js/pull/4536))
- Fix 3D map freezing when camera is adjusted against map bounds. ([#4537](https://github.com/maplibre/maplibre-gl-js/issues/4537))
- Fix `getStyle()` to return a clone so the object cannot be internally changed ([#4488](https://github.com/maplibre/maplibre-gl-js/issues/4488))

- Prefer local glyph rendering for all CJKV characters, not just those in the CJK Unified Ideographs, Hiragana, Katakana, and Hangul Syllables blocks. ([#4560](https://github.com/maplibre/maplibre-gl-js/pull/4560)))
- - _...Add new stuff here..._

## 4.5.2
Expand Down
19 changes: 18 additions & 1 deletion src/render/glyph_manager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ describe('GlyphManager', () => {
test('GlyphManager generates CJK PBF locally', async () => {
const manager = createGlyphManager('sans-serif');

// character 平
// Chinese character píng
const returnedGlyphs = await manager.getGlyphs({'Arial Unicode MS': [0x5e73]});
expect(returnedGlyphs['Arial Unicode MS'][0x5e73].metrics.advance).toBe(0.5);
});
Expand All @@ -114,6 +114,23 @@ describe('GlyphManager', () => {
expect(returnedGlyphs['Arial Unicode MS'][0x3066].metrics.advance).toBe(0.5);
});

test('GlyphManager consistently generates CJKV text locally', async () => {
const manager = createGlyphManager('sans-serif');

// Space
expect(manager._doesCharSupportLocalGlyph(0x0020)).toBe(false);
// Chinese character píng 平
expect(manager._doesCharSupportLocalGlyph(0x5e73)).toBe(true);
// Chinese character biáng 𰻞
expect(manager._doesCharSupportLocalGlyph(0x30EDE)).toBe(true);
// Katakana letter te テ
expect(manager._doesCharSupportLocalGlyph(0x30c6)).toBe(true);
// Hiragana letter te て
expect(manager._doesCharSupportLocalGlyph(0x3066)).toBe(true);
// Hangul letter a 아
expect(manager._doesCharSupportLocalGlyph(0xC544)).toBe(true);
});

test('GlyphManager caches locally generated glyphs', async () => {

const manager = createGlyphManager('sans-serif');
Expand Down
14 changes: 7 additions & 7 deletions src/render/glyph_manager.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import {loadGlyphRange} from '../style/load_glyph_range';

import TinySDF from '@mapbox/tiny-sdf';
import {unicodeBlockLookup} from '../util/is_char_in_unicode_block';
import {AlphaImage} from '../util/image';

import type {StyleGlyph} from '../style/style_glyph';
Expand Down Expand Up @@ -120,13 +119,14 @@ export class GlyphManager {
}

_doesCharSupportLocalGlyph(id: number): boolean {
/* eslint-disable new-cap */
// The CJK Unified Ideographs blocks and Hangul Syllables blocks are
// spread across many glyph PBFs and are typically accessed very
// randomly. Preferring local rendering for these blocks reduces
// wasteful bandwidth consumption. For visual consistency within CJKV
// text, also include any other CJKV or siniform ideograph or hangul,
// hiragana, or katakana character.
return !!this.localIdeographFontFamily &&
(unicodeBlockLookup['CJK Unified Ideographs'](id) ||
unicodeBlockLookup['Hangul Syllables'](id) ||
unicodeBlockLookup['Hiragana'](id) ||
unicodeBlockLookup['Katakana'](id));
/* eslint-enable new-cap */
/\p{Ideo}|\p{sc=Hang}|\p{sc=Hira}|\p{sc=Kana}/u.test(String.fromCodePoint(id));
}

_tinySDF(entry: Entry, stack: string, id: number): StyleGlyph {
Expand Down
4 changes: 2 additions & 2 deletions src/style/style.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ export type StyleOptions = {
validate?: boolean;
/**
* Defines a CSS
* font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges.
* In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
* font-family for locally overriding generation of Chinese, Japanese, and Korean characters.
* For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
* Set to `false`, to enable font settings from the map's style for these glyph ranges.
* Forces a full update.
*/
Expand Down
4 changes: 2 additions & 2 deletions src/ui/map.ts
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ export type MapOptions = {
fitBoundsOptions?: FitBoundsOptions;
/**
* Defines a CSS
* font-family for locally overriding generation of glyphs in the 'CJK Unified Ideographs', 'Hiragana', 'Katakana' and 'Hangul Syllables' ranges.
* In these ranges, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
* font-family for locally overriding generation of Chinese, Japanese, and Korean characters.
* For these characters, font settings from the map's style will be ignored, except for font-weight keywords (light/regular/medium/bold).
* Set to `false`, to enable font settings from the map's style for these glyph ranges.
* The purpose of this option is to avoid bandwidth-intensive glyph server requests. (See [Use locally generated ideographs](https://maplibre.org/maplibre-gl-js/docs/examples/local-ideographs).)
* @defaultValue 'sans-serif'
Expand Down
46 changes: 23 additions & 23 deletions src/util/is_char_in_unicode_block.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
// 'Cyrillic Supplement': (char) => char >= 0x0500 && char <= 0x052F,
// 'Armenian': (char) => char >= 0x0530 && char <= 0x058F,
//'Hebrew': (char) => char >= 0x0590 && char <= 0x05FF,
'Arabic': (char) => char >= 0x0600 && char <= 0x06FF,
// 'Arabic': (char) => char >= 0x0600 && char <= 0x06FF,
//'Syriac': (char) => char >= 0x0700 && char <= 0x074F,
'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F,
// 'Arabic Supplement': (char) => char >= 0x0750 && char <= 0x077F,
// 'Thaana': (char) => char >= 0x0780 && char <= 0x07BF,
// 'NKo': (char) => char >= 0x07C0 && char <= 0x07FF,
// 'Samaritan': (char) => char >= 0x0800 && char <= 0x083F,
// 'Mandaic': (char) => char >= 0x0840 && char <= 0x085F,
// 'Syriac Supplement': (char) => char >= 0x0860 && char <= 0x086F,
'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F,
'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF,
// 'Arabic Extended-B': (char) => char >= 0x0870 && char <= 0x089F,
// 'Arabic Extended-A': (char) => char >= 0x08A0 && char <= 0x08FF,
// 'Devanagari': (char) => char >= 0x0900 && char <= 0x097F,
// 'Bengali': (char) => char >= 0x0980 && char <= 0x09FF,
// 'Gurmukhi': (char) => char >= 0x0A00 && char <= 0x0A7F,
Expand All @@ -45,7 +45,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
// 'Ethiopic': (char) => char >= 0x1200 && char <= 0x137F,
// 'Ethiopic Supplement': (char) => char >= 0x1380 && char <= 0x139F,
// 'Cherokee': (char) => char >= 0x13A0 && char <= 0x13FF,
'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F,
// 'Unified Canadian Aboriginal Syllabics': (char) => char >= 0x1400 && char <= 0x167F,
// 'Ogham': (char) => char >= 0x1680 && char <= 0x169F,
// 'Runic': (char) => char >= 0x16A0 && char <= 0x16FF,
// 'Tagalog': (char) => char >= 0x1700 && char <= 0x171F,
Expand All @@ -54,7 +54,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
// 'Tagbanwa': (char) => char >= 0x1760 && char <= 0x177F,
'Khmer': (char) => char >= 0x1780 && char <= 0x17FF,
// 'Mongolian': (char) => char >= 0x1800 && char <= 0x18AF,
'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF,
// 'Unified Canadian Aboriginal Syllabics Extended': (char) => char >= 0x18B0 && char <= 0x18FF,
// 'Limbu': (char) => char >= 0x1900 && char <= 0x194F,
// 'Tai Le': (char) => char >= 0x1950 && char <= 0x197F,
// 'New Tai Lue': (char) => char >= 0x1980 && char <= 0x19DF,
Expand Down Expand Up @@ -108,25 +108,25 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
// 'Ethiopic Extended': (char) => char >= 0x2D80 && char <= 0x2DDF,
// 'Cyrillic Extended-A': (char) => char >= 0x2DE0 && char <= 0x2DFF,
// 'Supplemental Punctuation': (char) => char >= 0x2E00 && char <= 0x2E7F,
'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF,
'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF,
// 'CJK Radicals Supplement': (char) => char >= 0x2E80 && char <= 0x2EFF,
// 'Kangxi Radicals': (char) => char >= 0x2F00 && char <= 0x2FDF,
'Ideographic Description Characters': (char) => char >= 0x2FF0 && char <= 0x2FFF,
'CJK Symbols and Punctuation': (char) => char >= 0x3000 && char <= 0x303F,
'Hiragana': (char) => char >= 0x3040 && char <= 0x309F,
// 'Hiragana': (char) => char >= 0x3040 && char <= 0x309F,
'Katakana': (char) => char >= 0x30A0 && char <= 0x30FF,
'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F,
'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F,
// 'Bopomofo': (char) => char >= 0x3100 && char <= 0x312F,
// 'Hangul Compatibility Jamo': (char) => char >= 0x3130 && char <= 0x318F,
'Kanbun': (char) => char >= 0x3190 && char <= 0x319F,
'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF,
// 'Bopomofo Extended': (char) => char >= 0x31A0 && char <= 0x31BF,
'CJK Strokes': (char) => char >= 0x31C0 && char <= 0x31EF,
'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF,
// 'Katakana Phonetic Extensions': (char) => char >= 0x31F0 && char <= 0x31FF,
'Enclosed CJK Letters and Months': (char) => char >= 0x3200 && char <= 0x32FF,
'CJK Compatibility': (char) => char >= 0x3300 && char <= 0x33FF,
'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF,
// 'CJK Unified Ideographs Extension A': (char) => char >= 0x3400 && char <= 0x4DBF,
'Yijing Hexagram Symbols': (char) => char >= 0x4DC0 && char <= 0x4DFF,
'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF,
'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F,
'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF,
// 'CJK Unified Ideographs': (char) => char >= 0x4E00 && char <= 0x9FFF,
// 'Yi Syllables': (char) => char >= 0xA000 && char <= 0xA48F,
// 'Yi Radicals': (char) => char >= 0xA490 && char <= 0xA4CF,
// 'Lisu': (char) => char >= 0xA4D0 && char <= 0xA4FF,
// 'Vai': (char) => char >= 0xA500 && char <= 0xA63F,
// 'Cyrillic Extended-B': (char) => char >= 0xA640 && char <= 0xA69F,
Expand All @@ -140,7 +140,7 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
// 'Devanagari Extended': (char) => char >= 0xA8E0 && char <= 0xA8FF,
// 'Kayah Li': (char) => char >= 0xA900 && char <= 0xA92F,
// 'Rejang': (char) => char >= 0xA930 && char <= 0xA95F,
'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F,
// 'Hangul Jamo Extended-A': (char) => char >= 0xA960 && char <= 0xA97F,
// 'Javanese': (char) => char >= 0xA980 && char <= 0xA9DF,
// 'Myanmar Extended-B': (char) => char >= 0xA9E0 && char <= 0xA9FF,
// 'Cham': (char) => char >= 0xAA00 && char <= 0xAA5F,
Expand All @@ -151,21 +151,21 @@ export const unicodeBlockLookup: UnicodeBlockLookup = {
// 'Latin Extended-E': (char) => char >= 0xAB30 && char <= 0xAB6F,
// 'Cherokee Supplement': (char) => char >= 0xAB70 && char <= 0xABBF,
// 'Meetei Mayek': (char) => char >= 0xABC0 && char <= 0xABFF,
'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF,
'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF,
// 'Hangul Syllables': (char) => char >= 0xAC00 && char <= 0xD7AF,
// 'Hangul Jamo Extended-B': (char) => char >= 0xD7B0 && char <= 0xD7FF,
// 'High Surrogates': (char) => char >= 0xD800 && char <= 0xDB7F,
// 'High Private Use Surrogates': (char) => char >= 0xDB80 && char <= 0xDBFF,
// 'Low Surrogates': (char) => char >= 0xDC00 && char <= 0xDFFF,
'Private Use Area': (char) => char >= 0xE000 && char <= 0xF8FF,
'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF,
// 'CJK Compatibility Ideographs': (char) => char >= 0xF900 && char <= 0xFAFF,
// 'Alphabetic Presentation Forms': (char) => char >= 0xFB00 && char <= 0xFB4F,
'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF,
// 'Arabic Presentation Forms-A': (char) => char >= 0xFB50 && char <= 0xFDFF,
// 'Variation Selectors': (char) => char >= 0xFE00 && char <= 0xFE0F,
'Vertical Forms': (char) => char >= 0xFE10 && char <= 0xFE1F,
// 'Combining Half Marks': (char) => char >= 0xFE20 && char <= 0xFE2F,
'CJK Compatibility Forms': (char) => char >= 0xFE30 && char <= 0xFE4F,
'Small Form Variants': (char) => char >= 0xFE50 && char <= 0xFE6F,
'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF,
// 'Arabic Presentation Forms-B': (char) => char >= 0xFE70 && char <= 0xFEFF,
'Halfwidth and Fullwidth Forms': (char) => char >= 0xFF00 && char <= 0xFFEF
// 'Specials': (char) => char >= 0xFFF0 && char <= 0xFFFF,
// 'Linear B Syllabary': (char) => char >= 0x10000 && char <= 0x1007F,
Expand Down
131 changes: 130 additions & 1 deletion src/util/script_detection.test.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,138 @@
import {charInComplexShapingScript} from './script_detection';
import {charAllowsIdeographicBreaking, charAllowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, charInRTLScript} from './script_detection';

describe('charAllowsIdeographicBreaking', () => {
test('disallows ideographic breaking of Latin text', () => {
expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false);
});

test('allows ideographic breaking of ideographic punctuation', () => {
expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true);
});

test('allows ideographic breaking of Bopomofo text', () => {
expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true);
});

test('allows ideographic breaking of Chinese and Vietnamese text', () => {
expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true);
expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true);
});

test('disallows ideographic breaking of Korean text', () => {
expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false);
});

test('allows ideographic breaking of Japanese text', () => {
expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true);
expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true);
});

test('allows ideographic breaking of Yi text', () => {
expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true);
});
});

describe('charAllowsLetterSpacing', () => {
test('allows letter spacing of Latin text', () => {
expect(charAllowsLetterSpacing('A'.codePointAt(0))).toBe(true);
});

test('disallows ideographic breaking of Arabic text', () => {
// Arabic
expect(charAllowsLetterSpacing('۳'.codePointAt(0))).toBe(false);
// Arabic Supplement
expect(charAllowsLetterSpacing('ݣ'.codePointAt(0))).toBe(false);
// Arabic Extended-A
expect(charAllowsLetterSpacing('ࢳ'.codePointAt(0))).toBe(false);
// Arabic Extended-B
expect(charAllowsLetterSpacing('࢐'.codePointAt(0))).toBe(false);
// Arabic Presentation Forms-A
expect(charAllowsLetterSpacing('ﰤ'.codePointAt(0))).toBe(false);
// Arabic Presentation Forms-B
expect(charAllowsLetterSpacing('ﺽ'.codePointAt(0))).toBe(false);
});
});

describe('charHasUprightVerticalOrientation', () => {
test('rotates Latin text sideways', () => {
expect(charHasUprightVerticalOrientation('A'.codePointAt(0))).toBe(false);
});

test('keeps Bopomofo text upright', () => {
expect(charHasUprightVerticalOrientation('ㄎ'.codePointAt(0))).toBe(true);
});

test('keeps Canadian Aboriginal text upright', () => {
expect(charHasUprightVerticalOrientation('ᐃ'.codePointAt(0))).toBe(true);
});

test('keeps Chinese and Vietnamese text upright', () => {
expect(charHasUprightVerticalOrientation('市'.codePointAt(0))).toBe(true);
expect(charHasUprightVerticalOrientation('𡔖'.codePointAt(0))).toBe(true);
});

test('keeps Korean text upright', () => {
expect(charHasUprightVerticalOrientation('아'.codePointAt(0))).toBe(true);
});

test('keeps Japanese text upright', () => {
expect(charHasUprightVerticalOrientation('あ'.codePointAt(0))).toBe(true);
expect(charHasUprightVerticalOrientation('カ'.codePointAt(0))).toBe(true);
});

test('keeps Yi text upright', () => {
expect(charHasUprightVerticalOrientation('ꉆ'.codePointAt(0))).toBe(true);
});
});

describe('charInComplexShapingScript', () => {
test('recognizes that Arabic text needs complex shaping', () => {
// Non-Arabic
expect(charInComplexShapingScript('3'.codePointAt(0))).toBe(false);
// Arabic
expect(charInComplexShapingScript('۳'.codePointAt(0))).toBe(true);
// Arabic Supplement
expect(charInComplexShapingScript('ݣ'.codePointAt(0))).toBe(true);
// Arabic Extended-A
expect(charInComplexShapingScript('ࢳ'.codePointAt(0))).toBe(true);
// Arabic Extended-B
expect(charInComplexShapingScript('࢐'.codePointAt(0))).toBe(true);
// Arabic Presentation Forms-A
expect(charInComplexShapingScript('ﰤ'.codePointAt(0))).toBe(true);
// Arabic Presentation Forms-B
expect(charInComplexShapingScript('ﺽ'.codePointAt(0))).toBe(true);
});
});

describe('charInRTLScript', () => {
test('does not identify direction-neutral text as right-to-left', () => {
expect(charInRTLScript('3'.codePointAt(0))).toBe(false);
});

test('identifies Arabic text as right-to-left', () => {
// Arabic
expect(charInRTLScript('۳'.codePointAt(0))).toBe(true);
// Arabic Supplement
expect(charInRTLScript('ݣ'.codePointAt(0))).toBe(true);
// Arabic Extended-A
expect(charInRTLScript('ࢳ'.codePointAt(0))).toBe(true);
// Arabic Extended-B
expect(charInRTLScript('࢐'.codePointAt(0))).toBe(true);
// Arabic Presentation Forms-A
expect(charInRTLScript('ﰤ'.codePointAt(0))).toBe(true);
// Arabic Presentation Forms-B
expect(charInRTLScript('ﺽ'.codePointAt(0))).toBe(true);
});

test('identifies Hebrew text as right-to-left', () => {
// Hebrew
expect(charInRTLScript('ה'.codePointAt(0))).toBe(true);
// Alphabetic Presentation Forms
expect(charInRTLScript('ﬡ'.codePointAt(0))).toBe(true);
});

test('identifies Thaana text as right-to-left', () => {
// Thaana
expect(charInRTLScript('ޘ'.codePointAt(0))).toBe(true);
});
});
Loading

0 comments on commit 42d6847

Please sign in to comment.