Break lines based on word segmentation

Replaced custom word break heuristics when determining line breaks with a word segmenter.
1ec5 · Aug 27, 2024 · 0a42e34 · 0a42e34
1 parent 7d8dcde
commit 0a42e34
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 144 deletions.
diff --git a/src/symbol/shaping.ts b/src/symbol/shaping.ts
@@ -1,6 +1,5 @@
 import {
     charHasUprightVerticalOrientation,
-    charAllowsIdeographicBreaking,
     charInComplexShapingScript,
     rtlScriptRegExp,
     splitByGraphemeCluster
@@ -70,6 +69,7 @@ function isEmpty(positionedLines: Array<PositionedLine>) {
 }
 
 const rtlCombiningMarkRegExp = new RegExp(`(${rtlScriptRegExp.source})([\\p{gc=Mn}\\p{gc=Mc}])`, 'gu');
+const wordSegmenter = new Intl.Segmenter(undefined, {granularity: 'word'});
 
 export type SymbolAnchor = 'center' | 'left' | 'right' | 'top' | 'bottom' | 'top-left' | 'top-right' | 'bottom-left' | 'bottom-right';
 export type TextJustify = 'left' | 'center' | 'right';
@@ -373,34 +373,6 @@ const whitespace: {
     [0x20]: true, // space
 };
 
-const breakable: {
-    [_: number]: boolean;
-} = {
-    [0x0a]: true, // newline
-    [0x20]: true, // space
-    [0x26]: true, // ampersand
-    [0x29]: true, // right parenthesis
-    [0x2b]: true, // plus sign
-    [0x2d]: true, // hyphen-minus
-    [0x2f]: true, // solidus
-    [0xad]: true, // soft hyphen
-    [0xb7]: true, // middle dot
-    [0x200b]: true, // zero-width space
-    [0x2010]: true, // hyphen
-    [0x2013]: true, // en dash
-    [0x2027]: true  // interpunct
-    // Many other characters may be reasonable breakpoints
-    // Consider "neutral orientation" characters at scriptDetection.charHasNeutralVerticalOrientation
-    // See https://github.com/mapbox/mapbox-gl-js/issues/3658
-};
-
-// Allow breaks depending on the following character
-const breakableBefore: {
-    [_: number]: boolean;
-} = {
-    [0x28]: true, // left parenthesis
-};
-
 function getGlyphAdvance(
     grapheme: string,
     section: SectionOptions,
@@ -465,17 +437,12 @@ function calculateBadness(lineWidth: number,
     return raggedness + Math.abs(penalty) * penalty;
 }
 
-function calculatePenalty(codePoint: number, nextCodePoint: number, penalizableIdeographicBreak: boolean) {
+function calculatePenalty(codePoint: number, nextCodePoint: number) {
     let penalty = 0;
     // Force break on newline
     if (codePoint === 0x0a) {
         penalty -= 10000;
     }
-    // Penalize breaks between characters that allow ideographic breaking because
-    // they are less preferable than breaks at spaces (or zero width spaces).
-    if (penalizableIdeographicBreak) {
-        penalty += 150;
-    }
 
     // Penalize open parenthesis at end of line
     if (codePoint === 0x28 || codePoint === 0xff08) {
@@ -555,49 +522,28 @@ export function determineLineBreaks(
     const potentialLineBreaks = [];
     const targetWidth = determineAverageLineWidth(logicalInput, spacing, maxWidth, glyphMap, imagePositions, layoutTextSize);
 
-    const hasServerSuggestedBreakpoints = logicalInput.text.indexOf('\u200b') >= 0;
-
     let currentX = 0;
-
-    let i = 0;
-    const chars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator]();
-    let char = chars.next();
-    const nextChars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator]();
-    nextChars.next();
-    let nextChar = nextChars.next();
-    const nextNextChars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator]();
-    nextNextChars.next();
-    nextNextChars.next();
-    let nextNextChar = nextNextChars.next();
-
-    while (!char.done) {
-        const section = logicalInput.getSection(i);
-        const segment = char.value;
-        const codePoint = segment.codePointAt(0);
-        if (!whitespace[codePoint]) currentX += getGlyphAdvance(segment, section, glyphMap, imagePositions, spacing, layoutTextSize);
-
-        // Ideographic characters, spaces, and word-breaking punctuation that often appear without
-        // surrounding spaces.
-        if (!nextChar.done) {
-            const ideographicBreak = charAllowsIdeographicBreaking(codePoint);
-            const nextSegment = nextChar.value;
-            const nextCodePoint = nextSegment.codePointAt(0);
-            if (breakable[codePoint] || ideographicBreak || section.imageName || (!nextNextChar.done && breakableBefore[nextCodePoint])) {
-
-                potentialLineBreaks.push(
-                    evaluateBreak(
-                        i + 1,
-                        currentX,
-                        targetWidth,
-                        potentialLineBreaks,
-                        calculatePenalty(codePoint, nextCodePoint, ideographicBreak && hasServerSuggestedBreakpoints),
-                        false));
+    let graphemeIndex = 0;
+    for (const {index: wordIndex, segment: word} of wordSegmenter.segment(logicalInput.text)) {
+        const graphemes = splitByGraphemeCluster(word);
+        for (const grapheme of graphemes) {
+            const section = logicalInput.getSection(graphemeIndex);
+            if (!!grapheme.trim()) {
+                currentX += getGlyphAdvance(grapheme, section, glyphMap, imagePositions, spacing, layoutTextSize);
             }
+            graphemeIndex++;
         }
-        i++;
-        char = chars.next();
-        nextChar = nextChars.next();
-        nextNextChar = nextNextChars.next();
+
+        const nextWordIndex = wordIndex + word.length;
+        const lastCodePoint = graphemes.at(-1).codePointAt(0);
+        const nextWordCodePoint = logicalInput.text.codePointAt(nextWordIndex);
+        if (!nextWordCodePoint) {
+            continue;
+        }
+
+        const penalty = calculatePenalty(lastCodePoint, nextWordCodePoint);
+        const lineBreak = evaluateBreak(graphemeIndex, currentX, targetWidth, potentialLineBreaks, penalty, false)
+        potentialLineBreaks.push(lineBreak);
     }
 
     return leastBadBreaks(

diff --git a/src/util/script_detection.test.ts b/src/util/script_detection.test.ts
@@ -1,39 +1,4 @@
-import {allowsLetterSpacing, charAllowsIdeographicBreaking, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection';
-
-describe('charAllowsIdeographicBreaking', () => {
-    test('disallows ideographic breaking of Latin text', () => {
-        expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false);
-        expect(charAllowsIdeographicBreaking('3'.codePointAt(0))).toBe(false);
-    });
-
-    test('allows ideographic breaking of ideographic punctuation', () => {
-        expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true);
-    });
-
-    test('allows ideographic breaking of Bopomofo text', () => {
-        expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true);
-    });
-
-    test('allows ideographic breaking of Chinese and Vietnamese text', () => {
-        expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true);
-        expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true);
-        expect(charAllowsIdeographicBreaking('麵'.codePointAt(0))).toBe(true);
-        expect(charAllowsIdeographicBreaking('𪚥'.codePointAt(0))).toBe(true);
-    });
-
-    test('disallows ideographic breaking of Korean text', () => {
-        expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false);
-    });
-
-    test('allows ideographic breaking of Japanese text', () => {
-        expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true);
-        expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true);
-    });
-
-    test('allows ideographic breaking of Yi text', () => {
-        expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true);
-    });
-});
+import {allowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection';
 
 describe('allowsLetterSpacing', () => {
     test('allows letter spacing of Latin text', () => {

diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts
@@ -28,13 +28,6 @@ export function splitByGraphemeCluster(text: string) {
     return baseSegments;
 }
 
-export function allowsIdeographicBreaking(chars: string) {
-    for (const char of chars) {
-        if (!charAllowsIdeographicBreaking(char.codePointAt(0))) return false;
-    }
-    return true;
-}
-
 export function allowsVerticalWritingMode(chars: string) {
     for (const char of chars) {
         if (charHasUprightVerticalOrientation(char.codePointAt(0))) return true;
@@ -94,30 +87,6 @@ const ideographicBreakingScriptCodes = [
 
 const ideographicBreakingRegExp = sanitizedRegExpFromScriptCodes(ideographicBreakingScriptCodes);
 
-export function charAllowsIdeographicBreaking(char: number) {
-    // Return early for characters outside all ideographic ranges.
-    if (char < 0x2E80) return false;
-
-    if (isChar['CJK Compatibility'](char)) return true;
-    if (isChar['CJK Compatibility Forms'](char)) return true;
-    if (isChar['CJK Radicals Supplement'](char)) return true;
-    if (isChar['CJK Strokes'](char)) return true;
-    if (isChar['CJK Symbols and Punctuation'](char)) return true;
-    if (isChar['Enclosed CJK Letters and Months'](char)) return true;
-    if (isChar['Enclosed Ideographic Supplement'](char)) return true;
-    if (isChar['Halfwidth and Fullwidth Forms'](char)) return true;
-    if (isChar['Ideographic Description Characters'](char)) return true;
-    if (isChar['Ideographic Symbols and Punctuation'](char)) return true;
-    if (isChar['Kana Extended-A'](char)) return true;
-    if (isChar['Kana Extended-B'](char)) return true;
-    if (isChar['Kana Supplement'](char)) return true;
-    if (isChar['Kangxi Radicals'](char)) return true;
-    if (isChar['Katakana Phonetic Extensions'](char)) return true;
-    if (isChar['Small Kana Extension'](char)) return true;
-    if (isChar['Vertical Forms'](char)) return true;
-    return ideographicBreakingRegExp.test(String.fromCodePoint(char));
-}
-
 // The following logic comes from
 // <https://www.unicode.org/Public/16.0.0/ucd/VerticalOrientation.txt>.
 // Keep it synchronized with
@@ -220,9 +189,25 @@ export function charHasUprightVerticalOrientation(char: number) {
     if (/* Canadian Aboriginal */ /\p{sc=Cans}/u.test(String.fromCodePoint(char))) return true;
     if (/* Egyptian Hieroglyphs */ /\p{sc=Egyp}/u.test(String.fromCodePoint(char))) return true;
     if (/* Hangul */ /\p{sc=Hang}/u.test(String.fromCodePoint(char))) return true;
-    if (charAllowsIdeographicBreaking(char)) return true;
 
-    return false;
+    if (isChar['CJK Compatibility'](char)) return true;
+    if (isChar['CJK Compatibility Forms'](char)) return true;
+    if (isChar['CJK Radicals Supplement'](char)) return true;
+    if (isChar['CJK Strokes'](char)) return true;
+    if (isChar['CJK Symbols and Punctuation'](char)) return true;
+    if (isChar['Enclosed CJK Letters and Months'](char)) return true;
+    if (isChar['Enclosed Ideographic Supplement'](char)) return true;
+    if (isChar['Halfwidth and Fullwidth Forms'](char)) return true;
+    if (isChar['Ideographic Description Characters'](char)) return true;
+    if (isChar['Ideographic Symbols and Punctuation'](char)) return true;
+    if (isChar['Kana Extended-A'](char)) return true;
+    if (isChar['Kana Extended-B'](char)) return true;
+    if (isChar['Kana Supplement'](char)) return true;
+    if (isChar['Kangxi Radicals'](char)) return true;
+    if (isChar['Katakana Phonetic Extensions'](char)) return true;
+    if (isChar['Small Kana Extension'](char)) return true;
+    if (isChar['Vertical Forms'](char)) return true;
+    return ideographicBreakingRegExp.test(String.fromCodePoint(char));
 }
 
 /**