Skip to content

Commit

Permalink
Break lines based on word segmentation
Browse files Browse the repository at this point in the history
Replaced custom word break heuristics when determining line breaks with a word segmenter.
  • Loading branch information
1ec5 committed Aug 27, 2024
1 parent 7d8dcde commit 0a42e34
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 144 deletions.
96 changes: 21 additions & 75 deletions src/symbol/shaping.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import {
charHasUprightVerticalOrientation,
charAllowsIdeographicBreaking,
charInComplexShapingScript,
rtlScriptRegExp,
splitByGraphemeCluster
Expand Down Expand Up @@ -70,6 +69,7 @@ function isEmpty(positionedLines: Array<PositionedLine>) {
}

const rtlCombiningMarkRegExp = new RegExp(`(${rtlScriptRegExp.source})([\\p{gc=Mn}\\p{gc=Mc}])`, 'gu');
const wordSegmenter = new Intl.Segmenter(undefined, {granularity: 'word'});

export type SymbolAnchor = 'center' | 'left' | 'right' | 'top' | 'bottom' | 'top-left' | 'top-right' | 'bottom-left' | 'bottom-right';
export type TextJustify = 'left' | 'center' | 'right';
Expand Down Expand Up @@ -373,34 +373,6 @@ const whitespace: {
[0x20]: true, // space
};

const breakable: {
[_: number]: boolean;
} = {
[0x0a]: true, // newline
[0x20]: true, // space
[0x26]: true, // ampersand
[0x29]: true, // right parenthesis
[0x2b]: true, // plus sign
[0x2d]: true, // hyphen-minus
[0x2f]: true, // solidus
[0xad]: true, // soft hyphen
[0xb7]: true, // middle dot
[0x200b]: true, // zero-width space
[0x2010]: true, // hyphen
[0x2013]: true, // en dash
[0x2027]: true // interpunct
// Many other characters may be reasonable breakpoints
// Consider "neutral orientation" characters at scriptDetection.charHasNeutralVerticalOrientation
// See https://github.com/mapbox/mapbox-gl-js/issues/3658
};

// Allow breaks depending on the following character
const breakableBefore: {
[_: number]: boolean;
} = {
[0x28]: true, // left parenthesis
};

function getGlyphAdvance(
grapheme: string,
section: SectionOptions,
Expand Down Expand Up @@ -465,17 +437,12 @@ function calculateBadness(lineWidth: number,
return raggedness + Math.abs(penalty) * penalty;
}

function calculatePenalty(codePoint: number, nextCodePoint: number, penalizableIdeographicBreak: boolean) {
function calculatePenalty(codePoint: number, nextCodePoint: number) {
let penalty = 0;
// Force break on newline
if (codePoint === 0x0a) {
penalty -= 10000;
}
// Penalize breaks between characters that allow ideographic breaking because
// they are less preferable than breaks at spaces (or zero width spaces).
if (penalizableIdeographicBreak) {
penalty += 150;
}

// Penalize open parenthesis at end of line
if (codePoint === 0x28 || codePoint === 0xff08) {
Expand Down Expand Up @@ -555,49 +522,28 @@ export function determineLineBreaks(
const potentialLineBreaks = [];
const targetWidth = determineAverageLineWidth(logicalInput, spacing, maxWidth, glyphMap, imagePositions, layoutTextSize);

const hasServerSuggestedBreakpoints = logicalInput.text.indexOf('\u200b') >= 0;

let currentX = 0;

let i = 0;
const chars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator]();
let char = chars.next();
const nextChars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator]();
nextChars.next();
let nextChar = nextChars.next();
const nextNextChars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator]();
nextNextChars.next();
nextNextChars.next();
let nextNextChar = nextNextChars.next();

while (!char.done) {
const section = logicalInput.getSection(i);
const segment = char.value;
const codePoint = segment.codePointAt(0);
if (!whitespace[codePoint]) currentX += getGlyphAdvance(segment, section, glyphMap, imagePositions, spacing, layoutTextSize);

// Ideographic characters, spaces, and word-breaking punctuation that often appear without
// surrounding spaces.
if (!nextChar.done) {
const ideographicBreak = charAllowsIdeographicBreaking(codePoint);
const nextSegment = nextChar.value;
const nextCodePoint = nextSegment.codePointAt(0);
if (breakable[codePoint] || ideographicBreak || section.imageName || (!nextNextChar.done && breakableBefore[nextCodePoint])) {

potentialLineBreaks.push(
evaluateBreak(
i + 1,
currentX,
targetWidth,
potentialLineBreaks,
calculatePenalty(codePoint, nextCodePoint, ideographicBreak && hasServerSuggestedBreakpoints),
false));
let graphemeIndex = 0;
for (const {index: wordIndex, segment: word} of wordSegmenter.segment(logicalInput.text)) {
const graphemes = splitByGraphemeCluster(word);
for (const grapheme of graphemes) {
const section = logicalInput.getSection(graphemeIndex);
if (!!grapheme.trim()) {
currentX += getGlyphAdvance(grapheme, section, glyphMap, imagePositions, spacing, layoutTextSize);
}
graphemeIndex++;
}
i++;
char = chars.next();
nextChar = nextChars.next();
nextNextChar = nextNextChars.next();

const nextWordIndex = wordIndex + word.length;
const lastCodePoint = graphemes.at(-1).codePointAt(0);
const nextWordCodePoint = logicalInput.text.codePointAt(nextWordIndex);
if (!nextWordCodePoint) {
continue;
}

const penalty = calculatePenalty(lastCodePoint, nextWordCodePoint);
const lineBreak = evaluateBreak(graphemeIndex, currentX, targetWidth, potentialLineBreaks, penalty, false)
potentialLineBreaks.push(lineBreak);
}

return leastBadBreaks(
Expand Down
37 changes: 1 addition & 36 deletions src/util/script_detection.test.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,4 @@
import {allowsLetterSpacing, charAllowsIdeographicBreaking, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection';

describe('charAllowsIdeographicBreaking', () => {
test('disallows ideographic breaking of Latin text', () => {
expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false);
expect(charAllowsIdeographicBreaking('3'.codePointAt(0))).toBe(false);
});

test('allows ideographic breaking of ideographic punctuation', () => {
expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true);
});

test('allows ideographic breaking of Bopomofo text', () => {
expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true);
});

test('allows ideographic breaking of Chinese and Vietnamese text', () => {
expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true);
expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true);
expect(charAllowsIdeographicBreaking('麵'.codePointAt(0))).toBe(true);
expect(charAllowsIdeographicBreaking('𪚥'.codePointAt(0))).toBe(true);
});

test('disallows ideographic breaking of Korean text', () => {
expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false);
});

test('allows ideographic breaking of Japanese text', () => {
expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true);
expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true);
});

test('allows ideographic breaking of Yi text', () => {
expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true);
});
});
import {allowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection';

describe('allowsLetterSpacing', () => {
test('allows letter spacing of Latin text', () => {
Expand Down
51 changes: 18 additions & 33 deletions src/util/script_detection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,6 @@ export function splitByGraphemeCluster(text: string) {
return baseSegments;
}

export function allowsIdeographicBreaking(chars: string) {
for (const char of chars) {
if (!charAllowsIdeographicBreaking(char.codePointAt(0))) return false;
}
return true;
}

export function allowsVerticalWritingMode(chars: string) {
for (const char of chars) {
if (charHasUprightVerticalOrientation(char.codePointAt(0))) return true;
Expand Down Expand Up @@ -94,30 +87,6 @@ const ideographicBreakingScriptCodes = [

const ideographicBreakingRegExp = sanitizedRegExpFromScriptCodes(ideographicBreakingScriptCodes);

export function charAllowsIdeographicBreaking(char: number) {
// Return early for characters outside all ideographic ranges.
if (char < 0x2E80) return false;

if (isChar['CJK Compatibility'](char)) return true;
if (isChar['CJK Compatibility Forms'](char)) return true;
if (isChar['CJK Radicals Supplement'](char)) return true;
if (isChar['CJK Strokes'](char)) return true;
if (isChar['CJK Symbols and Punctuation'](char)) return true;
if (isChar['Enclosed CJK Letters and Months'](char)) return true;
if (isChar['Enclosed Ideographic Supplement'](char)) return true;
if (isChar['Halfwidth and Fullwidth Forms'](char)) return true;
if (isChar['Ideographic Description Characters'](char)) return true;
if (isChar['Ideographic Symbols and Punctuation'](char)) return true;
if (isChar['Kana Extended-A'](char)) return true;
if (isChar['Kana Extended-B'](char)) return true;
if (isChar['Kana Supplement'](char)) return true;
if (isChar['Kangxi Radicals'](char)) return true;
if (isChar['Katakana Phonetic Extensions'](char)) return true;
if (isChar['Small Kana Extension'](char)) return true;
if (isChar['Vertical Forms'](char)) return true;
return ideographicBreakingRegExp.test(String.fromCodePoint(char));
}

// The following logic comes from
// <https://www.unicode.org/Public/16.0.0/ucd/VerticalOrientation.txt>.
// Keep it synchronized with
Expand Down Expand Up @@ -220,9 +189,25 @@ export function charHasUprightVerticalOrientation(char: number) {
if (/* Canadian Aboriginal */ /\p{sc=Cans}/u.test(String.fromCodePoint(char))) return true;
if (/* Egyptian Hieroglyphs */ /\p{sc=Egyp}/u.test(String.fromCodePoint(char))) return true;
if (/* Hangul */ /\p{sc=Hang}/u.test(String.fromCodePoint(char))) return true;
if (charAllowsIdeographicBreaking(char)) return true;

return false;
if (isChar['CJK Compatibility'](char)) return true;
if (isChar['CJK Compatibility Forms'](char)) return true;
if (isChar['CJK Radicals Supplement'](char)) return true;
if (isChar['CJK Strokes'](char)) return true;
if (isChar['CJK Symbols and Punctuation'](char)) return true;
if (isChar['Enclosed CJK Letters and Months'](char)) return true;
if (isChar['Enclosed Ideographic Supplement'](char)) return true;
if (isChar['Halfwidth and Fullwidth Forms'](char)) return true;
if (isChar['Ideographic Description Characters'](char)) return true;
if (isChar['Ideographic Symbols and Punctuation'](char)) return true;
if (isChar['Kana Extended-A'](char)) return true;
if (isChar['Kana Extended-B'](char)) return true;
if (isChar['Kana Supplement'](char)) return true;
if (isChar['Kangxi Radicals'](char)) return true;
if (isChar['Katakana Phonetic Extensions'](char)) return true;
if (isChar['Small Kana Extension'](char)) return true;
if (isChar['Vertical Forms'](char)) return true;
return ideographicBreakingRegExp.test(String.fromCodePoint(char));
}

/**
Expand Down

0 comments on commit 0a42e34

Please sign in to comment.