Clean up GenerateBreakTest (#975)

This fulfils the following action item: UTC-155-A89 Robin Leroy, PAG Document extra classes used for testing characters in the segmentation test HTML files for 11.0. [E.g. ZWJ_FE, CM1_CM, etc.] (Retargeted for 13.0, 14.0, 15.0.) It also fixes #354. It also changes the pair table in LineBreakTest.html to show the three way direct/indirect/prohibited break distinction (across spaces), like the old pair table in UAX14 (see https://www.unicode.org/notes/tn54/alba-2.html?v=9.0.0). As in #970, the test files are not diffable, but I tested them with ICU. I tried improving the stability of the sample generation a little bit, but it is not as fancy as what Mark suggests in https://www.unicode.org/notes/tn54/alba-2.html?v=9.0.0#p478. I might do that in another PR.
unicode-org · Nov 28, 2024 · 7ae67b4 · 7ae67b4
1 parent b1e89f4
commit 7ae67b4
Show file tree

Hide file tree

Showing 11 changed files with 24,043 additions and 23,212 deletions.
diff --git a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.txt
diff --git a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.txt
diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.txt
diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java
diff --git a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java
diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt
@@ -16,19 +16,15 @@ $V=\p{Grapheme_Cluster_Break=V}
 $T=\p{Grapheme_Cluster_Break=T}
 $LV=\p{Grapheme_Cluster_Break=LV}
 $LVT=\p{Grapheme_Cluster_Break=LVT}
-# Note: The following may overlap with the above
-# Note: ConjunctLinkingScripts is not used anymore, instead that list exists in the derivation of Indic_Conjunct_Break.
-# It is kept here so that the diff of the generated test cases compared to the Unicode 15.1 β is minimal.
-# TODO(egg): Consider removing in Unicode 16.0.
-$ConjunctLinkingScripts=[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}]
 $ConjunctLinker=\p{Indic_Conjunct_Break=Linker}
 $LinkingConsonant=\p{Indic_Conjunct_Break=Consonant}
 ##	$E_Base=\p{Grapheme_Cluster_Break=E_Base}
 ##	$E_Modifier=\p{Grapheme_Cluster_Break=E_Modifier}
 $ExtPict=\p{Extended_Pictographic}
-$ExtCccZwj=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]
+$ConjunctExtender=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]
 ##	$EBG=\p{Grapheme_Cluster_Break=E_Base_GAZ}
 ##	$Glue_After_Zwj=\p{Grapheme_Cluster_Break=Glue_After_Zwj}
+$XX = \p{Grapheme_Cluster_Break=Other}
 
 # RULES
 
@@ -47,7 +43,7 @@ $ExtCccZwj=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]
 # Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters.
 9.1) × 	$SpacingMark
 9.2) $Prepend  ×
-9.3) $LinkingConsonant $ExtCccZwj* $ConjunctLinker $ExtCccZwj*  × $LinkingConsonant
+9.3) $LinkingConsonant $ConjunctExtender* $ConjunctLinker $ConjunctExtender*  × $LinkingConsonant
 ## Do not break within emoji modifier sequences or emoji zwj sequences.
 ##	10) $E_Base $Extend* × $E_Modifier
 11) $ExtPict $Extend* $ZWJ × $ExtPict
@@ -62,7 +58,7 @@ $ExtCccZwj=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]
 
 $AI=\p{Line_Break=Ambiguous}
 $AK=\p{Line_Break=Aksara}
-$AL=\p{Line_Break=Alphabetic}
+$ALorig=\p{Line_Break=Alphabetic}
 $AP=\p{Line_Break=Aksara_Prebase}
 $AS=\p{Line_Break=Aksara_Start}
 $B2=\p{Line_Break=Break_Both}
@@ -72,7 +68,7 @@ $BK=\p{Line_Break=Mandatory_Break}
 $CB=\p{Line_Break=Contingent_Break}
 $CL=\p{Line_Break=Close_Punctuation}
 $CP=\p{Line_Break=CP}
-$CM=\p{Line_Break=Combining_Mark}
+$CMorig=\p{Line_Break=Combining_Mark}
 $CR=\p{Line_Break=Carriage_Return}
 $EX=\p{Line_Break=Exclamation}
 $GL=\p{Line_Break=Glue}
@@ -88,13 +84,15 @@ $JT=\p{Line_Break=JT}
 $JV=\p{Line_Break=JV}
 $LF=\p{Line_Break=Line_Feed}
 $NL=\p{Line_Break=Next_Line}
-$NS=\p{Line_Break=Nonstarter}
+$NSorig=\p{Line_Break=Nonstarter}
 $NU=\p{Line_Break=Numeric}
 $OP=\p{Line_Break=Open_Punctuation}
 $PO=\p{Line_Break=Postfix_Numeric}
 $PR=\p{Line_Break=Prefix_Numeric}
 $QU=\p{Line_Break=Quotation}
-$SA=\p{Line_Break=Complex_Context}
+$SA_Mn=[\p{Line_Break=Complex_Context}&\p{gc=Mn}]
+$SA_Mc=[\p{Line_Break=Complex_Context}&\p{gc=Mc}]
+$SAmMnmMc=[\p{Line_Break=Complex_Context}-\p{gc=Mn}-\p{gc=Mc}]
 $SG=\p{Line_Break=Surrogate}
 $SP=\p{Line_Break=Space}
 $SY=\p{Line_Break=Break_Symbols}
@@ -109,20 +107,23 @@ $EB=\p{Line_Break=E_Base}
 $EM=\p{Line_Break=E_Modifier}
 $ZWJ=\p{Line_Break=ZWJ}
 
-$QU_Pi=[$QU & \p{gc=Pi}]
-$QU_Pf=[$QU & \p{gc=Pf}]
+$Pi = \p{gc=Pi}
+$Pf = \p{gc=Pf}
+
+$QU_Pi=[$QU & $Pi]
+$QU_Pf=[$QU & $Pf]
 
-$QUmPi=[$QU - \p{gc=Pi}]
-$QUmPf=[$QU - \p{gc=Pf}]
+$QUmPi=[$QU - $Pi]
+$QUmPf=[$QU - $Pf]
 
 $EastAsian   = [\p{ea=F}\p{ea=W}\p{ea=H}]
 $NonEastAsianBA = [$BA & [^$EastAsian]]
 
 $DottedCircle = [◌]
 $Hyphen = [\u2010]
 
-$CP30=[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
-$OP30=[$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
+$CPmEastAsian=[$CP-$EastAsian]
+$OPmEastAsian=[$OP-$EastAsian]
 
 $ExtPictUnassigned=[\p{Extended_Pictographic}&\p{gc=Cn}]
 
@@ -136,10 +137,11 @@ $eot=(?!.)
 
 # LB 1  Assign a line breaking class to each code point of the input. 
 # Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.
-# NOTE: CB is ok to fall through, but must handle others here.
-##	show $AL
-$AL=[$AI $AL $SG $XX $SA]
-$NS=[$NS $CJ]
+# In the absence of such criteria all characters with a specific combination of
+# original class and General_Category property value are resolved as follows:
+$AL=[$AI $ALorig $SG $XX $SAmMnmMc]
+$CM=[$CMorig $SA_Mn $SA_Mc]
+$NS=[$NSorig $CJ]
 
 # RULES
 
@@ -263,8 +265,8 @@ $NS=[$NS $CJ]
 # LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").
 29) $IS × ($AL | $HL)
 # LB 30  Do not break between letters, numbers or ordinary symbols and opening or closing punctuation.
-30.01) ($AL | $HL | $NU) × $OP30
-30.02) $CP30 × ($AL | $HL | $NU)
+30.01) ($AL | $HL | $NU) × $OPmEastAsian
+30.02) $CPmEastAsian × ($AL | $HL | $NU)
 # LB 30a  Break between two Regional Indicators if and only if there is an even number of them before the point being considered.
 30.11) $sot ($RI $RI)* $RI × $RI
 30.12) [^$RI] ($RI $RI)* $RI × $RI
@@ -291,6 +293,7 @@ $ATerm=\p{Sentence_Break=ATerm}
 $STerm=\p{Sentence_Break=STerm}
 $Close=\p{Sentence_Break=Close}
 $SContinue=\p{Sentence_Break=SContinue}
+$XX=\p{Sentence_Break=Other}
 $Any=.
 
 # SPECIAL EXTENSIONS
@@ -365,6 +368,7 @@ $ExtPict=\p{Extended_Pictographic}
 ##	$EBG=\p{Word_Break=E_Base_GAZ}
 ##	$Glue_After_Zwj=\p{Word_Break=Glue_After_Zwj}
 $WSegSpace=\p{Word_Break=WSegSpace}
+$XX=\p{Word_Break=Other}
 
 # MACROS