SegmenterDefault.txt: more remapping, less renaming (unicode-org#970)

* Use remap rules for word and sentence too * No CM1 or ZWJ_O * Regenerate UCD * ^ rather than a variable called Not, UnicodeSet unions rather than | * Regenerate UCD
markusicu · Nov 25, 2024 · d1bdcb8 · d1bdcb8
1 parent c4731c8
commit d1bdcb8
Show file tree

Hide file tree

Showing 7 changed files with 23,074 additions and 22,610 deletions.
diff --git a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.txt
diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.txt
diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html
diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt
diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt
@@ -72,7 +72,7 @@ $BK=\p{Line_Break=Mandatory_Break}
 $CB=\p{Line_Break=Contingent_Break}
 $CL=\p{Line_Break=Close_Punctuation}
 $CP=\p{Line_Break=CP}
-$CM1=\p{Line_Break=Combining_Mark}
+$CM=\p{Line_Break=Combining_Mark}
 $CR=\p{Line_Break=Carriage_Return}
 $EX=\p{Line_Break=Exclamation}
 $GL=\p{Line_Break=Glue}
@@ -107,7 +107,6 @@ $CJ=\p{Line_Break=Conditional_Japanese_Starter}
 $RI=\p{Line_Break=Regional_Indicator}
 $EB=\p{Line_Break=E_Base}
 $EM=\p{Line_Break=E_Modifier}
-$ZWJ_O=\p{Line_Break=ZWJ}
 $ZWJ=\p{Line_Break=ZWJ}
 
 $QU_Pi=[$QU & \p{gc=Pi}]
@@ -116,10 +115,10 @@ $QU_Pf=[$QU & \p{gc=Pf}]
 $QUmPi=[$QU - \p{gc=Pi}]
 $QUmPf=[$QU - \p{gc=Pf}]
 
-$NotEastAsian   = [^\p{ea=F}\p{ea=W}\p{ea=H}]
-$NonEastAsianBA = [$BA & $NotEastAsian]
+$EastAsian   = [\p{ea=F}\p{ea=W}\p{ea=H}]
+$NonEastAsianBA = [$BA & [^$EastAsian]]
 
-$DottedCircle = ◌
+$DottedCircle = [◌]
 $Hyphen = [\u2010]
 
 $CP30=[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
@@ -135,18 +134,13 @@ $eot=(?!.)
 
 # SPECIAL EXTENSIONS
 
-$CM=[$CM1 $ZWJ]
 # LB 1  Assign a line breaking class to each code point of the input. 
 # Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.
 # NOTE: CB is ok to fall through, but must handle others here.
 ##	show $AL
 $AL=[$AI $AL $SG $XX $SA]
 $NS=[$NS $CJ]
 
-# MACROS
-
-$Spec3a_=[^ $SP $BA $HY]
-
 # RULES
 
 # LB 4  Always break after hard line breaks (but never between CR and LF).
@@ -164,7 +158,7 @@ $Spec3a_=[^ $SP $BA $HY]
 # LB 8  Break before any character following a zero-width space, even if one or more spaces intervene.
 8) $ZW $SP* ÷
 # LB 8a  Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences)
-8.1) $ZWJ_O ×
+8.1) $ZWJ ×
 # LB 9  Do not break a combining character sequence; treat it as if it has the line breaking class
 # of the base character in all of the following rules. Treat ZWJ as if it were CM.
 9) (?<X>[^$BK $CR $LF $NL $SP $ZW]) ( $CM | $ZWJ )* → ${X}
@@ -176,7 +170,7 @@ $Spec3a_=[^ $SP $BA $HY]
 # LB 12  Do not break after NBSP and related characters.
 12) $GL ×
 # LB 12a Do not break before NBSP and related characters, except after spaces and hyphens.
-12.1) $Spec3a_ × $GL
+12.1) [^ $SP $BA $HY] × $GL
 # LB 13  Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces.
 13.01) × $EX
 13.02) × $CL
@@ -205,10 +199,10 @@ $Spec3a_=[^ $SP $BA $HY]
 19.01) × $QUmPi
 19.02) $QUmPf ×
 # LB 19a Unless surrounded by East Asian Characters, do not break either side of any unresolved quotation marks.
-19.10) $NotEastAsian × $QU
-19.11) × $QU ( $NotEastAsian | $eot )
-19.12) $QU × $NotEastAsian
-19.13) ( $sot | $NotEastAsian ) $QU ×
+19.10) [^$EastAsian] × $QU
+19.11) × $QU ( [^$EastAsian] | $eot )
+19.12) $QU × [^$EastAsian]
+19.13) ( $sot | [^$EastAsian] ) $QU ×
 # LB 20  Break before and after unresolved CB.
 20.01)  ÷ $CB
 20.02) $CB ÷
@@ -306,27 +300,10 @@ $Any=.
 ## Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.
 ##	$NotStuff=[^$OLetter $Upper $Lower $Sep]
 ##	# $ATerm and $Sterm are temporary, to match ICU until UTC decides.
-
-# WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend
-
-$FE=[$Format $Extend]
-$NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]
-##	$NotSep_=[^ $Sep $CR $LF]
-##	$FE=$Extend* $Format*
-$Sp=($Sp $FE*)
-$Lower=($Lower $FE*)
-$Upper=($Upper $FE*)
-$OLetter=($OLetter $FE*)
-$Numeric=($Numeric $FE*)
-$ATerm=($ATerm $FE*)
-$STerm=($STerm $FE*)
-$Close=($Close $FE*)
-$SContinue=($SContinue $FE*)
-
 # MACROS
 
-$ParaSep = ($Sep | $CR | $LF)
-$SATerm = ($STerm | $ATerm)
+$ParaSep = [$Sep $CR $LF]
+$SATerm = [$STerm $ATerm]
 
 # RULES
 
@@ -337,18 +314,16 @@ $SATerm = ($STerm | $ATerm)
 4) $ParaSep  	÷
 ##	3.4) ( $Control | $CR | $LF ) 	÷
 ##	3.5) ÷ 	( $Control | $CR | $LF )
-# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
-# WARNING: Implemented as don't break before format (except after linebreaks),
-# AND add format and extend in all variables definitions that appear after this point!
-##	3.91) [^$Control | $CR | $LF] × 	$Extend
-5) × [$Format $Extend]
+# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.)
+# This also has the effect of: Any × (Format | Extend)
+5) (?<X>[^$ParaSep]) ( $Extend | $Format )* → ${X}
 # Do not break after full stop in certain contexts. [See note below.]
 # Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,
 # is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.
 # For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
 6) $ATerm 	× 	$Numeric
 7) ($Upper | $Lower) $ATerm 	× 	$Upper
-8) $ATerm $Close* $Sp* 	× 	$NotPreLower_* $Lower
+8) $ATerm $Close* $Sp* 	× 	[^ $OLetter $Upper $Lower $ParaSep $SATerm]* $Lower
 8.1) $SATerm $Close* $Sp* 	× 	($SContinue | $SATerm)
 # Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
 9) $SATerm $Close* 	× 	( $Close | $Sp | $ParaSep )
@@ -393,38 +368,12 @@ $WSegSpace=\p{Word_Break=WSegSpace}
 
 # MACROS
 
-$AHLetter=($ALetter | $Hebrew_Letter)
-$MidNumLetQ=($MidNumLet | $Single_Quote)
+$AHLetter=[$ALetter $Hebrew_Letter]
+$MidNumLetQ=[$MidNumLet $Single_Quote]
 ## WARNING: For Rule 4: Fixes for GC, Format
 ##	# Subtract Format from Control, since we don't want to break before/after
 ##	$Control=[$Control-$Format]
 
-# SPECIAL EXTENSIONS
-
-# Add format and extend to everything
-$FE=[$Format $Extend $ZWJ]
-
-$NotBreak_=[^ $Newline $CR $LF ]
-##	$FE= ($Extend | $Format)*
-$Katakana=($Katakana $FE*)
-$ALetter=($ALetter $FE*)
-$MidLetter=($MidLetter $FE*)
-$MidNum=($MidNum $FE*)
-$MidNumLet=($MidNumLet $FE*)
-$Numeric=($Numeric $FE*)
-$ExtendNumLet=($ExtendNumLet $FE*)
-$RI=($RI $FE*)
-$Hebrew_Letter=($Hebrew_Letter $FE*)
-$Double_Quote=($Double_Quote $FE*)
-$Single_Quote=($Single_Quote $FE*)
-##	$E_Base=($E_Base $FE*)
-##	$E_Modifier=($E_Modifier $FE*)
-##	$ZWJ=($ZWJ $FE*) # don't do this one!
-##	$Glue_After_Zwj=($Glue_After_Zwj $FE*)
-##	$EBG=($EBG $FE*)
-$AHLetter=($AHLetter $FE*)
-$MidNumLetQ=($MidNumLetQ $FE*)
-
 # RULES
 
 # Break at the start and end of text, unless the text is empty.
@@ -440,11 +389,9 @@ $MidNumLetQ=($MidNumLetQ $FE*)
 ##	3.5) ÷ 	( $Control | $CR | $LF )
 ##	3.9) × 	$Extend
 ##	3.91) [^$Control | $CR | $LF] × 	$Extend
-# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
-# WARNING: Implemented as don't break before format (except after linebreaks),
-# AND add format and extend in all variables definitions that appear after this point!
-##	4) × [$Format $Extend]
-4) $NotBreak_ × [$Format $Extend $ZWJ]
+# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.)
+# This also has the effect of: Any × (Format | Extend)
+4) (?<X>[^$CR $LF $Newline]) ($Extend | $Format | $ZWJ)* → ${X}
 
 # VANILLA RULES