-
Notifications
You must be signed in to change notification settings - Fork 8.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tools: add a powershell script to generate CPWD from the UCD (#5946)
This commit introduces Generate-CodepointWidthsFromUCD, a powershell (7+) script that will parse a UCD XML database in the UAX 42 format from https://www.unicode.org/Public/UCD/latest/ucdxml/ and generate CodepointWidthDetector's giant width array. By default, it will emit one UnicodeRange for every range of non-narrow glyphs with a different Width + Emoji + Emoji Presentation class; however, it can be run in "packing" and "full" mode. * Packing mode: ignore the width/emoji/pres class and combine adjacent runs that CPWD will treat the same. * This is for optimizing the number of individual ranges emitted into code. * Full mode: include narrow codepoints (helpful for visualization) It also supports overrides, provided in an XML document of the same format as the UCD itself. Entries in the overrides files are applied after the entire UCD is read and will replace any impacted ranges. The output (when packing) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD -Pack:True -Full:False // on 05/17/2020 02:47:55 (UTC) from Unicode 13.0.0. // 66182 (0x10286) codepoints covered. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa4, 0xa4, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa7, 0xa8, CodepointWidth::Ambiguous }, . . . UnicodeRange{ 0x1f210, 0x1f23b, CodepointWidth::Wide }, UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ``` The output (when overriding) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD.ps1 -Pack:True -Full:False -NoOverrides:False // on 5/22/2020 11:17:39 PM (UTC) from Unicode 13.0.0. // 321205 (0x4E6B5) codepoints covered. // 240 (0xF0) codepoints overridden. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, ... UnicodeRange{ 0xfe20, 0xfe2f, CodepointWidth::Narrow }, // narrow combining ligatures (split into left/right halves, which take 2 columns together) ... UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ```
- Loading branch information
Showing
3 changed files
with
280 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,8 +8,9 @@ EXPCMDSTATE | |
href | ||
IBox | ||
IBind | ||
ICustom | ||
IClass | ||
IComparable | ||
ICustom | ||
IExplorer | ||
IMap | ||
IObject | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT license. | ||
|
||
#Requires -Version 7 | ||
# (we use the null coalescing operator) | ||
|
||
################################################################################ | ||
# This script generates the an array suitable for replacing the body of | ||
# src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1] | ||
# compliant with UAX#42[2]. | ||
# | ||
# This script supports a quasi-mandatory "overrides" file, overrides.xml. | ||
# If you do not have overrides, supply the -NoOverrides parameter. This was | ||
# developed for use with the CodepointWidthDetector, which has some override | ||
# ranges. | ||
# | ||
# This script was developed against the flat "no han unification" UCD | ||
# "ucd.nounihan.flat.xml". | ||
# It does not support the grouped database format. | ||
# significantly smaller, which would provide a performance win on the admittedly | ||
# extremely rare occasion that we should need to regenerate our table. | ||
# | ||
# Invoke as ./Generate-xxx ucd.nounihan.flat.xml -Pack | Out-File -Encoding | ||
# UTF-8 Temporary.cpp | ||
# | ||
# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/ | ||
# [2]: https://www.unicode.org/reports/tr42/ | ||
|
||
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')] | ||
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')] | ||
[CmdletBinding()] | ||
Param( | ||
[Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")] | ||
[System.Xml.XmlDocument]$InputObject, | ||
|
||
[Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")] | ||
[System.Xml.XmlDocument]$OverrideObject, | ||
|
||
[Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")] | ||
[string]$Path = "ucd.nounihan.flat.xml", | ||
|
||
[Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")] | ||
[string]$OverridePath = "overrides.xml", | ||
|
||
[switch]$Pack, # Pack tightly based on width | ||
[switch]$NoOverrides, # Do not include overrides | ||
[switch]$Full = $False # Include Narrow codepoints | ||
) | ||
|
||
Enum CodepointWidth { | ||
Narrow; | ||
Wide; | ||
Ambiguous; | ||
Invalid; | ||
} | ||
|
||
# UCD Functions {{{ | ||
Function Get-UCDEntryRange($entry) { | ||
$s = $e = 0 | ||
if ($null -ne $v.cp) { | ||
# Individual Codepoint | ||
$s = $e = [int]("0x"+$v.cp) | ||
} ElseIf ($null -ne $v."first-cp") { | ||
# Range of Codepoints | ||
$s = [int]("0x"+$v."first-cp") | ||
$e = [int]("0x"+$v."last-cp") | ||
} | ||
$s | ||
$e | ||
} | ||
|
||
Function Get-UCDEntryWidth($entry) { | ||
If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") { | ||
[CodepointWidth]::Wide | ||
Return | ||
} | ||
|
||
Switch($entry.ea) { | ||
"N" { [CodepointWidth]::Narrow; Return } | ||
"Na" { [CodepointWidth]::Narrow; Return } | ||
"H" { [CodepointWidth]::Narrow; Return } | ||
"W" { [CodepointWidth]::Wide; Return } | ||
"F" { [CodepointWidth]::Wide; Return } | ||
"A" { [CodepointWidth]::Ambiguous; Return } | ||
} | ||
[CodepointWidth]::Invalid | ||
} | ||
|
||
Function Get-UCDEntryFlags($entry) { | ||
If ($script:Pack) { | ||
# If we're "pack"ing entries, only the computed width matters for telling them apart | ||
Get-UCDEntryWidth $entry | ||
Return | ||
} | ||
|
||
$normalizedEAWidth = $entry.ea | ||
$normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth; | ||
"{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres | ||
} | ||
# }}} | ||
|
||
Class UnicodeRange : System.IComparable { | ||
[int]$Start | ||
[int]$End | ||
[CodepointWidth]$Width | ||
[string]$Flags | ||
[string]$Comment | ||
|
||
UnicodeRange([System.Xml.XmlElement]$ucdEntry) { | ||
$this.Start, $this.End = Get-UCDEntryRange $ucdEntry | ||
$this.Width = Get-UCDEntryWidth $ucdEntry | ||
$this.Flags = Get-UCDEntryFlags $ucdEntry | ||
|
||
If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") { | ||
$this.Comment = "Emoji=Y EPres=Y" | ||
} | ||
|
||
If ($null -ne $ucdEntry.comment) { | ||
$this.Comment = $ucdEntry.comment | ||
} | ||
} | ||
|
||
[int] CompareTo([object]$Other) { | ||
If ($Other -is [int]) { | ||
Return $this.Start - $Other | ||
} | ||
Return $this.Start - $Other.Start | ||
} | ||
|
||
[bool] Merge([UnicodeRange]$Other) { | ||
# If there's more than one codepoint between them, don't merge | ||
If (($Other.Start - $this.End) -gt 1) { | ||
Return $false | ||
} | ||
|
||
# Flags are different: do not merge | ||
If ($this.Flags -ne $Other.Flags) { | ||
Return $false | ||
} | ||
|
||
$this.End = $Other.End | ||
Return $true | ||
} | ||
|
||
[int] Length() { | ||
return $this.End - $this.Start + 1 | ||
} | ||
} | ||
|
||
Class UnicodeRangeList : System.Collections.Generic.List[Object] { | ||
UnicodeRangeList([int]$Capacity) : base($Capacity) { } | ||
|
||
[int] hidden _FindInsertionPoint([int]$codepoint) { | ||
$l = $this.BinarySearch($codepoint) | ||
If ($l -lt 0) { | ||
# Return value <0: value was not found, return value is bitwise complement the index of the first >= value | ||
Return -bNOT $l | ||
} | ||
Return $l | ||
} | ||
|
||
ReplaceUnicodeRange([UnicodeRange]$newRange) { | ||
$subset = [System.Collections.Generic.List[Object]]::New(3) | ||
$subset.Add($newRange) | ||
|
||
$i = $this._FindInsertionPoint($newRange.Start) | ||
|
||
# Left overlap can only ever be one (_FindInsertionPoint always returns the | ||
# index immediately after the range whose Start is <= than ours). | ||
$prev = $null | ||
If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) { | ||
$prev = $i - 1 | ||
} | ||
|
||
# Right overlap can be Infinite (because we didn't account for End) | ||
# Find extent of right overlap | ||
For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { } | ||
If ($this[$next].Start -gt $newRange.End) { | ||
# It turns out we didn't damage the following range; clear it | ||
$next = $null | ||
} | ||
|
||
If ($null -ne $next) { | ||
# Replace damaged elements after I with a truncated range | ||
$last = $this[$next] | ||
$this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I | ||
$last.Start = $newRange.End + 1 | ||
If ($last.Start -le $last.End) { | ||
$subset.Add($last) | ||
} | ||
} | ||
|
||
If ($null -ne $prev) { | ||
# Replace damaged elements before I with a truncated range | ||
$first = $this[$prev] | ||
$this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!) | ||
$first.End = $newRange.Start - 1 | ||
If ($first.End -ge $first.Start) { | ||
$subset.Insert(0, $first) | ||
} | ||
$i = $prev # Update the insertion cursor | ||
} | ||
|
||
$this.InsertRange($i, $subset) | ||
} | ||
} | ||
|
||
# Ingest UCD | ||
If ($null -eq $InputObject) { | ||
$InputObject = [xml](Get-Content $Path) | ||
} | ||
|
||
$UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object { | ||
# Sort by either cp or first-cp (for ranges) | ||
if ($null -ne $_.cp) { | ||
[int]("0x"+$_.cp) | ||
} ElseIf ($null -ne $_."first-cp") { | ||
[int]("0x"+$_."first-cp") | ||
} | ||
} | ||
|
||
If (-not $Full) { | ||
$UCDRepertoire = $UCDRepertoire | Where-Object { | ||
# Select everything Wide/Ambiguous/Full OR Emoji w/ Emoji Presentation | ||
($_.ea -notin "N", "Na", "H") -or ($_.Emoji -eq "Y" -and $_.EPres -eq "Y") | ||
} | ||
} | ||
|
||
$ranges = [UnicodeRangeList]::New(1024) | ||
|
||
$c = 0 | ||
ForEach($v in $UCDRepertoire) { | ||
$range = [UnicodeRange]::new($v) | ||
$c += $range.Length() | ||
|
||
If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) { | ||
# Merged into last entry | ||
Continue | ||
} | ||
$ranges.Add([object]$range) | ||
} | ||
|
||
If (-not $NoOverrides) { | ||
If ($null -eq $OverrideObject) { | ||
$OverrideObject = [xml](Get-Content $OverridePath) | ||
} | ||
|
||
$OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes | ||
$overrideCount = 0 | ||
ForEach($v in $OverrideRepertoire) { | ||
$range = [UnicodeRange]::new($v) | ||
$overrideCount += $range.Length() | ||
$range.Comment = $range.Comment ?? "overridden without comment" | ||
$ranges.ReplaceUnicodeRange($range) | ||
} | ||
} | ||
|
||
# Emit Code | ||
" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides | ||
" // on {0} (UTC) from {1}." -f (Get-Date -AsUTC), $InputObject.ucd.description | ||
" // {0} (0x{0:X}) codepoints covered." -f $c | ||
If (-not $NoOverrides) { | ||
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount | ||
} | ||
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count | ||
ForEach($_ in $ranges) { | ||
$comment = "" | ||
if ($null -ne $_.Comment) { | ||
# We only vend comments when we aren't packing tightly | ||
$comment = " // {0}" -f $_.Comment | ||
} | ||
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, CodepointWidth::{2} }},{3}" -f $_.Start, $_.End, $_.Width, $comment | ||
} | ||
" };" |