Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement String#unicode_normalize and String#unicode_normalized? #11226

Merged
110 changes: 107 additions & 3 deletions scripts/generate_unicode_data.cr
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,20 @@ require "../src/compiler/crystal/formatter"

UCD_ROOT = "http://www.unicode.org/Public/#{Unicode::VERSION}/ucd/"

enum DecompositionType
None
Canonical
Compatibility
end

# Each entry in UnicodeData.txt
# (some info is missing but we don't use it yet)
record Entry,
codepoint : Int32,
name : String,
general_category : String,
decomposition_type : DecompositionType,
decomposition_mapping : Array(Int32)?,
upcase : Int32?,
downcase : Int32?,
casefold : Int32?
Expand All @@ -25,6 +33,8 @@ record SpecialCase,
record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
record CanonicalCombiningClassRange, low : Int32, high : Int32, ccc : UInt8
record QuickCheckRange, low : Int32, high : Int32, result : Unicode::QuickCheckResult

def case_ranges(entries, &block)
ranges = [] of CaseRange
Expand Down Expand Up @@ -140,6 +150,9 @@ special_cases_downcase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase
special_cases_casefold = [] of SpecialCase
casefold_mapping = Hash(Int32, Int32).new
canonical_combining_classes = [] of CanonicalCombiningClassRange
full_composition_exclusions = Set(Int32).new
quick_checks = Unicode::NormalizationForm.values.to_h { |kind| {kind, Array(QuickCheckRange).new} }

url = "#{UCD_ROOT}CaseFolding.txt"
body = HTTP::Client.get(url).body
Expand Down Expand Up @@ -175,10 +188,29 @@ body.each_line do |line|
codepoint = pieces[0].to_i(16)
name = pieces[1]
general_category = pieces[2]
# don't read CanonicalCombiningClass here; the derived properties file has
# exact ranges
decomposition = pieces[5]
if decomposition.starts_with?('<')
decomposition_mapping = decomposition.partition("> ")[2].split.map(&.to_i(16))
decomposition_type = DecompositionType::Compatibility
else
decomposition_mapping = decomposition.presence.try &.split.map(&.to_i(16))
decomposition_type = decomposition_mapping.nil? ? DecompositionType::None : DecompositionType::Canonical
end
upcase = pieces[12].to_i?(16)
downcase = pieces[13].to_i?(16)
casefold = casefold_mapping[codepoint]?
entries << Entry.new(codepoint, name, general_category, upcase, downcase, casefold)
entries << Entry.new(
codepoint: codepoint,
name: name,
general_category: general_category,
decomposition_type: decomposition_type,
decomposition_mapping: decomposition_mapping,
upcase: upcase,
downcase: downcase,
casefold: casefold,
)
end

url = "#{UCD_ROOT}SpecialCasing.txt"
Expand Down Expand Up @@ -209,6 +241,39 @@ body.each_line do |line|
end
end

url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip

if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\d+)/)
ccc = m[3].to_u8
next if ccc == 0
low = m[1].to_i(16)
high = m[2]?.try(&.to_i(16)) || low
canonical_combining_classes << CanonicalCombiningClassRange.new(low, high, ccc)
end
end

url = "#{UCD_ROOT}DerivedNormalizationProps.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
break if line.starts_with?("# Derived Property: Expands_On_NFD")

if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*Full_Composition_Exclusion/)
low = m[1].to_i(16)
high = m[2]?.try(&.to_i(16)) || low
(low..high).each { |codepoint| full_composition_exclusions << codepoint }
elsif m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(NFC|NFD|NFKC|NFKD)_QC\s*;\s*(N|M)/)
low = m[1].to_i(16)
high = m[2]?.try(&.to_i(16)) || low
quick_check = quick_checks[Unicode::NormalizationForm.parse(m[3])]
result = m[4] == "M" ? Unicode::QuickCheckResult::Maybe : Unicode::QuickCheckResult::No
quick_check << QuickCheckRange.new(low, high, result)
end
end

downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }

Expand All @@ -226,8 +291,47 @@ categories.each do |category|
all_strides[category] = strides entries, category, &.general_category
end

output = String.build do |str|
ECR.embed "#{__DIR__}/unicode_data.ecr", str
canonical_combining_classes.sort_by! &.low

canonical_decompositions = entries.compact_map do |entry|
next unless entry.decomposition_type.canonical?
mapping = entry.decomposition_mapping.not_nil!
raise "BUG: mapping longer than 2 codepoints" unless mapping.size <= 2
{entry.codepoint, mapping[0], mapping[1]? || 0}
end

# Instead of storing the codepoints for each compatibility decomposition as an
# individual `Array`, we store all of them in a single `Array` and refer to its
# subsequences using index and count.
compatibility_decomposition_data = [] of Int32
compatibility_decompositions = entries.compact_map do |entry|
next unless entry.decomposition_type.compatibility?
mapping = entry.decomposition_mapping.not_nil!

# We try to reuse any existing subsequences in the table that match this
# entry's decomposition mapping. This reduces the table size by over 40%,
# mainly due to singleton decompositions. It can be further optimized by
# solving the shortest common superstring problem.
index = (0..compatibility_decomposition_data.size - mapping.size).find do |i|
(0...mapping.size).all? do |j|
mapping[j] == compatibility_decomposition_data[i + j]
end
end
unless index
index = compatibility_decomposition_data.size
compatibility_decomposition_data.concat(mapping)
end

{entry.codepoint, index, mapping.size}
end

canonical_compositions = canonical_decompositions.compact_map do |codepoint, first, second|
next if second == 0 || full_composition_exclusions.includes?(codepoint)
{(first.to_i64 << 21) | second, codepoint}
end

quick_checks.each_value &.sort_by! &.low

output = ECR.render "#{__DIR__}/unicode_data.ecr"
output = Crystal.format(output)
File.write("#{__DIR__}/../src/unicode/data.cr", output)
Loading