crystal-lang · straight-shoota · Aug 29, 2022 · Sep 17, 2021 · Sep 17, 2021 · Sep 17, 2021
diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr
@@ -8,12 +8,20 @@ require "../src/compiler/crystal/formatter"
 
 UCD_ROOT = "http://www.unicode.org/Public/#{Unicode::VERSION}/ucd/"
 
+enum DecompositionType
+  None
+  Canonical
+  Compatibility
+end
+
 # Each entry in UnicodeData.txt
 # (some info is missing but we don't use it yet)
 record Entry,
   codepoint : Int32,
   name : String,
   general_category : String,
+  decomposition_type : DecompositionType,
+  decomposition_mapping : Array(Int32)?,
   upcase : Int32?,
   downcase : Int32?,
   casefold : Int32?
@@ -25,6 +33,8 @@ record SpecialCase,
 record CaseRange, low : Int32, high : Int32, delta : Int32
 record AlternateRange, low : Int32, high : Int32
 record Stride, low : Int32, high : Int32, stride : Int32
+record CanonicalCombiningClassRange, low : Int32, high : Int32, ccc : UInt8
+record QuickCheckRange, low : Int32, high : Int32, result : Unicode::QuickCheckResult
 
 def case_ranges(entries, &block)
   ranges = [] of CaseRange
@@ -140,6 +150,9 @@ special_cases_downcase = [] of SpecialCase
 special_cases_upcase = [] of SpecialCase
 special_cases_casefold = [] of SpecialCase
 casefold_mapping = Hash(Int32, Int32).new
+canonical_combining_classes = [] of CanonicalCombiningClassRange
+full_composition_exclusions = Set(Int32).new
+quick_checks = Unicode::NormalizationForm.values.to_h { |kind| {kind, Array(QuickCheckRange).new} }
 
 url = "#{UCD_ROOT}CaseFolding.txt"
 body = HTTP::Client.get(url).body
@@ -175,10 +188,29 @@ body.each_line do |line|
   codepoint = pieces[0].to_i(16)
   name = pieces[1]
   general_category = pieces[2]
+  # don't read CanonicalCombiningClass here; the derived properties file has
+  # exact ranges
+  decomposition = pieces[5]
+  if decomposition.starts_with?('<')
+    decomposition_mapping = decomposition.partition("> ")[2].split.map(&.to_i(16))
+    decomposition_type = DecompositionType::Compatibility
+  else
+    decomposition_mapping = decomposition.presence.try &.split.map(&.to_i(16))
+    decomposition_type = decomposition_mapping.nil? ? DecompositionType::None : DecompositionType::Canonical
+  end
   upcase = pieces[12].to_i?(16)
   downcase = pieces[13].to_i?(16)
   casefold = casefold_mapping[codepoint]?
-  entries << Entry.new(codepoint, name, general_category, upcase, downcase, casefold)
+  entries << Entry.new(
+    codepoint: codepoint,
+    name: name,
+    general_category: general_category,
+    decomposition_type: decomposition_type,
+    decomposition_mapping: decomposition_mapping,
+    upcase: upcase,
+    downcase: downcase,
+    casefold: casefold,
+  )
 end
 
 url = "#{UCD_ROOT}SpecialCasing.txt"
@@ -209,6 +241,39 @@ body.each_line do |line|
   end
 end
 
+url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
+body = HTTP::Client.get(url).body
+body.each_line do |line|
+  line = line.strip
+
+  if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\d+)/)
+    ccc = m[3].to_u8
+    next if ccc == 0
+    low = m[1].to_i(16)
+    high = m[2]?.try(&.to_i(16)) || low
+    canonical_combining_classes << CanonicalCombiningClassRange.new(low, high, ccc)
+  end
+end
+
+url = "#{UCD_ROOT}DerivedNormalizationProps.txt"
+body = HTTP::Client.get(url).body
+body.each_line do |line|
+  line = line.strip
+  break if line.starts_with?("# Derived Property: Expands_On_NFD")
+
+  if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*Full_Composition_Exclusion/)
+    low = m[1].to_i(16)
+    high = m[2]?.try(&.to_i(16)) || low
+    (low..high).each { |codepoint| full_composition_exclusions << codepoint }
+  elsif m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(NFC|NFD|NFKC|NFKD)_QC\s*;\s*(N|M)/)
+    low = m[1].to_i(16)
+    high = m[2]?.try(&.to_i(16)) || low
+    quick_check = quick_checks[Unicode::NormalizationForm.parse(m[3])]
+    result = m[4] == "M" ? Unicode::QuickCheckResult::Maybe : Unicode::QuickCheckResult::No
+    quick_check << QuickCheckRange.new(low, high, result)
+  end
+end
+
 downcase_ranges = case_ranges entries, &.downcase
 downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }
 
@@ -226,8 +291,47 @@ categories.each do |category|
   all_strides[category] = strides entries, category, &.general_category
 end
 
-output = String.build do |str|
-  ECR.embed "#{__DIR__}/unicode_data.ecr", str
+canonical_combining_classes.sort_by! &.low
+
+canonical_decompositions = entries.compact_map do |entry|
+  next unless entry.decomposition_type.canonical?
+  mapping = entry.decomposition_mapping.not_nil!
+  raise "BUG: mapping longer than 2 codepoints" unless mapping.size <= 2
+  {entry.codepoint, mapping[0], mapping[1]? || 0}
 end
+
+# Instead of storing the codepoints for each compatibility decomposition as an
+# individual `Array`, we store all of them in a single `Array` and refer to its
+# subsequences using index and count.
+compatibility_decomposition_data = [] of Int32
+compatibility_decompositions = entries.compact_map do |entry|
+  next unless entry.decomposition_type.compatibility?
+  mapping = entry.decomposition_mapping.not_nil!
+
+  # We try to reuse any existing subsequences in the table that match this
+  # entry's decomposition mapping. This reduces the table size by over 40%,
+  # mainly due to singleton decompositions. It can be further optimized by
+  # solving the shortest common superstring problem.
+  index = (0..compatibility_decomposition_data.size - mapping.size).find do |i|
+    (0...mapping.size).all? do |j|
+      mapping[j] == compatibility_decomposition_data[i + j]
+    end
+  end
+  unless index
+    index = compatibility_decomposition_data.size
+    compatibility_decomposition_data.concat(mapping)
+  end
+
+  {entry.codepoint, index, mapping.size}
+end
+
+canonical_compositions = canonical_decompositions.compact_map do |codepoint, first, second|
+  next if second == 0 || full_composition_exclusions.includes?(codepoint)
+  {(first.to_i64 << 21) | second, codepoint}
+end
+
+quick_checks.each_value &.sort_by! &.low
+
+output = ECR.render "#{__DIR__}/unicode_data.ecr"
 output = Crystal.format(output)
 File.write("#{__DIR__}/../src/unicode/data.cr", output)