From 06e9d005e452b4be0b1b790373a35e157e24cb69 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Tue, 21 Nov 2023 00:16:41 +0800 Subject: [PATCH] Generate `src/html/entities.cr` automatically --- .gitattributes | 2 + Makefile | 4 ++ Makefile.win | 4 ++ scripts/generate_html_entities.cr | 43 +++++++++++ scripts/html_entities.ecr | 24 +++++++ src/html/entities.cr | 114 ++---------------------------- 6 files changed, 84 insertions(+), 107 deletions(-) create mode 100755 scripts/generate_html_entities.cr create mode 100644 scripts/html_entities.ecr diff --git a/.gitattributes b/.gitattributes index 78c53472cd74..f616edb09d75 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,6 +10,8 @@ lib/** linguist-vendored # produced by scripts/generate_windows_zone_names.cr src/crystal/system/win32/zone_names.cr linguist-generated +# produced by scripts/generate_html_entities.cr +src/html/entities.cr linguist-generated # produced by scripts/generate_ssl_server_defaults.cr src/openssl/ssl/defaults.cr linguist-generated # produced by scripts/generate_grapheme_properties.cr diff --git a/Makefile b/Makefile index 2ddcc270bdab..03a1aafd34ff 100644 --- a/Makefile +++ b/Makefile @@ -161,6 +161,10 @@ generate_data: src/crystal/system/win32/zone_names.cr src/crystal/system/win32/zone_names.cr: scripts/generate_windows_zone_names.cr $(CRYSTAL) run $< +generate_data: src/html/entities.cr +src/html/entities.cr: scripts/generate_html_entities.cr scripts/html_entities.ecr + $(CRYSTAL) run $< + .PHONY: install install: $(O)/crystal man/crystal.1.gz ## Install the compiler at DESTDIR $(INSTALL) -d -m 0755 "$(BINDIR)/" diff --git a/Makefile.win b/Makefile.win index 1e1d63fdabb2..595a26f5fd00 100644 --- a/Makefile.win +++ b/Makefile.win @@ -163,6 +163,10 @@ generate_data: src\crystal\system\win32\zone_names.cr src\crystal\system\win32\zone_names.cr: scripts\generate_windows_zone_names.cr $(CRYSTAL) run $< +generate_data: src\html\entities.cr +src\html\entities.cr: scripts\generate_html_entities.cr scripts\html_entities.ecr + $(CRYSTAL) run $< + .PHONY: install install: $(O)\crystal.exe ## Install the compiler at prefix $(call MKDIR,"$(BINDIR)") diff --git a/scripts/generate_html_entities.cr b/scripts/generate_html_entities.cr new file mode 100755 index 000000000000..d63592efe19b --- /dev/null +++ b/scripts/generate_html_entities.cr @@ -0,0 +1,43 @@ +#! /usr/bin/env crystal + +require "http" +require "json" +require "ecr" + +record Entity, characters : String, codepoints : Array(Int32) do + include JSON::Serializable + include JSON::Serializable::Strict +end + +single_char_entities = [] of {String, Entity} +double_char_entities = [] of {String, Entity} + +HTTP::Client.get("https://html.spec.whatwg.org/entities.json") do |res| + Hash(String, Entity).from_json(res.body_io).each do |name, entity| + name = name.rchop(';').lchop?('&') || raise "Entity does not begin with &" + + entities = + case entity.codepoints.size + when 1; single_char_entities + when 2; double_char_entities + else raise "Unknown entity codepoint size" + end + + entities << {name, entity} + end +end + +single_char_entities.uniq!(&.first).sort_by!(&.first) +double_char_entities.uniq!(&.first).sort_by!(&.first) + +max_entity_name_size = { + single_char_entities.max_of { |name, _| name.size }, + double_char_entities.max_of { |name, _| name.size }, +}.max + +path = "#{__DIR__}/../src/html/entities.cr" +File.open(path, "w") do |file| + ECR.embed "#{__DIR__}/html_entities.ecr", file +end + +`crystal tool format #{path}` diff --git a/scripts/html_entities.ecr b/scripts/html_entities.ecr new file mode 100644 index 000000000000..cfa1b64e92a0 --- /dev/null +++ b/scripts/html_entities.ecr @@ -0,0 +1,24 @@ +# This file was automatically generated by running: +# +# scripts/generate_html_entities.cr +# +# DO NOT EDIT + +module HTML + # :nodoc: + SINGLE_CHAR_ENTITIES = { + <%- single_char_entities.each do |name, entity| -%> + <%= name.dump %>.to_slice => '\u{<%= "%06X" % entity.codepoints[0] %>}', + <%- end -%> + } of Bytes => Char + + # :nodoc: + DOUBLE_CHAR_ENTITIES = { + <%- double_char_entities.each do |name, entity| -%> + <%= name.dump %>.to_slice => "\u{<%= "%04X" % entity.codepoints[0] %>}\u{<%= "%04X" % entity.codepoints[1] %>}", + <%- end -%> + } of Bytes => String + + # :nodoc: + MAX_ENTITY_NAME_SIZE = <%= max_entity_name_size %> +end diff --git a/src/html/entities.cr b/src/html/entities.cr index 3fe05fb63983..ee39c9b1d701 100644 --- a/src/html/entities.cr +++ b/src/html/entities.cr @@ -1,3 +1,9 @@ +# This file was automatically generated by running: +# +# scripts/generate_html_entities.cr +# +# DO NOT EDIT + module HTML # :nodoc: SINGLE_CHAR_ENTITIES = { @@ -2033,112 +2039,6 @@ module HTML "zscr".to_slice => '\u{01D4CF}', "zwj".to_slice => '\u{00200D}', "zwnj".to_slice => '\u{00200C}', - "AElig".to_slice => '\u{0000C6}', - "AMP".to_slice => '\u{000026}', - "Aacute".to_slice => '\u{0000C1}', - "Acirc".to_slice => '\u{0000C2}', - "Agrave".to_slice => '\u{0000C0}', - "Aring".to_slice => '\u{0000C5}', - "Atilde".to_slice => '\u{0000C3}', - "Auml".to_slice => '\u{0000C4}', - "COPY".to_slice => '\u{0000A9}', - "Ccedil".to_slice => '\u{0000C7}', - "ETH".to_slice => '\u{0000D0}', - "Eacute".to_slice => '\u{0000C9}', - "Ecirc".to_slice => '\u{0000CA}', - "Egrave".to_slice => '\u{0000C8}', - "Euml".to_slice => '\u{0000CB}', - "GT".to_slice => '\u{00003E}', - "Iacute".to_slice => '\u{0000CD}', - "Icirc".to_slice => '\u{0000CE}', - "Igrave".to_slice => '\u{0000CC}', - "Iuml".to_slice => '\u{0000CF}', - "LT".to_slice => '\u{00003C}', - "Ntilde".to_slice => '\u{0000D1}', - "Oacute".to_slice => '\u{0000D3}', - "Ocirc".to_slice => '\u{0000D4}', - "Ograve".to_slice => '\u{0000D2}', - "Oslash".to_slice => '\u{0000D8}', - "Otilde".to_slice => '\u{0000D5}', - "Ouml".to_slice => '\u{0000D6}', - "QUOT".to_slice => '\u{000022}', - "REG".to_slice => '\u{0000AE}', - "THORN".to_slice => '\u{0000DE}', - "Uacute".to_slice => '\u{0000DA}', - "Ucirc".to_slice => '\u{0000DB}', - "Ugrave".to_slice => '\u{0000D9}', - "Uuml".to_slice => '\u{0000DC}', - "Yacute".to_slice => '\u{0000DD}', - "aacute".to_slice => '\u{0000E1}', - "acirc".to_slice => '\u{0000E2}', - "acute".to_slice => '\u{0000B4}', - "aelig".to_slice => '\u{0000E6}', - "agrave".to_slice => '\u{0000E0}', - "amp".to_slice => '\u{000026}', - "aring".to_slice => '\u{0000E5}', - "atilde".to_slice => '\u{0000E3}', - "auml".to_slice => '\u{0000E4}', - "brvbar".to_slice => '\u{0000A6}', - "ccedil".to_slice => '\u{0000E7}', - "cedil".to_slice => '\u{0000B8}', - "cent".to_slice => '\u{0000A2}', - "copy".to_slice => '\u{0000A9}', - "curren".to_slice => '\u{0000A4}', - "deg".to_slice => '\u{0000B0}', - "divide".to_slice => '\u{0000F7}', - "eacute".to_slice => '\u{0000E9}', - "ecirc".to_slice => '\u{0000EA}', - "egrave".to_slice => '\u{0000E8}', - "eth".to_slice => '\u{0000F0}', - "euml".to_slice => '\u{0000EB}', - "frac12".to_slice => '\u{0000BD}', - "frac14".to_slice => '\u{0000BC}', - "frac34".to_slice => '\u{0000BE}', - "gt".to_slice => '\u{00003E}', - "iacute".to_slice => '\u{0000ED}', - "icirc".to_slice => '\u{0000EE}', - "iexcl".to_slice => '\u{0000A1}', - "igrave".to_slice => '\u{0000EC}', - "iquest".to_slice => '\u{0000BF}', - "iuml".to_slice => '\u{0000EF}', - "laquo".to_slice => '\u{0000AB}', - "lt".to_slice => '\u{00003C}', - "macr".to_slice => '\u{0000AF}', - "micro".to_slice => '\u{0000B5}', - "middot".to_slice => '\u{0000B7}', - "nbsp".to_slice => '\u{0000A0}', - "not".to_slice => '\u{0000AC}', - "ntilde".to_slice => '\u{0000F1}', - "oacute".to_slice => '\u{0000F3}', - "ocirc".to_slice => '\u{0000F4}', - "ograve".to_slice => '\u{0000F2}', - "ordf".to_slice => '\u{0000AA}', - "ordm".to_slice => '\u{0000BA}', - "oslash".to_slice => '\u{0000F8}', - "otilde".to_slice => '\u{0000F5}', - "ouml".to_slice => '\u{0000F6}', - "para".to_slice => '\u{0000B6}', - "plusmn".to_slice => '\u{0000B1}', - "pound".to_slice => '\u{0000A3}', - "quot".to_slice => '\u{000022}', - "raquo".to_slice => '\u{0000BB}', - "reg".to_slice => '\u{0000AE}', - "sect".to_slice => '\u{0000A7}', - "shy".to_slice => '\u{0000AD}', - "sup1".to_slice => '\u{0000B9}', - "sup2".to_slice => '\u{0000B2}', - "sup3".to_slice => '\u{0000B3}', - "szlig".to_slice => '\u{0000DF}', - "thorn".to_slice => '\u{0000FE}', - "times".to_slice => '\u{0000D7}', - "uacute".to_slice => '\u{0000FA}', - "ucirc".to_slice => '\u{0000FB}', - "ugrave".to_slice => '\u{0000F9}', - "uml".to_slice => '\u{0000A8}', - "uuml".to_slice => '\u{0000FC}', - "yacute".to_slice => '\u{0000FD}', - "yen".to_slice => '\u{0000A5}', - "yuml".to_slice => '\u{0000FF}', } of Bytes => Char # :nodoc: @@ -2239,5 +2139,5 @@ module HTML } of Bytes => String # :nodoc: - MAX_ENTITY_NAME_SIZE = Math.max(SINGLE_CHAR_ENTITIES.each_key.max_of(&.size), DOUBLE_CHAR_ENTITIES.each_key.max_of(&.size)) + MAX_ENTITY_NAME_SIZE = 31 end