diff --git a/.gitignore b/.gitignore index d34ed114972c5..87fec41415272 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ __pycache__/ /src/libcore/unicode/Scripts.txt /src/libcore/unicode/SpecialCasing.txt /src/libcore/unicode/UnicodeData.txt +/src/libcore/unicode/downloaded /stage[0-9]+/ /target target/ diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py index ae356c3ff4459..a0539cd9ca9b6 100755 --- a/src/libcore/unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -1,147 +1,350 @@ #!/usr/bin/env python -# This script uses the following Unicode tables: -# - DerivedCoreProperties.txt -# - DerivedNormalizationProps.txt -# - EastAsianWidth.txt -# - auxiliary/GraphemeBreakProperty.txt -# - PropList.txt -# - ReadMe.txt -# - Scripts.txt -# - UnicodeData.txt -# -# Since this should not require frequent updates, we just store this -# out-of-line and check the tables.rs file into git. +""" +Regenerate Unicode tables (tables.rs). +""" -import fileinput, re, os, sys, operator, math, datetime +# This script uses the Unicode tables as defined +# in the UnicodeFiles class. -# The directory in which this file resides. -fdir = os.path.dirname(os.path.realpath(__file__)) + "/" +# Since this should not require frequent updates, we just store this +# out-of-line and check the tables.rs file into git. -preamble = ''' +# Note that the "curl" program is required for operation. +# This script is compatible with Python 2.7 and 3.x. + +import argparse +import datetime +import fileinput +import itertools +import os +import re +import textwrap +import subprocess + +from collections import defaultdict, namedtuple + +try: + # Python 3 + from itertools import zip_longest + from io import StringIO +except ImportError: + # Python 2 compatibility + zip_longest = itertools.izip_longest + from StringIO import StringIO + +try: + # Completely optional type hinting + # (Python 2 compatible using comments, + # see: https://mypy.readthedocs.io/en/latest/python2.html) + # This is very helpful in typing-aware IDE like PyCharm. + from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple +except ImportError: + pass + + +# We don't use enum.Enum because of Python 2.7 compatibility. +class UnicodeFiles(object): + # ReadMe does not contain any Unicode data, we + # only use it to extract versions. + README = "ReadMe.txt" + + DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt" + DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt" + PROPS = "PropList.txt" + SCRIPTS = "Scripts.txt" + SPECIAL_CASING = "SpecialCasing.txt" + UNICODE_DATA = "UnicodeData.txt" + + +# The order doesn't really matter (Python < 3.6 won't preserve it), +# we only want to aggregate all the file names. +ALL_UNICODE_FILES = tuple( + value for name, value in UnicodeFiles.__dict__.items() + if not name.startswith("_") +) + +assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files" + +# The directory this file is located in. +THIS_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Where to download the Unicode data. The downloaded files +# will be placed in sub-directories named after Unicode version. +FETCH_DIR = os.path.join(THIS_DIR, "downloaded") + +FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}" +FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}" + +PREAMBLE = """\ // NOTE: The following code was generated by "./unicode.py", do not edit directly #![allow(missing_docs, non_upper_case_globals, non_snake_case)] use unicode::version::UnicodeVersion; use unicode::bool_trie::{{BoolTrie, SmallBoolTrie}}; -'''.format(year = datetime.datetime.now().year) +""".format(year=datetime.datetime.now().year) # Mapping taken from Table 12 from: # http://www.unicode.org/reports/tr44/#General_Category_Values -expanded_categories = { - 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], - 'Lm': ['L'], 'Lo': ['L'], - 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], - 'Nd': ['N'], 'Nl': ['N'], 'No': ['N'], - 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], - 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], - 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], - 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], - 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], +EXPANDED_CATEGORIES = { + "Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"], + "Lm": ["L"], "Lo": ["L"], + "Mn": ["M"], "Mc": ["M"], "Me": ["M"], + "Nd": ["N"], "Nl": ["N"], "No": ["N"], + "Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"], + "Pi": ["P"], "Pf": ["P"], "Po": ["P"], + "Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"], + "Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"], + "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"], } -# these are the surrogate codepoints, which are not valid rust characters -surrogate_codepoints = (0xd800, 0xdfff) +# This is the (inclusive) range of surrogate codepoints. +# These are not valid Rust characters. +SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff) + +UnicodeData = namedtuple( + "UnicodeData", ( + # Conversions: + "to_upper", "to_lower", "to_title", + + # Decompositions: canonical decompositions, compatibility decomp + "canon_decomp", "compat_decomp", + + # Grouped: general categories and combining characters + "general_categories", "combines", + ) +) + +UnicodeVersion = namedtuple( + "UnicodeVersion", ("major", "minor", "micro", "as_str") +) + + +def fetch_files(version=None): + # type: (str) -> UnicodeVersion + """ + Fetch all the Unicode files from unicode.org. + + This will use cached files (stored in `FETCH_DIR`) if they exist, + creating them if they don't. In any case, the Unicode version + is always returned. + + :param version: The desired Unicode version, as string. + (If None, defaults to latest final release available, + querying the unicode.org service). + """ + have_version = check_stored_version(version) + if have_version: + return have_version + + if version: + # Check if the desired version exists on the server. + get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name) + else: + # Extract the latest version. + get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name) + + readme_url = get_fetch_url(UnicodeFiles.README) + + print("Fetching: {}".format(readme_url)) + readme_content = subprocess.check_output(("curl", readme_url)) + + unicode_version = parse_readme_unicode_version( + readme_content.decode("utf8") + ) + + download_dir = get_unicode_dir(unicode_version) + if not os.path.exists(download_dir): + # For 2.7 compat, we don't use `exist_ok=True`. + os.makedirs(download_dir) + + for filename in ALL_UNICODE_FILES: + file_path = get_unicode_file_path(unicode_version, filename) + + if os.path.exists(file_path): + # Assume file on the server didn't change if it's been saved before. + continue + + if filename == UnicodeFiles.README: + with open(file_path, "wb") as fd: + fd.write(readme_content) + else: + url = get_fetch_url(filename) + print("Fetching: {}".format(url)) + subprocess.check_call(("curl", "-o", file_path, url)) + + return unicode_version + + +def check_stored_version(version): + # type: (Optional[str]) -> Optional[UnicodeVersion] + """ + Given desired Unicode version, return the version + if stored files are all present, and `None` otherwise. + """ + if not version: + # If no desired version specified, we should check what's the latest + # version, skipping stored version checks. + return None + + fetch_dir = os.path.join(FETCH_DIR, version) -def fetch(f): - path = fdir + os.path.basename(f) - if not os.path.exists(path): - os.system("curl -o {0}{1} ftp://ftp.unicode.org/Public/UNIDATA/{1}".format(fdir, f)) + for filename in ALL_UNICODE_FILES: + file_path = os.path.join(fetch_dir, filename) + + if not os.path.exists(file_path): + return None + + with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd: + return parse_readme_unicode_version(fd.read()) + + +def parse_readme_unicode_version(readme_content): + # type: (str) -> UnicodeVersion + """ + Parse the Unicode version contained in their `ReadMe.txt` file. + """ + # "Raw string" is necessary for \d not being treated as escape char + # (for the sake of compat with future Python versions). + # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior + pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + groups = re.search(pattern, readme_content).groups() + + return UnicodeVersion(*map(int, groups), as_str=".".join(groups)) + + +def get_unicode_dir(unicode_version): + # type: (UnicodeVersion) -> str + """ + Indicate in which parent dir the Unicode data files should be stored. + + This returns a full, absolute path. + """ + return os.path.join(FETCH_DIR, unicode_version.as_str) + + +def get_unicode_file_path(unicode_version, filename): + # type: (UnicodeVersion, str) -> str + """ + Indicate where the Unicode data file should be stored. + """ + return os.path.join(get_unicode_dir(unicode_version), filename) - if not os.path.exists(path): - sys.stderr.write("cannot load %s" % f) - exit(1) def is_surrogate(n): - return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] - -def load_unicode_data(f): - fetch(f) - gencats = {} - to_lower = {} - to_upper = {} - to_title = {} - combines = {} - canon_decomp = {} - compat_decomp = {} - - udict = {} + # type: (int) -> bool + """ + Tell if given codepoint is a surrogate (not a valid Rust character). + """ + return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1] + + +def load_unicode_data(file_path): + # type: (str) -> UnicodeData + """ + Load main Unicode data. + """ + # Conversions + to_lower = {} # type: Dict[int, Tuple[int, int, int]] + to_upper = {} # type: Dict[int, Tuple[int, int, int]] + to_title = {} # type: Dict[int, Tuple[int, int, int]] + + # Decompositions + compat_decomp = {} # type: Dict[int, List[int]] + canon_decomp = {} # type: Dict[int, List[int]] + + # Combining characters + # FIXME: combines are not used + combines = defaultdict(set) # type: Dict[str, Set[int]] + + # Categories + general_categories = defaultdict(set) # type: Dict[str, Set[int]] + category_assigned_codepoints = set() # type: Set[int] + + all_codepoints = {} + range_start = -1 - for line in fileinput.input(fdir + f): - data = line.split(';') + + for line in fileinput.input(file_path): + data = line.split(";") if len(data) != 15: continue - cp = int(data[0], 16) - if is_surrogate(cp): + codepoint = int(data[0], 16) + if is_surrogate(codepoint): continue if range_start >= 0: - for i in range(range_start, cp): - udict[i] = data + for i in range(range_start, codepoint): + all_codepoints[i] = data range_start = -1 if data[1].endswith(", First>"): - range_start = cp + range_start = codepoint continue - udict[cp] = data + all_codepoints[codepoint] = data - for code in udict: + for code, data in all_codepoints.items(): (code_org, name, gencat, combine, bidi, decomp, deci, digit, num, mirror, - old, iso, upcase, lowcase, titlecase) = udict[code] + old, iso, upcase, lowcase, titlecase) = data + + # Generate char to char direct common and simple conversions: - # generate char to char direct common and simple conversions - # uppercase to lowercase + # Uppercase to lowercase if lowcase != "" and code_org != lowcase: to_lower[code] = (int(lowcase, 16), 0, 0) - # lowercase to uppercase + # Lowercase to uppercase if upcase != "" and code_org != upcase: to_upper[code] = (int(upcase, 16), 0, 0) - # title case + # Title case if titlecase.strip() != "" and code_org != titlecase: to_title[code] = (int(titlecase, 16), 0, 0) - # store decomposition, if given - if decomp != "": - if decomp.startswith('<'): - seq = [] - for i in decomp.split()[1:]: - seq.append(int(i, 16)) - compat_decomp[code] = seq + # Store decomposition, if given + if decomp: + decompositions = decomp.split()[1:] + decomp_code_points = [int(i, 16) for i in decompositions] + + if decomp.startswith("<"): + # Compatibility decomposition + compat_decomp[code] = decomp_code_points else: - seq = [] - for i in decomp.split(): - seq.append(int(i, 16)) - canon_decomp[code] = seq - - # place letter in categories as appropriate - for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []): - if cat not in gencats: - gencats[cat] = [] - gencats[cat].append(code) - - # record combining class, if any + # Canonical decomposition + canon_decomp[code] = decomp_code_points + + # Place letter in categories as appropriate. + for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])): + general_categories[cat].add(code) + category_assigned_codepoints.add(code) + + # Record combining class, if any. if combine != "0": - if combine not in combines: - combines[combine] = [] - combines[combine].append(code) - - # generate Not_Assigned from Assigned - gencats["Cn"] = gen_unassigned(gencats["Assigned"]) - # Assigned is not a real category - del(gencats["Assigned"]) + combines[combine].add(code) + + # Generate Not_Assigned from Assigned. + general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints) + # Other contains Not_Assigned - gencats["C"].extend(gencats["Cn"]) - gencats = group_cats(gencats) - combines = to_combines(group_cats(combines)) + general_categories["C"].update(general_categories["Cn"]) + + grouped_categories = group_categories(general_categories) - return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title) + # FIXME: combines are not used + return UnicodeData( + to_lower=to_lower, to_upper=to_upper, to_title=to_title, + compat_decomp=compat_decomp, canon_decomp=canon_decomp, + general_categories=grouped_categories, combines=combines, + ) -def load_special_casing(f, to_upper, to_lower, to_title): - fetch(f) - for line in fileinput.input(fdir + f): - data = line.split('#')[0].split(';') + +def load_special_casing(file_path, unicode_data): + # type: (str, UnicodeData) -> None + """ + Load special casing data and enrich given Unicode data. + """ + for line in fileinput.input(file_path): + data = line.split("#")[0].split(";") if len(data) == 5: code, lower, title, upper, _comment = data elif len(data) == 6: @@ -155,243 +358,399 @@ def load_special_casing(f, to_upper, to_lower, to_title): title = title.strip() upper = upper.strip() key = int(code, 16) - for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]: + for (map_, values) in ((unicode_data.to_lower, lower), + (unicode_data.to_upper, upper), + (unicode_data.to_title, title)): if values != code: - values = [int(i, 16) for i in values.split()] - for _ in range(len(values), 3): - values.append(0) - assert len(values) == 3 - map_[key] = values - -def group_cats(cats): - cats_out = {} - for cat in cats: - cats_out[cat] = group_cat(cats[cat]) - return cats_out - -def group_cat(cat): - cat_out = [] - letters = sorted(set(cat)) - cur_start = letters.pop(0) - cur_end = cur_start - for letter in letters: - assert letter > cur_end, \ - "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) - if letter == cur_end + 1: - cur_end = letter - else: - cat_out.append((cur_start, cur_end)) - cur_start = cur_end = letter - cat_out.append((cur_start, cur_end)) - return cat_out - -def ungroup_cat(cat): - cat_out = [] - for (lo, hi) in cat: - while lo <= hi: - cat_out.append(lo) - lo += 1 - return cat_out - -def gen_unassigned(assigned): - assigned = set(assigned) - return ([i for i in range(0, 0xd800) if i not in assigned] + - [i for i in range(0xe000, 0x110000) if i not in assigned]) - -def to_combines(combs): - combs_out = [] - for comb in combs: - for (lo, hi) in combs[comb]: - combs_out.append((lo, hi, comb)) - combs_out.sort(key=lambda comb: comb[0]) - return combs_out - -def format_table_content(f, content, indent): - line = " "*indent + split = values.split() + + codepoints = list(itertools.chain( + (int(i, 16) for i in split), + (0 for _ in range(len(split), 3)) + )) + + assert len(codepoints) == 3 + map_[key] = codepoints + + +def group_categories(mapping): + # type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]] + """ + Group codepoints mapped in "categories". + """ + return {category: group_codepoints(codepoints) + for category, codepoints in mapping.items()} + + +def group_codepoints(codepoints): + # type: (Iterable[int]) -> List[Tuple[int, int]] + """ + Group integral values into continuous, disjoint value ranges. + + Performs value deduplication. + + :return: sorted list of pairs denoting start and end of codepoint + group values, both ends inclusive. + + >>> group_codepoints([1, 2, 10, 11, 12, 3, 4]) + [(1, 4), (10, 12)] + >>> group_codepoints([1]) + [(1, 1)] + >>> group_codepoints([1, 5, 6]) + [(1, 1), (5, 6)] + >>> group_codepoints([]) + [] + """ + sorted_codes = sorted(set(codepoints)) + result = [] # type: List[Tuple[int, int]] + + if not sorted_codes: + return result + + next_codes = sorted_codes[1:] + start_code = sorted_codes[0] + + for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None): + if next_code is None or next_code - code != 1: + result.append((start_code, code)) + start_code = next_code + + return result + + +def ungroup_codepoints(codepoint_pairs): + # type: (Iterable[Tuple[int, int]]) -> List[int] + """ + The inverse of group_codepoints -- produce a flat list of values + from value range pairs. + + >>> ungroup_codepoints([(1, 4), (10, 12)]) + [1, 2, 3, 4, 10, 11, 12] + >>> ungroup_codepoints([(1, 1), (5, 6)]) + [1, 5, 6] + >>> ungroup_codepoints(group_codepoints([1, 2, 7, 8])) + [1, 2, 7, 8] + >>> ungroup_codepoints([]) + [] + """ + return list(itertools.chain.from_iterable( + range(lo, hi + 1) for lo, hi in codepoint_pairs + )) + + +def get_unassigned_codepoints(assigned_codepoints): + # type: (Set[int]) -> Set[int] + """ + Given a set of "assigned" codepoints, return a set + of these that are not in assigned and not surrogate. + """ + return {i for i in range(0, 0x110000) + if i not in assigned_codepoints and not is_surrogate(i)} + + +def generate_table_lines(items, indent, wrap=98): + # type: (Iterable[str], int, int) -> Iterator[str] + """ + Given table items, generate wrapped lines of text with comma-separated items. + + This is a generator function. + + :param wrap: soft wrap limit (characters per line), integer. + """ + line = " " * indent first = True - for chunk in content.split(","): - if len(line) + len(chunk) < 98: + for item in items: + if len(line) + len(item) < wrap: if first: - line += chunk + line += item else: - line += ", " + chunk + line += ", " + item first = False else: - f.write(line + ",\n") - line = " "*indent + chunk - f.write(line) - -def load_properties(f, interestingprops): - fetch(f) - props = {} - re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") - re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") - - for line in fileinput.input(fdir + os.path.basename(f)): - prop = None - d_lo = 0 - d_hi = 0 - m = re1.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(1) - prop = m.group(2) - else: - m = re2.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(2) - prop = m.group(3) + yield line + ",\n" + line = " " * indent + item + + yield line + + +def load_properties(file_path, interesting_props): + # type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]] + """ + Load properties data and return in grouped form. + """ + props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]] + # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars + # (for the sake of compat with future Python versions). + # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior + re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") + + for line in fileinput.input(file_path): + match = re1.match(line) or re2.match(line) + if match: + groups = match.groups() + + if len(groups) == 2: + # `re1` matched (2 groups). + d_lo, prop = groups + d_hi = d_lo else: - continue - if interestingprops and prop not in interestingprops: + d_lo, d_hi, prop = groups + else: + continue + + if interesting_props and prop not in interesting_props: continue - d_lo = int(d_lo, 16) - d_hi = int(d_hi, 16) - if prop not in props: - props[prop] = [] - props[prop].append((d_lo, d_hi)) - # optimize if possible + lo_value = int(d_lo, 16) + hi_value = int(d_hi, 16) + + props[prop].append((lo_value, hi_value)) + + # Optimize if possible. for prop in props: - props[prop] = group_cat(ungroup_cat(props[prop])) + props[prop] = group_codepoints(ungroup_codepoints(props[prop])) return props -def escape_char(c): - return "'\\u{%x}'" % c if c != 0 else "'\\0'" -def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True, - pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): +def escape_char(c): + # type: (int) -> str + r""" + Escape a codepoint for use as Rust char literal. + + Outputs are OK to use as Rust source code as char literals + and they also include necessary quotes. + + >>> escape_char(97) + "'\\u{61}'" + >>> escape_char(0) + "'\\0'" + """ + return r"'\u{%x}'" % c if c != 0 else r"'\0'" + + +def format_char_pair(pair): + # type: (Tuple[int, int]) -> str + """ + Format a pair of two Rust chars. + """ + return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1])) + + +def generate_table( + name, # type: str + items, # type: List[Tuple[int, int]] + decl_type="&[(char, char)]", # type: str + is_pub=True, # type: bool + format_item=format_char_pair, # type: Callable[[Tuple[int, int]], str] +): + # type: (...) -> Iterator[str] + """ + Generate a nicely formatted Rust constant "table" array. + + This generates actual Rust code. + """ pub_string = "" if is_pub: pub_string = "pub " - f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) - data = "" + + yield " %sconst %s: %s = &[\n" % (pub_string, name, decl_type) + + data = [] first = True - for dat in t_data: + for item in items: if not first: - data += "," + data.append(",") first = False - data += pfun(dat) - format_table_content(f, data, 8) - f.write("\n ];\n\n") + data.extend(format_item(item)) -def compute_trie(rawdata, chunksize): + for table_line in generate_table_lines("".join(data).split(","), 8): + yield table_line + + yield "\n ];\n\n" + + +def compute_trie(raw_data, chunk_size): + # type: (List[int], int) -> Tuple[List[int], List[int]] + """ + Compute postfix-compressed trie. + + See: bool_trie.rs for more details. + + >>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3) + ([0, 0, 1], [1, 2, 3, 4, 5, 6]) + >>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3) + ([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6]) + """ root = [] - childmap = {} + childmap = {} # type: Dict[Tuple[int, ...], int] child_data = [] - for i in range(len(rawdata) // chunksize): - data = rawdata[i * chunksize: (i + 1) * chunksize] - child = '|'.join(map(str, data)) + + assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized" + + for i in range(len(raw_data) // chunk_size): + data = raw_data[i * chunk_size : (i + 1) * chunk_size] + + # Postfix compression of child nodes (data chunks) + # (identical child nodes are shared). + + # Make a tuple out of the list so it's hashable. + child = tuple(data) if child not in childmap: childmap[child] = len(childmap) child_data.extend(data) + root.append(childmap[child]) - return (root, child_data) -def emit_bool_trie(f, name, t_data, is_pub=True): - CHUNK = 64 + return root, child_data + + +def generate_bool_trie(name, codepoint_ranges, is_pub=True): + # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] + """ + Generate Rust code for BoolTrie struct. + + This yields string fragments that should be joined to produce + the final string. + + See: `bool_trie.rs`. + """ + chunk_size = 64 rawdata = [False] * 0x110000 - for (lo, hi) in t_data: + for (lo, hi) in codepoint_ranges: for cp in range(lo, hi + 1): rawdata[cp] = True - # convert to bitmap chunks of 64 bits each + # Convert to bitmap chunks of `chunk_size` bits each. chunks = [] - for i in range(0x110000 // CHUNK): + for i in range(0x110000 // chunk_size): chunk = 0 - for j in range(64): - if rawdata[i * 64 + j]: + for j in range(chunk_size): + if rawdata[i * chunk_size + j]: chunk |= 1 << j chunks.append(chunk) pub_string = "" if is_pub: pub_string = "pub " - f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)) - f.write(" r1: [\n") - data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK]) - format_table_content(f, data, 12) - f.write("\n ],\n") + yield " %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name) + yield " r1: [\n" + data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size]) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" # 0x800..0x10000 trie - (r2, r3) = compute_trie(chunks[0x800 // CHUNK : 0x10000 // CHUNK], 64 // CHUNK) - f.write(" r2: [\n") - data = ','.join(str(node) for node in r2) - format_table_content(f, data, 12) - f.write("\n ],\n") - f.write(" r3: &[\n") - data = ','.join('0x%016x' % chunk for chunk in r3) - format_table_content(f, data, 12) - f.write("\n ],\n") + (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size) + yield " r2: [\n" + data = map(str, r2) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r3: &[\n" + data = ("0x%016x" % node for node in r3) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" # 0x10000..0x110000 trie - (mid, r6) = compute_trie(chunks[0x10000 // CHUNK : 0x110000 // CHUNK], 64 // CHUNK) + (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], + 64 // chunk_size) (r4, r5) = compute_trie(mid, 64) - f.write(" r4: [\n") - data = ','.join(str(node) for node in r4) - format_table_content(f, data, 12) - f.write("\n ],\n") - f.write(" r5: &[\n") - data = ','.join(str(node) for node in r5) - format_table_content(f, data, 12) - f.write("\n ],\n") - f.write(" r6: &[\n") - data = ','.join('0x%016x' % chunk for chunk in r6) - format_table_content(f, data, 12) - f.write("\n ],\n") - - f.write(" };\n\n") - -def emit_small_bool_trie(f, name, t_data, is_pub=True): - last_chunk = max(hi // 64 for (lo, hi) in t_data) + + yield " r4: [\n" + data = map(str, r4) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r5: &[\n" + data = map(str, r5) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r6: &[\n" + data = ("0x%016x" % node for node in r6) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " };\n\n" + + +def generate_small_bool_trie(name, codepoint_ranges, is_pub=True): + # type: (str, List[Tuple[int, int]], bool) -> Iterator[str] + """ + Generate Rust code for `SmallBoolTrie` struct. + + See: `bool_trie.rs`. + """ + last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges) n_chunks = last_chunk + 1 chunks = [0] * n_chunks - for (lo, hi) in t_data: + for (lo, hi) in codepoint_ranges: for cp in range(lo, hi + 1): - if cp // 64 >= len(chunks): - print(cp, cp // 64, len(chunks), lo, hi) + assert cp // 64 < len(chunks) chunks[cp // 64] |= 1 << (cp & 63) pub_string = "" if is_pub: pub_string = "pub " - f.write(" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n" - % (pub_string, name)) + + yield (" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n" + % (pub_string, name)) (r1, r2) = compute_trie(chunks, 1) - f.write(" r1: &[\n") - data = ','.join(str(node) for node in r1) - format_table_content(f, data, 12) - f.write("\n ],\n") - - f.write(" r2: &[\n") - data = ','.join('0x%016x' % node for node in r2) - format_table_content(f, data, 12) - f.write("\n ],\n") - - f.write(" };\n\n") - -def emit_property_module(f, mod, tbl, emit): - f.write("pub mod %s {\n" % mod) - for cat in sorted(emit): - if cat in ["Cc", "White_Space", "Pattern_White_Space"]: - emit_small_bool_trie(f, "%s_table" % cat, tbl[cat]) - f.write(" pub fn %s(c: char) -> bool {\n" % cat) - f.write(" %s_table.lookup(c)\n" % cat) - f.write(" }\n\n") + yield " r1: &[\n" + data = (str(node) for node in r1) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " r2: &[\n" + data = ("0x%016x" % node for node in r2) + for fragment in generate_table_lines(data, 12): + yield fragment + yield "\n ],\n" + + yield " };\n\n" + + +def generate_property_module(mod, grouped_categories, category_subset): + # type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str] + """ + Generate Rust code for module defining properties. + """ + + yield "pub mod %s {\n" % mod + for cat in sorted(category_subset): + if cat in ("Cc", "White_Space", "Pattern_White_Space"): + generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat]) else: - emit_bool_trie(f, "%s_table" % cat, tbl[cat]) - f.write(" pub fn %s(c: char) -> bool {\n" % cat) - f.write(" %s_table.lookup(c)\n" % cat) - f.write(" }\n\n") - f.write("}\n\n") - -def emit_conversions_module(f, to_upper, to_lower, to_title): - f.write("pub mod conversions {") - f.write(""" + generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat]) + + for fragment in generator: + yield fragment + + yield " pub fn %s(c: char) -> bool {\n" % cat + yield " %s_table.lookup(c)\n" % cat + yield " }\n\n" + + yield "}\n\n" + + +def generate_conversions_module(unicode_data): + # type: (UnicodeData) -> Iterator[str] + """ + Generate Rust code for module defining conversions. + """ + + yield "pub mod conversions {" + yield """ pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { None => [c, '\\0', '\\0'], @@ -408,80 +767,109 @@ def emit_conversions_module(f, to_upper, to_lower, to_title): fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option { table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() - } + }\n\n""" + + decl_type = "&[(char, [char; 3])]" + format_conversion = lambda x: "({},[{},{},{}])".format(*( + escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2]) + )) + + for fragment in generate_table( + name="to_lowercase_table", + items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]), + decl_type=decl_type, + is_pub=False, + format_item=format_conversion + ): + yield fragment + + for fragment in generate_table( + name="to_uppercase_table", + items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]), + decl_type=decl_type, + is_pub=False, + format_item=format_conversion + ): + yield fragment + + yield "}\n" + + +def parse_args(): + # type: () -> argparse.Namespace + """ + Parse command line arguments. + """ + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-v", "--version", default=None, type=str, + help="Unicode version to use (if not specified," + " defaults to latest release).") + + return parser.parse_args() + + +def main(): + # type: () -> None + """ + Script entry point. + """ + args = parse_args() + + unicode_version = fetch_files(args.version) + print("Using Unicode version: {}".format(unicode_version.as_str)) + + # All the writing happens entirely in memory, we only write to file + # once we have generated the file content (it's not very large, <1 MB). + buf = StringIO() + buf.write(PREAMBLE) + + unicode_version_notice = textwrap.dedent(""" + /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of + /// `char` and `str` methods are based on. + #[unstable(feature = "unicode_version", issue = "49726")] + pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{ + major: {version.major}, + minor: {version.minor}, + micro: {version.micro}, + _priv: (), + }}; + """).format(version=unicode_version) + buf.write(unicode_version_notice) + + get_path = lambda f: get_unicode_file_path(unicode_version, f) + + unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA)) + load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data) + + want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", + "Cased", "Case_Ignorable", "Grapheme_Extend"} + derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived) + + props = load_properties(get_path(UnicodeFiles.PROPS), + {"White_Space", "Join_Control", "Noncharacter_Code_Point", + "Pattern_White_Space"}) + + # Category tables + for (name, categories, category_subset) in ( + ("general_category", unicode_data.general_categories, ["N", "Cc"]), + ("derived_property", derived, want_derived), + ("property", props, ["White_Space", "Pattern_White_Space"]) + ): + for fragment in generate_property_module(name, categories, category_subset): + buf.write(fragment) + + for fragment in generate_conversions_module(unicode_data): + buf.write(fragment) + + tables_rs_path = os.path.join(THIS_DIR, "tables.rs") + + # Actually write out the file content. + # Will overwrite the file if it exists. + with open(tables_rs_path, "w") as fd: + fd.write(buf.getvalue()) + + print("Regenerated tables.rs.") -""") - t_type = "&[(char, [char; 3])]" - pfun = lambda x: "(%s,[%s,%s,%s])" % ( - escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])) - emit_table(f, "to_lowercase_table", - sorted(to_lower.items(), key=operator.itemgetter(0)), - is_pub=False, t_type = t_type, pfun=pfun) - emit_table(f, "to_uppercase_table", - sorted(to_upper.items(), key=operator.itemgetter(0)), - is_pub=False, t_type = t_type, pfun=pfun) - f.write("}\n\n") - -def emit_norm_module(f, canon, compat, combine, norm_props): - canon_keys = sorted(canon.keys()) - - compat_keys = sorted(compat.keys()) - - canon_comp = {} - comp_exclusions = norm_props["Full_Composition_Exclusion"] - for char in canon_keys: - if any(lo <= char <= hi for lo, hi in comp_exclusions): - continue - decomp = canon[char] - if len(decomp) == 2: - if decomp[0] not in canon_comp: - canon_comp[decomp[0]] = [] - canon_comp[decomp[0]].append( (decomp[1], char) ) - canon_comp_keys = sorted(canon_comp.keys()) if __name__ == "__main__": - r = fdir + "tables.rs" - if os.path.exists(r): - os.remove(r) - with open(r, "w") as rf: - # write the file's preamble - rf.write(preamble) - - # download and parse all the data - fetch("ReadMe.txt") - with open(fdir + "ReadMe.txt") as readme: - pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" - unicode_version = re.search(pattern, readme.read()).groups() - rf.write(""" -/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of -/// `char` and `str` methods are based on. -#[unstable(feature = "unicode_version", issue = "49726")] -pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion { - major: %s, - minor: %s, - micro: %s, - _priv: (), -}; -""" % unicode_version) - (canon_decomp, compat_decomp, gencats, combines, - to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt") - load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title) - want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", - "Cased", "Case_Ignorable", "Grapheme_Extend"] - derived = load_properties("DerivedCoreProperties.txt", want_derived) - scripts = load_properties("Scripts.txt", []) - props = load_properties("PropList.txt", - ["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"]) - norm_props = load_properties("DerivedNormalizationProps.txt", - ["Full_Composition_Exclusion"]) - - # category tables - for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \ - ("derived_property", derived, want_derived), \ - ("property", props, ["White_Space", "Pattern_White_Space"]): - emit_property_module(rf, name, cat, pfuns) - - # normalizations and conversions module - emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props) - emit_conversions_module(rf, to_upper, to_lower, to_title) - print("Regenerated tables.rs.") + main()