diff --git a/docs/tar.md b/docs/tar.md index f9e82e507..1d3a05a5a 100644 --- a/docs/tar.md +++ b/docs/tar.md @@ -100,7 +100,7 @@ Rule that executes BSD `tar`. Most users should use the [`tar`](#tar) macro, rat | out | Resulting tar file to write. If absent, `[name].tar` is written. | Label | optional | `None` | | args | Additional flags permitted by BSD tar; see the man page. | List of strings | optional | `[]` | | compress | Compress the archive file with a supported algorithm. | String | optional | `""` | -| compute_unused_inputs | Whether to discover and prune input files that will not contribute to the archive.

Unused inputs are discovered by comparing the set of input files in `srcs` to the set of files referenced by `mtree`. Files not used for content by the mtree specification will not be read by the `tar` tool when creating the archive and can be pruned from the input set using the `unused_inputs_list` [mechanism](https://bazel.build/contribute/codebase#input-discovery).

Benefits: pruning unused input files can reduce the amount of work the build system must perform. Pruned files are not included in the action cache key; changes to them do not invalidate the cache entry, which can lead to higher cache hit rates. Actions do not need to block on the availability of pruned inputs, which can increase the available parallelism of builds. Pruned files do not need to be transferred to remote-execution workers, which can reduce network costs.

Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The comparison performed between `srcs` and `mtree` is currently inexact and may fail to handle handwritten or externally-derived mtree specifications. However, it is safe to use this feature when the lines found in `mtree` are derived from one or more `mtree_spec` rules, filtered and/or merged on whole-line basis only.

Possible values:

- `compute_unused_inputs = 1`: Always perform unused input discovery and pruning. - `compute_unused_inputs = 0`: Never discover or prune unused inputs. - `compute_unused_inputs = -1`: Discovery and pruning of unused inputs is controlled by the --[no]@aspect_bazel_lib//lib:tar_compute_unused_inputs flag. | Integer | optional | `-1` | +| compute_unused_inputs | Whether to discover and prune input files that will not contribute to the archive.

Unused inputs are discovered by comparing the set of input files in `srcs` to the set of files referenced by `mtree`. Files not used for content by the mtree specification will not be read by the `tar` tool when creating the archive and can be pruned from the input set using the `unused_inputs_list` [mechanism](https://bazel.build/contribute/codebase#input-discovery).

Benefits: pruning unused input files can reduce the amount of work the build system must perform. Pruned files are not included in the action cache key; changes to them do not invalidate the cache entry, which can lead to higher cache hit rates. Actions do not need to block on the availability of pruned inputs, which can increase the available parallelism of builds. Pruned files do not need to be transferred to remote-execution workers, which can reduce network costs.

Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The comparison performed between `srcs` and `mtree` is exact. There are no known circumstances where incorrect results are anticipated.

Possible values:

- `compute_unused_inputs = 1`: Always perform unused input discovery and pruning. - `compute_unused_inputs = 0`: Never discover or prune unused inputs. - `compute_unused_inputs = -1`: Discovery and pruning of unused inputs is controlled by the --[no]@aspect_bazel_lib//lib:tar_compute_unused_inputs flag. | Integer | optional | `-1` | | mode | A mode indicator from the following list, copied from the tar manpage:

- create: Create a new archive containing the specified items. - append: Like `create`, but new entries are appended to the archive. Note that this only works on uncompressed archives stored in regular files. The -f option is required. - list: List archive contents to stdout. - update: Like `append`, but new entries are added only if they have a modification date newer than the corresponding entry in the archive. Note that this only works on uncompressed archives stored in regular files. The -f option is required. - extract: Extract to disk from the archive. If a file with the same name appears more than once in the archive, each copy will be extracted, with later copies overwriting (replacing) earlier copies. | String | optional | `"create"` | | mtree | An mtree specification file | Label | required | | diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 2e9654044..c1d372571 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -1,5 +1,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library") +load("//lib:run_binary.bzl", "run_binary") load("//lib:utils.bzl", "is_bazel_7_or_greater") +load("//lib:write_source_files.bzl", "write_source_files") exports_files( [ @@ -8,6 +10,9 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", + "unvis_canonical.sed", + "vis_canonicalize.sed", + "vis_escape_nonascii.sed", ], visibility = ["//visibility:public"], ) @@ -279,9 +284,13 @@ bzl_library( bzl_library( name = "tar", - srcs = ["tar.bzl"], + srcs = [ + "tar.bzl", + "vis_escape_ascii.bzl", + ], visibility = ["//lib:__subpackages__"], deps = [ + ":strings.bzl", "@aspect_bazel_lib//lib:paths", "@bazel_skylib//rules:common_settings", ], @@ -362,6 +371,9 @@ bzl_library( name = "strings", srcs = ["strings.bzl"], visibility = ["//lib:__subpackages__"], + deps = [ + "@bazel_skylib//lib:types", + ], ) bzl_library( @@ -369,3 +381,30 @@ bzl_library( srcs = ["zstd_toolchain.bzl"], visibility = ["//lib:__subpackages__"], ) + +run_binary( + name = "run_gen_vis_scripts", + outs = [ + "_unvis_canonical.sed", + "_vis_canonicalize.sed", + "_vis_escape_ascii.bzl", + "_vis_escape_nonascii.sed", + ], + args = [ + "unvis_canonical.sed=$(location _unvis_canonical.sed)", + "vis_canonicalize.sed=$(location _vis_canonicalize.sed)", + "vis_escape_ascii.bzl=$(location _vis_escape_ascii.bzl)", + "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)", + ], + tool = "//tools/gen_vis_scripts", +) + +write_source_files( + name = "write_vis_scripts", + files = { + "unvis_canonical.sed": ":_unvis_canonical.sed", + "vis_canonicalize.sed": ":_vis_canonicalize.sed", + "vis_escape_ascii.bzl": ":_vis_escape_ascii.bzl", + "vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed", + }, +) diff --git a/lib/private/strings.bzl b/lib/private/strings.bzl index acf5157ff..479b0cd10 100644 --- a/lib/private/strings.bzl +++ b/lib/private/strings.bzl @@ -1,5 +1,7 @@ "String utilities" +load("@bazel_skylib//lib:types.bzl", "types") + CHAR_TO_INT = { "\0": 0, "\1": 1, @@ -653,3 +655,104 @@ def split_args(s): if arg != "": args.append(arg) return args + +def maketrans(x): + """ + Return a translation table usable with translate(). + + Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans) + of the same name. + + Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not + possible. Entries for characters outside this range will trigger a failure. + + Args: + x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings) + to Unicode ordinals, strings, or None. Character keys will be converted to ordinals. + + Returns: + dict. The translation table. + """ + + if not types.is_dict(x): + fail("if you give only one argument to maketrans it must be a dict") + + table = {} + + for (k, v) in x.items(): + if types.is_int(k): + if k > 0xFF: + fail("most Unicode is unsupported") + table[k] = v + elif types.is_string(k): + if len(k) != 1: + fail("string keys in translate table must be of length 1") + codepoint = ord(k) + if codepoint == None: + fail("could not compute ord('{}'), most Unicode is unsupported".format(k)) + table[codepoint] = v + else: + fail("keys in translate table must be strings or integers") + + return table + +def translate(s, table): + """ + Replace characters a string according to a translation table. + + Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate) + of the same name. + + Characters with entries in the table are replaced in the output. + Characters mapped to None are deleted. + Characters absent from the table are mirrored to the output untouched. + + Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not + possible. Characters outside this range will be silently mirrored to the output without consulting + the translation table. + + Args: + s: str. Input string upon which to perform replacements. + table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None. + + Returns: + str. Output string derived from input string with substitutions and deletions applied from table. + """ + + if not types.is_string(s): + fail("first argument to translate must be a string") + if not types.is_dict(table): + fail("second argument to translate must be a dict") + + parts = [] + lit_start = None # Index of start of current run of literal (i.e. no-op translation) content, or None. + for (i, c) in enumerate(s.elems()): + codepoint = ord(c) + if codepoint != None and codepoint in table: + # Terminate the current literal run, if any. + if lit_start != None: + parts.append(s[lit_start:i]) + lit_start = None + + replacement = table[codepoint] + if replacement == None: + pass + elif types.is_int(replacement): + parts.append(chr(replacement)) + elif types.is_string(replacement): + parts.append(replacement) + else: + fail("character mapping must return integer, None or str") + + else: # No entry in translation table. + if lit_start == None: + lit_start = i + + # Flush the caudal literal run, if any. + if lit_start != None: + parts.append(s[lit_start:]) + lit_start = None + + if len(parts) == 1: + return parts[0] + return "".join(parts) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index dc5cff549..bbfbb1e6a 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -2,6 +2,8 @@ load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo") load("//lib:paths.bzl", "to_repository_relative_path") +load(":strings.bzl", str_translate = "translate") +load(":vis_escape_ascii.bzl", "VIS_ESCAPE_ASCII") TAR_TOOLCHAIN_TYPE = "@aspect_bazel_lib//lib:tar_toolchain_type" @@ -103,10 +105,8 @@ parallelism of builds. Pruned files do not need to be transferred to remote-exec workers, which can reduce network costs. Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The -comparison performed between `srcs` and `mtree` is currently inexact and may fail to -handle handwritten or externally-derived mtree specifications. However, it is safe to use -this feature when the lines found in `mtree` are derived from one or more `mtree_spec` -rules, filtered and/or merged on whole-line basis only. +comparison performed between `srcs` and `mtree` is exact. There are no known +circumstances where incorrect results are anticipated. Possible values: @@ -119,11 +119,15 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), + "_unvis_canonical": attr.label(allow_single_file = True, default = Label("//lib/private:unvis_canonical.sed")), + "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")), + "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } _mtree_attrs = { "srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True), "out": attr.output(doc = "Resulting specification file to write"), + "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } def _add_compression_args(compress, args): @@ -188,15 +192,9 @@ def _is_unprunable(file): def _fmt_pruanble_inputs_line(file): if _is_unprunable(file): return None - - # The tar.prunable_inputs.txt file has a two columns: - # 1. vis-encoded paths of the files, used in comparison - # 2. un-vis-encoded paths of the files, used for reporting back to Bazel after filtering - path = file.path - return _vis_encode(path) + " " + path + return _vis_encode(file.path) def _fmt_keep_inputs_line(file): - # The tar.keep_inputs.txt file has a single column of vis-encoded paths of the files to keep. return _vis_encode(file.path) def _configured_unused_inputs_file(ctx, srcs, keep): @@ -243,26 +241,33 @@ def _configured_unused_inputs_file(ctx, srcs, keep): # * are not found in any content= or contents= keyword in the MTREE # * are not in the hardcoded KEEP_INPUTS set # - # Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation, stored in field 1, - # before being written out in the un-vis-encoded form Bazel understands, from field 2. + # Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation + # before being written out in the un-vis-encoded form Bazel understands. # # Note: bsdtar (libarchive) accepts both content= and contents= to identify source file: # ref https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1640 - # - # TODO: Make comparison exact by converting all inputs to a canonical vis-encoded form before comparing. - # See also: https://github.com/bazel-contrib/bazel-lib/issues/794 ctx.actions.run_shell( outputs = [unused_inputs], - inputs = [prunable_inputs, keep_inputs, ctx.file.mtree], + inputs = [ + prunable_inputs, + keep_inputs, + ctx.file.mtree, + ctx.file._unvis_canonical, + ctx.file._vis_canonicalize, + ctx.file._vis_escape_nonascii, + ], tools = [coreutils], command = ''' "$COREUTILS" join -v 1 \\ - <("$COREUTILS" sort -u "$PRUNABLE_INPUTS") \\ + <(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ <("$COREUTILS" sort -u \\ - <(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\ - "$KEEP_INPUTS" \\ + <(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\ + | "$COREUTILS" cut -d'=' -f 2- \\ + | sed -Ef "$VIS_CANONICALIZE" \\ + ) \\ + <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ ) \\ - | "$COREUTILS" cut -d' ' -f 2- \\ + | sed -f "$UNVIS_CANONICAL" \\ > "$UNUSED_INPUTS" ''', env = { @@ -271,6 +276,9 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "KEEP_INPUTS": keep_inputs.path, "MTREE": ctx.file.mtree.path, "UNUSED_INPUTS": unused_inputs.path, + "UNVIS_CANONICAL": ctx.file._unvis_canonical.path, + "VIS_CANONICALIZE": ctx.file._vis_canonicalize.path, + "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, }, mnemonic = "UnusedTarInputs", toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type", @@ -278,7 +286,6 @@ def _configured_unused_inputs_file(ctx, srcs, keep): return unused_inputs - # TODO(3.0): Access field directly after minimum bazel_compatibility advanced to or beyond v7.0.0. def _repo_mapping_manifest(files_to_run): return getattr(files_to_run, "repo_mapping_manifest", None) @@ -372,8 +379,9 @@ def _to_rlocation_path(file, workspace): return workspace + "/" + file.short_path def _vis_encode(filename): - # TODO(#794): correctly encode all filenames by using vis(3) (or porting it) - return filename.replace(" ", "\\040") + # Escaping of non-ASCII bytes cannot be performed within Starlark. + # After writing content out, a second pass is performed with vis_escape_nonascii.sed. + return str_translate(filename, VIS_ESCAPE_ASCII) def _expand(file, expander, transform = to_repository_relative_path): expanded = expander.expand(file) @@ -400,6 +408,7 @@ def _expand(file, expander, transform = to_repository_relative_path): def _mtree_impl(ctx): out = ctx.outputs.out or ctx.actions.declare_file(ctx.attr.name + ".spec") + unescaped = ctx.actions.declare_file(ctx.attr.name + ".spec.unescaped") content = ctx.actions.args() content.set_param_file_format("multiline") @@ -444,7 +453,18 @@ def _mtree_impl(ctx): _mtree_line(_vis_encode(runfiles_dir + "/_repo_mapping"), "file", content = _vis_encode(repo_mapping.path)), ) - ctx.actions.write(out, content = content) + ctx.actions.write(unescaped, content = content) + ctx.actions.run_shell( + outputs = [out], + inputs = [unescaped, ctx.file._vis_escape_nonascii], + command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"', + env = { + "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, + "UNESCAPED": unescaped.path, + "OUT": out.path, + }, + mnemonic = "EscapeNonAscii", + ) return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out])) diff --git a/lib/private/unvis_canonical.sed b/lib/private/unvis_canonical.sed new file mode 100644 index 000000000..9d6ec7e79 --- /dev/null +++ b/lib/private/unvis_canonical.sed @@ -0,0 +1,169 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace octal escape sequences with the bytes they represent. +# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed + +s/\\000/\x00/g +s/\\001/\x01/g +s/\\002/\x02/g +s/\\003/\x03/g +s/\\004/\x04/g +s/\\005/\x05/g +s/\\006/\x06/g +s/\\007/\x07/g +s/\\010/\x08/g +s/\\011/\x09/g +s/\\012/\x0a/g +s/\\013/\x0b/g +s/\\014/\x0c/g +s/\\015/\x0d/g +s/\\016/\x0e/g +s/\\017/\x0f/g +s/\\020/\x10/g +s/\\021/\x11/g +s/\\022/\x12/g +s/\\023/\x13/g +s/\\024/\x14/g +s/\\025/\x15/g +s/\\026/\x16/g +s/\\027/\x17/g +s/\\030/\x18/g +s/\\031/\x19/g +s/\\032/\x1a/g +s/\\033/\x1b/g +s/\\034/\x1c/g +s/\\035/\x1d/g +s/\\036/\x1e/g +s/\\037/\x1f/g +s/\\040/\x20/g +s/\\177/\x7f/g +s/\\200/\x80/g +s/\\201/\x81/g +s/\\202/\x82/g +s/\\203/\x83/g +s/\\204/\x84/g +s/\\205/\x85/g +s/\\206/\x86/g +s/\\207/\x87/g +s/\\210/\x88/g +s/\\211/\x89/g +s/\\212/\x8a/g +s/\\213/\x8b/g +s/\\214/\x8c/g +s/\\215/\x8d/g +s/\\216/\x8e/g +s/\\217/\x8f/g +s/\\220/\x90/g +s/\\221/\x91/g +s/\\222/\x92/g +s/\\223/\x93/g +s/\\224/\x94/g +s/\\225/\x95/g +s/\\226/\x96/g +s/\\227/\x97/g +s/\\230/\x98/g +s/\\231/\x99/g +s/\\232/\x9a/g +s/\\233/\x9b/g +s/\\234/\x9c/g +s/\\235/\x9d/g +s/\\236/\x9e/g +s/\\237/\x9f/g +s/\\240/\xa0/g +s/\\241/\xa1/g +s/\\242/\xa2/g +s/\\243/\xa3/g +s/\\244/\xa4/g +s/\\245/\xa5/g +s/\\246/\xa6/g +s/\\247/\xa7/g +s/\\250/\xa8/g +s/\\251/\xa9/g +s/\\252/\xaa/g +s/\\253/\xab/g +s/\\254/\xac/g +s/\\255/\xad/g +s/\\256/\xae/g +s/\\257/\xaf/g +s/\\260/\xb0/g +s/\\261/\xb1/g +s/\\262/\xb2/g +s/\\263/\xb3/g +s/\\264/\xb4/g +s/\\265/\xb5/g +s/\\266/\xb6/g +s/\\267/\xb7/g +s/\\270/\xb8/g +s/\\271/\xb9/g +s/\\272/\xba/g +s/\\273/\xbb/g +s/\\274/\xbc/g +s/\\275/\xbd/g +s/\\276/\xbe/g +s/\\277/\xbf/g +s/\\300/\xc0/g +s/\\301/\xc1/g +s/\\302/\xc2/g +s/\\303/\xc3/g +s/\\304/\xc4/g +s/\\305/\xc5/g +s/\\306/\xc6/g +s/\\307/\xc7/g +s/\\310/\xc8/g +s/\\311/\xc9/g +s/\\312/\xca/g +s/\\313/\xcb/g +s/\\314/\xcc/g +s/\\315/\xcd/g +s/\\316/\xce/g +s/\\317/\xcf/g +s/\\320/\xd0/g +s/\\321/\xd1/g +s/\\322/\xd2/g +s/\\323/\xd3/g +s/\\324/\xd4/g +s/\\325/\xd5/g +s/\\326/\xd6/g +s/\\327/\xd7/g +s/\\330/\xd8/g +s/\\331/\xd9/g +s/\\332/\xda/g +s/\\333/\xdb/g +s/\\334/\xdc/g +s/\\335/\xdd/g +s/\\336/\xde/g +s/\\337/\xdf/g +s/\\340/\xe0/g +s/\\341/\xe1/g +s/\\342/\xe2/g +s/\\343/\xe3/g +s/\\344/\xe4/g +s/\\345/\xe5/g +s/\\346/\xe6/g +s/\\347/\xe7/g +s/\\350/\xe8/g +s/\\351/\xe9/g +s/\\352/\xea/g +s/\\353/\xeb/g +s/\\354/\xec/g +s/\\355/\xed/g +s/\\356/\xee/g +s/\\357/\xef/g +s/\\360/\xf0/g +s/\\361/\xf1/g +s/\\362/\xf2/g +s/\\363/\xf3/g +s/\\364/\xf4/g +s/\\365/\xf5/g +s/\\366/\xf6/g +s/\\367/\xf7/g +s/\\370/\xf8/g +s/\\371/\xf9/g +s/\\372/\xfa/g +s/\\373/\xfb/g +s/\\374/\xfc/g +s/\\375/\xfd/g +s/\\376/\xfe/g +s/\\377/\xff/g + +# Unvis of backslash must be applied last to avoid double-interpretation. +s/\\134/\\/g diff --git a/lib/private/vis_canonicalize.sed b/lib/private/vis_canonicalize.sed new file mode 100644 index 000000000..4944a8c94 --- /dev/null +++ b/lib/private/vis_canonicalize.sed @@ -0,0 +1,287 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# +# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. +# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. +# The remaining characters are not escaped; they represent themselves. +# +# Input is interpreted as libarchive would, with a wider set of escape sequences: +# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings +# * \0 means NUL when not the start of an three-digit octal escape sequence +# * \s means SPACE +# * \ is valid as an ordinary backslash when not the start of a valid escape sequence +# +# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 + +# Escaping of backslashes must be applied first to avoid double-interpretation. +s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g +s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g + +s/\\a/\\007/g +s/\\b/\\008/g +s/\\f/\\014/g +s/\\n/\\012/g +s/\\r/\\015/g +s/\\s/\\040/g +s/\\t/\\011/g +s/\\v/\\013/g + +# NUL special form must be disambiguated from ordinary octal escape sequences. +s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g + +# Remove octal escaping from characters that don't need it. +s/\\041/!/g +s/\\042/"/g +s/\\043/#/g +s/\\044/$/g +s/\\045/%/g +s/\\046/&/g +s/\\047/'/g +s/\\050/(/g +s/\\051/)/g +s/\\052/*/g +s/\\053/+/g +s/\\054/,/g +s/\\055/-/g +s/\\056/./g +s:\\057:/:g +s/\\060/0/g +s/\\061/1/g +s/\\062/2/g +s/\\063/3/g +s/\\064/4/g +s/\\065/5/g +s/\\066/6/g +s/\\067/7/g +s/\\070/8/g +s/\\071/9/g +s/\\072/:/g +s/\\073/;/g +s/\\074//g +s/\\077/?/g +s/\\100/@/g +s/\\101/A/g +s/\\102/B/g +s/\\103/C/g +s/\\104/D/g +s/\\105/E/g +s/\\106/F/g +s/\\107/G/g +s/\\110/H/g +s/\\111/I/g +s/\\112/J/g +s/\\113/K/g +s/\\114/L/g +s/\\115/M/g +s/\\116/N/g +s/\\117/O/g +s/\\120/P/g +s/\\121/Q/g +s/\\122/R/g +s/\\123/S/g +s/\\124/T/g +s/\\125/U/g +s/\\126/V/g +s/\\127/W/g +s/\\130/X/g +s/\\131/Y/g +s/\\132/Z/g +s/\\133/[/g +s/\\135/]/g +s/\\136/^/g +s/\\137/_/g +s/\\140/`/g +s/\\141/a/g +s/\\142/b/g +s/\\143/c/g +s/\\144/d/g +s/\\145/e/g +s/\\146/f/g +s/\\147/g/g +s/\\150/h/g +s/\\151/i/g +s/\\152/j/g +s/\\153/k/g +s/\\154/l/g +s/\\155/m/g +s/\\156/n/g +s/\\157/o/g +s/\\160/p/g +s/\\161/q/g +s/\\162/r/g +s/\\163/s/g +s/\\164/t/g +s/\\165/u/g +s/\\166/v/g +s/\\167/w/g +s/\\170/x/g +s/\\171/y/g +s/\\172/z/g +s/\\173/{/g +s/\\174/|/g +s/\\175/}/g +s/\\176/~/g + +# Add octal escaping for characters that need it. +s/\x00/\\000/g +s/\x01/\\001/g +s/\x02/\\002/g +s/\x03/\\003/g +s/\x04/\\004/g +s/\x05/\\005/g +s/\x06/\\006/g +s/\x07/\\007/g +s/\x08/\\010/g +s/\x09/\\011/g +s/\x0b/\\013/g +s/\x0c/\\014/g +s/\x0d/\\015/g +s/\x0e/\\016/g +s/\x0f/\\017/g +s/\x10/\\020/g +s/\x11/\\021/g +s/\x12/\\022/g +s/\x13/\\023/g +s/\x14/\\024/g +s/\x15/\\025/g +s/\x16/\\026/g +s/\x17/\\027/g +s/\x18/\\030/g +s/\x19/\\031/g +s/\x1a/\\032/g +s/\x1b/\\033/g +s/\x1c/\\034/g +s/\x1d/\\035/g +s/\x1e/\\036/g +s/\x1f/\\037/g +s/\x20/\\040/g +s/\x7f/\\177/g +s/\x80/\\200/g +s/\x81/\\201/g +s/\x82/\\202/g +s/\x83/\\203/g +s/\x84/\\204/g +s/\x85/\\205/g +s/\x86/\\206/g +s/\x87/\\207/g +s/\x88/\\210/g +s/\x89/\\211/g +s/\x8a/\\212/g +s/\x8b/\\213/g +s/\x8c/\\214/g +s/\x8d/\\215/g +s/\x8e/\\216/g +s/\x8f/\\217/g +s/\x90/\\220/g +s/\x91/\\221/g +s/\x92/\\222/g +s/\x93/\\223/g +s/\x94/\\224/g +s/\x95/\\225/g +s/\x96/\\226/g +s/\x97/\\227/g +s/\x98/\\230/g +s/\x99/\\231/g +s/\x9a/\\232/g +s/\x9b/\\233/g +s/\x9c/\\234/g +s/\x9d/\\235/g +s/\x9e/\\236/g +s/\x9f/\\237/g +s/\xa0/\\240/g +s/\xa1/\\241/g +s/\xa2/\\242/g +s/\xa3/\\243/g +s/\xa4/\\244/g +s/\xa5/\\245/g +s/\xa6/\\246/g +s/\xa7/\\247/g +s/\xa8/\\250/g +s/\xa9/\\251/g +s/\xaa/\\252/g +s/\xab/\\253/g +s/\xac/\\254/g +s/\xad/\\255/g +s/\xae/\\256/g +s/\xaf/\\257/g +s/\xb0/\\260/g +s/\xb1/\\261/g +s/\xb2/\\262/g +s/\xb3/\\263/g +s/\xb4/\\264/g +s/\xb5/\\265/g +s/\xb6/\\266/g +s/\xb7/\\267/g +s/\xb8/\\270/g +s/\xb9/\\271/g +s/\xba/\\272/g +s/\xbb/\\273/g +s/\xbc/\\274/g +s/\xbd/\\275/g +s/\xbe/\\276/g +s/\xbf/\\277/g +s/\xc0/\\300/g +s/\xc1/\\301/g +s/\xc2/\\302/g +s/\xc3/\\303/g +s/\xc4/\\304/g +s/\xc5/\\305/g +s/\xc6/\\306/g +s/\xc7/\\307/g +s/\xc8/\\310/g +s/\xc9/\\311/g +s/\xca/\\312/g +s/\xcb/\\313/g +s/\xcc/\\314/g +s/\xcd/\\315/g +s/\xce/\\316/g +s/\xcf/\\317/g +s/\xd0/\\320/g +s/\xd1/\\321/g +s/\xd2/\\322/g +s/\xd3/\\323/g +s/\xd4/\\324/g +s/\xd5/\\325/g +s/\xd6/\\326/g +s/\xd7/\\327/g +s/\xd8/\\330/g +s/\xd9/\\331/g +s/\xda/\\332/g +s/\xdb/\\333/g +s/\xdc/\\334/g +s/\xdd/\\335/g +s/\xde/\\336/g +s/\xdf/\\337/g +s/\xe0/\\340/g +s/\xe1/\\341/g +s/\xe2/\\342/g +s/\xe3/\\343/g +s/\xe4/\\344/g +s/\xe5/\\345/g +s/\xe6/\\346/g +s/\xe7/\\347/g +s/\xe8/\\350/g +s/\xe9/\\351/g +s/\xea/\\352/g +s/\xeb/\\353/g +s/\xec/\\354/g +s/\xed/\\355/g +s/\xee/\\356/g +s/\xef/\\357/g +s/\xf0/\\360/g +s/\xf1/\\361/g +s/\xf2/\\362/g +s/\xf3/\\363/g +s/\xf4/\\364/g +s/\xf5/\\365/g +s/\xf6/\\366/g +s/\xf7/\\367/g +s/\xf8/\\370/g +s/\xf9/\\371/g +s/\xfa/\\372/g +s/\xfb/\\373/g +s/\xfc/\\374/g +s/\xfd/\\375/g +s/\xfe/\\376/g +s/\xff/\\377/g diff --git a/lib/private/vis_escape_ascii.bzl b/lib/private/vis_escape_ascii.bzl new file mode 100644 index 000000000..eac44ff00 --- /dev/null +++ b/lib/private/vis_escape_ascii.bzl @@ -0,0 +1,42 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +"A translation table for vis-encoding the ASCII range for mtree." + +load(":strings.bzl", "maketrans") + +VIS_ESCAPE_ASCII = maketrans({ + 0: r"\000", + 1: r"\001", + 2: r"\002", + 3: r"\003", + 4: r"\004", + 5: r"\005", + 6: r"\006", + 7: r"\007", + 8: r"\010", + 9: r"\011", + 10: r"\012", + 11: r"\013", + 12: r"\014", + 13: r"\015", + 14: r"\016", + 15: r"\017", + 16: r"\020", + 17: r"\021", + 18: r"\022", + 19: r"\023", + 20: r"\024", + 21: r"\025", + 22: r"\026", + 23: r"\027", + 24: r"\030", + 25: r"\031", + 26: r"\032", + 27: r"\033", + 28: r"\034", + 29: r"\035", + 30: r"\036", + 31: r"\037", + 32: r"\040", + 92: r"\134", + 127: r"\177", +}) diff --git a/lib/private/vis_escape_nonascii.sed b/lib/private/vis_escape_nonascii.sed new file mode 100644 index 000000000..744713564 --- /dev/null +++ b/lib/private/vis_escape_nonascii.sed @@ -0,0 +1,132 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace non-ASCII bytes with their octal escape sequences. +# Escaping of ASCII is done in Starlark prior to writing content out. + +s/\x80/\\200/g +s/\x81/\\201/g +s/\x82/\\202/g +s/\x83/\\203/g +s/\x84/\\204/g +s/\x85/\\205/g +s/\x86/\\206/g +s/\x87/\\207/g +s/\x88/\\210/g +s/\x89/\\211/g +s/\x8a/\\212/g +s/\x8b/\\213/g +s/\x8c/\\214/g +s/\x8d/\\215/g +s/\x8e/\\216/g +s/\x8f/\\217/g +s/\x90/\\220/g +s/\x91/\\221/g +s/\x92/\\222/g +s/\x93/\\223/g +s/\x94/\\224/g +s/\x95/\\225/g +s/\x96/\\226/g +s/\x97/\\227/g +s/\x98/\\230/g +s/\x99/\\231/g +s/\x9a/\\232/g +s/\x9b/\\233/g +s/\x9c/\\234/g +s/\x9d/\\235/g +s/\x9e/\\236/g +s/\x9f/\\237/g +s/\xa0/\\240/g +s/\xa1/\\241/g +s/\xa2/\\242/g +s/\xa3/\\243/g +s/\xa4/\\244/g +s/\xa5/\\245/g +s/\xa6/\\246/g +s/\xa7/\\247/g +s/\xa8/\\250/g +s/\xa9/\\251/g +s/\xaa/\\252/g +s/\xab/\\253/g +s/\xac/\\254/g +s/\xad/\\255/g +s/\xae/\\256/g +s/\xaf/\\257/g +s/\xb0/\\260/g +s/\xb1/\\261/g +s/\xb2/\\262/g +s/\xb3/\\263/g +s/\xb4/\\264/g +s/\xb5/\\265/g +s/\xb6/\\266/g +s/\xb7/\\267/g +s/\xb8/\\270/g +s/\xb9/\\271/g +s/\xba/\\272/g +s/\xbb/\\273/g +s/\xbc/\\274/g +s/\xbd/\\275/g +s/\xbe/\\276/g +s/\xbf/\\277/g +s/\xc0/\\300/g +s/\xc1/\\301/g +s/\xc2/\\302/g +s/\xc3/\\303/g +s/\xc4/\\304/g +s/\xc5/\\305/g +s/\xc6/\\306/g +s/\xc7/\\307/g +s/\xc8/\\310/g +s/\xc9/\\311/g +s/\xca/\\312/g +s/\xcb/\\313/g +s/\xcc/\\314/g +s/\xcd/\\315/g +s/\xce/\\316/g +s/\xcf/\\317/g +s/\xd0/\\320/g +s/\xd1/\\321/g +s/\xd2/\\322/g +s/\xd3/\\323/g +s/\xd4/\\324/g +s/\xd5/\\325/g +s/\xd6/\\326/g +s/\xd7/\\327/g +s/\xd8/\\330/g +s/\xd9/\\331/g +s/\xda/\\332/g +s/\xdb/\\333/g +s/\xdc/\\334/g +s/\xdd/\\335/g +s/\xde/\\336/g +s/\xdf/\\337/g +s/\xe0/\\340/g +s/\xe1/\\341/g +s/\xe2/\\342/g +s/\xe3/\\343/g +s/\xe4/\\344/g +s/\xe5/\\345/g +s/\xe6/\\346/g +s/\xe7/\\347/g +s/\xe8/\\350/g +s/\xe9/\\351/g +s/\xea/\\352/g +s/\xeb/\\353/g +s/\xec/\\354/g +s/\xed/\\355/g +s/\xee/\\356/g +s/\xef/\\357/g +s/\xf0/\\360/g +s/\xf1/\\361/g +s/\xf2/\\362/g +s/\xf3/\\363/g +s/\xf4/\\364/g +s/\xf5/\\365/g +s/\xf6/\\366/g +s/\xf7/\\367/g +s/\xf8/\\370/g +s/\xf9/\\371/g +s/\xfa/\\372/g +s/\xfb/\\373/g +s/\xfc/\\374/g +s/\xfd/\\375/g +s/\xfe/\\376/g +s/\xff/\\377/g diff --git a/lib/tests/strings_tests.bzl b/lib/tests/strings_tests.bzl index 177dae286..243f48d48 100644 --- a/lib/tests/strings_tests.bzl +++ b/lib/tests/strings_tests.bzl @@ -2,7 +2,7 @@ load("@bazel_skylib//lib:partial.bzl", "partial") load("@bazel_skylib//lib:unittest.bzl", "asserts", "unittest") -load("//lib/private:strings.bzl", "chr", "hex", "ord", "split_args") +load("//lib/private:strings.bzl", "chr", "hex", "maketrans", "ord", "split_args", "translate") def _ord_test_impl(ctx): env = unittest.begin(ctx) @@ -83,6 +83,29 @@ def _split_args_test_impl(ctx): split_args_test = unittest.make(_split_args_test_impl) +def _translate_test_impl(ctx): + env = unittest.begin(ctx) + + table = maketrans({ + "<": ">", + "!": None, + }) + + asserts.equals(env, "...", translate("...", table)) + asserts.equals(env, ">..", translate("<..", table)) + asserts.equals(env, ".>.", translate(".<.", table)) + asserts.equals(env, "..>", translate("..<", table)) + asserts.equals(env, "..", translate("!..", table)) + asserts.equals(env, "..", translate(".!.", table)) + asserts.equals(env, "..", translate("..!", table)) + asserts.equals(env, ">>>", translate("<<<", table)) + asserts.equals(env, "", translate("!!!", table)) + asserts.equals(env, ".>", translate(".$@".format(actual), + # HACK: under default and POSIX locales, MacOS 15.1 and Ubuntu 22.04 disagree on how files with Unicode filenames should be printed. + # LC_ALL=en_US may be inacurate, but by using a dense 8-bit, single-byte encoding, + # we achieve the effect of leaving the bytes alone and producing a consistent output to assert against. + cmd = "LC_ALL=en_US $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), + # toolchains = ["@bsd_tar_toolchains//:resolved_toolchain"], ) diff --git "a/lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" "b/lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" new file mode 100644 index 000000000..388e04c99 --- /dev/null +++ "b/lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" @@ -0,0 +1 @@ +💯 \ No newline at end of file diff --git a/tools/gen_vis_scripts/BUILD.bazel b/tools/gen_vis_scripts/BUILD.bazel new file mode 100644 index 000000000..747e0d6d1 --- /dev/null +++ b/tools/gen_vis_scripts/BUILD.bazel @@ -0,0 +1,7 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_binary") + +go_binary( + name = "gen_vis_scripts", + srcs = ["gen_vis_scripts.go"], + visibility = ["//lib/private:__pkg__"], +) diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go new file mode 100644 index 000000000..b1c4277fc --- /dev/null +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -0,0 +1,171 @@ +// Code generator for vis-encoding support scripts. +package main + +import ( + "fmt" + "io" + "log" + "os" + "strings" + "unicode" +) + +func main() { + for _, arg := range os.Args[1:] { + name, dest, ok := strings.Cut(arg, "=") + if !ok { + log.Fatal("invalid generation spec:", arg) + } + + f, err := os.Create(dest) + if err != nil { + log.Fatal(err) + } + defer mustClose(f) + + switch name { + case "vis_escape_ascii.bzl": + writeEscapeASCIIBzl(f) + case "vis_escape_nonascii.sed": + writeEscapeNonASCIISed(f) + case "vis_canonicalize.sed": + writeVisCanonicalizeSed(f) + case "unvis_canonical.sed": + writeUnvisCanonicalSed(f) + default: + log.Fatal("unknown generated content:", name) + } + } +} + +func mustClose(f *os.File) { + if err := f.Close(); err != nil { + log.Fatal(err) + } +} + +const newline rune = '\n' + +// Escape all characters identified by mtree(5) as requiring escaping. Plus whitespace. +func shouldEscape(b byte) bool { + return b == '\\' || b > unicode.MaxASCII || unicode.IsSpace(rune(b)) || !unicode.IsPrint(rune(b)) +} + +func writeEscapeASCIIBzl(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +"A translation table for vis-encoding the ASCII range for mtree." + +load(":strings.bzl", "maketrans") + +VIS_ESCAPE_ASCII = maketrans({ + `)) + + for i := 0; i <= unicode.MaxASCII; i++ { + b := byte(i) + if shouldEscape(b) { + fmt.Fprintf(w, ` %[1]d: r"\%03[1]o",%[2]c`, b, newline) + } + } + fmt.Fprintln(w, "})") +} + +func writeEscapeNonASCIISed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace non-ASCII bytes with their octal escape sequences. +# Escaping of ASCII is done in Starlark prior to writing content out. + `)) + fmt.Fprintln(w, "") + + for i := 0x80; i <= 0xFF; i++ { + fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) + } +} + +func writeVisCanonicalizeSed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# +# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. +# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. +# The remaining characters are not escaped; they represent themselves. +# +# Input is interpreted as libarchive would, with a wider set of escape sequences: +# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings +# * \0 means NUL when not the start of an three-digit octal escape sequence +# * \s means SPACE +# * \ is valid as an ordinary backslash when not the start of a valid escape sequence +# +# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 + +# Escaping of backslashes must be applied first to avoid double-interpretation. +s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g +s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g + +s/\\a/\\007/g +s/\\b/\\008/g +s/\\f/\\014/g +s/\\n/\\012/g +s/\\r/\\015/g +s/\\s/\\040/g +s/\\t/\\011/g +s/\\v/\\013/g + +# NUL special form must be disambiguated from ordinary octal escape sequences. +s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g + `)) + fmt.Fprintln(w, "") + + fmt.Fprintln(w, "# Remove octal escaping from characters that don't need it.") + for i := 0; i <= 0xFF; i++ { + b := byte(i) + if shouldEscape(b) { + continue + } + if b == '/' { + fmt.Fprintf(w, `s:\\%03[1]o:%[1]c:g%[2]c`, b, newline) + } else { + fmt.Fprintf(w, `s/\\%03[1]o/%[1]c/g%[2]c`, b, newline) + } + } + fmt.Fprintln(w, "") + + fmt.Fprintln(w, "# Add octal escaping for characters that need it.") + for i := 0; i <= 0xFF; i++ { + b := byte(i) + if !shouldEscape(b) { + continue + } + if b == '\\' || b == '\n' { + continue + } + fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, b, newline) + } +} + +func writeUnvisCanonicalSed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace octal escape sequences with the bytes they represent. +# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed + `)) + fmt.Fprintln(w, "") + + for i := 0x00; i <= 0xFF; i++ { + b := byte(i) + if b == '\\' { + continue + } + if !shouldEscape(b) { + continue + } + fmt.Fprintf(w, `s/\\%03[1]o/\x%02[1]x/g%[2]c`, b, newline) + } + fmt.Fprintln(w, "") + + fmt.Fprintln(w, strings.TrimSpace(` +# Unvis of backslash must be applied last to avoid double-interpretation. +s/\\134/\\/g + `)) +}