From 272b77086cb90eb73126866edb9f98f4111c5531 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Thu, 2 Apr 2020 17:27:56 -0400 Subject: [PATCH] extract: add predicate support (#32) Allows selective extraction and/or recording what's extracted. --- README.md | 11 +++++++++-- src/Tar.jl | 45 ++++++++++++++++++++++++++++++++++----------- src/extract.jl | 19 +++++++++++++++---- test/runtests.jl | 29 ++++++++++++++++++++++++++++- 4 files changed, 86 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index d5dc024..69b8c19 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ None of these are exported, however: the recommended usage is to do `import Tar` create([ predicate, ] dir, [ tarball ]) -> tarball -* `predicate :: Function` +* `predicate :: String --> Bool` * `dir :: AbstractString` * `tarball :: Union{AbstractString, IO}` @@ -134,8 +134,9 @@ will be included in the archive. ### Tar.extract - extract(tarball, [ dir ]) -> dir + extract([ predicate, ] tarball, [ dir ]) -> dir +* `predicate :: Header --> Bool` * `tarball :: Union{AbstractString, IO}` * `dir :: AbstractString` @@ -146,6 +147,12 @@ archive contents will be read from that IO stream. The archive is extracted to which can be created as a new directory. If `dir` is not specified, the archive is extracted into a temporary directory which is returned by `extract`. +If a `predicate` function is passed, it is called on each `Header` object that +is encountered while extracting `tarball` and the entry is only extracted if the +`predicate(hdr)` is true. This can be used to selectively extract only parts of +an archive, to skip entries that cause `extract` to throw an error, or to record +what is extracted during the extraction process. + ### Tar.list list(tarball; [ strict = true ]) -> Vector{Header} diff --git a/src/Tar.jl b/src/Tar.jl index a335f41..6efbb25 100644 --- a/src/Tar.jl +++ b/src/Tar.jl @@ -12,12 +12,14 @@ include("header.jl") include("create.jl") include("extract.jl") +const true_predicate = _ -> true + ## official API: create, list, extract """ create([ predicate, ] dir, [ tarball ]) -> tarball - predicate :: Function + predicate :: String --> Bool dir :: AbstractString tarball :: Union{AbstractString, IO} @@ -64,8 +66,9 @@ function create(predicate::Function, dir::AbstractString) end create(dir::AbstractString, tarball::Union{AbstractString, IO}) = - create(p->true, dir, tarball) -create(dir::AbstractString) = create(p->true, dir) + create(true_predicate, dir, tarball) +create(dir::AbstractString) = + create(true_predicate, dir) """ list(tarball; [ strict = true ]) -> Vector{Header} @@ -94,8 +97,9 @@ list(tarball::IO; raw::Bool=false, strict::Bool=!raw) = list_tarball(tarball, raw=raw, strict=strict) """ - extract(tarball, [ dir ]) -> dir + extract([ predicate, ] tarball, [ dir ]) -> dir + predicate :: Header --> Bool tarball :: Union{AbstractString, IO} dir :: AbstractString @@ -105,28 +109,47 @@ archive contents will be read from that IO stream. The archive is extracted to `dir` which must either be an existing empty directory or a non-existent path which can be created as a new directory. If `dir` is not specified, the archive is extracted into a temporary directory which is returned by `extract`. + +If a `predicate` function is passed, it is called on each `Header` object that +is encountered while extracting `tarball` and the entry is only extracted if the +`predicate(hdr)` is true. This can be used to selectively extract only parts of +an archive, to skip entries that cause `extract` to throw an error, or to record +what is extracted during the extraction process. """ -function extract(tarball::Union{AbstractString, IO}, dir::AbstractString) +function extract( + predicate::Function, + tarball::Union{AbstractString, IO}, + dir::AbstractString, +) extract_tarball_check(tarball) extract_dir_check(dir) if ispath(dir) - extract_tarball(tarball, dir) + extract_tarball(predicate, tarball, dir) else mkdir(dir) - extract_cleanup(tarball, dir) + extract_tarball_with_cleanup(predicate, tarball, dir) end return dir end -function extract(tarball::Union{AbstractString, IO}) +function extract(predicate::Function, tarball::Union{AbstractString, IO}) extract_tarball_check(tarball) dir = mktempdir() - extract_cleanup(tarball, dir) + extract_tarball_with_cleanup(predicate, tarball, dir) return dir end -function extract_cleanup(tarball::Union{AbstractString, IO}, dir::AbstractString) - try extract_tarball(tarball, dir) +extract(tarball::Union{AbstractString, IO}, dir::AbstractString) = + extract(true_predicate, tarball, dir) +extract(tarball::Union{AbstractString, IO}) = + extract(true_predicate, tarball) + +function extract_tarball_with_cleanup( + predicate::Function, + tarball::Union{AbstractString, IO}, + dir::AbstractString, +) + try extract_tarball(predicate, tarball, dir) catch rm(dir, force=true, recursive=true) rethrow() diff --git a/src/extract.jl b/src/extract.jl index 66ef20b..8d8e7f4 100644 --- a/src/extract.jl +++ b/src/extract.jl @@ -13,22 +13,24 @@ function list_tarball( hdr === nothing && break strict && check_header(hdr) push!(headers, hdr) - skip(tar, 512 * ((hdr.size + 511) ÷ 512)) + skip_data(tar, hdr.size) end return headers end function extract_tarball( + predicate::Function, tarball::AbstractString, root::String; buf::Vector{UInt8} = Vector{UInt8}(undef, 512), ) open(tarball) do tar - extract_tarball(tar, root, buf=buf) + extract_tarball(predicate, tar, root, buf=buf) end end function extract_tarball( + predicate::Function, tar::IO, root::String; buf::Vector{UInt8} = Vector{UInt8}(undef, 512), @@ -37,6 +39,11 @@ function extract_tarball( while !eof(tar) hdr = read_header(tar, buf=buf) hdr === nothing && break + # check if we should extract or skip + if !predicate(hdr) + skip_data(tar, hdr.size) + continue + end check_header(hdr) # normalize path and check for symlink attacks path = "" @@ -44,9 +51,9 @@ function extract_tarball( for part in split(hdr.path, '/') (isempty(part) || part == ".") && continue path in links && error(""" - refusing to extract path with symlink prefix, possible attack + Refusing to extract path with symlink prefix, possible attack * symlink prefix: $(repr(path)) - * path: $(repr(hdr.path)) + * extracted path: $(repr(hdr.path)) """) path = isempty(path) ? part : "$path/$part" push!(parts, part) @@ -232,6 +239,10 @@ function read_standard_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef, return Header(path, to_symbolic_type(type), mode, size, link) end +function skip_data(tar::IO, size::Integer) + skip(tar, 512 * ((size + 511) ÷ 512)) +end + index_range(offset::Int, length::Int) = offset .+ (1:length) read_header_chr(buf::Vector{UInt8}, offset::Int) = Char(buf[offset+1]) diff --git a/test/runtests.jl b/test/runtests.jl index 6d26925..d7e169f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,7 +77,7 @@ end end function check_tree_hash(hash::Vector{UInt8}, root::AbstractString) - @test tree_hash(root) == hash + @test string(tree_hash(root)) == string(hash) rm(root, recursive=true) end @@ -408,4 +408,31 @@ end Tar.extract(io, dir) end rm(dir, recursive=true) + + # generate a version of dir with .skip entries + dir = make_test_dir(true) + tarball = Tar.create(dir) + rm(dir, recursive=true) + + # predicate to skip paths ending in `.skip` + predicate = hdr -> !any(splitext(p)[2] == ".skip" for p in split(hdr.path, '/')) + + # extract(predicate::Function, tarball::String) + dir = Tar.extract(predicate, tarball) + check_tree_hash(hash, dir) + # extract(predicate::Function, tarball::String, dir::String) + dir = tempname() + Tar.extract(predicate, tarball, dir) + check_tree_hash(hash, dir) + # extract(predicate::Function, tarball::IO) + dir = open(tarball) do io + Tar.extract(predicate, io) + end + check_tree_hash(hash, dir) + # extract(tarball::IO, dir::String) — non-existent + dir = tempname() + open(tarball) do io + Tar.extract(predicate, io, dir) + end + check_tree_hash(hash, dir) end