Skip to content

Commit

Permalink
extract: add predicate support (#32)
Browse files Browse the repository at this point in the history
Allows selective extraction and/or recording what's extracted.
  • Loading branch information
StefanKarpinski authored Apr 2, 2020
1 parent 2f5aaf4 commit 272b770
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 18 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ None of these are exported, however: the recommended usage is to do `import Tar`

create([ predicate, ] dir, [ tarball ]) -> tarball

* `predicate :: Function`
* `predicate :: String --> Bool`
* `dir :: AbstractString`
* `tarball :: Union{AbstractString, IO}`

Expand All @@ -134,8 +134,9 @@ will be included in the archive.

### Tar.extract

extract(tarball, [ dir ]) -> dir
extract([ predicate, ] tarball, [ dir ]) -> dir

* `predicate :: Header --> Bool`
* `tarball :: Union{AbstractString, IO}`
* `dir :: AbstractString`

Expand All @@ -146,6 +147,12 @@ archive contents will be read from that IO stream. The archive is extracted to
which can be created as a new directory. If `dir` is not specified, the archive
is extracted into a temporary directory which is returned by `extract`.

If a `predicate` function is passed, it is called on each `Header` object that
is encountered while extracting `tarball` and the entry is only extracted if the
`predicate(hdr)` is true. This can be used to selectively extract only parts of
an archive, to skip entries that cause `extract` to throw an error, or to record
what is extracted during the extraction process.

### Tar.list

list(tarball; [ strict = true ]) -> Vector{Header}
Expand Down
45 changes: 34 additions & 11 deletions src/Tar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ include("header.jl")
include("create.jl")
include("extract.jl")

const true_predicate = _ -> true

## official API: create, list, extract

"""
create([ predicate, ] dir, [ tarball ]) -> tarball
predicate :: Function
predicate :: String --> Bool
dir :: AbstractString
tarball :: Union{AbstractString, IO}
Expand Down Expand Up @@ -64,8 +66,9 @@ function create(predicate::Function, dir::AbstractString)
end

create(dir::AbstractString, tarball::Union{AbstractString, IO}) =
create(p->true, dir, tarball)
create(dir::AbstractString) = create(p->true, dir)
create(true_predicate, dir, tarball)
create(dir::AbstractString) =
create(true_predicate, dir)

"""
list(tarball; [ strict = true ]) -> Vector{Header}
Expand Down Expand Up @@ -94,8 +97,9 @@ list(tarball::IO; raw::Bool=false, strict::Bool=!raw) =
list_tarball(tarball, raw=raw, strict=strict)

"""
extract(tarball, [ dir ]) -> dir
extract([ predicate, ] tarball, [ dir ]) -> dir
predicate :: Header --> Bool
tarball :: Union{AbstractString, IO}
dir :: AbstractString
Expand All @@ -105,28 +109,47 @@ archive contents will be read from that IO stream. The archive is extracted to
`dir` which must either be an existing empty directory or a non-existent path
which can be created as a new directory. If `dir` is not specified, the archive
is extracted into a temporary directory which is returned by `extract`.
If a `predicate` function is passed, it is called on each `Header` object that
is encountered while extracting `tarball` and the entry is only extracted if the
`predicate(hdr)` is true. This can be used to selectively extract only parts of
an archive, to skip entries that cause `extract` to throw an error, or to record
what is extracted during the extraction process.
"""
function extract(tarball::Union{AbstractString, IO}, dir::AbstractString)
function extract(
predicate::Function,
tarball::Union{AbstractString, IO},
dir::AbstractString,
)
extract_tarball_check(tarball)
extract_dir_check(dir)
if ispath(dir)
extract_tarball(tarball, dir)
extract_tarball(predicate, tarball, dir)
else
mkdir(dir)
extract_cleanup(tarball, dir)
extract_tarball_with_cleanup(predicate, tarball, dir)
end
return dir
end

function extract(tarball::Union{AbstractString, IO})
function extract(predicate::Function, tarball::Union{AbstractString, IO})
extract_tarball_check(tarball)
dir = mktempdir()
extract_cleanup(tarball, dir)
extract_tarball_with_cleanup(predicate, tarball, dir)
return dir
end

function extract_cleanup(tarball::Union{AbstractString, IO}, dir::AbstractString)
try extract_tarball(tarball, dir)
extract(tarball::Union{AbstractString, IO}, dir::AbstractString) =
extract(true_predicate, tarball, dir)
extract(tarball::Union{AbstractString, IO}) =
extract(true_predicate, tarball)

function extract_tarball_with_cleanup(
predicate::Function,
tarball::Union{AbstractString, IO},
dir::AbstractString,
)
try extract_tarball(predicate, tarball, dir)
catch
rm(dir, force=true, recursive=true)
rethrow()
Expand Down
19 changes: 15 additions & 4 deletions src/extract.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,24 @@ function list_tarball(
hdr === nothing && break
strict && check_header(hdr)
push!(headers, hdr)
skip(tar, 512 * ((hdr.size + 511) ÷ 512))
skip_data(tar, hdr.size)
end
return headers
end

function extract_tarball(
predicate::Function,
tarball::AbstractString,
root::String;
buf::Vector{UInt8} = Vector{UInt8}(undef, 512),
)
open(tarball) do tar
extract_tarball(tar, root, buf=buf)
extract_tarball(predicate, tar, root, buf=buf)
end
end

function extract_tarball(
predicate::Function,
tar::IO,
root::String;
buf::Vector{UInt8} = Vector{UInt8}(undef, 512),
Expand All @@ -37,16 +39,21 @@ function extract_tarball(
while !eof(tar)
hdr = read_header(tar, buf=buf)
hdr === nothing && break
# check if we should extract or skip
if !predicate(hdr)
skip_data(tar, hdr.size)
continue
end
check_header(hdr)
# normalize path and check for symlink attacks
path = ""
parts = String[]
for part in split(hdr.path, '/')
(isempty(part) || part == ".") && continue
path in links && error("""
refusing to extract path with symlink prefix, possible attack
Refusing to extract path with symlink prefix, possible attack
* symlink prefix: $(repr(path))
* path: $(repr(hdr.path))
* extracted path: $(repr(hdr.path))
""")
path = isempty(path) ? part : "$path/$part"
push!(parts, part)
Expand Down Expand Up @@ -232,6 +239,10 @@ function read_standard_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef,
return Header(path, to_symbolic_type(type), mode, size, link)
end

function skip_data(tar::IO, size::Integer)
skip(tar, 512 * ((size + 511) ÷ 512))
end

index_range(offset::Int, length::Int) = offset .+ (1:length)

read_header_chr(buf::Vector{UInt8}, offset::Int) = Char(buf[offset+1])
Expand Down
29 changes: 28 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ end
end

function check_tree_hash(hash::Vector{UInt8}, root::AbstractString)
@test tree_hash(root) == hash
@test string(tree_hash(root)) == string(hash)
rm(root, recursive=true)
end

Expand Down Expand Up @@ -408,4 +408,31 @@ end
Tar.extract(io, dir)
end
rm(dir, recursive=true)

# generate a version of dir with .skip entries
dir = make_test_dir(true)
tarball = Tar.create(dir)
rm(dir, recursive=true)

# predicate to skip paths ending in `.skip`
predicate = hdr -> !any(splitext(p)[2] == ".skip" for p in split(hdr.path, '/'))

# extract(predicate::Function, tarball::String)
dir = Tar.extract(predicate, tarball)
check_tree_hash(hash, dir)
# extract(predicate::Function, tarball::String, dir::String)
dir = tempname()
Tar.extract(predicate, tarball, dir)
check_tree_hash(hash, dir)
# extract(predicate::Function, tarball::IO)
dir = open(tarball) do io
Tar.extract(predicate, io)
end
check_tree_hash(hash, dir)
# extract(tarball::IO, dir::String) — non-existent
dir = tempname()
open(tarball) do io
Tar.extract(predicate, io, dir)
end
check_tree_hash(hash, dir)
end

0 comments on commit 272b770

Please sign in to comment.