-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add dataset caching with OhMyArtifacts.jl, Arrow.jl & Serialization.jl
- Loading branch information
Showing
6 changed files
with
123 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,15 +4,18 @@ authors = ["Frankie Robertson <[email protected]> and contributors"] | |
version = "0.1.0" | ||
|
||
[deps] | ||
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" | ||
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" | ||
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" | ||
CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" | ||
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" | ||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" | ||
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" | ||
OhMyArtifacts = "cf8be1f4-309d-442e-839d-29d2a0af6cb7" | ||
RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" | ||
Reexport = "189a3867-3050-52da-a836-e630ba90ab69" | ||
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" | ||
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
UrlDownload = "856ac37a-3032-4c1c-9122-f86d88358c8b" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
using Arrow | ||
using DataFrames | ||
using OhMyArtifacts | ||
using Serialization | ||
|
||
|
||
const rdataget_artifacts = Ref{String}() | ||
|
||
function __init__() | ||
rdataget_artifacts[] = @my_artifacts_toml!(versioned=false) | ||
end | ||
|
||
function mk_artifact_name(package_name, version, dataset_name, varname) | ||
"cran__" * package_name * "__" * version * "__" * dataset_name * (isnothing(varname) ? "" : ("__" * varname)) | ||
end | ||
|
||
JL_SER_MAGIC = Vector{UInt8}(b"7JL") | ||
|
||
function load_artifact(path) | ||
magic = open(f -> read(f, 3), path) | ||
if magic == JL_SER_MAGIC | ||
return deserialize(path) | ||
else | ||
return DataFrame(Arrow.Table(path)) | ||
end | ||
end | ||
|
||
function serialize_artifact(io, data::Union{CSV.File, DataFrame}) | ||
Arrow.write(io, data) | ||
end | ||
|
||
function serialize_artifact(io, data) | ||
serialize(io, data) | ||
end | ||
|
||
function cran_arrow_artifact(package_name, dataset_name, varname=nothing, version="auto"; cran_mirror=default_cran_mirror) | ||
@debug "Saving RDataGet.jl artifact metadata to $(rdataget_artifacts[])" | ||
if dataset_name == varname | ||
# normalise this case for the artifact name | ||
varname = nothing | ||
end | ||
name = mk_artifact_name(package_name, version, dataset_name, varname) | ||
additional_name = nothing | ||
filename = name * ".arrow" | ||
hash = my_artifact_hash(name, rdataget_artifacts[]) | ||
if isnothing(hash) | ||
hash = create_my_artifact() do artifact_dir | ||
full_path = joinpath(artifact_dir, filename) | ||
open(full_path, "w") do io | ||
package, got_version = get_package_cached(package_name, cran_mirror) | ||
if version == "auto" | ||
additional_name = mk_artifact_name(package_name, got_version, dataset_name, varname) | ||
end | ||
tbl = table_from_data_dir(package, dataset_name, varname) | ||
serialize_artifact(io, tbl) | ||
end | ||
return full_path | ||
end | ||
bind_my_artifact!(rdataget_artifacts[], name, hash) | ||
end | ||
if additional_name !== nothing | ||
additional_name_hash = my_artifact_hash(additional_name, rdataget_artifacts[]) | ||
if isnothing(additional_name_hash) | ||
bind_my_artifact!(rdataget_artifacts[], additional_name, hash) | ||
else | ||
# Assert hashes are the same? | ||
end | ||
end | ||
return load_artifact(my_artifact_path(hash)) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters