diff --git a/docs/src/Data.toml b/docs/src/Data.toml index a9d6176..372f8d9 100644 --- a/docs/src/Data.toml +++ b/docs/src/Data.toml @@ -16,8 +16,8 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" # Data stored in FileSystem is either File (a file) or FileTree (a directory/folder) type="File" # Path with posix `/` separators. - # Use @__DIR__ for paths relative to Data.toml - path="@__DIR__/data/file.txt" + # Relative paths are relative to the location of Data.toml + path="data/file.txt" # A second example [[datasets]] @@ -28,7 +28,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" type="FileTree" - path="@__DIR__/data/csvset" + path="data/csvset" # Further datasets can be added as desired # [[datasets]] diff --git a/docs/src/reference.md b/docs/src/reference.md index a99ac4b..4334183 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -3,12 +3,18 @@ ## Using datasets The primary mechanism for loading datasets is the `dataset` function, coupled -with `open()` to open the resulting `DataSet` as some Julia type. In addition, -DataSets.jl provides two macros [`@datafunc`](@ref) and [`@datarun`](@ref) to -help in creating program entry points and running them. +with `open()` to open the resulting `DataSet` as some Julia type. ```@docs dataset +``` + +In addition, DataSets.jl provides two macros [`@datafunc`](@ref) and +[`@datarun`](@ref) to help in creating program entry points and running them. +Note that these APIs aren't fully formed and might be deprecated before +DataSets-1.0. + +```@docs @datafunc @datarun ``` @@ -46,6 +52,14 @@ DataSets.ActiveDataProject DataSets.TomlFileDataProject ``` +### Modifying datasets + +The metadata for a dataset may be updated using `config!` + +```@docs +DataSets.config! +``` + ## Data Models for files and directories DataSets provides some builtin data models [`File`](@ref) and @@ -62,7 +76,7 @@ newdir ## Storage Drivers -To add a new kind of data storage backend, implement [`DataSets.add_storage_driver`](@ref) +To add a new kind of data storage backend, call [`DataSets.add_storage_driver`](@ref) ```@docs DataSets.add_storage_driver diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index bc8b53b..acac62b 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -12,6 +12,7 @@ DocTestFilters = [ r"path =.*", r"@.*", r"(?<=IOStream\().*", + r"(?<=TomlFileDataProject \[).*", ] ``` @@ -71,7 +72,7 @@ global configuration this is also possible: ```jldoctest julia> project = DataSets.load_project("src/Data.toml") -DataSets.DataProject: +DataSets.TomlFileDataProject [.../DataSets/docs/src/Data.toml]: 📄 a_text_file => b498f769-a7f6-4f67-8d74-40b770398f26 📁 a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a diff --git a/src/DataSet.jl b/src/DataSet.jl index 14278d7..8a1843d 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -5,57 +5,38 @@ unopinionated about the underlying storage mechanism. The data in a `DataSet` has a type which implies an index; the index can be used to partition the data for processing. """ -struct DataSet - # For now, the representation `conf` contains data read directly from the - # TOML. Once the design has settled we might get some explicit fields and - # do validation. +mutable struct DataSet + project # AbstractDataProject owning this DataSet uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. + # The representation `conf` contains "configuration data" read directly from + # the TOML (or other data project source, eg json API etc) conf - function DataSet(conf) - _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) - _check_keys(conf["storage"], DataSet, ["driver"=>String]) - check_dataset_name(conf["name"]) - new(UUID(conf["uuid"]), conf) + function DataSet(project, conf) + _validate_dataset_config(conf) + new(project, UUID(conf["uuid"]), conf) end +end - #= - name::String # Default name for convenience. - # The binding to an actual name is managed by the data - # project. - storage # Storage config and driver definition - maps::Vector{DataMap} - - # Generic dictionary of other properties... for now. Required properties - # will be moved - _other::Dict{Symbol,Any} - - #storage_id # unique identifier in storage backend, if it exists - #owner # Project or user who owns the data - #description::String - #type # Some representation of the type of data? - # # An array, blob, table, tree, etc - #cachable::Bool # Can the data be cached? It might not for data governance - # # reasons or it might change commonly. - ## A set of identifiers - #tags::Set{String} - =# -end - -_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T -_key_match(config, k::String) = haskey(config, k) - -function _check_keys(config, context, keys) - missed_keys = filter(k->!_key_match(config, k), keys) - if !isempty(missed_keys) - error(""" - Missing expected keys in $context: - $missed_keys - - In TOML fragment: - $(sprint(TOML.print,config)) - """) - end +DataSet(conf) = DataSet(nothing, conf) + +function _validate_dataset_config(conf) + _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) + _check_keys(conf["storage"], DataSet, ["driver"=>String]) + _check_optional_keys(conf, + "description"=>AbstractString, + "tags"=>VectorOf(AbstractString)) + check_dataset_name(conf["name"]) +end + +function Base.show(io::IO, d::DataSet) + print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") +end + +function Base.show(io::IO, ::MIME"text/plain", d::DataSet) + println(io, "DataSet instance:") + println(io) + TOML.print(io, d.conf) end """ @@ -104,18 +85,29 @@ function check_dataset_name(name::AbstractString) end end -# Hacky thing until we figure out which fields DataSet should actually have. +#------------------------------------------------------------------------------- +# API for DataSet type function Base.getproperty(d::DataSet, name::Symbol) - if name in fieldnames(DataSet) - return getfield(d, name) + if name === :uuid + getfield(d, :uuid) + elseif name === :conf + getfield(d, :conf) else getfield(d, :conf)[string(name)] end end +function Base.setproperty!(d::DataSet, name::Symbol, x) + config!(d; name=>x) +end + Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) +function data_project(dataset::DataSet) + return getfield(dataset, :project) +end + # Split the fragment section as a '/' separated RelPath function dataspec_fragment_as_path(d::DataSet) if haskey(d, "dataspec") @@ -127,17 +119,35 @@ function dataspec_fragment_as_path(d::DataSet) return nothing end -function Base.show(io::IO, d::DataSet) - print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") +function config!(dataset::DataSet; kws...) + config!(data_project(dataset), dataset; kws...) end -function Base.show(io::IO, ::MIME"text/plain", d::DataSet) - println(io, "DataSet instance:") - println(io) - TOML.print(io, d.conf) +# The default case of a dataset config update when the update is independent of +# the project. (In general, projects may supply extra constraints.) +function config!(::Nothing, dataset::DataSet; kws...) + for (k,v) in pairs(kws) + if k in (:uuid, :name) + error("Cannot modify dataset config with key $k") + # TODO: elseif k === :storage + # Check consistency using storage driver API? + end + # TODO: Fold these schema checks in with _validate_dataset_config + # somehow. + if k === :description + if !(v isa AbstractString) + error("Dataset description must be a string") + end + elseif k === :tags + if !(v isa AbstractVector && all(x isa AbstractString for x in v)) + error("Dataset tags must be a vector of strings") + end + end + dataset.conf[string(k)] = v + end + return dataset end - #------------------------------------------------------------------------------- # Functions for opening datasets diff --git a/src/DataSets.jl b/src/DataSets.jl index 150f565..ca15b19 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -218,6 +218,8 @@ end #------------------------------------------------------------------------------- +include("utils.jl") + # Application entry points include("entrypoint.jl") diff --git a/src/data_project.jl b/src/data_project.jl index cf99441..9458c82 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -52,9 +52,39 @@ function dataset(proj::AbstractDataProject, spec::AbstractString) conf = copy(dataset.conf) conf["dataspec"] = dataspec - return DataSet(conf) + # FIXME: This copy is problematic now that datasets can be mutated with + # `DataSets.config!()` as "dataspec" will infect the dataset when it's + # saved again. + return DataSet(data_project(dataset), conf) end +""" + config!(name::AbstractString; kws...) + config!(proj::AbstractDataProject, name::AbstractString; kws...) + + config!(dataset::DataSet; kws...) + +Update the configuration of `dataset` with the given keyword arguments and +persist it in the dataset's project storage. The versions which take a `name` +use that name to search within the given data project. + +# Examples + +Update the description of the dataset named `"SomeData"` in the global project: +``` +DataSets.config!("SomeData"; description="This is a description") +``` + +Alternatively, setting `DataSet` properties can be used to update metadata. For +example, to tag the dataset "SomeData" with tags `"A"` and `"B"`. +``` +ds = dataset("SomeData") +ds.tags = ["A", "B"] +``` +""" +function config!(project::AbstractDataProject, name::AbstractString; kws...) + config!(project[name]; kws...) +end # Percent-decode a string according to the URI escaping rules. # Vendored from URIs.jl for now to avoid depending on that entire package for @@ -282,22 +312,18 @@ end #------------------------------------------------------------------------------- """ - load_project(path; auto_update=false) - load_project(config_dict) - -Load a data project from a system `path` referring to a TOML file. If -`auto_update` is true, the returned project will monitor the file for updates -and reload when necessary. + load_project(path) -Alternatively, create a `DataProject` from a an existing dictionary -`config_dict`, which should be in the Data.toml format. +Load a data project from a system `path` referring to a TOML file. See also [`load_project!`](@ref). """ -function load_project(path::AbstractString; auto_update=false) +function load_project(path::AbstractString; auto_update=true) sys_path = abspath(path) - auto_update ? TomlFileDataProject(sys_path) : - _load_project(read(sys_path,String), dirname(sys_path)) + if !auto_update + Base.depwarn("`auto_update` is deprecated", :load_project) + end + TomlFileDataProject(sys_path) end function load_project(config::AbstractDict; kws...) @@ -312,7 +338,7 @@ function load_project(config::AbstractDict; kws...) end proj = DataProject() for dataset_conf in config["datasets"] - dataset = DataSet(dataset_conf) + dataset = DataSet(proj, dataset_conf) proj[dataset.name] = dataset end if haskey(config, "drivers") @@ -326,3 +352,16 @@ function load_project(config::AbstractDict; kws...) proj end +function project_toml(proj::DataProject) + # FIXME: Preserve other unknown keys here for forward compatibility. + conf = Dict( + "data_config_version"=>CURRENT_DATA_CONFIG_VERSION, + "datasets"=>[d.conf for (n,d) in proj.datasets], + "drivers"=>proj.drivers + ) + return sprint(TOML.print, conf) +end + +function config!(name::AbstractString; kws...) + config!(PROJECT, name; kws...) +end diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 65a9979..79f3773 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -4,8 +4,8 @@ A cache of file content, parsed with an arbitrary parser function. This is a modified and generalized version of `Base.CachedTOMLDict`. -Getting the value of the cache with `f[]` will automatically update the parsed -value whenever the file changes. +Getting the value of the cache with `get_cache(f)` will automatically update +the parsed value whenever the file changes. """ mutable struct CachedParsedFile{T} path::String @@ -38,7 +38,7 @@ function CachedParsedFile{T}(parser::Function, path::String) where T ) end -function Base.getindex(f::CachedParsedFile) +function get_cache(f::CachedParsedFile, allow_refresh=true) s = stat(f.path) time_since_cached = time() - f.mtime rough_mtime_granularity = 0.1 # seconds @@ -59,6 +59,9 @@ function Base.getindex(f::CachedParsedFile) f.mtime = s.mtime f.size = s.size f.hash = new_hash + if !allow_refresh + error("The file at $(f.path) was written externally") + end @debug "Cache of file $(repr(f.path)) invalid, reparsing..." return f.d = f.parser(content) end @@ -66,19 +69,39 @@ function Base.getindex(f::CachedParsedFile) return f.d end +function set_cache(f::CachedParsedFile, content::AbstractString) + mktemp(dirname(f.path)) do tmppath, tmpio + write(tmpio, content) + close(tmpio) + # Uses mktemp() + mv() to atomically overwrite the file + mv(tmppath, f.path, force=true) + end + s = stat(f.path) + f.inode = s.inode + f.mtime = s.mtime + f.size = s.size + f.hash = sha1(content) +end + function Base.show(io::IO, m::MIME"text/plain", f::CachedParsedFile) println(io, "Cache of file $(repr(f.path)) with value") - show(io, m, f[]) + show(io, m, get_cache(f)) end # Parse Data.toml into DataProject which updates when the file does. -function parse_and_cache_project(sys_path::AbstractString) +function parse_and_cache_project(proj, sys_path::AbstractString) sys_data_dir = dirname(sys_path) CachedParsedFile{DataProject}(sys_path) do content if isnothing(content) DataProject() else - _load_project(String(content), sys_data_dir) + inner_proj = _load_project(String(content), sys_data_dir) + for d in inner_proj + # Hack; we steal ownership from the DataProject here. + # What's a better way to do this? + setfield!(d, :project, proj) + end + inner_proj end end end @@ -87,19 +110,19 @@ end abstract type AbstractTomlFileDataProject <: AbstractDataProject end function Base.get(proj::AbstractTomlFileDataProject, name::AbstractString, default) - get(_get_cached(proj), name, default) + get(get_cache(proj), name, default) end function Base.keys(proj::AbstractTomlFileDataProject) - keys(_get_cached(proj)) + keys(get_cache(proj)) end function Base.iterate(proj::AbstractTomlFileDataProject, state=nothing) # This is a little complex because we want iterate to work even if the # active project changes concurrently, which means wrapping up the initial - # result of _get_cached with the iterator state. + # result of get_cache with the iterator state. if isnothing(state) - cached_values = values(_get_cached(proj)) + cached_values = values(get_cache(proj)) if isnothing(cached_values) return nothing end @@ -116,9 +139,20 @@ function Base.iterate(proj::AbstractTomlFileDataProject, state=nothing) end end -Base.pairs(proj::AbstractTomlFileDataProject) = pairs(_get_cached(proj)) +Base.pairs(proj::AbstractTomlFileDataProject) = pairs(get_cache(proj)) + +data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(get_cache(proj)) -data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(_get_cached(proj)) +function config!(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) + if data_project(dataset) !== proj + error("dataset must belong to project") + end + # Here we accept the update independently of the project - Data.toml should + # be able to manage any dataset config. + config!(nothing, dataset; kws...) + set_cache(proj, project_toml(get_cache(proj, false))) + return dataset +end #------------------------------------------------------------------------------- """ @@ -128,15 +162,23 @@ filesystem. mutable struct TomlFileDataProject <: AbstractTomlFileDataProject path::String cache::CachedParsedFile{DataProject} + function TomlFileDataProject(path::String) + proj = new(path) + proj.cache = parse_and_cache_project(proj, path) + proj + end +end + +function get_cache(proj::TomlFileDataProject, refresh=true) + get_cache(proj.cache, refresh) end -function TomlFileDataProject(path::String) - cache = parse_and_cache_project(path) - TomlFileDataProject(path, cache) +function set_cache(proj::TomlFileDataProject, content::AbstractString) + set_cache(proj.cache, content) end -function _get_cached(proj::TomlFileDataProject) - proj.cache[] +function local_data_abspath(proj::TomlFileDataProject, relpath) + return joinpath(dirname(proj.path), relpath) end project_name(proj::TomlFileDataProject) = proj.path @@ -160,7 +202,7 @@ end function ActiveDataProject() proj = ActiveDataProject(nothing, DataProject()) - _get_cached(proj) + get_cache(proj) proj end @@ -170,37 +212,58 @@ function _active_project_data_toml(project_path=Base.active_project(false)) joinpath(dirname(project_path), "Data.toml") end -function _get_cached(proj::ActiveDataProject) +function get_cache(proj::ActiveDataProject, allow_refresh=true) active_project = Base.active_project(false) if proj.active_project_path != active_project + if !allow_refresh + error("The current project path was changed") + end # The unusual case: active project has changed. if isnothing(active_project) proj.cache = DataProject() else data_toml = _active_project_data_toml(active_project) # Need to re-cache - proj.cache = parse_and_cache_project(data_toml) + proj.cache = parse_and_cache_project(proj, data_toml) end proj.active_project_path = active_project end - proj.cache isa DataProject ? proj.cache : proj.cache[] + proj.cache isa DataProject ? proj.cache : get_cache(proj.cache, allow_refresh) +end + +function set_cache(proj::ActiveDataProject, content::AbstractString) + if proj.cache isa DataProject + error("No current active project") + else + set_cache(proj.cache, content) + end +end + +function local_data_abspath(proj::ActiveDataProject, relpath) + if isnothing(proj.active_project_path) + error("No active project") + end + return joinpath(dirname(proj.active_project_path), relpath) end project_name(::ActiveDataProject) = _active_project_data_toml() #------------------------------------------------------------------------------- -function _fill_template(toml_path, toml_str) - # Super hacky templating for paths relative to the toml file. - # We really should have something a lot nicer here... - if Sys.iswindows() - toml_path = replace(toml_path, '\\'=>'/') +function _fill_template(toml_str) + if occursin("@__DIR__", toml_str) + Base.depwarn(""" + Using @__DIR__ in Data.toml is deprecated. Use a '/'-separated + relative path instead.""", + :_fill_template) + return replace(toml_str, "@__DIR__"=>".") + else + return toml_str end - toml_str = replace(toml_str, "@__DIR__"=>toml_path) end function _load_project(content::AbstractString, sys_data_dir) - toml_str = _fill_template(sys_data_dir, content) + toml_str = _fill_template(content) config = TOML.parse(toml_str) load_project(config) end @@ -225,13 +288,17 @@ function from_path(path::AbstractString) throw(ArgumentError(msg)) end + path_key = Sys.isunix() ? "unix_path" : + Sys.iswindows() ? "windows_path" : + error("Unknown system: cannot determine path type") + conf = Dict( "name"=>make_valid_dataset_name(path), "uuid"=>string(uuid4()), "storage"=>Dict( "driver"=>"FileSystem", "type"=>dtype, - "path"=>abspath(path), + path_key=>abspath(path), ) ) diff --git a/src/filesystem.jl b/src/filesystem.jl index 199403b..73d1803 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -1,24 +1,6 @@ """ Root storage object for trees which are rooted in the file system (in git terminology, there exists a "working copy") - -## Metadata spec - -For File: -``` - [datasets.storage] - driver="FileSystem" - type="File" - path=\$(path_to_file) -``` - -For FileTree: -``` - [datasets.storage] - driver="FileSystem" - type="FileTree" - path=\$(path_to_directory) -``` """ mutable struct FileSystemRoot path::String @@ -247,10 +229,83 @@ end #-------------------------------------------------- +# FileSystem storage driver + +""" + local_data_abspath(project, relpath) + +Return the absolute path of data on disk where `relpath` is relative to +`project`. + +This function must be implemented for any `AbstractDataProject` subtype which +intends to support the `FileSystem` data driver. +""" +function local_data_abspath +end + + +function local_data_abspath(::Nothing, path) + error("Path must be absolute for DataSets without parent data projects") +end -# Filesystem storage driver + +""" +## Metadata spec + +For File: +``` + [datasets.storage] + driver="FileSystem" + type="File" + \$path_key=\$(path_string) +``` + +For FileTree: +``` + [datasets.storage] + driver="FileSystem" + type="FileTree" + \$path_key=\$(path_string) +``` + +`path_key` should be one of the following forms: +``` + path=\$(relative_slash_separated_path_to_file) + unix_path=\$(absolute_unix_path_to_file) + windows_path=\$(absolute_windows_path_to_file) +``` +""" function connect_filesystem(f, config, dataset) - path = config["path"] + # Paths keys can be in three forms documented above; + if haskey(config, "path") + pathstr = config["path"] + # Local absolute paths are not portable. Previously these were allowed + # in the "path" key, but those are now deprecated in favor of + # system-specific path keys unix_path or windows_path + if isabspath(pathstr) + Base.depwarn(""" + Absolute paths in Data.toml are deprecated. Instead, use relative + paths (separated with `/`) relative to the Data.toml location.""", + :connect_filesystem) + path = pathstr + else + if '\\' in pathstr && Sys.iswindows() + # Heuristic deprecation warning for windows paths in Data.toml + Base.depwarn( + "Relative paths in Data.toml should be separated with '/' characters.", + :connect_filesystem) + pathstr = join(split(pathstr, '\\'), '/') + end + relpath = joinpath(split(pathstr, '/')...) + path = local_data_abspath(data_project(dataset), relpath) + end + elseif haskey(config, "unix_path") && Sys.isunix() + path = config["unix_path"] + elseif haskey(config, "windows_path") && Sys.iswindows() + path = config["windows_path"] + else + error("No \"path\" key found for FileSystem storage driver.") + end type = config["type"] if type in ("File", "Blob") isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) diff --git a/src/utils.jl b/src/utils.jl new file mode 100644 index 0000000..a2cc260 --- /dev/null +++ b/src/utils.jl @@ -0,0 +1,38 @@ +# Some basic utilities to validate "config-like" data +# +# (Perhaps these could be replaced with the use of JSON schema or some such?) + +_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T +_key_match(config, k::String) = haskey(config, k) + +function _check_keys(config, context, keys) + missed_keys = filter(k->!_key_match(config, k), keys) + if !isempty(missed_keys) + error(""" + Missing expected keys in $context: + $missed_keys + + In DataSet fragment: + $(sprint(TOML.print,config)) + """) + end +end + +struct VectorOf + T +end + +function _check_optional_keys(config, context, keys...) + for (k, check) in keys + if haskey(config, k) + v = config[k] + if check isa Type && !(v isa check) + error("""Invalid DataSet key $k. Expected type $check""") + elseif check isa VectorOf && !(v isa AbstractVector && + all(x isa check.T for x in v)) + error("""Invalid DataSet key $k""") + end + end + end +end + diff --git a/test/Data.toml b/test/Data.toml index d9e85d2..3ddbdc0 100644 --- a/test/Data.toml +++ b/test/Data.toml @@ -12,16 +12,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" - - # TODO: We'd like a layering abstraction. - - # [[datasets.maps]] - # type="File" - # - # [[datasets.maps]] - # type="text" - # parameters={encoding="UTF-8"} + path="data/file.txt" [[datasets]] description="A text file with namespace" @@ -31,7 +22,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" + path="data/file.txt" #-------------------------------------------------- [[datasets]] @@ -42,15 +33,7 @@ uuid="2d126588-5f76-4e53-8245-87dc91625bf4" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/people.csv.gz" - - #[[datasets.maps]] - #type="GZip" - # - #[[datasets.maps]] - #type="CSV" - #parameters={delim=","} - + path="data/people.csv.gz" #-------------------------------------------------- [[datasets]] @@ -60,10 +43,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" type="FileTree" - path="@__DIR__/data/csvset" - - # TODO: Add data maps here which expose it logically as a single CSV? - + path="data/csvset" #-------------------------------------------------- # Data embedded in the TOML diff --git a/test/active_project/Data.toml b/test/active_project/Data.toml index 03158c8..64f90df 100644 --- a/test/active_project/Data.toml +++ b/test/active_project/Data.toml @@ -1,4 +1,4 @@ -data_config_version=0 +data_config_version=1 [[datasets]] description="A text file" @@ -8,4 +8,4 @@ uuid="314996ef-12be-40d0-912c-9755af354fdb" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" + path="data/file.txt" diff --git a/test/backend_compat.jl b/test/backend_compat.jl index c2f8335..27914b8 100644 --- a/test/backend_compat.jl +++ b/test/backend_compat.jl @@ -82,7 +82,7 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) @test read(open(dataset(proj, "old_backend_tree"))[path"b.txt"], String) == "b" end -@testset "Compat for renaming Blob->File, BlobTree->FileTree" begin +@testset "Compat for @__DIR__ and renaming Blob->File, BlobTree->FileTree" begin proj = DataSets.load_project(joinpath(@__DIR__, "DataCompat.toml")) text_data = dataset(proj, "a_text_file") diff --git a/test/projects.jl b/test/projects.jl index 5ba13e4..61a49f8 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -6,7 +6,8 @@ using DataSets: TomlFileDataProject, ActiveDataProject, StackedDataProject, - project_name + project_name, + config! test_project_names = ["a_text_file", "a_tree_example", @@ -33,19 +34,13 @@ test_project_names = ["a_text_file", # identity @test project_name(proj) == abspath("Data.toml") - - # Test @__DIR__ templating - # Use `cleanpath` as there's currently a mixture of / and \ on windows - # which does work, but is quite ugly. - cleanpath(p) = replace(p, '\\'=>'/') - @test cleanpath(proj["a_text_file"].storage["path"]) == cleanpath(joinpath(@__DIR__, "data", "file.txt")) end @testset "TomlFileDataProject live updates" begin # Test live updating when the file is rewritten mktemp() do path,io write(io, """ - data_config_version=0 + data_config_version=1 [[datasets]] description="A text file" @@ -55,7 +50,7 @@ end [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" + path="data/file.txt" """) flush(io) @@ -74,7 +69,7 @@ end [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file2.txt" + path="data/file2.txt" """) flush(io) @@ -136,3 +131,63 @@ end @test project_name(DataSets.PROJECT.projects[1]) == joinpath(@__DIR__, "Data.toml") end +@testset "config!() metadata update" begin + # Test live updating when the file is rewritten + mktempdir() do tmppath + data_toml_path = joinpath(tmppath, "Data.toml") + open(data_toml_path, write=true) do io + write(io, """ + data_config_version=1 + + [[datasets]] + description="A" + name="a_text_file" + uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="FileSystem" + type="File" + path="data/file.txt" + """) + end + + proj = TomlFileDataProject(data_toml_path) + @testset "config!(proj, ...)" begin + @test dataset(proj, "a_text_file").description == "A" + config!(proj, "a_text_file", description="B") + config!(proj, "a_text_file", tags=Any["xx", "yy"]) + @test dataset(proj, "a_text_file").description == "B" + @test dataset(proj, "a_text_file").tags == ["xx", "yy"] + end + + @testset "Persistence on disk" begin + proj2 = TomlFileDataProject(data_toml_path) + @test dataset(proj2, "a_text_file").description == "B" + @test dataset(proj2, "a_text_file").tags == ["xx", "yy"] + end + + @testset "config! via DataSet instances" begin + ds = dataset(proj, "a_text_file") + config!(ds, description = "C") + @test dataset(proj, "a_text_file").description == "C" + ds.description = "D" + @test dataset(proj, "a_text_file").description == "D" + end + + @testset "description and tags validation" begin + ds = dataset(proj, "a_text_file") + @test_throws Exception config!(ds, description = 1) + @test_throws Exception config!(ds, tags = "hi") + end + + @testset "global config! methods" begin + empty!(DataSets.PROJECT) + pushfirst!(DataSets.PROJECT, TomlFileDataProject(data_toml_path)) + + config!("a_text_file", description="X") + @test dataset("a_text_file").description == "X" + + empty!(DataSets.PROJECT) + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index a7d65fc..0b2925d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,7 +20,7 @@ end @testset "DataSet config from Dict" begin config = Dict( - "data_config_version"=>0, + "data_config_version"=>1, "datasets"=>[Dict( "description"=>"A text file", "name"=>"a_text_file",