JuliaComputing · c42f · May 29, 2022 · May 23, 2022 · May 11, 2022 · May 24, 2022
diff --git a/docs/src/Data.toml b/docs/src/Data.toml
@@ -16,8 +16,8 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
     # Data stored in FileSystem is either File (a file) or FileTree (a directory/folder)
     type="File"
     # Path with posix `/` separators.
-    # Use @__DIR__ for paths relative to Data.toml
-    path="@__DIR__/data/file.txt"
+    # Relative paths are relative to the location of Data.toml
+    path="data/file.txt"
 
 # A second example
 [[datasets]]
@@ -28,7 +28,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a"
     [datasets.storage]
     driver="FileSystem"
     type="FileTree"
-    path="@__DIR__/data/csvset"
+    path="data/csvset"
 
 # Further datasets can be added as desired
 # [[datasets]]

diff --git a/docs/src/reference.md b/docs/src/reference.md
@@ -3,12 +3,18 @@
 ## Using datasets
 
 The primary mechanism for loading datasets is the `dataset` function, coupled
-with `open()` to open the resulting `DataSet` as some Julia type. In addition,
-DataSets.jl provides two macros [`@datafunc`](@ref) and [`@datarun`](@ref) to
-help in creating program entry points and running them.
+with `open()` to open the resulting `DataSet` as some Julia type.
 
 ```@docs
 dataset
+```
+
+In addition, DataSets.jl provides two macros [`@datafunc`](@ref) and
+[`@datarun`](@ref) to help in creating program entry points and running them.
+Note that these APIs aren't fully formed and might be deprecated before
+DataSets-1.0.
+
+```@docs
 @datafunc
 @datarun
 ```
@@ -46,6 +52,14 @@ DataSets.ActiveDataProject
 DataSets.TomlFileDataProject
 ```
 
+### Modifying datasets
+
+The metadata for a dataset may be updated using `config!`
+
+```@docs
+DataSets.config!
+```
+
 ## Data Models for files and directories
 
 DataSets provides some builtin data models [`File`](@ref) and
@@ -62,7 +76,7 @@ newdir
 
 ## Storage Drivers
 
-To add a new kind of data storage backend, implement [`DataSets.add_storage_driver`](@ref)
+To add a new kind of data storage backend, call [`DataSets.add_storage_driver`](@ref)
 
 ```@docs
 DataSets.add_storage_driver

diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md
@@ -12,6 +12,7 @@ DocTestFilters = [
     r"path =.*",
     r"@.*",
     r"(?<=IOStream\().*",
+    r"(?<=TomlFileDataProject \[).*",
 ]
 ```
 
@@ -71,7 +72,7 @@ global configuration this is also possible:
 
 ```jldoctest
 julia> project = DataSets.load_project("src/Data.toml")
-DataSets.DataProject:
+DataSets.TomlFileDataProject [.../DataSets/docs/src/Data.toml]:
   📄 a_text_file    => b498f769-a7f6-4f67-8d74-40b770398f26
   📁 a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a
 

diff --git a/src/DataSet.jl b/src/DataSet.jl
@@ -5,57 +5,38 @@ unopinionated about the underlying storage mechanism.
 The data in a `DataSet` has a type which implies an index; the index can be
 used to partition the data for processing.
 """
-struct DataSet
-    # For now, the representation `conf` contains data read directly from the
-    # TOML. Once the design has settled we might get some explicit fields and
-    # do validation.
+mutable struct DataSet
+    project        # AbstractDataProject owning this DataSet
     uuid::UUID     # Unique identifier for the dataset. Use uuid4() to create these.
+    # The representation `conf` contains "configuration data" read directly from
+    # the TOML (or other data project source, eg json API etc)
     conf
 
-    function DataSet(conf)
-        _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String])
-        _check_keys(conf["storage"], DataSet, ["driver"=>String])
-        check_dataset_name(conf["name"])
-        new(UUID(conf["uuid"]), conf)
+    function DataSet(project, conf)
+        _validate_dataset_config(conf)
+        new(project, UUID(conf["uuid"]), conf)
     end
+end
 
-    #=
-    name::String # Default name for convenience.
-                         # The binding to an actual name is managed by the data
-                         # project.
-    storage        # Storage config and driver definition
-    maps::Vector{DataMap}
-
-    # Generic dictionary of other properties... for now. Required properties
-    # will be moved
-    _other::Dict{Symbol,Any}
-
-    #storage_id     # unique identifier in storage backend, if it exists
-    #owner          # Project or user who owns the data
-    #description::String
-    #type           # Some representation of the type of data?
-    #               # An array, blob, table, tree, etc
-    #cachable::Bool # Can the data be cached?  It might not for data governance
-    #               # reasons or it might change commonly.
-    ## A set of identifiers
-    #tags::Set{String}
-    =#
-end
-
-_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T
-_key_match(config, k::String) = haskey(config, k)
-
-function _check_keys(config, context, keys)
-    missed_keys = filter(k->!_key_match(config, k), keys)
-    if !isempty(missed_keys)
-        error("""
-              Missing expected keys in $context:
-              $missed_keys
-
-              In TOML fragment:
-              $(sprint(TOML.print,config))
-              """)
-    end
+DataSet(conf) = DataSet(nothing, conf)
+
+function _validate_dataset_config(conf)
+    _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String])
+    _check_keys(conf["storage"], DataSet, ["driver"=>String])
+    _check_optional_keys(conf,
+                         "description"=>AbstractString,
+                         "tags"=>VectorOf(AbstractString))
+    check_dataset_name(conf["name"])
+end
+
+function Base.show(io::IO, d::DataSet)
+    print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)")
+end
+
+function Base.show(io::IO, ::MIME"text/plain", d::DataSet)
+    println(io, "DataSet instance:")
+    println(io)
+    TOML.print(io, d.conf)
 end
 
 """
@@ -104,18 +85,29 @@ function check_dataset_name(name::AbstractString)
     end
 end
 
-# Hacky thing until we figure out which fields DataSet should actually have.
+#-------------------------------------------------------------------------------
+# API for DataSet type
 function Base.getproperty(d::DataSet, name::Symbol)
-    if name in fieldnames(DataSet)
-        return getfield(d, name)
+    if name === :uuid
+        getfield(d, :uuid)
+    elseif name === :conf
+        getfield(d, :conf)
     else
         getfield(d, :conf)[string(name)]
     end
 end
 
+function Base.setproperty!(d::DataSet, name::Symbol, x)
+    config!(d; name=>x)
+end
+
 Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name)
 Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name)
 
+function data_project(dataset::DataSet)
+    return getfield(dataset, :project)
+end
+
 # Split the fragment section as a '/' separated RelPath
 function dataspec_fragment_as_path(d::DataSet)
     if haskey(d, "dataspec")
@@ -127,17 +119,35 @@ function dataspec_fragment_as_path(d::DataSet)
     return nothing
 end
 
-function Base.show(io::IO, d::DataSet)
-    print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)")
+function config!(dataset::DataSet; kws...)
+    config!(data_project(dataset), dataset; kws...)
 end
 
-function Base.show(io::IO, ::MIME"text/plain", d::DataSet)
-    println(io, "DataSet instance:")
-    println(io)
-    TOML.print(io, d.conf)
+# The default case of a dataset config update when the update is independent of
+# the project.  (In general, projects may supply extra constraints.)
+function config!(::Nothing, dataset::DataSet; kws...)
+    for (k,v) in pairs(kws)
+        if k in (:uuid, :name)
+            error("Cannot modify dataset config with key $k")
+        # TODO: elseif k === :storage
+            # Check consistency using storage driver API?
+        end
+        # TODO: Fold these schema checks in with _validate_dataset_config
+        # somehow.
+        if k === :description
+            if !(v isa AbstractString)
+                error("Dataset description must be a string")
+            end
+        elseif k === :tags
+            if !(v isa AbstractVector && all(x isa AbstractString for x in v))
+                error("Dataset tags must be a vector of strings")
+            end
+        end
+        dataset.conf[string(k)] = v
+    end
+    return dataset
 end
 
-
 #-------------------------------------------------------------------------------
 # Functions for opening datasets
 

diff --git a/src/DataSets.jl b/src/DataSets.jl
@@ -218,6 +218,8 @@ end
 
 
 #-------------------------------------------------------------------------------
+include("utils.jl")
+
 # Application entry points
 include("entrypoint.jl")
 

diff --git a/src/data_project.jl b/src/data_project.jl
@@ -52,9 +52,39 @@ function dataset(proj::AbstractDataProject, spec::AbstractString)
     conf = copy(dataset.conf)
     conf["dataspec"] = dataspec
 
-    return DataSet(conf)
+    # FIXME: This copy is problematic now that datasets can be mutated with
+    # `DataSets.config!()` as "dataspec" will infect the dataset when it's
+    # saved again.
+    return DataSet(data_project(dataset), conf)
 end
 
+"""
+    config!(name::AbstractString; kws...)
+    config!(proj::AbstractDataProject, name::AbstractString; kws...)
+
+    config!(dataset::DataSet; kws...)
+
+Update the configuration of `dataset` with the given keyword arguments and
+persist it in the dataset's project storage. The versions which take a `name`
+use that name to search within the given data project.
+
+# Examples
+
+Update the description of the dataset named `"SomeData"` in the global project:
+```
+DataSets.config!("SomeData"; description="This is a description")
+```
+
+Alternatively, setting `DataSet` properties can be used to update metadata. For
+example, to tag the dataset "SomeData" with tags `"A"` and `"B"`.
+```
+ds = dataset("SomeData")
+ds.tags = ["A", "B"]
+```
+"""
+function config!(project::AbstractDataProject, name::AbstractString; kws...)
+    config!(project[name]; kws...)
+end
 
 # Percent-decode a string according to the URI escaping rules.
 # Vendored from URIs.jl for now to avoid depending on that entire package for
@@ -282,22 +312,18 @@ end
 
 #-------------------------------------------------------------------------------
 """
-    load_project(path; auto_update=false)
-    load_project(config_dict)
-
-Load a data project from a system `path` referring to a TOML file. If
-`auto_update` is true, the returned project will monitor the file for updates
-and reload when necessary.
+    load_project(path)
 
-Alternatively, create a `DataProject` from a an existing dictionary
-`config_dict`, which should be in the Data.toml format.
+Load a data project from a system `path` referring to a TOML file.
 
 See also [`load_project!`](@ref).
 """
-function load_project(path::AbstractString; auto_update=false)
+function load_project(path::AbstractString; auto_update=true)
     sys_path = abspath(path)
-    auto_update ? TomlFileDataProject(sys_path) :
-                  _load_project(read(sys_path,String), dirname(sys_path))
+    if !auto_update
+        Base.depwarn("`auto_update` is deprecated", :load_project)
+    end
+    TomlFileDataProject(sys_path)
 end
 
 function load_project(config::AbstractDict; kws...)
@@ -312,7 +338,7 @@ function load_project(config::AbstractDict; kws...)
     end
     proj = DataProject()
     for dataset_conf in config["datasets"]
-        dataset = DataSet(dataset_conf)
+        dataset = DataSet(proj, dataset_conf)
         proj[dataset.name] = dataset
     end
     if haskey(config, "drivers")
@@ -326,3 +352,16 @@ function load_project(config::AbstractDict; kws...)
     proj
 end
 
+function project_toml(proj::DataProject)
+    # FIXME: Preserve other unknown keys here for forward compatibility.
+    conf = Dict(
+        "data_config_version"=>CURRENT_DATA_CONFIG_VERSION,
+        "datasets"=>[d.conf for (n,d) in proj.datasets],
+        "drivers"=>proj.drivers
+    )
+    return sprint(TOML.print, conf)
+end
+
+function config!(name::AbstractString; kws...)
+    config!(PROJECT, name; kws...)
+end