From d8b26da38acb26d89ddcf79efe3190ce6c0b2ee2 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Sat, 2 Nov 2019 14:35:13 +0000 Subject: [PATCH 1/7] init perf branch --- Project.toml | 16 +++++++--------- src/MLJBase.jl | 28 ++++++++++++++++------------ src/init.jl | 32 ++++++++++++++++++++------------ test/runtests.jl | 6 +++--- 4 files changed, 46 insertions(+), 36 deletions(-) diff --git a/Project.toml b/Project.toml index 95e190b1..7c13c6c8 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.7.3" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -17,23 +18,20 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -CategoricalArrays = "<0.5.3, 0.7" -Distributions = "0.21.3" -Missings = "0.4.3" +CategoricalArrays = "^0.7" +LossFunctions = "^0.5" OrderedCollections = "1.1" -Requires = "^0.5.2" -ScientificTypes = "0.2.0" -StatsBase = "0.32" -Tables = "<0.1.19, 0.2" +ScientificTypes = "^0.2" +StatsBase = "^0.32" +Tables = "^0.2" julia = "1" [extras] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [targets] -test = ["CSV", "DataFrames", "Distances", "LossFunctions", "Test", "TypedTables"] +test = ["CSV", "DataFrames", "Distances", "Test", "TypedTables"] diff --git a/src/MLJBase.jl b/src/MLJBase.jl index 351c8f17..4cf0f3ad 100644 --- a/src/MLJBase.jl +++ b/src/MLJBase.jl @@ -1,6 +1,6 @@ # Users of this module should first read the document # https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/ -module MLJBase +module MLJBase export MLJType, Model, Supervised, Unsupervised export Deterministic, Probabilistic, Interval @@ -58,24 +58,23 @@ import Base: @__doc__ using Tables using OrderedCollections # already a dependency of StatsBase -import Distributions -import Distributions: pdf, mode using CategoricalArrays -using OrderedCollections -import CategoricalArrays -using ScientificTypes -import ScientificTypes: trait # to be extended: import StatsBase: fit, predict, fit! import Missings.levels +import Distributions +import Distributions: pdf, mode + +using ScientificTypes # from Standard Library: + using Statistics using Random using InteractiveUtils - +using LossFunctions ## CONSTANTS @@ -88,13 +87,11 @@ const DEFAULT_SHOW_DEPTH = 0 include("utilities.jl") - ## BASE TYPES abstract type MLJType end include("equality.jl") # equality for MLJType objects - ## ABSTRACT MODEL TYPES # for storing hyperparameters: @@ -178,7 +175,6 @@ function best end # message): clean!(model::Model) = "" - ## TRAITS """ @@ -218,6 +214,14 @@ include("mlj_model_macro.jl") include("metadata_utilities.jl") # __init__() function: -include("init.jl") +# include("init.jl") + +ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:supervised_model] = + x-> x isa Supervised +ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:unsupervised_model] = + x-> x isa Unsupervised +ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:measure] = is_measure + +include("loss_functions_interface.jl") end # module diff --git a/src/init.jl b/src/init.jl index 605fa3b8..621f36f5 100644 --- a/src/init.jl +++ b/src/init.jl @@ -1,13 +1,21 @@ -using Requires +# using Requires -function __init__() - ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:supervised_model] = - x-> x isa Supervised - ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:unsupervised_model] = - x-> x isa Unsupervised - ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:measure] = is_measure - @require(CSV="336ed68f-0bac-5ca0-87d4-7b16caf5d00b", - include("datasets_requires.jl")) - @require(LossFunctions="30fc2ffe-d236-52d8-8643-a9d8f7c094a7", - include("loss_functions_interface.jl")) -end +ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:supervised_model] = + x-> x isa Supervised +ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:unsupervised_model] = + x-> x isa Unsupervised +ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:measure] = is_measure + +include("loss_functions_interface.jl") + +# function __init__() +# ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:supervised_model] = +# x-> x isa Supervised +# ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:unsupervised_model] = +# x-> x isa Unsupervised +# ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:measure] = is_measure +# # @require(CSV="336ed68f-0bac-5ca0-87d4-7b16caf5d00b", +# # include("datasets_requires.jl")) +# # @require(LossFunctions="30fc2ffe-d236-52d8-8643-a9d8f7c094a7", +# # include("loss_functions_interface.jl")) +# end diff --git a/test/runtests.jl b/test/runtests.jl index 486c971d..dcda82b8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -29,9 +29,9 @@ end @test include("info.jl") end -@testset "datasets" begin - @test include("datasets.jl") -end +# @testset "datasets" begin +# @test include("datasets.jl") +# end @testset "tasks" begin @test include("tasks.jl") From 32dc6826e8b06b1dff20856a84f015d633b025b1 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 5 Nov 2019 18:22:30 +0100 Subject: [PATCH 2/7] removing CSV dep --- Project.toml | 3 +- src/MLJBase.jl | 2 +- src/data.jl | 21 ++--- src/datasets_requires.jl | 182 ++++++++++++++++++++++++++++++++++----- 4 files changed, 173 insertions(+), 35 deletions(-) diff --git a/Project.toml b/Project.toml index 7c13c6c8..2c182235 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.7.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" @@ -20,7 +21,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] CategoricalArrays = "^0.7" LossFunctions = "^0.5" -OrderedCollections = "1.1" +OrderedCollections = "^1.1" ScientificTypes = "^0.2" StatsBase = "^0.32" Tables = "^0.2" diff --git a/src/MLJBase.jl b/src/MLJBase.jl index 4cf0f3ad..a4ad1fbc 100644 --- a/src/MLJBase.jl +++ b/src/MLJBase.jl @@ -56,7 +56,7 @@ export std import Base.== import Base: @__doc__ -using Tables +using Tables, DelimitedFiles using OrderedCollections # already a dependency of StatsBase using CategoricalArrays diff --git a/src/data.jl b/src/data.jl index 6ca47955..a8130ce5 100644 --- a/src/data.jl +++ b/src/data.jl @@ -44,6 +44,7 @@ function partition(rows::AbstractVector{Int}, fractions...; shuffle::Bool=false, return tuple(rowss...) end + """ t1, t2, ...., tk = unnpack(table, t1, t2, ... tk; wrap_singles=false) @@ -51,7 +52,7 @@ Split any Tables.jl compatible `table` into smaller tables (or vectors) `t1, t2, ..., tk` by making selections *without replacement* from the column names defined by the tests `t1`, `t2`, ..., `tk`. A *test* is any object `t` such that `t(name)` is `true` -or `false` for each column `name::Symbol` of `table`. +or `false` for each column `name::Symbol` of `table`. Whenever a returned table contains a single column, it is converted to a vector unless `wrap_singles=true`. @@ -66,7 +67,7 @@ semicolon): ``` julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=[:A, :B]) julia> Z, XY = unpack(table, ==(:z), !=(:w); - :x=>Continuous, :y=>Multiclass) + :x=>Continuous, :y=>Multiclass) julia> XY 2×2 DataFrame │ Row │ x │ y │ @@ -80,7 +81,6 @@ julia> Z 10.0 20.0 ``` - """ function unpack(X, tests...; wrap_singles=false, pairs...) @@ -106,13 +106,13 @@ function unpack(X, tests...; wrap_singles=false, pairs...) counter += 1 end return Tuple(unpacked) - end ## DEALING WITH CATEGORICAL ELEMENTS -CategoricalElement{U} = Union{CategoricalValue{<:Any,U},CategoricalString{U}} +const CategoricalElement{U} = Union{CategoricalValue{<:Any,U},CategoricalString{U}} + """ classes(x) @@ -178,13 +178,14 @@ int(A::AbstractArray) = broadcast(int, A) # get the integer representation of a level given pool (private # method): -int(pool::CategoricalPool, level) = pool.order[pool.invindex[level]] +int(pool::CategoricalPool, level) = pool.order[pool.invindex[level]] struct CategoricalDecoder{T,R} # <: MLJType pool::CategoricalPool{T,R} invorder::Vector{Int} end + """ d = decoder(x) @@ -208,7 +209,6 @@ integer arrays, in which case `d` is broadcast over all elements. *Warning:* It is *not* true that `int(d(u)) == u` always holds. See also: [`int`](@ref), [`classes`](@ref). - """ decoder(element::CategoricalElement) = CategoricalDecoder(element.pool, sortperm(element.pool.order)) @@ -235,7 +235,6 @@ Convert a Tables.jl compatible table source `X` into an `Matrix`; or, if `X` is a `AbstractMatrix`, return `X`. Optimized for column-based sources. Rows of the table or input matrix, correspond to rows of the output, unless `transpose=true`. - """ matrix(X; kwargs...) = matrix(Val(ScientificTypes.trait(X)), X; kwargs...) matrix(::Val{:other}, X; kwargs...) = throw(ArgumentError) @@ -283,7 +282,6 @@ If a `prototype` is specified, then the matrix is materialized as a table of the preferred sink type of `prototype`, rather than wrapped. Note that if `protottype` is *not* specified, then `MLJ.matrix(MLJ.table(A))` is essentially a non-operation. - """ function table(cols::NamedTuple; prototype=NamedTuple()) Tables.istable(prototype) || error("`prototype` is not a table. ") @@ -429,11 +427,11 @@ nrows(::Val{:other}, v::AbstractVector) = length(v) ## ACCESSORS FOR ABSTRACT MATRICES -selectrows(::Val{:other}, A::AbstractMatrix, r) = A[r, :] +selectrows(::Val{:other}, A::AbstractMatrix, r) = A[r, :] selectrows(::Val{:other}, A::CategoricalMatrix, r) = @inbounds A[r, :] # single row selection must return a matrix! -selectrows(::Val{:other}, A::AbstractMatrix, r::Integer) = A[r:r, :] +selectrows(::Val{:other}, A::AbstractMatrix, r::Integer) = A[r:r, :] selectrows(::Val{:other}, A::CategoricalMatrix, r::Integer) = @inbounds A[r:r, :] @@ -462,4 +460,3 @@ nrows(::Val{:other}, A::AbstractMatrix) = size(A, 1) # select(::Val{:sparse}, X, r::Integer, c::AbstractVector{Symbol}) = X[r,sort(c)] # select(::Val{:sparse}, X, r::Integer, ::Colon) = X[r,:] # select(::Val{:sparse}, X, r, c) = X[r,sort(c)] - diff --git a/src/datasets_requires.jl b/src/datasets_requires.jl index c18713e0..aaec05b3 100644 --- a/src/datasets_requires.jl +++ b/src/datasets_requires.jl @@ -1,32 +1,172 @@ # see also the macro versions in datasets.jl -using .CSV +# ------------------------------------------------------- +# To add a new dataset assuming it has a header and is, at path +# `data/newdataset.csv` +# +# 1. start by loading it with CSV: +# +# fpath = joinpath("datadir", "newdataset.csv") +# data = CSV.read(fpath, copycols=true, +# categorical=true) +# +# 2. load it with DelimitedFiles and Tables +# +# data_raw, data_header = readdlm(fpath, ',', header=true) +# data_table = Tables.table(data_raw; header=Symbol.(vec(data_header))) +# +# 3. retrieve the conversions: +# +# for (n, st) in zip(names(data), scitype_union.(eachcol(data))) +# println(":$n=>$st,") +# end +# +# 4. copy and paste the result in a coerce +# +# data_table = coerce(data_table, ...) +# +# ------------------------------------------------------- -export load_boston, load_ames, load_iris -export load_reduced_ames -export load_crabs +export load_boston, + load_ames, + load_iris, + load_reduced_ames, + load_crabs -datadir = joinpath(@__DIR__, "..", "data") +const DATA_DIR = joinpath(@__DIR__, "..", "data") -load_boston() = CSV.read(joinpath(datadir, "Boston.csv"), copycols=true, - categorical=true) +const COERCE_BOSTON = (:Chas => Count,) -function load_reduced_ames() - df = CSV.read(joinpath(datadir, "reduced_ames.csv"), copycols=true, - categorical=true) - return coerce(df, :OverallQual => OrderedFactor, - :GarageCars => Count, - :YearBuilt => Continuous, - :YearRemodAdd => Continuous) -end +const COERCE_REDUCED_AMES = ( + :target => Continuous, + :OverallQual => OrderedFactor, + :GrLivArea => Continuous, + :Neighborhood => Multiclass, + :x1stFlrSF => Continuous, + :TotalBsmtSF => Continuous, + :BsmtFinSF1 => Continuous, + :LotArea => Continuous, + :GarageCars => Count, + :MSSubClass => Multiclass, + :GarageArea => Count, + :YearRemodAdd => Continuous, + :YearBuilt => Continuous) + +const COERCE_AMES = ( + :Id => Count, + :MSSubClass => Multiclass, + :MSZoning => Multiclass, + :LotFrontage => Continuous, + :LotArea => Count, + :Street => Multiclass, + :LotShape => Multiclass, + :LandContour => Multiclass, + :LotConfig => Multiclass, + :LandSlope => Multiclass, + :Neighborhood => Multiclass, + :Condition1 => Multiclass, + :Condition2 => Multiclass, + :BldgType => Multiclass, + :HouseStyle => Multiclass, + :OverallQual => Count, + :OverallCond => Count, + :YearBuilt => Count, + :YearRemodAdd => Count, + :RoofStyle => Multiclass, + :RoofMatl => Multiclass, + :Exterior1st => Multiclass, + :Exterior2nd => Multiclass, + :MasVnrType => Multiclass, + :MasVnrArea => Count, + :ExterQual => Multiclass, + :ExterCond => Multiclass, + :Foundation => Multiclass, + :BsmtQual => Multiclass, + :BsmtCond => Multiclass, + :BsmtExposure => Multiclass, + :BsmtFinType1 => Multiclass, + :BsmtFinSF1 => Continuous, + :BsmtFinType2 => Multiclass, + :BsmtFinSF2 => Count, + :BsmtUnfSF => Count, + :TotalBsmtSF => Continuous, + :Heating => Multiclass, + :HeatingQC => Multiclass, + :CentralAir => Multiclass, + :Electrical => Multiclass, + :x1stFlrSF => Count, + :x2ndFlrSF => Count, + :LowQualFinSF => Count, + :GrLivArea => Count, + :BsmtFullBath => Count, + :BsmtHalfBath => Count, + :FullBath => Count, + :HalfBath => Count, + :BedroomAbvGr => Count, + :KitchenAbvGr => Count, + :KitchenQual => Multiclass, + :TotRmsAbvGrd => Count, + :Functional => Multiclass, + :Fireplaces => Count, + :FireplaceQu => Multiclass, + :GarageType => Multiclass, + :GarageYrBlt => Continuous, + :GarageFinish => Multiclass, + :GarageCars => Count, + :GarageArea => Count, + :GarageQual => Multiclass, + :GarageCond => Multiclass, + :PavedDrive => Multiclass, + :WoodDeckSF => Count, + :OpenPorchSF => Count, + :EnclosedPorch => Count, + :x3SsnPorch => Count, + :ScreenPorch => Count, + :PoolArea => Count, + :MiscVal => Count, + :MoSold => Count, + :YrSold => Count, + :SaleType => Multiclass, + :SaleCondition => Multiclass, + :target => Continuous) + +const COERCE_IRIS = ( + :sepal_length => Continuous, + :sepal_width => Continuous, + :petal_length => Continuous, + :petal_width => Continuous, + :target => Multiclass) -load_ames() = CSV.read(joinpath(datadir, "ames.csv"), copycols=true, - categorical=true) +const COERCE_CRABS = ( + :sp => Multiclass, + :sex => Multiclass, + :index => Count, + :FL => Continuous, + :RW => Continuous, + :CL => Continuous, + :CW => Continuous, + :BD => Continuous) -load_iris() = CSV.read(joinpath(datadir, "iris.csv"), pool=true, copycols=true, - categorical=true) +typeof(COERCE_CRABS) + +""" +load_dataset(fpath, coercions) + +Load one of standard dataset like Boston etc assuming the file is a comma separated file with +a header. +""" +function load_dataset(fname::String, coercions::Tuple) + fpath = joinpath(DATA_DIR, fname) + data_raw, data_header = readdlm(fpath, ',', header=true) + data_table = Tables.table(data_raw; header=Symbol.(vec(data_header))) + return coerce(data_table, coercions...) +end -load_crabs() = CSV.read(joinpath(datadir, "crabs.csv"), pool=true, - copycols=true, categorical=true) +load_boston() = load_dataset("Boston.csv", COERCE_BOSTON) +load_reduced_ames() = load_dataset("reduced_ames.csv", COERCE_REDUCED_AMES) +load_ames() = load_dataset("ames.csv", COERCE_AMES) +load_iris() = load_dataset("iris.csv", COERCE_IRIS) +load_crabs() = load_dataset("crabs.csv", COERCE_CRABS) +load_crabs() From 36e6c6a37f75d78f36c444adf7b06d31ffe14a22 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 5 Nov 2019 18:44:53 +0100 Subject: [PATCH 3/7] removing requires from deps --- Project.toml | 1 - src/init.jl | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/Project.toml b/Project.toml index 2c182235..cbe6d79b 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,6 @@ LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Requires = "ae029012-a4dd-5104-9daa-d747884805df" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/src/init.jl b/src/init.jl index 621f36f5..e5613f88 100644 --- a/src/init.jl +++ b/src/init.jl @@ -7,15 +7,3 @@ ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:unsupervised_model] = ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:measure] = is_measure include("loss_functions_interface.jl") - -# function __init__() -# ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:supervised_model] = -# x-> x isa Supervised -# ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:unsupervised_model] = -# x-> x isa Unsupervised -# ScientificTypes.TRAIT_FUNCTION_GIVEN_NAME[:measure] = is_measure -# # @require(CSV="336ed68f-0bac-5ca0-87d4-7b16caf5d00b", -# # include("datasets_requires.jl")) -# # @require(LossFunctions="30fc2ffe-d236-52d8-8643-a9d8f7c094a7", -# # include("loss_functions_interface.jl")) -# end From a74258f47946d7380063b254d592d397955f3a13 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 5 Nov 2019 22:24:54 +0100 Subject: [PATCH 4/7] uncommenting the datasets tests --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index dcda82b8..486c971d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -29,9 +29,9 @@ end @test include("info.jl") end -# @testset "datasets" begin -# @test include("datasets.jl") -# end +@testset "datasets" begin + @test include("datasets.jl") +end @testset "tasks" begin @test include("tasks.jl") From 6ab4bdb9d6d70b4c9cb1abce60f361106b95c484 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 5 Nov 2019 22:41:31 +0100 Subject: [PATCH 5/7] fixing things --- src/MLJBase.jl | 9 +- src/datasets.jl | 168 +++++++++++++++++++++++++++++++++++++- src/datasets_requires.jl | 172 --------------------------------------- 3 files changed, 171 insertions(+), 178 deletions(-) delete mode 100644 src/datasets_requires.jl diff --git a/src/MLJBase.jl b/src/MLJBase.jl index a4ad1fbc..a64fee77 100644 --- a/src/MLJBase.jl +++ b/src/MLJBase.jl @@ -29,9 +29,10 @@ export UnivariateFinite, average # distributions.jl export SupervisedTask, UnsupervisedTask, MLJTask # tasks.jl export X_and_y, X_, y_, nrows, nfeatures # tasks.jl export info # info.jl -export @load_boston, @load_ames, @load_iris # datasets.jl -export @load_reduced_ames # datasets.jl -export @load_crabs # datasets.jl +export load_boston, load_ames, load_iris, + load_reduced_ames, load_crabs, + @load_boston, @load_ames, @load_iris, + @load_reduced_ames, @load_crabs # datasets.jl export orientation, reports_each_observation # measures.jl export is_feature_dependent # measures.jl export default_measure, value # measures.jl @@ -203,7 +204,7 @@ include("data.jl") include("distributions.jl") include("info.jl") -include("datasets.jl") # importing CSV will also load datasets_requires.jl +include("datasets.jl") include("tasks.jl") include("measures.jl") diff --git a/src/datasets.jl b/src/datasets.jl index 7e7ba8a0..2ba2095d 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -1,4 +1,169 @@ -# see also the non-macro versions in datasets_requires.jl +# see also the macro versions in datasets.jl + +# ------------------------------------------------------- +# To add a new dataset assuming it has a header and is, at path +# `data/newdataset.csv` +# +# 1. start by loading it with CSV: +# +# fpath = joinpath("datadir", "newdataset.csv") +# data = CSV.read(fpath, copycols=true, +# categorical=true) +# +# 2. load it with DelimitedFiles and Tables +# +# data_raw, data_header = readdlm(fpath, ',', header=true) +# data_table = Tables.table(data_raw; header=Symbol.(vec(data_header))) +# +# 3. retrieve the conversions: +# +# for (n, st) in zip(names(data), scitype_union.(eachcol(data))) +# println(":$n=>$st,") +# end +# +# 4. copy and paste the result in a coerce +# +# data_table = coerce(data_table, ...) +# +# ------------------------------------------------------- + +const DATA_DIR = joinpath(@__DIR__, "..", "data") + +const COERCE_BOSTON = (:Chas => Count,) + +const COERCE_REDUCED_AMES = ( + :target => Continuous, + :OverallQual => OrderedFactor, + :GrLivArea => Continuous, + :Neighborhood => Multiclass, + :x1stFlrSF => Continuous, + :TotalBsmtSF => Continuous, + :BsmtFinSF1 => Continuous, + :LotArea => Continuous, + :GarageCars => Count, + :MSSubClass => Multiclass, + :GarageArea => Count, + :YearRemodAdd => Continuous, + :YearBuilt => Continuous) + +const COERCE_AMES = ( + :Id => Count, + :MSSubClass => Multiclass, + :MSZoning => Multiclass, + :LotFrontage => Continuous, + :LotArea => Count, + :Street => Multiclass, + :LotShape => Multiclass, + :LandContour => Multiclass, + :LotConfig => Multiclass, + :LandSlope => Multiclass, + :Neighborhood => Multiclass, + :Condition1 => Multiclass, + :Condition2 => Multiclass, + :BldgType => Multiclass, + :HouseStyle => Multiclass, + :OverallQual => Count, + :OverallCond => Count, + :YearBuilt => Count, + :YearRemodAdd => Count, + :RoofStyle => Multiclass, + :RoofMatl => Multiclass, + :Exterior1st => Multiclass, + :Exterior2nd => Multiclass, + :MasVnrType => Multiclass, + :MasVnrArea => Count, + :ExterQual => Multiclass, + :ExterCond => Multiclass, + :Foundation => Multiclass, + :BsmtQual => Multiclass, + :BsmtCond => Multiclass, + :BsmtExposure => Multiclass, + :BsmtFinType1 => Multiclass, + :BsmtFinSF1 => Continuous, + :BsmtFinType2 => Multiclass, + :BsmtFinSF2 => Count, + :BsmtUnfSF => Count, + :TotalBsmtSF => Continuous, + :Heating => Multiclass, + :HeatingQC => Multiclass, + :CentralAir => Multiclass, + :Electrical => Multiclass, + :x1stFlrSF => Count, + :x2ndFlrSF => Count, + :LowQualFinSF => Count, + :GrLivArea => Count, + :BsmtFullBath => Count, + :BsmtHalfBath => Count, + :FullBath => Count, + :HalfBath => Count, + :BedroomAbvGr => Count, + :KitchenAbvGr => Count, + :KitchenQual => Multiclass, + :TotRmsAbvGrd => Count, + :Functional => Multiclass, + :Fireplaces => Count, + :FireplaceQu => Multiclass, + :GarageType => Multiclass, + :GarageYrBlt => Continuous, + :GarageFinish => Multiclass, + :GarageCars => Count, + :GarageArea => Count, + :GarageQual => Multiclass, + :GarageCond => Multiclass, + :PavedDrive => Multiclass, + :WoodDeckSF => Count, + :OpenPorchSF => Count, + :EnclosedPorch => Count, + :x3SsnPorch => Count, + :ScreenPorch => Count, + :PoolArea => Count, + :MiscVal => Count, + :MoSold => Count, + :YrSold => Count, + :SaleType => Multiclass, + :SaleCondition => Multiclass, + :target => Continuous) + +const COERCE_IRIS = ( + :sepal_length => Continuous, + :sepal_width => Continuous, + :petal_length => Continuous, + :petal_width => Continuous, + :target => Multiclass) + +const COERCE_CRABS = ( + :sp => Multiclass, + :sex => Multiclass, + :index => Count, + :FL => Continuous, + :RW => Continuous, + :CL => Continuous, + :CW => Continuous, + :BD => Continuous) + +typeof(COERCE_CRABS) + +""" +load_dataset(fpath, coercions) + +Load one of standard dataset like Boston etc assuming the file is a comma separated file with +a header. +""" +function load_dataset(fname::String, coercions::Tuple) + fpath = joinpath(DATA_DIR, fname) + data_raw, data_header = readdlm(fpath, ',', header=true) + data_table = Tables.table(data_raw; header=Symbol.(vec(data_header))) + return coerce(data_table, coercions...) +end + + +load_boston() = load_dataset("Boston.csv", COERCE_BOSTON) +load_reduced_ames() = load_dataset("reduced_ames.csv", COERCE_REDUCED_AMES) +load_ames() = load_dataset("ames.csv", COERCE_AMES) +load_iris() = load_dataset("iris.csv", COERCE_IRIS) +load_crabs() = load_dataset("crabs.csv", COERCE_CRABS) + +load_crabs() """Load a well-known public regression dataset with `Continuous` features.""" macro load_boston() @@ -44,4 +209,3 @@ macro load_crabs() (X, y) end end - diff --git a/src/datasets_requires.jl b/src/datasets_requires.jl deleted file mode 100644 index aaec05b3..00000000 --- a/src/datasets_requires.jl +++ /dev/null @@ -1,172 +0,0 @@ -# see also the macro versions in datasets.jl - -# ------------------------------------------------------- -# To add a new dataset assuming it has a header and is, at path -# `data/newdataset.csv` -# -# 1. start by loading it with CSV: -# -# fpath = joinpath("datadir", "newdataset.csv") -# data = CSV.read(fpath, copycols=true, -# categorical=true) -# -# 2. load it with DelimitedFiles and Tables -# -# data_raw, data_header = readdlm(fpath, ',', header=true) -# data_table = Tables.table(data_raw; header=Symbol.(vec(data_header))) -# -# 3. retrieve the conversions: -# -# for (n, st) in zip(names(data), scitype_union.(eachcol(data))) -# println(":$n=>$st,") -# end -# -# 4. copy and paste the result in a coerce -# -# data_table = coerce(data_table, ...) -# -# ------------------------------------------------------- - -export load_boston, - load_ames, - load_iris, - load_reduced_ames, - load_crabs - -const DATA_DIR = joinpath(@__DIR__, "..", "data") - -const COERCE_BOSTON = (:Chas => Count,) - -const COERCE_REDUCED_AMES = ( - :target => Continuous, - :OverallQual => OrderedFactor, - :GrLivArea => Continuous, - :Neighborhood => Multiclass, - :x1stFlrSF => Continuous, - :TotalBsmtSF => Continuous, - :BsmtFinSF1 => Continuous, - :LotArea => Continuous, - :GarageCars => Count, - :MSSubClass => Multiclass, - :GarageArea => Count, - :YearRemodAdd => Continuous, - :YearBuilt => Continuous) - -const COERCE_AMES = ( - :Id => Count, - :MSSubClass => Multiclass, - :MSZoning => Multiclass, - :LotFrontage => Continuous, - :LotArea => Count, - :Street => Multiclass, - :LotShape => Multiclass, - :LandContour => Multiclass, - :LotConfig => Multiclass, - :LandSlope => Multiclass, - :Neighborhood => Multiclass, - :Condition1 => Multiclass, - :Condition2 => Multiclass, - :BldgType => Multiclass, - :HouseStyle => Multiclass, - :OverallQual => Count, - :OverallCond => Count, - :YearBuilt => Count, - :YearRemodAdd => Count, - :RoofStyle => Multiclass, - :RoofMatl => Multiclass, - :Exterior1st => Multiclass, - :Exterior2nd => Multiclass, - :MasVnrType => Multiclass, - :MasVnrArea => Count, - :ExterQual => Multiclass, - :ExterCond => Multiclass, - :Foundation => Multiclass, - :BsmtQual => Multiclass, - :BsmtCond => Multiclass, - :BsmtExposure => Multiclass, - :BsmtFinType1 => Multiclass, - :BsmtFinSF1 => Continuous, - :BsmtFinType2 => Multiclass, - :BsmtFinSF2 => Count, - :BsmtUnfSF => Count, - :TotalBsmtSF => Continuous, - :Heating => Multiclass, - :HeatingQC => Multiclass, - :CentralAir => Multiclass, - :Electrical => Multiclass, - :x1stFlrSF => Count, - :x2ndFlrSF => Count, - :LowQualFinSF => Count, - :GrLivArea => Count, - :BsmtFullBath => Count, - :BsmtHalfBath => Count, - :FullBath => Count, - :HalfBath => Count, - :BedroomAbvGr => Count, - :KitchenAbvGr => Count, - :KitchenQual => Multiclass, - :TotRmsAbvGrd => Count, - :Functional => Multiclass, - :Fireplaces => Count, - :FireplaceQu => Multiclass, - :GarageType => Multiclass, - :GarageYrBlt => Continuous, - :GarageFinish => Multiclass, - :GarageCars => Count, - :GarageArea => Count, - :GarageQual => Multiclass, - :GarageCond => Multiclass, - :PavedDrive => Multiclass, - :WoodDeckSF => Count, - :OpenPorchSF => Count, - :EnclosedPorch => Count, - :x3SsnPorch => Count, - :ScreenPorch => Count, - :PoolArea => Count, - :MiscVal => Count, - :MoSold => Count, - :YrSold => Count, - :SaleType => Multiclass, - :SaleCondition => Multiclass, - :target => Continuous) - -const COERCE_IRIS = ( - :sepal_length => Continuous, - :sepal_width => Continuous, - :petal_length => Continuous, - :petal_width => Continuous, - :target => Multiclass) - -const COERCE_CRABS = ( - :sp => Multiclass, - :sex => Multiclass, - :index => Count, - :FL => Continuous, - :RW => Continuous, - :CL => Continuous, - :CW => Continuous, - :BD => Continuous) - -typeof(COERCE_CRABS) - -""" -load_dataset(fpath, coercions) - -Load one of standard dataset like Boston etc assuming the file is a comma separated file with -a header. -""" -function load_dataset(fname::String, coercions::Tuple) - fpath = joinpath(DATA_DIR, fname) - data_raw, data_header = readdlm(fpath, ',', header=true) - data_table = Tables.table(data_raw; header=Symbol.(vec(data_header))) - return coerce(data_table, coercions...) -end - - -load_boston() = load_dataset("Boston.csv", COERCE_BOSTON) -load_reduced_ames() = load_dataset("reduced_ames.csv", COERCE_REDUCED_AMES) -load_ames() = load_dataset("ames.csv", COERCE_AMES) -load_iris() = load_dataset("iris.csv", COERCE_IRIS) -load_crabs() = load_dataset("crabs.csv", COERCE_CRABS) - -load_crabs() From ede435b2f8bf1bf84a8341e6b2625b17fc64035c Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 5 Nov 2019 22:58:29 +0100 Subject: [PATCH 6/7] removing import CSV from macro --- src/datasets.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/datasets.jl b/src/datasets.jl index 2ba2095d..4c8dea55 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -168,7 +168,6 @@ load_crabs() """Load a well-known public regression dataset with `Continuous` features.""" macro load_boston() quote - import CSV y, X = unpack(load_boston(), ==(:MedV), x->x != :Chas) (X, y) end @@ -177,7 +176,6 @@ end """Load a reduced version of the well-known Ames Housing task""" macro load_reduced_ames() quote - import CSV y, X = unpack(load_reduced_ames(), ==(:target), x-> true) (X, y) end @@ -186,7 +184,6 @@ end """Load the full version of the well-known Ames Housing task.""" macro load_ames() quote - import CSV y, X = unpack(load_ames(), ==(:target), x->x != :Id) (X, y) end @@ -195,7 +192,6 @@ end """Load a well-known public classification task with nominal features.""" macro load_iris() quote - import CSV y, X = unpack(load_iris(), ==(:target), x-> true) (X, y) end @@ -204,7 +200,6 @@ end """Load a well-known crab classification dataset with nominal features.""" macro load_crabs() quote - import CSV y, X = unpack(load_crabs(), ==(:sp), x-> !(x in [:sex, :index])) (X, y) end From 44aa3ff8a8db093917a5f591fff9ef2c11faefe1 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Wed, 6 Nov 2019 09:03:17 +0100 Subject: [PATCH 7/7] removing extraneous line --- src/datasets.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets.jl b/src/datasets.jl index 4c8dea55..b292cf96 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -163,7 +163,6 @@ load_ames() = load_dataset("ames.csv", COERCE_AMES) load_iris() = load_dataset("iris.csv", COERCE_IRIS) load_crabs() = load_dataset("crabs.csv", COERCE_CRABS) -load_crabs() """Load a well-known public regression dataset with `Continuous` features.""" macro load_boston()