From 0912bc0715966f2fd0bfeec5b74992f6a65d05c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 10:00:28 +0200 Subject: [PATCH 1/8] upgrade Julia 1.0 --- .travis.yml | 3 ++- REQUIRE | 6 +++--- appveyor.yml | 6 +++--- src/DataFramesMeta.jl | 2 +- src/compositedataframe.jl | 7 ++++--- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6c57d455..03305145 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,10 @@ language: julia julia: - - 0.6 + - 0.7 - nightly os: - linux + - osx notifications: email: false # script: diff --git a/REQUIRE b/REQUIRE index 15109ced..dddf46a1 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,3 @@ -julia 0.6 -DataFrames 0.11 -Compat 0.57.0 +julia 0.7 +DataFrames 0.13 +Compat 1.0 diff --git a/appveyor.yml b/appveyor.yml index b55935bb..68f6c50b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,7 +1,7 @@ environment: matrix: - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe" - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe" + - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.7/julia-0.7-latest-win32.exe" + - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7-latest-win64.exe" - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe" - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe" @@ -36,7 +36,7 @@ install: build_script: # Need to convert from shallow to complete for Pkg.clone to work - IF EXIST .git\shallow (git fetch --unshallow) - - C:\projects\julia\bin\julia -e "versioninfo(); + - C:\projects\julia\bin\julia -e "using InteractiveUtils; InteractiveUtils.versioninfo(); Pkg.clone(pwd(), \"DataFramesMeta\"); Pkg.build(\"DataFramesMeta\")" test_script: diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index 487a07f3..8da8bc31 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -620,7 +620,7 @@ end ############################################################################## -function select(d::Union{AbstractDataFrame, Associative}; kwargs...) +function select(d::Union{AbstractDataFrame, AbstractDict}; kwargs...) result = typeof(d)() for (k, v) in kwargs result[k] = v diff --git a/src/compositedataframe.jl b/src/compositedataframe.jl index 6d410a8c..6e0d96ff 100644 --- a/src/compositedataframe.jl +++ b/src/compositedataframe.jl @@ -175,9 +175,10 @@ struct CDFRowIterator{T <: AbstractCompositeDataFrame} end DataFrames.eachrow(df::AbstractCompositeDataFrame) = CDFRowIterator(df, nrow(df)) -Base.start(itr::CDFRowIterator) = 1 -Base.done(itr::CDFRowIterator, i::Int) = i > itr.len -Base.next(itr::CDFRowIterator, i::Int) = (row(itr.df, i), i + 1) +function Base.iterate(itr::CDFRowIterator, i=1) + i > itr.len && return nothing + return (row(itr.df, i), i+1) +end Base.size(itr::CDFRowIterator) = (size(itr.df, 1), ) Base.length(itr::CDFRowIterator) = size(itr.df, 1) Base.getindex(itr::CDFRowIterator, i::Any) = row(itr.df, i) From e4ce6e97dc3fa1a0d963346f2901f726e2d3d6e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 11:21:27 +0200 Subject: [PATCH 2/8] additional fixes --- src/DataFramesMeta.jl | 15 +++++++++------ test/dict.jl | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index 8da8bc31..1fb46226 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -133,14 +133,14 @@ julia> df = DataFrame(x = 1:3, y = [2, 1, 2]); julia> x = [2, 1, 0]; -julia> @with(df, :y + 1) -3-element DataArrays.DataArray{Int64,1}: +julia> @with(df, :y .+ 1) +3-element Array{Int64,1}: 3 2 3 -julia> @with(df, :x + x) # the two x's are different -3-element DataArrays.DataArray{Int64,1}: +julia> @with(df, :x + x) +3-element Array{Int64,1}: 3 3 3 @@ -155,14 +155,14 @@ julia> @with df begin 10.0 julia> @with(df, df[:x .> 1, ^(:y)]) # The ^ means leave the :y alone -2-element DataArrays.DataArray{Int64,1}: +2-element Array{Int64,1}: 1 2 julia> colref = :x; julia> @with(df, :y + _I_(colref)) # Equivalent to df[:y] + df[colref] -3-element DataArrays.DataArray{Int64,1}: +3-element Array{Int64,1}: 3 3 5 @@ -302,6 +302,9 @@ select(d::AbstractDataFrame, arg) = d[arg] ## ############################################################################## +# needed on Julia 1.0 till #1489 in DataFrames is merged +orderby(d::DataFrame, arg::DataFrame) = d[sortperm(arg), :] + function orderby(d::AbstractDataFrame, args...) D = typeof(d)(args...) d[sortperm(D), :] diff --git a/test/dict.jl b/test/dict.jl index 1001bb03..3944736d 100644 --- a/test/dict.jl +++ b/test/dict.jl @@ -33,8 +33,8 @@ d2 = @select(d, :y, z = :y + :s, :e) @test DataFramesMeta.with_helper(:df, :(f.(1))) == :(f.(1)) @test DataFramesMeta.with_helper(:df, :(f.(b + c))) == :(f.(b + c)) @test DataFramesMeta.with_helper(:df, Expr(:., :a, QuoteNode(:b))) == Expr(:., :a, QuoteNode(:b)) -@test DataFramesMeta.select_helper(:df, 1).args[2].args[1].args[2].args[3] == 1 -@test DataFramesMeta.select_helper(:df, QuoteNode(:a)).args[2].args[1].args[2].args[2].args[2].args[2].args[3].args[1] == :a +@test DataFramesMeta.select_helper(:df, 1).args[2].args[2].args[2].args[3] == 1 +@test DataFramesMeta.select_helper(:df, QuoteNode(:a)).args[2].args[2].args[2].args[2].args[2].args[2].args[3].args[1] == :a @test DataFramesMeta.expandargs(QuoteNode(:a)) == Expr(:kw, :a, QuoteNode(:a)) @test DataFramesMeta.byrow_find_newcols(:(;), Any[]) == (Any[], Any[]) From 5786f8a77ce2021fdebb1a3e75c654852aaaaec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 12:23:36 +0200 Subject: [PATCH 3/8] fix @newcol --- src/byrow.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/byrow.jl b/src/byrow.jl index 0f063281..249daaf8 100644 --- a/src/byrow.jl +++ b/src/byrow.jl @@ -34,7 +34,7 @@ function byrow_find_newcols(e::Expr, newcol_decl) ea = e.args[3] end # expression to assign a new column to df - return (nothing, Any[Expr(:kw, ea.args[1], Expr(:call, ea.args[2], :_N))]) + return (nothing, Any[Expr(:kw, ea.args[1], Expr(:call, ea.args[2], undef, :_N))]) else if isempty(e.args) return (e.args, Any[]) From ddd3799c926fe4371070cd2222688535326731e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 14:31:28 +0200 Subject: [PATCH 4/8] many fixes for 1.0 --- src/DataFramesMeta.jl | 4 +- src/byrow.jl | 14 ++----- src/compositedataframe.jl | 2 + test/REQUIRE | 1 - test/cdftimings.jl | 3 +- test/chaining.jl | 16 ++++---- test/data.table.timings.jl | 3 +- test/grouping.jl | 8 ++-- test/linqmacro.jl | 4 +- test/runtests.jl | 3 +- test/speedtests.jl | 79 +++++++++++++++++++------------------- test/timings.jl | 43 +++++++++++---------- 12 files changed, 93 insertions(+), 87 deletions(-) diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index 1fb46226..b0175689 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -393,10 +393,10 @@ end function transform(g::GroupedDataFrame; kwargs...) result = DataFrame(g) idx2 = cumsum(Int[size(g[i],1) for i in 1:length(g)]) - idx1 = [1; 1 + idx2[1:end-1]] + idx1 = [1; 1 .+ idx2[1:end-1]] for (k, v) in kwargs first = v(g[1]) - result[k] = Array{eltype(first)}(size(result, 1)) + result[k] = Array{eltype(first)}(undef, size(result, 1)) result[idx1[1]:idx2[1], k] = first for i in 2:length(g) result[idx1[i]:idx2[i], k] = v(g[i]) diff --git a/src/byrow.jl b/src/byrow.jl index 249daaf8..80003236 100644 --- a/src/byrow.jl +++ b/src/byrow.jl @@ -10,18 +10,12 @@ export @byrow! # Recursive function that traverses the syntax tree of e, replaces instances of # ":(:(x))" with ":x[row]". function byrow_replace(e::Expr) - # target is the Expr that is built to replace ":(:(x))" - target = Expr(:ref) - # Traverse the syntax tree of e - if e.head != :quote - return Expr(e.head, (isempty(e.args) ? e.args : map(x -> byrow_replace(x), e.args))...) - else - push!(target.args, e, :row) - return target - end + Expr(e.head, (isempty(e.args) ? e.args : map(byrow_replace, e.args))...) end +byrow_replace(e::QuoteNode) = Expr(:ref, e, :row) + # Set the base case for helper, i.e. for when expand hits an object of type # other than Expr (generally a Symbol or a literal). byrow_replace(x) = x @@ -34,7 +28,7 @@ function byrow_find_newcols(e::Expr, newcol_decl) ea = e.args[3] end # expression to assign a new column to df - return (nothing, Any[Expr(:kw, ea.args[1], Expr(:call, ea.args[2], undef, :_N))]) + return (nothing, Any[Expr(:kw, ea.args[1], Expr(:call, ea.args[2], :undef, :_N))]) else if isempty(e.args) return (e.args, Any[]) diff --git a/src/compositedataframe.jl b/src/compositedataframe.jl index 6e0d96ff..3e77fe32 100644 --- a/src/compositedataframe.jl +++ b/src/compositedataframe.jl @@ -86,6 +86,8 @@ function CompositeDataFrame(columns::Vector{Any}, typeconv = Expr(:call, rowtypename, [Expr(:ref, Expr(:(.), :d, QuoteNode(nm)), :i) for nm in cnames]...) row_method = Expr(:function, :( DataFramesMeta.row(d::$typename, i::Integer) ), typeconv) row_call = :($typename($columns...)) + all_statements = Expr(:block, type_definition, column_definition, row_method, row_call) + println(all_statements) eval(inmodule, Expr(:block, type_definition, column_definition, row_method, row_call)) end diff --git a/test/REQUIRE b/test/REQUIRE index 8525c5e3..e2749de5 100644 --- a/test/REQUIRE +++ b/test/REQUIRE @@ -1,2 +1 @@ Lazy -DataArrays 0.7.0 \ No newline at end of file diff --git a/test/cdftimings.jl b/test/cdftimings.jl index 607965ec..c9ace852 100644 --- a/test/cdftimings.jl +++ b/test/cdftimings.jl @@ -2,7 +2,8 @@ module CompositeDataFrameTimings using Compat.Random -using DataArrays, DataFrames +# using DataArrays, DataFrames +using DataFrames using DataFramesMeta srand(1) diff --git a/test/chaining.jl b/test/chaining.jl index 7f265ff7..66b6b62a 100644 --- a/test/chaining.jl +++ b/test/chaining.jl @@ -4,8 +4,10 @@ using Compat.Test, Compat.Random using DataFrames using DataFramesMeta using Lazy +using Statistics +using Random -srand(1) +Random.seed!(1) n = 100 df = DataFrame(a = rand(1:3, n), b = ["a","b","c","d"][rand(1:4, n)], @@ -17,13 +19,13 @@ x = @by(x, :b, meanX = mean(:x), meanY = mean(:y)) x = @orderby(x, :b, -:meanX) x = @select(x, var = :b, :meanX, :meanY) -x_as = @as _ begin +x_as = @as _x_ begin df - @where(_, :a .> 2) - @transform(_, y = 10 * :x) - @by(_, :b, meanX = mean(:x), meanY = mean(:y)) - @orderby(_, :b, -:meanX) - @select(_, var = :b, :meanX, :meanY) + @where(_x_, :a .> 2) + @transform(_x_, y = 10 * :x) + @by(_x_, :b, meanX = mean(:x), meanY = mean(:y)) + @orderby(_x_, :b, -:meanX) + @select(_x_, var = :b, :meanX, :meanY) end # @> is broken in 0.7 Lazy diff --git a/test/data.table.timings.jl b/test/data.table.timings.jl index d1e61e45..a0331bd9 100644 --- a/test/data.table.timings.jl +++ b/test/data.table.timings.jl @@ -3,7 +3,8 @@ using Compat.Random using DataFrames, DataFramesMeta -using CategoricalArrays, DataArrays +# using CategoricalArrays, DataArrays +using CategoricalArrays N=10_0000; K=100 srand(1) diff --git a/test/grouping.jl b/test/grouping.jl index 8cc6a9d0..abc36611 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3,19 +3,21 @@ module TestGrouping using Compat.Test -using DataArrays, DataFrames +using DataFrames +# using DataArrays, DataFrames using DataFramesMeta +using Statistics d = DataFrame(n = 1:20, x = [3, 3, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 3, 1, 1, 2]) g = groupby(d, :x, sort=true) @test @where(d, :x .== 3) == DataFramesMeta.where(d, x -> x[:x] .== 3) @test DataFrame(@where(g, length(:x) > 5)) == DataFrame(DataFramesMeta.where(g, x -> length(x[:x]) > 5)) -@test DataFrame(@where(g, length(:x) > 5))[:n][1:3] == @data [5, 6, 7] +@test DataFrame(@where(g, length(:x) > 5))[:n][1:3] == [5, 6, 7] @test DataFrame(DataFramesMeta.orderby(g, x -> mean(x[:n]))) == DataFrame(@orderby(g, mean(:n))) -@test (@transform(g, y = :n - median(:n)))[1,:y] == -5.0 +@test (@transform(g, y = :n .- median(:n)))[1,:y] == -5.0 d = DataFrame(n = 1:20, x = [3, 3, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 3, 1, 1, 2]) g = groupby(d, :x, sort=true) diff --git a/test/linqmacro.jl b/test/linqmacro.jl index b3ad87d5..38905aea 100644 --- a/test/linqmacro.jl +++ b/test/linqmacro.jl @@ -3,8 +3,10 @@ module TestLinqMacro using Compat.Test using DataFrames using DataFramesMeta +using Statistics +using Random -srand(100) +Random.seed!(100) n = 100 df = DataFrame(a = rand(1:3, n), b = ["a","b","c","d"][rand(1:4, n)], diff --git a/test/runtests.jl b/test/runtests.jl index ca9779dd..77aaf27f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,8 @@ anyerrors = false my_tests = ["dict.jl", "dataframes.jl", - "compositedataframes.jl", + # remove from tests until figured out how to fix it + # "compositedataframes.jl", "grouping.jl", "chaining.jl", "linqmacro.jl"] diff --git a/test/speedtests.jl b/test/speedtests.jl index 19e803d6..96398270 100644 --- a/test/speedtests.jl +++ b/test/speedtests.jl @@ -1,7 +1,8 @@ module SpeedTests using Compat.Test, Compat.Random -using DataArrays, DataFrames +using DataFrames +# using DataArrays, DataFrames using DataFramesMeta using Devectorize @@ -9,13 +10,13 @@ srand(1) const n = 5_000_000 a = rand(n) b = rand(n) -da = data(a) -db = data(b) +# da = data(a) +# db = data(b) df = DataFrame(a = da, b = db) df2 = DataFrame(Any[a, b]) names!(df2, [:a, :b]) -Base.values(da::DataArray) = da.data +# Base.values(da::DataArray) = da.data function dot1(a::Vector, b::Vector) x = 0.0 @@ -25,14 +26,14 @@ function dot1(a::Vector, b::Vector) return x end -function dot2(da::DataVector, db::DataVector) - T = eltype(da) - x = 0.0 - for i in 1:length(da) - x += da[i]::T * db[i]::T - end - return x -end +# function dot2(da::DataVector, db::DataVector) +# T = eltype(da) +# x = 0.0 +# for i in 1:length(da) +# x += da[i]::T * db[i]::T +# end +# return x +# end function dot3(df::DataFrame) da, db = df[:a], df[:b] @@ -49,31 +50,31 @@ function dot4(df::DataFrame) return dot2(da, db) end -function dot5(da::DataVector, db::DataVector) - x = 0.0 - for i in 1:length(da) - x += da.data[i] * db.data[i] - end - return x -end +# function dot5(da::DataVector, db::DataVector) +# x = 0.0 +# for i in 1:length(da) +# x += da.data[i] * db.data[i] +# end +# return x +# end -function dot6(da::DataVector, db::DataVector) - x = 0.0 - for i in 1:length(da) - x += values(da)[i] * values(db)[i] - end - return x -end +# function dot6(da::DataVector, db::DataVector) +# x = 0.0 +# for i in 1:length(da) +# x += values(da)[i] * values(db)[i] +# end +# return x +# end -function dot7(da::DataVector, db::DataVector) - x = 0.0 - for i in 1:length(da) - if !(isna(da, i) || isna(da, i)) - x += values(da)[i] * values(db)[i] - end - end - return x -end +# function dot7(da::DataVector, db::DataVector) +# x = 0.0 +# for i in 1:length(da) +# if !(isna(da, i) || isna(da, i)) +# x += values(da)[i] * values(db)[i] +# end +# end +# return x +# end function dot8(a::Vector, b::Vector) x = 0.0 @@ -104,12 +105,12 @@ end ## t10 = @elapsed dot10(df) t1 = @elapsed dot1(a, b) -t2 = @elapsed dot2(da, db) +# t2 = @elapsed dot2(da, db) t3 = @elapsed dot3(df) t4 = @elapsed dot4(df) -t5 = @elapsed dot5(da, db) -t6 = @elapsed dot6(da, db) -t7 = @elapsed dot7(da, db) +# t5 = @elapsed dot5(da, db) +# t6 = @elapsed dot6(da, db) +# t7 = @elapsed dot7(da, db) t8 = @elapsed dot8(a, b) t9 = @elapsed dot9(df) diff --git a/test/timings.jl b/test/timings.jl index 31731f0c..6eba89ec 100644 --- a/test/timings.jl +++ b/test/timings.jl @@ -2,35 +2,36 @@ module DataFramesTimings using Compat.Random -using DataArrays, DataFrames +using DataFrames +# using DataArrays, DataFrames using DataFramesMeta srand(1) const n = 5_000_000 a = rand(n) b = rand(n) -da = data(a) -db = data(b) +# da = data(a) +# db = data(b) df = DataFrame(a = da, b = db) -Base.values(da::DataArray) = da.data +# Base.values(da::DataArray) = da.data -function dot1(da::DataVector, db::DataVector) - T = eltype(da) - x = 0.0 - for i in 1:length(da) - x += da[i]::T * db[i]::T - end - return x -end +# function dot1(da::DataVector, db::DataVector) +# T = eltype(da) +# x = 0.0 +# for i in 1:length(da) +# x += da[i]::T * db[i]::T +# end +# return x +# end -function dot2(da::DataVector, db::DataVector) - x = 0.0 - for i in 1:length(da) - x += da.data[i] * db.data[i] - end - return x -end +# function dot2(da::DataVector, db::DataVector) +# x = 0.0 +# for i in 1:length(da) +# x += da.data[i] * db.data[i] +# end +# return x +# end function dot3(df::DataFrame) da, db = df[:a], df[:b] @@ -63,8 +64,8 @@ function dot5(df::DataFrame) end end -@show t1 = @elapsed dot1(da, db) -@show t2 = @elapsed dot2(da, db) +# @show t1 = @elapsed dot1(da, db) +# @show t2 = @elapsed dot2(da, db) @show t3 = @elapsed dot3(df) @show t4 = @elapsed dot4(df) @show t5 = @elapsed dot5(df) From e31f9b3c21ce216bcdf54e8ccc1e379621bbcc5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 17:00:53 +0200 Subject: [PATCH 5/8] Removed DataArrays dependencies in tests and CompositeDataFrame type --- src/DataFramesMeta.jl | 3 +- src/compositedataframe.jl | 203 ------------------------------------ test/cdftimings.jl | 79 -------------- test/compositedataframes.jl | 82 --------------- test/data.table.timings.jl | 17 --- test/grouping.jl | 1 - test/runtests.jl | 2 - test/speedtests.jl | 52 --------- test/timings.jl | 25 ----- 9 files changed, 1 insertion(+), 463 deletions(-) delete mode 100644 src/compositedataframe.jl delete mode 100644 test/cdftimings.jl delete mode 100644 test/compositedataframes.jl diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index b0175689..0c3acd7c 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -3,9 +3,8 @@ module DataFramesMeta using DataFrames, Compat # Basics: -export @with, @ix, @where, @orderby, @transform, @by, @based_on, @select +export @with, @where, @orderby, @transform, @by, @based_on, @select -include("compositedataframe.jl") include("linqmacro.jl") include("byrow.jl") diff --git a/src/compositedataframe.jl b/src/compositedataframe.jl deleted file mode 100644 index 3e77fe32..00000000 --- a/src/compositedataframe.jl +++ /dev/null @@ -1,203 +0,0 @@ -export AbstractCompositeDataFrame, AbstractCompositeDataFrameRow, - CompositeDataFrame, row - -""" - AbstractCompositeDataFrame - -An abstract type that is an `AbstractDataFrame`. Each type that inherits from -this is expected to be a type-stable data frame. -""" -abstract type AbstractCompositeDataFrame <: AbstractDataFrame end - -abstract type AbstractCompositeDataFrameRow end - - -""" - row(cdf::AbstractCompositeDataFrame, i) - -Return row `i` of `cdf` as a `CompositeDataFrameRow`. This object has -the same fields as `cdf` where the type of each field is taken from the `eltype` -of the field in `cdf`. - -See also `eachrow(cdf)`. - -```jldoctest -julia> using DataFramesMeta - -julia> df = CompositeDataFrame(x = 1:3, y = [2, 1, 6]); - -julia> dfr = row(df, 3); - -julia> dfr.y -6 -``` -""" -row() = nothing - -""" - CompositeDataFrame(columns::Vector{Any}, cnames::Vector{Symbol}; - inmodule = DataFramesMeta) - CompositeDataFrame(columns::Vector{Any}, cnames::Vector{Symbol}, typename::Symbol; - inmodule = DataFramesMeta) - CompositeDataFrame(; inmodule = DataFramesMeta, kwargs...) - CompositeDataFrame(typename::Symbol; inmodule = DataFramesMeta, kwargs...) - - -A constructor of an `AbstractCompositeDataFrame` that mimics the `DataFrame` -constructor. This returns a composite type (not immutable) that is an -`AbstractCompositeDataFrame`. - -This uses `eval` to create a new type within the module specified by the -`inmodule` keyword argument. - -### Arguments - -* `columns` : contains the contents of the columns -* `cnames` : the names of the columns -* `typename` : the optional name of the type created -* `kwargs` : the key gives the column names, and the value is the column contents -* `inmodule = DataFramesMeta` : a keyword argument to specify what module you - want to define the type in. Consider passing `current_module()` - (`VERSION < v"0.7-") or `@__MODULE__` depending on your julia version. - -### Examples - -```jldoctest -julia> using DataFramesMeta - -julia> df = CompositeDataFrame(Any[1:3, [2, 1, 2]], [:x, :y]); - -julia> df = CompositeDataFrame(x = 1:3, y = [2, 1, 2]); - -julia> df = CompositeDataFrame(:MyDF, x = 1:3, y = [2, 1, 2]); -``` -""" -function CompositeDataFrame(columns::Vector{Any}, - cnames::Vector{Symbol} = gennames(length(columns)), - typename::Symbol = Symbol("CompositeDF", gensym()); - inmodule = DataFramesMeta) - rowtypename = Symbol(typename, "Row") - # TODO: length checks - type_definition = :(mutable struct $typename <: AbstractCompositeDataFrame end) - type_definition.args[3].args = Any[:($(cnames[i]) :: $(typeof(columns[i]))) for i in 1:length(columns)] - ## do the same for the row iterator type: - column_definition = :(struct $rowtypename <: AbstractCompositeDataFrameRow end) - column_definition.args[3].args = Any[:($(cnames[i]) :: $(eltype(columns[i]))) for i in 1:length(columns)] - typeconv = Expr(:call, rowtypename, [Expr(:ref, Expr(:(.), :d, QuoteNode(nm)), :i) for nm in cnames]...) - row_method = Expr(:function, :( DataFramesMeta.row(d::$typename, i::Integer) ), typeconv) - row_call = :($typename($columns...)) - all_statements = Expr(:block, type_definition, column_definition, row_method, row_call) - println(all_statements) - eval(inmodule, Expr(:block, type_definition, column_definition, row_method, row_call)) -end - -CompositeDataFrame(; inmodule = DataFramesMeta, kwargs...) = - CompositeDataFrame(Any[ v for (k, v) in kwargs ], - Symbol[ k for (k, v) in kwargs ], - inmodule = inmodule) -CompositeDataFrame(typename::Symbol; inmodule = DataFramesMeta, kwargs...) = - CompositeDataFrame(Any[ v for (k, v) in kwargs ], - Symbol[ k for (k, v) in kwargs ], - typename, - inmodule = inmodule) - -# CompositeDataFrame(df::DataFrame) = CompositeDataFrame(df.columns, names(df)) - -CompositeDataFrame(adf::AbstractDataFrame, inmodule = DataFramesMeta) = - CompositeDataFrame(DataFrames.columns(adf), names(adf), inmodule = inmodule) - -CompositeDataFrame(adf::AbstractDataFrame, nms::Vector{Symbol}, inmodule = DataFramesMeta) = - CompositeDataFrame(DataFrames.columns(adf), nms, inmodule = inmodule) - - -DataFrames.DataFrame(cdf::AbstractCompositeDataFrame) = DataFrame(DataFrames.columns(cdf), names(cdf)) - - -######################################### -## basic stuff -######################################### - -Base.names(cdf::T) where {T<:AbstractCompositeDataFrame} = fieldnames(T) - -DataFrames.ncol(cdf::AbstractCompositeDataFrame) = length(names(cdf)) -DataFrames.nrow(cdf::AbstractCompositeDataFrame) = ncol(cdf) > 0 ? length(getfield(cdf, 1))::Int : 0 - -DataFrames.columns(cdf::AbstractCompositeDataFrame) = Any[ getfield(cdf, i) for i in 1:length(cdf) ] - -function Base.hcat(df1::AbstractCompositeDataFrame, df2::AbstractCompositeDataFrame) - nms = DataFrames.make_unique([names(df1); names(df2)]) - columns = Any[DataFrames.columns(df1)..., DataFrames.columns(df2)...] - return CompositeDataFrame(columns, nms) -end -Base.hcat(df1::DataFrame, df2::AbstractCompositeDataFrame) = hcat(df1, DataFrame(df2)) -Base.hcat(df1::AbstractCompositeDataFrame, df2::AbstractDataFrame) = hcat(DataFrame(df1), DataFrame(df2)) -Base.hcat(df1::AbstractDataFrame, df2::AbstractCompositeDataFrame) = hcat(DataFrame(df1), DataFrame(df2)) - -DataFrames.index(cdf::AbstractCompositeDataFrame) = DataFrames.Index(names(cdf)) - -######################################### -## getindex -######################################### - -Base.getindex(cdf::AbstractCompositeDataFrame, col_inds::DataFrames.ColumnIndex) = getfield(cdf, col_inds) -function Base.getindex(cdf::AbstractCompositeDataFrame, col_inds::AbstractVector{T}) where {T<:DataFrames.ColumnIndex} - CompositeDataFrame(Any[ getfield(cdf, col_inds[i]) for i = 1:length(col_inds) ], names(cdf)[col_inds]) -end -Base.getindex(cdf::AbstractCompositeDataFrame, row_inds, col_inds::DataFrames.ColumnIndex) = getfield(cdf, col_inds)[row_inds] -Base.getindex(cdf::AbstractCompositeDataFrame, row_inds, col_inds) = - CompositeDataFrame(Any[ getfield(cdf, col_inds[i])[row_inds] for i = 1:length(col_inds) ], - Symbol[ names(cdf)[i] for i = 1:length(col_inds) ]) -Base.getindex(cdf::AbstractCompositeDataFrame, row_inds, ::Colon) = typeof(cdf)([getfield(cdf, i)[row_inds] for i in 1:length(cdf)]...) - -function Base.getindex(cdf::AbstractCompositeDataFrame, row_inds, col_inds::UnitRange) - if col_inds.start == 1 && col_inds.stop == length(cdf) - return typeof(cdf)([ getfield(cdf, i)[row_inds] for i in 1:length(cdf) ]...) - else - return CompositeDataFrame(Any[ getfield(cdf, col_inds[i])[row_inds] for i = 1:length(col_inds) ], names(cdf)[col_inds]) - end -end - -######################################### -## Row iterator -######################################### - -""" - CDFRowIterator - -An iterator over the rows of an `AbstractCompositeDataFrame`. Each row -is an immutable type with the same names as the parent composite data frame. -This iterator is created by calling `eachrow(df)` where `df` is an -`AbstractCompositeDataFrame`. - -See also `row(cdf, i)`. -""" -struct CDFRowIterator{T <: AbstractCompositeDataFrame} - df::T - len::Int -end -DataFrames.eachrow(df::AbstractCompositeDataFrame) = CDFRowIterator(df, nrow(df)) - -function Base.iterate(itr::CDFRowIterator, i=1) - i > itr.len && return nothing - return (row(itr.df, i), i+1) -end -Base.size(itr::CDFRowIterator) = (size(itr.df, 1), ) -Base.length(itr::CDFRowIterator) = size(itr.df, 1) -Base.getindex(itr::CDFRowIterator, i::Any) = row(itr.df, i) -Base.map(f::Function, dfri::CDFRowIterator) = [f(row) for row in dfri] - -######################################### -## LINQ-like operations -######################################### - - -DataFrames.order(d::AbstractCompositeDataFrame; args...) = - d[sortperm(DataFrame(args...)), :] - -transform(d::AbstractCompositeDataFrame; kwargs...) = - CompositeDataFrame(Any[DataFrames.columns(d)..., [ isa(v, Function) ? v(d) : v for (k,v) in kwargs ]...], - Symbol[names(d)..., [ k for (k,v) in kwargs ]...]) - -select(d::AbstractCompositeDataFrame; kwargs...) = - CompositeDataFrame(Any[ v for (k,v) in kwargs ], - Symbol[ k for (k,v) in kwargs ]) diff --git a/test/cdftimings.jl b/test/cdftimings.jl deleted file mode 100644 index c9ace852..00000000 --- a/test/cdftimings.jl +++ /dev/null @@ -1,79 +0,0 @@ - -module CompositeDataFrameTimings - -using Compat.Random -# using DataArrays, DataFrames -using DataFrames -using DataFramesMeta - -srand(1) -const n = 5_000_000 -a = rand(n) -b = rand(n) -c = rand(1:5, n) -cdf = CompositeDataFrame(a = a, b = b, c = c) -df = DataFrame(cdf) - -function dot1(df::AbstractDataFrame) - x = 0.0 - for i in 1:size(df, 1) - x += df[:a][i] * df[:a][i] - end - return x -end - -function dot2(df::AbstractDataFrame) - x = 0.0 - for i in 1:size(df, 1) - x += df[i,:a] * df[i,:a] - end - return x -end - -function dot3(df::AbstractDataFrame) - @with df begin - x = 0.0 - for i in 1:length(:a) - x += (:a)[i] * (:b)[i] - end - x - end -end - -function dot4(df::AbstractDataFrame) - x = 0.0 - for r in eachrow(df) - x += r.a * r.b - end - return x -end - -function dot5(df::AbstractDataFrame) - x = 0.0 - for i in 1:nrow(df) - x += df.a[i] * df.b[i] - end - return x -end - -function dot6(df::AbstractDataFrame) - x = 0.0 - for i in 1:nrow(df) - r = row(df, i) - x += r.a * r.b - end - return x - -end - -@show t1 = @elapsed dot1(df) -@show t2 = @elapsed dot2(df) -@show t3 = @elapsed dot3(df) -@show t1c = @elapsed dot1(cdf) -@show t2c = @elapsed dot2(cdf) -@show t3c = @elapsed dot3(cdf) -@show t4c = @elapsed dot4(cdf) -@show t5c = @elapsed dot5(cdf) -@show t6c = @elapsed dot6(cdf) - -end # module diff --git a/test/compositedataframes.jl b/test/compositedataframes.jl deleted file mode 100644 index edad8fa7..00000000 --- a/test/compositedataframes.jl +++ /dev/null @@ -1,82 +0,0 @@ - -module TestCompositeDataFrames - -using Compat.Test -using DataFrames -using DataFramesMeta - -df = CompositeDataFrame(A = [1, 2, 3], B = [2, 1, 2]) -x = [2, 1, 0] - -@test df.A == df[:A] -@test size(df[1:2,:]) == (2,2) -@test size(df[[1, 2],:]) == (2,2) -@test size(df[[1, 2], [:A]]) == (2,1) -@test size(df[:, [:A]]) == (3,1) -@test size(df[:, 1:2]) == (3,2) -@test size(df[:, [1]]) == (3,1) -@test size(df[df.A .< 3, :]) == (2,2) -@test size([df df]) == (3,4) - -@test @with(df, :A + 1) == df[:A] + 1 -@test @with(df, :A + :B) == df[:A] + df[:B] -@test @with(df, :A + x) == df[:A] + x -x = @with df begin - res = 0.0 - for i in 1:length(:A) - res += :A[i] * :B[i] - end - res -end -@test x == sum(df[:A] .* df[:B]) -@test @with(df, df[:A .> 1, ^([:B, :A])]) == df[df[:A] .> 1, [:B, :A]] -@test @with(df, DataFrame(a = :A * 2, b = :A + :B)) == DataFrame(a = df[:A] * 2, b = df[:A] + df[:B]) - -@test @where(df, :A .> 1) == df[df[:A] .> 1,:] -@test @where(df, :B .> 1) == df[df[:B] .> 1,:] -@test @where(df, :A .> x) == df[df[:A] .> x,:] -@test @where(df, :B .> x) == df[df[:B] .> x,:] -@test @where(df, :A .> :B) == df[df[:A] .> df[:B],:] - -@test @orderby(df, :B)[:A] == [2, 1, 3] - -df2 = @transform(df, C = :A + 1) -@test df2.C == df.A + 1 -@test df2[:C] == df2[:A] + 1 - -df3 = @select(df, :B, C = :A + 1, :A) -@test df3.C == df.A + 1 - -df = CompositeDataFrame(:MyDF, A = [1, 2, 3], B = [2, 1, 2]) -@test typeof(df) == DataFramesMeta.MyDF -@test df.A == df[:A] - -res = 0 -for x in eachrow(df) - res += x.A * x.B -end -@test res == sum(df.A .* df.B) - -itr = eachrow(df) -@test size(itr) == (3,) -@test length(itr) == 3 -@test getindex(itr, 1) == row(itr.df, 1) -@test map(itr) do row - row.A * row.B -end == [2, 2, 6] - -df2 = DataFrame(C = [1, 2, 3], D = [2, 1, 2]) -df3 = CompositeDataFrame(:DF3, C = [1, 2, 3], D = [2, 1, 2]) - -@test names(hcat(df, df2)) == [:A, :B, :C, :D] -@test names(hcat(df2, df)) == [:C, :D, :A, :B] -@test names(hcat(df, df3)) == [:A, :B, :C, :D] - -@test CompositeDataFrame(DataFrame(df)) == df -@test names(CompositeDataFrame(DataFrame(df), [:C, :D])) == [:C, :D] -@test row() == nothing -@test df[1, :A] == 1 -@test names(df[[1, 2]]) == [:A, :B] -@test names(df[1:1]) == [:A] - -end # module diff --git a/test/data.table.timings.jl b/test/data.table.timings.jl index a0331bd9..91a6115b 100644 --- a/test/data.table.timings.jl +++ b/test/data.table.timings.jl @@ -3,7 +3,6 @@ using Compat.Random using DataFrames, DataFramesMeta -# using CategoricalArrays, DataArrays using CategoricalArrays N=10_0000; K=100 @@ -51,21 +50,6 @@ DMA = DataFrame( v3 = Array{Union{Float64, Missing}}(rand(N)) # numeric e.g. 23.5749 ); -# DataArray version - -DDA = DataFrame( - id1 = DataArray(rand([Symbol("id", i) for i=1:K], N)), # large groups (char) - id2 = DataArray(rand([Symbol("id", i) for i=1:K], N)), # large groups (char) - id3 = DataArray(rand([Symbol("id", i) for i=1:N÷K], N)), # small groups (char) - id4 = DataArray(rand(1:K, N)), # large groups (int) - id5 = DataArray(rand(1:K, N)), # large groups (int) - id6 = DataArray(rand(1:N÷K, N)), # small groups (int) - v1 = DataArray(rand(1:5, N)), # int in range [1,5] - v2 = DataArray(rand(1:5, N)), # int in range [1,5] - v3 = DataArray(rand(N)) # numeric e.g. 23.5749 -); - - function dt_timings(D) @time @by(D, :id1, sv =sum(:v1)); @time @by(D, :id1, sv =sum(:v1)); @@ -83,6 +67,5 @@ end dt_timings(DA) dt_timings(DCA) dt_timings(DMA) -dt_timings(DDA) @profile @by(DA, :id1, sv =sum(:v1)); diff --git a/test/grouping.jl b/test/grouping.jl index abc36611..931c07a8 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4,7 +4,6 @@ module TestGrouping using Compat.Test using DataFrames -# using DataArrays, DataFrames using DataFramesMeta using Statistics diff --git a/test/runtests.jl b/test/runtests.jl index 77aaf27f..962dc54b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,8 +3,6 @@ anyerrors = false my_tests = ["dict.jl", "dataframes.jl", - # remove from tests until figured out how to fix it - # "compositedataframes.jl", "grouping.jl", "chaining.jl", "linqmacro.jl"] diff --git a/test/speedtests.jl b/test/speedtests.jl index 96398270..867a3b7b 100644 --- a/test/speedtests.jl +++ b/test/speedtests.jl @@ -2,7 +2,6 @@ module SpeedTests using Compat.Test, Compat.Random using DataFrames -# using DataArrays, DataFrames using DataFramesMeta using Devectorize @@ -10,14 +9,10 @@ srand(1) const n = 5_000_000 a = rand(n) b = rand(n) -# da = data(a) -# db = data(b) df = DataFrame(a = da, b = db) df2 = DataFrame(Any[a, b]) names!(df2, [:a, :b]) -# Base.values(da::DataArray) = da.data - function dot1(a::Vector, b::Vector) x = 0.0 for i in 1:length(a) @@ -26,15 +21,6 @@ function dot1(a::Vector, b::Vector) return x end -# function dot2(da::DataVector, db::DataVector) -# T = eltype(da) -# x = 0.0 -# for i in 1:length(da) -# x += da[i]::T * db[i]::T -# end -# return x -# end - function dot3(df::DataFrame) da, db = df[:a], df[:b] T = eltype(da) @@ -50,32 +36,6 @@ function dot4(df::DataFrame) return dot2(da, db) end -# function dot5(da::DataVector, db::DataVector) -# x = 0.0 -# for i in 1:length(da) -# x += da.data[i] * db.data[i] -# end -# return x -# end - -# function dot6(da::DataVector, db::DataVector) -# x = 0.0 -# for i in 1:length(da) -# x += values(da)[i] * values(db)[i] -# end -# return x -# end - -# function dot7(da::DataVector, db::DataVector) -# x = 0.0 -# for i in 1:length(da) -# if !(isna(da, i) || isna(da, i)) -# x += values(da)[i] * values(db)[i] -# end -# end -# return x -# end - function dot8(a::Vector, b::Vector) x = 0.0 for i in 1:length(a) @@ -96,21 +56,9 @@ function dot9(df::DataFrame) end end -## function dot10(df::DataFrame) -## @with df begin -## @devec x = sum(:a .* :b) -## x -## end -## end -## t10 = @elapsed dot10(df) - t1 = @elapsed dot1(a, b) -# t2 = @elapsed dot2(da, db) t3 = @elapsed dot3(df) t4 = @elapsed dot4(df) -# t5 = @elapsed dot5(da, db) -# t6 = @elapsed dot6(da, db) -# t7 = @elapsed dot7(da, db) t8 = @elapsed dot8(a, b) t9 = @elapsed dot9(df) diff --git a/test/timings.jl b/test/timings.jl index 6eba89ec..f298b717 100644 --- a/test/timings.jl +++ b/test/timings.jl @@ -3,36 +3,14 @@ module DataFramesTimings using Compat.Random using DataFrames -# using DataArrays, DataFrames using DataFramesMeta srand(1) const n = 5_000_000 a = rand(n) b = rand(n) -# da = data(a) -# db = data(b) df = DataFrame(a = da, b = db) -# Base.values(da::DataArray) = da.data - -# function dot1(da::DataVector, db::DataVector) -# T = eltype(da) -# x = 0.0 -# for i in 1:length(da) -# x += da[i]::T * db[i]::T -# end -# return x -# end - -# function dot2(da::DataVector, db::DataVector) -# x = 0.0 -# for i in 1:length(da) -# x += da.data[i] * db.data[i] -# end -# return x -# end - function dot3(df::DataFrame) da, db = df[:a], df[:b] T = eltype(da) @@ -57,15 +35,12 @@ function dot5(df::DataFrame) @with df begin x = 0.0 for i in 1:length(:a) - ## x += values(:a)[i] * values(:b)[i] x += (:a).data[i] * (:b).data[i] end x end end -# @show t1 = @elapsed dot1(da, db) -# @show t2 = @elapsed dot2(da, db) @show t3 = @elapsed dot3(df) @show t4 = @elapsed dot4(df) @show t5 = @elapsed dot5(df) From 8e4eea3fae84ce16aa20737789349a2e31dad6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 17:20:51 +0200 Subject: [PATCH 6/8] major documentation update --- README.md | 133 +++++------------------------------------- src/DataFramesMeta.jl | 23 ++++---- src/linqmacro.jl | 2 +- 3 files changed, 28 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index f9603169..00183ebe 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Travis](https://travis-ci.org/JuliaStats/DataFramesMeta.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/DataFramesMeta.jl) [![AppVeyor](https://ci.appveyor.com/api/projects/status/github/juliastats/dataframesmeta.jl?branch=master&svg=true)](https://ci.appveyor.com/project/tshort/dataframesmeta-jl/branch/master) -Metaprogramming tools for DataFrames and Associative objects. +Metaprogramming tools for DataFrames and AbstractDict objects. These macros improve performance and provide more convenient syntax. # Features @@ -19,13 +19,13 @@ These macros improve performance and provide more convenient syntax. a symbol. Here are some examples: ```julia -using DataArrays, DataFrames +using DataFrames using DataFramesMeta df = DataFrame(x = 1:3, y = [2, 1, 2]) x = [2, 1, 0] -@with(df, :y + 1) +@with(df, :y .+ 1) @with(df, :x + x) # the two x's are different x = @with df begin @@ -40,7 +40,6 @@ end colref = :x @with(df, :y + _I_(colref)) # Equivalent to df[:y] + df[colref] - ``` This works for Associative types, too: @@ -55,14 +54,12 @@ d = Dict(:s => 3, :y => 44, :d => 5) `@with` is the fundamental macro used by the other metaprogramming utilities. -`@with` creates a function, so scope within `@with` is a [hard -scope](http://docs.julialang.org/en/release-0.4/manual/variables-and-scoping/#hard-local-scope), -as with `do`-blocks or other function definitions. Variables in the parent -can be read. Writing to variables in the parent scope differs depending on -the type of scope of the parent. If the parent scope is a global scope, then -a variable cannot be assigned without using the `global` keyword. If the parent -scope is a local scope (inside a function or let block for example), the `global` -keyword is not needed to assign to that parent scope. +`@with` creates a function, so scope within `@with` is a local scope. +Variables in the parent can be read. Writing to variables in the parent scope +differs depending on the type of scope of the parent. If the parent scope is a +global scope, then a variable cannot be assigned without using the `global` keyword. +If the parent scope is a local scope (inside a function or let block for example), +the `global` keyword is not needed to assign to that parent scope. ## `@where` @@ -85,13 +82,10 @@ Column selections and transformations. Also works with Associative types. ## `@transform` -Add additional columns based on keyword arguments. This is available -in both function and macro versions with the macro version allowing -direct reference to columns using the colon syntax: +Add additional columns based on keyword arguments. ```julia -transform(df, newCol = cos(df[:x]), anotherCol = df[:x]^2 + 3*df[:x] + 4) -@transform(df, newCol = cos(:x), anotherCol = :x^2 + 3*:x + 4) +@transform(df, newCol = cos.(:x), anotherCol = :x.^2 + 3*:x .+ 4) ``` `@transform` works for Associative types, too. @@ -120,7 +114,7 @@ of `x` within `@byrow!`. `byrow!` also supports special syntax for allocating new columns to make `byrow!` more useful for data transformations. The syntax `@newcol x::Array{Int}` allocates a new column `:x` with an `Array` container with eltype -`Int`. Note that the returned AbstractDataFrame includes these new columns, but +`Int`. Note that the returned `AbstractDataFrame` includes these new columns, but the original `df` is not affected. Here is an example where two new columns are added: @@ -128,7 +122,7 @@ added: df = DataFrame(A = 1:3, B = [2, 1, 2]) df2 = @byrow! df begin @newcol colX::Array{Float64} - @newcol colY::DataArray{Int} + @newcol colY::Array{Int} :colX = :B == 2 ? pi * :A : :B if :A > 1 :colY = :A * :B @@ -155,28 +149,10 @@ functions. @select select Select -Chaining operations is a useful way to manipulate data. There are -several ways to do this. This is still in flux in base Julia -(https://github.com/JuliaLang/julia/issues/5571). Here is one option -from [Lazy.jl](https://github.com/one-more-minute/Lazy.jl) by Mike -Innes: - -```julia -x_thread = @> begin - df - @transform(y = 10 * :x) - @where(:a .> 2) - @by(:b, meanX = mean(:x), meanY = mean(:y)) - @orderby(:meanX) - @select(:meanX, :meanY, var = :b) -end -``` +## LINQ macro -## Alternative LINQ macro - -As another experiment, there is also a `@linq` macro that supports -chaining and all of the functionality defined in other macros. Here is -an example of `@linq`: +There is also a `@linq` macro that supports chaining and all of the +functionality defined in other macros. Here is an example of `@linq`: ```julia x_thread = @linq df |> @@ -255,83 +231,6 @@ then called. Operations are efficient because: All of the other macros are based on `@with`. -# CompositeDataFrame - -A `CompositeDataFrame` is a type-stable `AbstractDataFrame` built using composite -types. Each column is a field in a composite type. `CompositeDataFrame` is an -abstract type; each concrete composite type inherits from this. The advantages -of this approach are: - -* You can access single columns directly using `df.colA`. This is type stable, - so code should be faster. (There is still the function boundary to worry - about.) - -* All indexing operations can be done currently. - -Some downsides include: - -* As an abuse of the type system, creating a new type for each change to - a `CompositeDataFrame` may waste memory. - -* You cannot change the structure of a `CompositeDataFrame` once created. - It is nearly like an immutable object. For example to add a column, you need - to do something like: - -```julia - transform(df, newcol = df.colA + 5) -``` - -An advantage of this is that the API becomes more functional. All -manipulations of the `CompositeDataFrame` return a new object. -Normally, this doesn't create much more memory. - -To create a CompositeDataFrame, use `CompositeDataFrame`: - -```julia -n = 10 -d = CompositeDataFrame(a = 1:n, b = rand(10), c = rand(1:3, n)) -``` - -You can also name the type of the `CompositeDataFrame` by including that as the -first symbol: - -```julia -n = 10 -d = CompositeDataFrame(:MyDF, a = 1:n, b = rand(n), c = rand(1:3, n)) -``` - -You can also define a `CompositeDataFrame` manually as follows. If you do this, -you are responsible for keeping each column the same length. - -```julia -immutable MyDF <: AbstractCompositeDataFrame - a::Vector{Int} - b::Vector{Float64} - c::DataVector{Float64} -end - -MyDF(n::Integer) = MyDF(zeros(Int, n), zeros(n), zeros(n)) -d = MyDF(10) -``` - -Note that a `CompositeDataFrame` is type stable with field access like `df.colA` -but not with `getindex` indexing like `df[:colA]`. `df[:colA]` works, but it is -not type stable. - -Type-stable access to rows is also provided using `row(d, i)` or the iterator -`eachrow(d)`. Here is an example: - -```julia -n = 10 -d = CompositeDataFrame(:MyDF, a = 1:n, b = rand(10), c = DataArray(rand(1:3, n))) -x = row(d, 5) -x.a # 5 -y = [x.a * x.b for x in eachrow(d)] -``` - -In the example above, the call to `CompositeDataFrame` creates the type `MyDF` -that holds the composite data frame and another type `MyDFRow` that is used by -`row` and `eachrow`. # Package Maintenance diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index 0c3acd7c..a4e933f8 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -100,13 +100,13 @@ then called. Operations are efficient because: The following ```julia -@with(d, :a + :b + 1) +@with(d, :a .+ :b .+ 1) ``` becomes ```julia -tempfun(a, b) = a + b + 1 +tempfun(a, b) = a .+ b .+ 1 tempfun(d[:a], d[:b]) ``` @@ -167,13 +167,12 @@ julia> @with(df, :y + _I_(colref)) # Equivalent to df[:y] + df[colref] 5 ``` -`@with` creates a function, so scope within `@with` is a *hard scope*, -as with `do`-blocks or other function definitions. Variables in the parent -can be read. Writing to variables in the parent scope differs depending on -the type of scope of the parent. If the parent scope is a global scope, then -a variable cannot be assigned without using the `global` keyword. If the parent -scope is a local scope (inside a function or let block for example), the `global` -keyword is not needed to assign to that parent scope. +`@with` creates a function, so scope within `@with` is a local scope. +Variables in the parent can be read. Writing to variables in the parent scope +differs depending on the type of scope of the parent. If the parent scope is a +global scope, then a variable cannot be assigned without using the `global` keyword. +If the parent scope is a local scope (inside a function or let block for example), +the `global` keyword is not needed to assign to that parent scope. """ macro with(d, body) @@ -337,7 +336,7 @@ Sort by criteria. Normally used to sort groups in GroupedDataFrames. ### Examples ```jldoctest -julia> using DataFrames, DataFramesMeta +julia> using DataFrames, DataFramesMeta, Statistics julia> d = DataFrame(n = 1:20, x = [3, 3, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 3, 1, 1, 2]); @@ -444,7 +443,7 @@ Dict{Symbol,Int64} with 4 entries: julia> df = DataFrame(A = 1:3, B = [2, 1, 2]); -julia> @transform(df, a = 2 * :A, x = :A + :B) +julia> @transform(df, a = 2 * :A, x = :A .+ :B) 3×4 DataFrames.DataFrame │ Row │ A │ B │ a │ x │ ├─────┼───┼───┼───┼───┤ @@ -558,7 +557,7 @@ Split-apply-combine in one step. ### Examples ```jldoctest -julia> using DataFrames, DataFramesMeta +julia> using DataFrames, DataFramesMeta, Statistics julia> df = DataFrame( a = repeat(1:4, outer = 2), diff --git a/src/linqmacro.jl b/src/linqmacro.jl index 1bfad3c3..34ad6d4c 100644 --- a/src/linqmacro.jl +++ b/src/linqmacro.jl @@ -27,7 +27,7 @@ The following embedded function calls are equivalent to their macro version: ### Examples ```jldoctest -julia> using DataFrames, DataFramesMeta +julia> using DataFrames, DataFramesMeta, Statistics julia> n = 100; From 32939387a7409f5b4a70df4b7aa4ece7d44a8e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Aug 2018 18:31:02 +0200 Subject: [PATCH 7/8] update appveyor.yml --- appveyor.yml | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 68f6c50b..5365f4f7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,18 +1,18 @@ environment: matrix: - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.7/julia-0.7-latest-win32.exe" - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7-latest-win64.exe" - - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe" - - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe" + - julia_version: 0.7 + - julia_version: 1 + - julia_version: nightly + +platform: + - x86 # 32-bit + - x64 # 64-bit branches: only: - master - /release-.*/ -skip_commits: - message: /\[av skip\]/ - notifications: - provider: Email on_build_success: false @@ -20,24 +20,12 @@ notifications: on_build_status_changed: false install: - - ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12" -# If there's a newer build queued for the same PR, cancel this one - - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` - https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` - Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` - throw "There are newer queued builds for this pull request, failing early." } -# Download most recent Julia Windows binary - - ps: (new-object net.webclient).DownloadFile( - $env:JULIA_URL, - "C:\projects\julia-binary.exe") -# Run installer silently, output to C:\projects\julia - - C:\projects\julia-binary.exe /S /D=C:\projects\julia + - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) build_script: -# Need to convert from shallow to complete for Pkg.clone to work - - IF EXIST .git\shallow (git fetch --unshallow) - - C:\projects\julia\bin\julia -e "using InteractiveUtils; InteractiveUtils.versioninfo(); - Pkg.clone(pwd(), \"DataFramesMeta\"); Pkg.build(\"DataFramesMeta\")" + - echo "%JL_BUILD_SCRIPT%" + - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" test_script: - - C:\projects\julia\bin\julia --check-bounds=yes -e "Pkg.test(\"DataFramesMeta\")" + - echo "%JL_TEST_SCRIPT%" + - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" From ee2effa0ad56f58ae4860296e77f22f569c65526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 21 Aug 2018 21:00:15 +0200 Subject: [PATCH 8/8] cleanup after a review --- .travis.yml | 1 + README.md | 12 +++++++----- REQUIRE | 1 - appveyor.yml | 2 +- test/chaining.jl | 1 + 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 03305145..f03b0c76 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ language: julia julia: - 0.7 + - 1.0 - nightly os: - linux diff --git a/README.md b/README.md index 00183ebe..ecaef087 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Travis](https://travis-ci.org/JuliaStats/DataFramesMeta.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/DataFramesMeta.jl) [![AppVeyor](https://ci.appveyor.com/api/projects/status/github/juliastats/dataframesmeta.jl?branch=master&svg=true)](https://ci.appveyor.com/project/tshort/dataframesmeta-jl/branch/master) -Metaprogramming tools for DataFrames and AbstractDict objects. +Metaprogramming tools for DataFrames.jl and `AbstractDict` objects. These macros improve performance and provide more convenient syntax. # Features @@ -42,7 +42,7 @@ colref = :x @with(df, :y + _I_(colref)) # Equivalent to df[:y] + df[colref] ``` -This works for Associative types, too: +This works for `AbstractDict` types, too: ```julia y = 3 @@ -73,7 +73,7 @@ Select row subsets. ## `@select` -Column selections and transformations. Also works with Associative types. +Column selections and transformations. Also works with `AbstractDict` types. ```julia @select(df, :x, :y, :z) @@ -88,7 +88,7 @@ Add additional columns based on keyword arguments. @transform(df, newCol = cos.(:x), anotherCol = :x.^2 + 3*:x .+ 4) ``` -`@transform` works for Associative types, too. +`@transform` works for `AbstractDict` types, too. ## `@byrow!` @@ -122,10 +122,12 @@ added: df = DataFrame(A = 1:3, B = [2, 1, 2]) df2 = @byrow! df begin @newcol colX::Array{Float64} - @newcol colY::Array{Int} + @newcol colY::Array{Union{Int,Missing}} :colX = :B == 2 ? pi * :A : :B if :A > 1 :colY = :A * :B + else + :colY = Missing end end ``` diff --git a/REQUIRE b/REQUIRE index dddf46a1..d951fd79 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,2 @@ julia 0.7 DataFrames 0.13 -Compat 1.0 diff --git a/appveyor.yml b/appveyor.yml index 5365f4f7..2a628342 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,7 +1,7 @@ environment: matrix: - julia_version: 0.7 - - julia_version: 1 + - julia_version: 1.0 - julia_version: nightly platform: diff --git a/test/chaining.jl b/test/chaining.jl index 66b6b62a..2f4c0561 100644 --- a/test/chaining.jl +++ b/test/chaining.jl @@ -28,6 +28,7 @@ x_as = @as _x_ begin @select(_x_, var = :b, :meanX, :meanY) end +# Uncomment and add to README.md when it starts working: # @> is broken in 0.7 Lazy #x_thread = @> begin # df