diff --git a/NEWS.md b/NEWS.md index 81a9d67825..504d2386b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,6 +74,8 @@ which if set to `true` makes them retun a `SubDataFrame` view into the passed data frame. * add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449)) +* passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine` + with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476)) ## Deprecated diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 46aee96257..c92d84cb6b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -994,9 +994,10 @@ end @inline function Base.filter((cols, f)::Pair, df::AbstractDataFrame; view::Bool=false) int_cols = index(df)[cols] # it will be AbstractVector{Int} or Int if length(int_cols) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = [f() for _ in axes(df, 1)] + else + rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...) end - rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -1006,9 +1007,10 @@ end AbstractVector{<:Symbol}}}, df::AbstractDataFrame; view::Bool=false) if length(cols) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = [f() for _ in axes(df, 1)] + else + rowidxs = _filter_helper(f, (df[!, i] for i in cols)...) end - rowidxs = _filter_helper(f, (df[!, i] for i in cols)...) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -1018,9 +1020,10 @@ _filter_helper(f, cols...)::BitVector = ((x...) -> f(x...)::Bool).(cols...) view::Bool=false) df_tmp = select(df, cols.cols, copycols=false) if ncol(df_tmp) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = [f(NamedTuple()) for _ in axes(df, 1)] + else + rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp)) end - rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp)) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -1101,7 +1104,7 @@ julia> filter!(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df) │ 3 │ 1 │ b │ ``` """ -Base.filter!(f, df::AbstractDataFrame) = _filter!_helper(df, f, eachrow(df)) +Base.filter!(f, df::AbstractDataFrame) = delete!(df, findall(!f, eachrow(df))) Base.filter!((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) = _filter!_helper(df, f, df[!, col]) Base.filter!((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) = @@ -1115,17 +1118,20 @@ Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) = function _filter!_helper(df::AbstractDataFrame, f, cols...) if length(cols) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = findall(x -> !f(), axes(df, 1)) + else + rowidxs = findall(((x...) -> !(f(x...)::Bool)).(cols...)) end - return delete!(df, findall(((x...) -> !(f(x...)::Bool)).(cols...))) + return delete!(df, rowidxs) end function Base.filter!((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame) dff = select(df, cols.cols, copycols=false) if ncol(dff) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + return delete!(df, findall(x -> !f(NamedTuple()), axes(df, 1))) + else + return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f) end - return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f) end _filter!_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) = diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 76393ccd85..88500bd77d 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -67,10 +67,20 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool) function normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any,<:Pair{<:Base.Callable, - <:Union{Symbol, AbstractString, DataType, - AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}}}, + sel::Pair{<:ColumnIndex, + <:Pair{<:Base.Callable, + <:Union{Symbol, AbstractString}}}, + renamecols::Bool) + src, (fun, dst) = sel + return idx[src] => fun => Symbol(dst) +end + +function normalize_selection(idx::AbstractIndex, + sel::Pair{<:Any, + <:Pair{<:Base.Callable, + <:Union{Symbol, AbstractString, DataType, + AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}}}, renamecols::Bool) lls = last(last(sel)) if lls isa DataType @@ -170,31 +180,29 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function _transformation_helper(df::AbstractDataFrame, - col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, - @nospecialize(fun)) - if col_idx === nothing - return fun(df) - elseif col_idx isa Int - return fun(df[!, col_idx]) - elseif col_idx isa AsTable - tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) - if isempty(tbl) && fun isa ByRow - return [fun.fun(NamedTuple()) for _ in 1:nrow(df)] - else - return fun(tbl) - end +_transformation_helper(df::AbstractDataFrame, col_idx::Nothing, fun) = fun(df) +_transformation_helper(df::AbstractDataFrame, col_idx::Int, fun) = fun(df[!, col_idx]) + +_empty_astable_helper(fun, len) = [fun(NamedTuple()) for _ in 1:len] + +function _transformation_helper(df::AbstractDataFrame, col_idx::AsTable, fun) + tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) + if isempty(tbl) && fun isa ByRow + return _empty_astable_helper(fun.fun, nrow(df)) else - # it should be fast enough here as we do not expect to do it millions of times - @assert col_idx isa AbstractVector{Int} - if isempty(col_idx) && fun isa ByRow - return [fun.fun() for _ in 1:nrow(df)] - else - cdf = eachcol(df) - return fun(map(c -> cdf[c], col_idx)...) - end + return fun(tbl) + end +end + +_empty_selector_helper(fun, len) = [fun() for _ in 1:len] + +function _transformation_helper(df::AbstractDataFrame, col_idx::AbstractVector{Int}, fun) + if isempty(col_idx) && fun isa ByRow + return _empty_selector_helper(fun.fun, nrow(df)) + else + cdf = eachcol(df) + return fun(map(c -> cdf[c], col_idx)...) end - throw(ErrorException("unreachable reached")) end function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol}, @@ -656,7 +664,7 @@ julia> select!(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stat ``` """ -select!(df::DataFrame, args...; renamecols::Bool=true) = +select!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) = _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols)) function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -676,7 +684,7 @@ Equivalent to `select!(df, :, args...)`. See [`select!`](@ref) for detailed rules regarding accepted values for `args`. """ -transform!(df::DataFrame, args...; renamecols::Bool=true) = +transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) = select!(df, :, args..., renamecols=renamecols) function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -810,7 +818,7 @@ julia> select(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats ``` """ -select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = +select(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) = manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols) function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -831,7 +839,7 @@ Equivalent to `select(df, :, args..., copycols=copycols)`. See [`select`](@ref) for detailed rules regarding accepted values for `args`. """ -transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = +transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) = select(df, :, args..., copycols=copycols, renamecols=renamecols) function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -935,7 +943,7 @@ julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stat │ 3 │ (mean = 6.0, std = 3.0) │ 6.0 │ 3.0 │ ``` """ -combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = +combine(df::AbstractDataFrame, @nospecialize(args...); renamecols::Bool=true) = manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols) function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -964,7 +972,7 @@ manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool, renamecols::Bool) = manipulate(df, [c], copycols=copycols, keeprows=keeprows, renamecols=renamecols) -function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool) +function manipulate(df::DataFrame, @nospecialize(cs...); copycols::Bool, keeprows::Bool, renamecols::Bool) cs_vec = [] for v in cs if v isa AbstractVecOrMat{<:Pair} @@ -1061,7 +1069,7 @@ function manipulate(dfv::SubDataFrame, args::MultiColumnIndex; end end -function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool, +function manipulate(dfv::SubDataFrame, @nospecialize(args...); copycols::Bool, keeprows::Bool, renamecols::Bool) if copycols cs_vec = [] diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index b6f0595019..30ef6ab0e1 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -711,7 +711,11 @@ end function do_call(f::Any, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::Tuple{}, i::Integer) - f() + if f isa ByRow + return [f.fun() for _ in 1:(ends[i] - starts[i] + 1)] + else + return f() + end end function do_call(f::Any, idx::AbstractVector{<:Integer}, @@ -753,8 +757,12 @@ end function do_call(f::Any, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::NamedTuple, i::Integer) - idx = idx[starts[i]:ends[i]] - return f(map(c -> view(c, idx), incols)) + if f isa ByRow && isempty(incols) + return [f.fun(NamedTuple()) for _ in 1:(ends[i] - starts[i] + 1)] + else + idx = idx[starts[i]:ends[i]] + return f(map(c -> view(c, idx), incols)) + end end function do_call(f::Any, idx::AbstractVector{<:Integer}, diff --git a/test/data.jl b/test/data.jl index cd9f167865..fff7a74cec 100644 --- a/test/data.jl +++ b/test/data.jl @@ -453,11 +453,45 @@ end @test filter(AsTable("x") => testfun, df) == DataFrame(x=[3, 2], y=["b", "a"]) filter!(AsTable("x") => testfun, df) @test df == DataFrame(x=[3, 2], y=["b", "a"]) +end + +@testset "empty arg to filter and filter!" begin + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + + @test filter([] => () -> true, df) == df + @test filter(AsTable(r"z") => x -> true, df) == df + @test filter!([] => () -> true, copy(df)) == df + @test filter!(AsTable(r"z") => x -> true, copy(df)) == df + + flipflop0 = let + state = false + () -> (state = !state) + end + + flipflop1 = let + state = false + x -> (state = !state) + end - @test_throws ArgumentError filter([] => () -> true, df) - @test_throws ArgumentError filter(AsTable(r"z") => () -> true, df) - @test_throws ArgumentError filter!([] => () -> true, df) - @test_throws ArgumentError filter!(AsTable(r"z") => () -> true, df) + @test filter([] => flipflop0, df) == df[[1,3], :] + @test filter(Int[] => flipflop0, df) == df[[1,3], :] + @test filter(String[] => flipflop0, df) == df[[1,3], :] + @test filter(Symbol[] => flipflop0, df) == df[[1,3], :] + @test filter(r"z" => flipflop0, df) == df[[1,3], :] + @test filter(Not(All()) => flipflop0, df) == df[[1,3], :] + @test filter(AsTable(r"z") => flipflop1, df) == df[[1,3], :] + @test filter(AsTable([]) => flipflop1, df) == df[[1,3], :] + @test filter!([] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(Int[] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(String[] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(Symbol[] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(r"z" => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(Not(All()) => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(AsTable(r"z") => flipflop1, copy(df)) == df[[1,3], :] + @test filter!(AsTable([]) => flipflop1, copy(df)) == df[[1,3], :] + + @test_throws MethodError filter([] => flipflop1, df) + @test_throws MethodError filter(AsTable([]) => flipflop0, df) end @testset "names with cols" begin diff --git a/test/grouping.jl b/test/grouping.jl index 26b81ad058..27a20a3ceb 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -2874,4 +2874,45 @@ end @test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) end +@testset "empty ByRow" begin + inc0 = let + state = 0 + () -> (state += 1) + end + + inc1 = let + state = 0 + x -> (state += 1) + end + + df = DataFrame(a=[1,1,1,2,2,3,4,4,5,5,5,5], b=1:12) + gdf = groupby_checked(df, :a) + + @test select(gdf, [] => ByRow(inc0) => :bin) == + DataFrame(a=df.a, bin=1:12) + @test combine(gdf, [] => ByRow(inc0) => :bin) == + DataFrame(a=df.a, bin=13:24) + @test select(gdf, AsTable([]) => ByRow(inc1) => :bin) == + DataFrame(a=df.a, bin=1:12) + @test combine(gdf, AsTable([]) => ByRow(inc1) => :bin) == + DataFrame(a=df.a, bin=13:24) + @test combine(gdf[Not(2)], [] => ByRow(inc0) => :bin) == + DataFrame(a=df.a[Not(4:5)], bin=25:34) + @test combine(gdf[Not(2)], AsTable([]) => ByRow(inc1) => :bin) == + DataFrame(a=df.a[Not(4:5)], bin=25:34) + + # note that type inference in a comprehension does not always work + @test isequal_coltyped(combine(gdf[[]], [] => ByRow(inc0) => :bin), + DataFrame(a=Int[], bin=Any[])) + @test isequal_coltyped(combine(gdf[[]], [] => ByRow(rand) => :bin), + DataFrame(a=Int[], bin=Float64[])) + @test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(inc1) => :bin), + DataFrame(a=Int[], bin=Any[])) + @test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(x -> rand()) => :bin), + DataFrame(a=Int[], bin=Float64[])) + + @test_throws MethodError select(gdf, [] => ByRow(inc1) => :bin) + @test_throws MethodError select(gdf, AsTable([]) => ByRow(inc0) => :bin) +end + end # module