diff --git a/NEWS.md b/NEWS.md index 384219f6d..ddaab8785 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,9 @@ treated as if they were wrapped in `Cols` and does not throw an error when a vector of duplicate indices is passed when doing column selection ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) +* `reduce` performing `vcat` on a collection of data frames + now accepts `init` keyword argument + ([#3310](https://github.com/JuliaData/DataFrames.jl/pull/3310)) * Allow to pass column names in `DataFrame` constructor that replace the names generated by default ([#3320](https://github.com/JuliaData/DataFrames.jl/pull/3320)) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 5d558b03b..9dc47849a 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1777,11 +1777,20 @@ Base.vcat(dfs::AbstractDataFrame...; cols::Union{Symbol, AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal, source::Union{Nothing, Symbol, AbstractString, - Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing) + Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing, + init::AbstractDataFrame=DataFrame()) Efficiently reduce the given vector or tuple of `AbstractDataFrame`s with `vcat`. +See the [`vcat`](@ref) docstring for a description of keyword arguments `cols` +and `source`. + +The keyword argument `init` is the initial value to use in the reductions. +It must be a data frame that has zero rows. It is not taken into account when +computing the value of the `source` column nor when determining metadata +of the produced data frame. + The column order, names, and types of the resulting `DataFrame`, and the behavior of `cols` and `source` keyword arguments follow the rules specified for [`vcat`](@ref) of `AbstractDataFrame`s. @@ -1854,8 +1863,14 @@ function Base.reduce(::typeof(vcat), cols::Union{Symbol, AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal, source::Union{Nothing, SymbolOrString, - Pair{<:SymbolOrString, <:AbstractVector}}=nothing) - res = _vcat(AbstractDataFrame[df for df in dfs if ncol(df) != 0]; cols=cols) + Pair{<:SymbolOrString, <:AbstractVector}}=nothing, + init::AbstractDataFrame=DataFrame()) + if nrow(init) > 0 + throw(ArgumentError("init data frame must have zero rows")) + end + dfs_init = AbstractDataFrame[emptycolmetadata!(copy(init))] + append!(dfs_init, dfs) + res = _vcat(AbstractDataFrame[df for df in dfs_init if ncol(df) != 0]; cols=cols) # only handle table-level metadata, as column-level metadata was done in _vcat _merge_matching_table_note_metadata!(res, dfs) @@ -1899,8 +1914,13 @@ end # definition needed to avoid dispatch ambiguity Base.reduce(::typeof(vcat), dfs::SentinelArrays.ChainedVector{T, A} where {T<:AbstractDataFrame, - A<:AbstractVector{T}}) = - reduce(vcat, collect(AbstractDataFrame, dfs)) + A<:AbstractVector{T}}; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + source::Union{Nothing, SymbolOrString, + Pair{<:SymbolOrString, <:AbstractVector}}=nothing, + init::AbstractDataFrame=DataFrame()) = + reduce(vcat, collect(AbstractDataFrame, dfs), cols=cols, source=source, init=init) function _vcat(dfs::AbstractVector{AbstractDataFrame}; cols::Union{Symbol, AbstractVector{Symbol}, diff --git a/test/dataframe.jl b/test/dataframe.jl index 8b4e6b9d5..971d7626d 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1900,9 +1900,34 @@ end DataFrame(c=[missing, missing])) end +@testset "vcat init" begin + dfs = [DataFrame(a=1), DataFrame(b=2)] + @test reduce(vcat, dfs, cols=:union, source=:x, init=DataFrame(c=[])) ≅ + DataFrame(c=missing, a=[1, missing], b=[missing, 2], x=1:2) + @test reduce(vcat, dfs, cols=:union, source=:x => ["a", "b"], init=DataFrame(c=[])) ≅ + DataFrame(c=missing, a=[1, missing], b=[missing, 2], x=["a", "b"]) + + dfs = [DataFrame(a=1), DataFrame(a=2)] + df_init = DataFrame(c=[]) + metadata!(df_init, "k1", "v1", style=:note) + colmetadata!(df_init, :c, "kc", "vc", style=:note) + metadata!(dfs[1], "k2", "v2", style=:note) + metadata!(dfs[2], "k2", "v2", style=:note) + colmetadata!(dfs[1], :a, "ka", "va", style=:note) + colmetadata!(dfs[2], :a, "ka", "va", style=:note) + res = reduce(vcat, dfs, cols=:union, source=:x => ["a", "b"], init=df_init) + @test metadata(res) == Dict("k2" => "v2") + @test colmetadata(res) == Dict(:a => Dict("ka" => "va")) + + @test_throws ArgumentError reduce(vcat, dfs, cols=:union, source=:x, init=DataFrame(c=[1])) +end + @testset "vcat ChainedVector ambiguity" begin dfs = DataFrames.SentinelArrays.ChainedVector([[DataFrame(a=1)], [DataFrame(a=2)]]) @test reduce(vcat, dfs) == DataFrame(a=1:2) + dfs = DataFrames.SentinelArrays.ChainedVector([[DataFrame(a=1)], [DataFrame(b=2)]]) + @test reduce(vcat, dfs, cols=:union, source=:x, init=DataFrame(c=[])) ≅ + DataFrame(c=missing, a=[1, missing], b=[missing, 2], x=1:2) end @testset "names for Type, predicate + standard tests of cols" begin