JuliaData · bkamins · Jun 13, 2023 · Apr 6, 2023 · Apr 7, 2023 · May 23, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,9 @@
   treated as if they were wrapped in `Cols` and does not throw an error
   when a vector of duplicate indices is passed when doing column selection
   ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302))
+* `reduce` performing `vcat` on a collection of data frames
+  now accepts `init` keyword argument
+  ([#3310](https://github.com/JuliaData/DataFrames.jl/pull/3310))
 * Allow to pass column names in `DataFrame` constructor that replace
   the names generated by default
   ([#3320](https://github.com/JuliaData/DataFrames.jl/pull/3320))

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -1777,11 +1777,20 @@ Base.vcat(dfs::AbstractDataFrame...;
            cols::Union{Symbol, AbstractVector{Symbol},
                        AbstractVector{<:AbstractString}}=:setequal,
            source::Union{Nothing, Symbol, AbstractString,
-                         Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing)
+                         Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing,
+           init::AbstractDataFrame=DataFrame())
 
 Efficiently reduce the given vector or tuple of `AbstractDataFrame`s with
 `vcat`.
 
+See the [`vcat`](@ref) docstring for a description of keyword arguments `cols`
+and `source`.
+
+The keyword argument `init` is the initial value to use in the reductions.
+It must be a data frame that has zero rows. It is not taken into account when
+computing the value of the `source` column nor when determining metadata
+of the produced data frame.
+
 The column order, names, and types of the resulting `DataFrame`, and the
 behavior of `cols` and `source` keyword arguments follow the rules specified for
 [`vcat`](@ref) of `AbstractDataFrame`s.
@@ -1854,8 +1863,14 @@ function Base.reduce(::typeof(vcat),
                      cols::Union{Symbol, AbstractVector{Symbol},
                                  AbstractVector{<:AbstractString}}=:setequal,
                      source::Union{Nothing, SymbolOrString,
-                                   Pair{<:SymbolOrString, <:AbstractVector}}=nothing)
-    res = _vcat(AbstractDataFrame[df for df in dfs if ncol(df) != 0]; cols=cols)
+                                   Pair{<:SymbolOrString, <:AbstractVector}}=nothing,
+                     init::AbstractDataFrame=DataFrame())
+    if nrow(init) > 0
+        throw(ArgumentError("init data frame must have zero rows"))
+    end
+    dfs_init = AbstractDataFrame[emptycolmetadata!(copy(init))]
+    append!(dfs_init, dfs)
+    res = _vcat(AbstractDataFrame[df for df in dfs_init if ncol(df) != 0]; cols=cols)
     # only handle table-level metadata, as column-level metadata was done in _vcat
     _merge_matching_table_note_metadata!(res, dfs)
 
@@ -1899,8 +1914,13 @@ end
 # definition needed to avoid dispatch ambiguity
 Base.reduce(::typeof(vcat),
             dfs::SentinelArrays.ChainedVector{T, A} where {T<:AbstractDataFrame,
-                                                           A<:AbstractVector{T}}) =
-    reduce(vcat, collect(AbstractDataFrame, dfs))
+                                                           A<:AbstractVector{T}};
+            cols::Union{Symbol, AbstractVector{Symbol},
+                        AbstractVector{<:AbstractString}}=:setequal,
+            source::Union{Nothing, SymbolOrString,
+                            Pair{<:SymbolOrString, <:AbstractVector}}=nothing,
+            init::AbstractDataFrame=DataFrame()) =
+    reduce(vcat, collect(AbstractDataFrame, dfs), cols=cols, source=source, init=init)
 
 function _vcat(dfs::AbstractVector{AbstractDataFrame};
                cols::Union{Symbol, AbstractVector{Symbol},

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -1900,9 +1900,34 @@ end
                            DataFrame(c=[missing, missing]))
 end
 
+@testset "vcat init" begin
+    dfs = [DataFrame(a=1), DataFrame(b=2)]
+    @test reduce(vcat, dfs, cols=:union, source=:x, init=DataFrame(c=[])) ≅
+          DataFrame(c=missing, a=[1, missing], b=[missing, 2], x=1:2)
+    @test reduce(vcat, dfs, cols=:union, source=:x => ["a", "b"], init=DataFrame(c=[])) ≅
+          DataFrame(c=missing, a=[1, missing], b=[missing, 2], x=["a", "b"])
+
+    dfs = [DataFrame(a=1), DataFrame(a=2)]
+    df_init = DataFrame(c=[])
+    metadata!(df_init, "k1", "v1", style=:note)
+    colmetadata!(df_init, :c, "kc", "vc", style=:note)
+    metadata!(dfs[1], "k2", "v2", style=:note)
+    metadata!(dfs[2], "k2", "v2", style=:note)
+    colmetadata!(dfs[1], :a, "ka", "va", style=:note)
+    colmetadata!(dfs[2], :a, "ka", "va", style=:note)
+    res = reduce(vcat, dfs, cols=:union, source=:x => ["a", "b"], init=df_init)
+    @test metadata(res) == Dict("k2" => "v2")
+    @test colmetadata(res) == Dict(:a => Dict("ka" => "va"))
+
+    @test_throws ArgumentError reduce(vcat, dfs, cols=:union, source=:x, init=DataFrame(c=[1]))
+end
+
 @testset "vcat ChainedVector ambiguity" begin
     dfs = DataFrames.SentinelArrays.ChainedVector([[DataFrame(a=1)], [DataFrame(a=2)]])
     @test reduce(vcat, dfs) == DataFrame(a=1:2)
+    dfs = DataFrames.SentinelArrays.ChainedVector([[DataFrame(a=1)], [DataFrame(b=2)]])
+    @test reduce(vcat, dfs, cols=:union, source=:x, init=DataFrame(c=[])) ≅
+          DataFrame(c=missing, a=[1, missing], b=[missing, 2], x=1:2)
 end
 
 @testset "names for Type, predicate + standard tests of cols" begin