JuliaData · nalimilan · Dec 29, 2017 · Dec 15, 2017 · Dec 24, 2017 · Dec 26, 2017
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -110,7 +110,7 @@ names!(df::AbstractDataFrame, vals)
 * `df` : the AbstractDataFrame
 * `vals` : column names, normally a Vector{Symbol} the same length as
   the number of columns in `df`
-* `allow_duplicates` : if `false` (the default), an error will be raised
+* `makeunique` : if `false` (the default), an error will be raised
   if duplicate names are found; if `true`, duplicate names will be suffixed
   with `_i` (`i` starting at 1 for the first duplicate).
 
@@ -125,12 +125,16 @@ names!(df::AbstractDataFrame, vals)
 df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
 names!(df, [:a, :b, :c])
 names!(df, [:a, :b, :a])  # throws ArgumentError
-names!(df, [:a, :b, :a], allow_duplicates=true)  # renames second :a to :a_1
+names!(df, [:a, :b, :a], makeunique=true)  # renames second :a to :a_1
 ```
 
 """
-function names!(df::AbstractDataFrame, vals; allow_duplicates=false)
-    names!(index(df), vals; allow_duplicates=allow_duplicates)
+# TODO: remove allow_duplicates after deprecation period
+function names!(df::AbstractDataFrame, vals; allow_duplicates=false, makeunique::Bool=false)
+    if allow_duplicates
+        Base.depwarn("Keyword argument allow_duplicates is deprecated. Use makeunique.", :names!)
+    end
+    names!(index(df), vals, allow_duplicates=allow_duplicates, makeunique=makeunique)
     return df
 end
 
@@ -172,6 +176,9 @@ rename(f::Function, df::AbstractDataFrame)
 
 * `::AbstractDataFrame` : the updated result
 
+New names are processed sequentially. A new name must not already exist in the `DataFrame`
+at the moment an attempt to rename a column is performed.
+
 **Examples**
 
 ```julia
@@ -678,18 +685,24 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c])
 
 # hcat's first argument must be an AbstractDataFrame
 # or AbstractVector if the second argument is AbstractDataFrame
-# Trailing arguments (currently) may also be vectors or scalars.
+# Trailing arguments (currently) may also be vectors.
 
 # hcat! is defined in DataFrames/DataFrames.jl
 # Its first argument (currently) must be a DataFrame.
 
 # catch-all to cover cases where indexing returns a DataFrame and copy doesn't
-Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x)
-Base.hcat(x, df::AbstractDataFrame) = hcat!(x, df[:, :])
-Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame) = hcat!(df1[:, :], df2)
 
-Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...)
-Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
+Base.hcat(df::AbstractDataFrame, x; makeunique::Bool=false) =
+    hcat!(df[:, :], x, makeunique=makeunique)
+Base.hcat(x, df::AbstractDataFrame; makeunique::Bool=false) =
+    hcat!(x, df[:, :], makeunique=makeunique)
+Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false) =
+    hcat!(df1[:, :], df2, makeunique=makeunique)
+Base.hcat(df::AbstractDataFrame, x, y...; makeunique::Bool=false) =
+    hcat!(hcat(df, x, makeunique=makeunique), y..., makeunique=makeunique)
+Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
+          makeunique::Bool=false) =
+    hcat!(hcat(df1, df2, makeunique=makeunique), dfn..., makeunique=makeunique)
 
 @generated function promote_col_type(cols::AbstractVector...)
     T = mapreduce(x -> Missings.T(eltype(x)), promote_type, cols)

diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl
@@ -307,5 +307,5 @@ DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;
     append!(sink.columns[col], column)
 end
 
-
-Data.close!(df::DataFrameStream) = DataFrame(collect(Any, df.columns), Symbol.(df.header))
+Data.close!(df::DataFrameStream, makeunique::Bool=false) =
+    DataFrame(collect(Any, df.columns), Symbol.(df.header), makeunique=makeunique)
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -48,9 +48,11 @@ Base.length(x::RowIndexMap) = length(x.orig)
 
 # composes the joined data table using the maps between the left and right
 # table rows and the indices of rows in the result
+
 function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
                               left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap,
-                              right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap)
+                              right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap;
+                              makeunique::Bool=false)
     @assert length(left_ixs) == length(right_ixs)
     # compose left half of the result taking all left columns
     all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig)
@@ -98,7 +100,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
         copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
         permute!(cols[i+ncleft], right_perm)
     end
-    res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
+    res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), makeunique=makeunique)
 
     if length(rightonly_ixs.join) > 0
         # some left rows are missing, so the values of the "on" columns
@@ -211,7 +213,7 @@ function update_row_maps!(left_table::AbstractDataFrame,
 end
 
 """
-    join(df1, df2; on = Symbol[], kind = :inner)
+    join(df1, df2; on = Symbol[], kind = :inner, makeunique = false)
 
 Join two `DataFrame` objects
 
@@ -239,6 +241,11 @@ Join two `DataFrame` objects
   - `:cross` : a full Cartesian product of the key combinations; every
     row of `df1` is matched with every row of `df2`
 
+* `makeunique` : how to handle columns with duplicate names other than `on` in joined tables:
+
+  - `false` : throw an error if duplicate column names are present
+  - `true` : duplicate column names in `df2` will be deduplicated by adding a suffix
+
 For the three join operations that may introduce missing values (`:outer`, `:left`,
 and `:right`), all columns of the returned data table will support missing values.
 
@@ -272,10 +279,10 @@ join(name, job2, on = :ID => :identifier)
 function Base.join(df1::AbstractDataFrame,
                    df2::AbstractDataFrame;
                    on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[],
-                   kind::Symbol = :inner)
+                   kind::Symbol = :inner, makeunique::Bool=false)
     if kind == :cross
         (on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'."))
-        return crossjoin(df1, df2)
+        return crossjoin(df1, df2, makeunique=makeunique)
     elseif on == Symbol[]
         throw(ArgumentError("Missing join argument 'on'."))
     end
@@ -285,19 +292,23 @@ function Base.join(df1::AbstractDataFrame,
     if kind == :inner
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),
-                                                            true, false, true, false)...)
+                                                            true, false, true, false)...,
+                                                            makeunique=makeunique)
     elseif kind == :left
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),
-                                                            true, true, true, false)...)
+                                                            true, true, true, false)...,
+                                                            makeunique=makeunique)
     elseif kind == :right
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfr_on, joiner.dfl_on,
                                                             group_rows(joiner.dfl_on),
-                                                            true, true, true, false)[[3, 4, 1, 2]]...)
+                                                            true, true, true, false)[[3, 4, 1, 2]]...,
+                                                            makeunique=makeunique)
     elseif kind == :outer
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),
-                                                            true, true, true, true)...)
+                                                            true, true, true, true)...,
+                                                            makeunique=makeunique)
     elseif kind == :semi
         # hash the right rows
         dfr_on_grp = group_rows(joiner.dfr_on)
@@ -331,10 +342,10 @@ function Base.join(df1::AbstractDataFrame,
     end
 end
 
-function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame)
+function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false)
     r1, r2 = size(df1, 1), size(df2, 1)
+    colindex = merge(index(df1), index(df2), makeunique=makeunique)
     cols = Any[[repeat(c, inner=r2) for c in columns(df1)];
                [repeat(c, outer=r1) for c in columns(df2)]]
-    colindex = merge(index(df1), index(df2))
     DataFrame(cols, colindex)
 end