Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecation warning when using insert! with duplicate column name #1308

Merged
merged 9 commits into from
Dec 29, 2017
Merged
33 changes: 23 additions & 10 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ names!(df::AbstractDataFrame, vals)
* `df` : the AbstractDataFrame
* `vals` : column names, normally a Vector{Symbol} the same length as
the number of columns in `df`
* `allow_duplicates` : if `false` (the default), an error will be raised
* `makeunique` : if `false` (the default), an error will be raised
if duplicate names are found; if `true`, duplicate names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).

Expand All @@ -125,12 +125,16 @@ names!(df::AbstractDataFrame, vals)
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
names!(df, [:a, :b, :c])
names!(df, [:a, :b, :a]) # throws ArgumentError
names!(df, [:a, :b, :a], allow_duplicates=true) # renames second :a to :a_1
names!(df, [:a, :b, :a], makeunique=true) # renames second :a to :a_1
```

"""
function names!(df::AbstractDataFrame, vals; allow_duplicates=false)
names!(index(df), vals; allow_duplicates=allow_duplicates)
# TODO: remove allow_duplicates after deprecation period
function names!(df::AbstractDataFrame, vals; allow_duplicates=false, makeunique::Bool=false)
if allow_duplicates
Base.depwarn("Keyword argument allow_duplicates is deprecated. Use makeunique.", :names!)
end
names!(index(df), vals, allow_duplicates=allow_duplicates, makeunique=makeunique)
return df
end

Expand Down Expand Up @@ -172,6 +176,9 @@ rename(f::Function, df::AbstractDataFrame)

* `::AbstractDataFrame` : the updated result

New names are processed sequentially. A new name must not already exist in the `DataFrame`
at the moment an attempt to rename a column is performed.

**Examples**

```julia
Expand Down Expand Up @@ -678,18 +685,24 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c])

# hcat's first argument must be an AbstractDataFrame
# or AbstractVector if the second argument is AbstractDataFrame
# Trailing arguments (currently) may also be vectors or scalars.
# Trailing arguments (currently) may also be vectors.

# hcat! is defined in DataFrames/DataFrames.jl
# Its first argument (currently) must be a DataFrame.

# catch-all to cover cases where indexing returns a DataFrame and copy doesn't
Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x)
Base.hcat(x, df::AbstractDataFrame) = hcat!(x, df[:, :])
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame) = hcat!(df1[:, :], df2)

Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
Base.hcat(df::AbstractDataFrame, x; makeunique::Bool=false) =
hcat!(df[:, :], x, makeunique=makeunique)
Base.hcat(x, df::AbstractDataFrame; makeunique::Bool=false) =
hcat!(x, df[:, :], makeunique=makeunique)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false) =
hcat!(df1[:, :], df2, makeunique=makeunique)
Base.hcat(df::AbstractDataFrame, x, y...; makeunique::Bool=false) =
hcat!(hcat(df, x, makeunique=makeunique), y..., makeunique=makeunique)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
makeunique::Bool=false) =
hcat!(hcat(df1, df2, makeunique=makeunique), dfn..., makeunique=makeunique)

@generated function promote_col_type(cols::AbstractVector...)
T = mapreduce(x -> Missings.T(eltype(x)), promote_type, cols)
Expand Down
4 changes: 2 additions & 2 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,5 +307,5 @@ DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;
append!(sink.columns[col], column)
end


Data.close!(df::DataFrameStream) = DataFrame(collect(Any, df.columns), Symbol.(df.header))
Data.close!(df::DataFrameStream, makeunique::Bool=false) =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need this? This implements a particular DataStreams interface which is supposed to be completely generic, so the caller should not have to adapt to DataFrame specificities. We should follow the rule adopted by DataStreams in general with regard to duplicate column names.

@quinnj Should we automatically deduplicate column names, or throw an error?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to be explicit rather than implicitly assume one type of behavior without thinking about it. Of course if @quinnj has a clear opinion let us implement it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I've just tested, and it turns out CSV.jl will happily return duplicated column names. So let's always pass makeunique=true to DataFrames, and remove the keyword argument to close!.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

DataFrame(collect(Any, df.columns), Symbol.(df.header), makeunique=makeunique)
33 changes: 22 additions & 11 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,11 @@ Base.length(x::RowIndexMap) = length(x.orig)

# composes the joined data table using the maps between the left and right
# table rows and the indices of rows in the result

function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap,
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap)
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap;
makeunique::Bool=false)
@assert length(left_ixs) == length(right_ixs)
# compose left half of the result taking all left columns
all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig)
Expand Down Expand Up @@ -98,7 +100,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
permute!(cols[i+ncleft], right_perm)
end
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), makeunique=makeunique)

if length(rightonly_ixs.join) > 0
# some left rows are missing, so the values of the "on" columns
Expand Down Expand Up @@ -211,7 +213,7 @@ function update_row_maps!(left_table::AbstractDataFrame,
end

"""
join(df1, df2; on = Symbol[], kind = :inner)
join(df1, df2; on = Symbol[], kind = :inner, makeunique = false)

Join two `DataFrame` objects

Expand Down Expand Up @@ -239,6 +241,11 @@ Join two `DataFrame` objects
- `:cross` : a full Cartesian product of the key combinations; every
row of `df1` is matched with every row of `df2`

* `makeunique` : how to handle columns with duplicate names other than `on` in joined tables:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make this more concise? A bullet list sounds too much for a mere boolean. Why not use a description similar to that used for names!?

Also, I know I used it, but I don't think "deduplicate" is a great term for an official documentation. "Make unique" sounds better.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK - fixing


- `false` : throw an error if duplicate column names are present
- `true` : duplicate column names in `df2` will be deduplicated by adding a suffix

For the three join operations that may introduce missing values (`:outer`, `:left`,
and `:right`), all columns of the returned data table will support missing values.

Expand Down Expand Up @@ -272,10 +279,10 @@ join(name, job2, on = :ID => :identifier)
function Base.join(df1::AbstractDataFrame,
df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[],
kind::Symbol = :inner)
kind::Symbol = :inner, makeunique::Bool=false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? By definition, joins match columns with identical names, so that shouldn't be a problem? Anyway it should be in the docstring.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is needed. Join matches on on keyword argument not on columns having identical names. And we can have columns with identical names on which we do not join.
See the example as it works now:

julia> x = DataFrame(rand(3,3))
3×3 DataFrames.DataFrame
│ Row │ x1       │ x2       │ x3       │
├─────┼──────────┼──────────┼──────────┤
│ 1   │ 0.97283  │ 0.200961 │ 0.591387 │
│ 2   │ 0.391169 │ 0.532865 │ 0.630848 │
│ 3   │ 0.090204 │ 0.320481 │ 0.775537 │

julia> y = DataFrame(rand(3,3))
3×3 DataFrames.DataFrame
│ Row │ x1       │ x2       │ x3       │
├─────┼──────────┼──────────┼──────────┤
│ 1   │ 0.470765 │ 0.468833 │ 0.199083 │
│ 2   │ 0.303053 │ 0.992384 │ 0.140139 │
│ 3   │ 0.284372 │ 0.816792 │ 0.675941 │

julia> x[:id] = 1:3
1:3

julia> y[:id] = 1:3
1:3

julia> join(x, y, on=:id)
3×7 DataFrames.DataFrame
│ Row │ x1       │ x2       │ x3       │ id │ x1_1     │ x2_1     │ x3_1     │
├─────┼──────────┼──────────┼──────────┼────┼──────────┼──────────┼──────────┤
│ 1   │ 0.97283  │ 0.200961 │ 0.591387 │ 1  │ 0.470765 │ 0.468833 │ 0.199083 │
│ 2   │ 0.391169 │ 0.532865 │ 0.630848 │ 2  │ 0.303053 │ 0.992384 │ 0.140139 │
│ 3   │ 0.090204 │ 0.320481 │ 0.775537 │ 3  │ 0.284372 │ 0.816792 │ 0.675941 │

I will improve the docstring.

if kind == :cross
(on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'."))
return crossjoin(df1, df2)
return crossjoin(df1, df2, makeunique=makeunique)
elseif on == Symbol[]
throw(ArgumentError("Missing join argument 'on'."))
end
Expand All @@ -285,19 +292,23 @@ function Base.join(df1::AbstractDataFrame,
if kind == :inner
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
true, false, true, false)...)
true, false, true, false)...,
makeunique=makeunique)
elseif kind == :left
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
true, true, true, false)...)
true, true, true, false)...,
makeunique=makeunique)
elseif kind == :right
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfr_on, joiner.dfl_on,
group_rows(joiner.dfl_on),
true, true, true, false)[[3, 4, 1, 2]]...)
true, true, true, false)[[3, 4, 1, 2]]...,
makeunique=makeunique)
elseif kind == :outer
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
true, true, true, true)...)
true, true, true, true)...,
makeunique=makeunique)
elseif kind == :semi
# hash the right rows
dfr_on_grp = group_rows(joiner.dfr_on)
Expand Down Expand Up @@ -331,10 +342,10 @@ function Base.join(df1::AbstractDataFrame,
end
end

function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame)
function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false)
r1, r2 = size(df1, 1), size(df2, 1)
colindex = merge(index(df1), index(df2), makeunique=makeunique)
cols = Any[[repeat(c, inner=r2) for c in columns(df1)];
[repeat(c, outer=r1) for c in columns(df2)]]
colindex = merge(index(df1), index(df2))
DataFrame(cols, colindex)
end
Loading