-
Notifications
You must be signed in to change notification settings - Fork 370
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Deprecation warning when using insert! with duplicate column name #1308
Changes from 7 commits
e55a20a
d46554f
f4eb9c4
a9a21f7
700e9fc
38234a0
d533090
311415a
14af911
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,9 +48,11 @@ Base.length(x::RowIndexMap) = length(x.orig) | |
|
||
# composes the joined data table using the maps between the left and right | ||
# table rows and the indices of rows in the result | ||
|
||
function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, | ||
left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, | ||
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) | ||
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap; | ||
makeunique::Bool=false) | ||
@assert length(left_ixs) == length(right_ixs) | ||
# compose left half of the result taking all left columns | ||
all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) | ||
|
@@ -98,7 +100,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, | |
copy!(cols[i+ncleft], view(col, all_orig_right_ixs)) | ||
permute!(cols[i+ncleft], right_perm) | ||
end | ||
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon))) | ||
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), makeunique=makeunique) | ||
|
||
if length(rightonly_ixs.join) > 0 | ||
# some left rows are missing, so the values of the "on" columns | ||
|
@@ -211,7 +213,7 @@ function update_row_maps!(left_table::AbstractDataFrame, | |
end | ||
|
||
""" | ||
join(df1, df2; on = Symbol[], kind = :inner) | ||
join(df1, df2; on = Symbol[], kind = :inner, makeunique = false) | ||
|
||
Join two `DataFrame` objects | ||
|
||
|
@@ -239,6 +241,11 @@ Join two `DataFrame` objects | |
- `:cross` : a full Cartesian product of the key combinations; every | ||
row of `df1` is matched with every row of `df2` | ||
|
||
* `makeunique` : how to handle columns with duplicate names other than `on` in joined tables: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make this more concise? A bullet list sounds too much for a mere boolean. Why not use a description similar to that used for Also, I know I used it, but I don't think "deduplicate" is a great term for an official documentation. "Make unique" sounds better. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK - fixing |
||
|
||
- `false` : throw an error if duplicate column names are present | ||
- `true` : duplicate column names in `df2` will be deduplicated by adding a suffix | ||
|
||
For the three join operations that may introduce missing values (`:outer`, `:left`, | ||
and `:right`), all columns of the returned data table will support missing values. | ||
|
||
|
@@ -272,10 +279,10 @@ join(name, job2, on = :ID => :identifier) | |
function Base.join(df1::AbstractDataFrame, | ||
df2::AbstractDataFrame; | ||
on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[], | ||
kind::Symbol = :inner) | ||
kind::Symbol = :inner, makeunique::Bool=false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this needed? By definition, joins match columns with identical names, so that shouldn't be a problem? Anyway it should be in the docstring. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is needed. Join matches on
I will improve the docstring. |
||
if kind == :cross | ||
(on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'.")) | ||
return crossjoin(df1, df2) | ||
return crossjoin(df1, df2, makeunique=makeunique) | ||
elseif on == Symbol[] | ||
throw(ArgumentError("Missing join argument 'on'.")) | ||
end | ||
|
@@ -285,19 +292,23 @@ function Base.join(df1::AbstractDataFrame, | |
if kind == :inner | ||
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on, | ||
group_rows(joiner.dfr_on), | ||
true, false, true, false)...) | ||
true, false, true, false)..., | ||
makeunique=makeunique) | ||
elseif kind == :left | ||
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on, | ||
group_rows(joiner.dfr_on), | ||
true, true, true, false)...) | ||
true, true, true, false)..., | ||
makeunique=makeunique) | ||
elseif kind == :right | ||
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfr_on, joiner.dfl_on, | ||
group_rows(joiner.dfl_on), | ||
true, true, true, false)[[3, 4, 1, 2]]...) | ||
true, true, true, false)[[3, 4, 1, 2]]..., | ||
makeunique=makeunique) | ||
elseif kind == :outer | ||
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on, | ||
group_rows(joiner.dfr_on), | ||
true, true, true, true)...) | ||
true, true, true, true)..., | ||
makeunique=makeunique) | ||
elseif kind == :semi | ||
# hash the right rows | ||
dfr_on_grp = group_rows(joiner.dfr_on) | ||
|
@@ -331,10 +342,10 @@ function Base.join(df1::AbstractDataFrame, | |
end | ||
end | ||
|
||
function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame) | ||
function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false) | ||
r1, r2 = size(df1, 1), size(df2, 1) | ||
colindex = merge(index(df1), index(df2), makeunique=makeunique) | ||
cols = Any[[repeat(c, inner=r2) for c in columns(df1)]; | ||
[repeat(c, outer=r1) for c in columns(df2)]] | ||
colindex = merge(index(df1), index(df2)) | ||
DataFrame(cols, colindex) | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we really need this? This implements a particular DataStreams interface which is supposed to be completely generic, so the caller should not have to adapt to
DataFrame
specificities. We should follow the rule adopted by DataStreams in general with regard to duplicate column names.@quinnj Should we automatically deduplicate column names, or throw an error?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted to be explicit rather than implicitly assume one type of behavior without thinking about it. Of course if @quinnj has a clear opinion let us implement it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I've just tested, and it turns out CSV.jl will happily return duplicated column names. So let's always pass
makeunique=true
toDataFrames
, and remove the keyword argument toclose!
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK