From f39613ae8f4d57f722f4e9dc1f417ece50dd4e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 15 Dec 2021 12:48:24 +0100 Subject: [PATCH 1/4] produce a better error message in disallowmissing! and disallowmissing --- src/abstractdataframe/abstractdataframe.jl | 20 +++++++++++++++----- src/dataframe/dataframe.jl | 12 ++++++++++-- test/dataframe.jl | 10 +++++----- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 419581f1f8..f32ee66b1c 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -493,7 +493,7 @@ Get a data frame with the `n` first rows of `df`. If `view=false` a freshly allocated `DataFrame` is returned. If `view=true` then a `SubDataFrame` view into `df` is returned. """ -@inline Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) = +@inline Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) = view ? Base.view(df, 1:min(n ,nrow(df)), :) : df[1:min(n, nrow(df)), :] """ @@ -511,7 +511,7 @@ Get a data frame with the `n` last rows of `df`. If `view=false` a freshly allocated `DataFrame` is returned. If `view=true` then a `SubDataFrame` view into `df` is returned. """ -@inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) = +@inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) = view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :] @@ -2060,10 +2060,20 @@ function Missings.disallowmissing(df::AbstractDataFrame, for i in axes(df, 2) x = df[!, i] if i in idxcols - if !error && Missing <: eltype(x) && any(ismissing, x) - y = x + if Missing <: eltype(x) + # any(ismissing, x) is cheap so we accept paying it + # to get a better error message + if any(ismissing, x) + if error + throw(ArgumentError("Missing value found in column number $i")) + else + y = x + end + else + y = disallowmissing(x) + end else - y = disallowmissing(x) + y = x end push!(newcols, y === x ? copy(y) : y) else diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 42a491bc9e..7fec8d3080 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1190,8 +1190,16 @@ function disallowmissing! end function disallowmissing!(df::DataFrame, col::ColumnIndex; error::Bool=true) x = df[!, col] - if !(!error && Missing <: eltype(x) && any(ismissing, x)) - df[!, col] = disallowmissing(x) + if Missing <: eltype(x) + # any(ismissing, x) is cheap so we accept paying it + # to get a better error message + if any(ismissing, x) + if error + throw(ArgumentError("Missing value found in column number $col")) + end + else + df[!, col] = disallowmissing(x) + end end return df end diff --git a/test/dataframe.jl b/test/dataframe.jl index 190b7fe1d6..0c71b38eaa 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1131,7 +1131,7 @@ end @test isa(df[!, 1], Vector{Union{Int, Missing}}) @test !isa(df[!, 2], Vector{Union{Int, Missing}}) df[1, 1] = missing - @test_throws MethodError disallowmissing!(df, 1) + @test_throws ArgumentError disallowmissing!(df, 1) tmpcol = df[!, 1] disallowmissing!(df, 1, error=false) @test df[!, 1] === tmpcol @@ -1145,7 +1145,7 @@ end @test isa(df[!, 1], Vector{Union{Int, Missing}}) @test !isa(df[!, 2], Vector{Union{Int, Missing}}) df[1, 1] = missing - @test_throws MethodError disallowmissing!(df, Not(Not(1))) + @test_throws ArgumentError disallowmissing!(df, Not(Not(1))) tmpcol = df[!, 1] disallowmissing!(df, Not(Not(1)), error=false) @test df[!, 1] === tmpcol @@ -1206,7 +1206,7 @@ end @test eltype(df[!, 1]) <: Union{CategoricalValue{Int}, Missing} @test eltype(df[!, 2]) <: Union{CategoricalValue{String}, Missing} df[1, 2] = missing - @test_throws MissingException disallowmissing!(df) + @test_throws ArgumentError disallowmissing!(df) tmpcol =df[!, 2] disallowmissing!(df, error=false) @test df[!, 2] === tmpcol @@ -1331,9 +1331,9 @@ end end end - @test_throws MethodError disallowmissing(DataFrame(x=[missing])) + @test_throws ArgumentError disallowmissing(DataFrame(x=[missing])) @test disallowmissing(DataFrame(x=[missing]), error=false) ≅ DataFrame(x=[missing]) - @test_throws MethodError disallowmissing(DataFrame(x=[1, missing])) + @test_throws ArgumentError disallowmissing(DataFrame(x=[1, missing])) @test disallowmissing(DataFrame(x=[1, missing]), error=false) ≅ DataFrame(x=[1, missing]) df = DataFrame(x=[1], y=Union{Int, Missing}[1], z=[missing]) From 8277460f917f3fa4eee8d9e573344b4a66e73353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 15 Dec 2021 20:32:58 +0100 Subject: [PATCH 2/4] improve error message --- src/abstractdataframe/abstractdataframe.jl | 9 ++++++--- src/dataframe/dataframe.jl | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 3121c618ed..098e0d474f 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2061,11 +2061,14 @@ function Missings.disallowmissing(df::AbstractDataFrame, x = df[!, i] if i in idxcols if Missing <: eltype(x) - # any(ismissing, x) is cheap so we accept paying it + # finding missing is cheap so we accept paying it # to get a better error message - if any(ismissing, x) + row = findfirst(ismissing, x) + if row !== nothing if error - throw(ArgumentError("Missing value found in column number $i")) + col = _names(df)[i] + throw(ArgumentError("Missing value found in column :$col " * + "in row $row")) else y = x end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 7fec8d3080..512821a7d6 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1191,11 +1191,14 @@ function disallowmissing! end function disallowmissing!(df::DataFrame, col::ColumnIndex; error::Bool=true) x = df[!, col] if Missing <: eltype(x) - # any(ismissing, x) is cheap so we accept paying it + # finding missing is cheap so we accept paying it # to get a better error message - if any(ismissing, x) + row = findfirst(ismissing, x) + if row !== nothing if error - throw(ArgumentError("Missing value found in column number $col")) + col_name = only(names(df, col)) + throw(ArgumentError("Missing value found in column :$col_name " * + "in row $row")) end else df[!, col] = disallowmissing(x) From 611e127d4f23e4e0fd181999abf80c3c53dab6ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 16 Dec 2021 14:51:59 +0100 Subject: [PATCH 3/4] use try-catch --- src/abstractdataframe/abstractdataframe.jl | 29 ++++++++++++++-------- src/dataframe/dataframe.jl | 24 +++++++++++------- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 098e0d474f..38fddb630b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2060,20 +2060,27 @@ function Missings.disallowmissing(df::AbstractDataFrame, for i in axes(df, 2) x = df[!, i] if i in idxcols + local y if Missing <: eltype(x) - # finding missing is cheap so we accept paying it - # to get a better error message - row = findfirst(ismissing, x) - if row !== nothing - if error - col = _names(df)[i] - throw(ArgumentError("Missing value found in column :$col " * - "in row $row")) - else - y = x + if error + try + y = disallowmissing(x) + catch e + row = findfirst(ismissing, x) + if row !== nothing + col = _names(df)[i] + throw(ArgumentError("Missing value found in column " * + ":$col in row $row")) + else + rethrow(e) + end end else - y = disallowmissing(x) + if any(ismissing, x) + y = x + else + y = disallowmissing(x) + end end else y = x diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 512821a7d6..29d0c30891 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1191,17 +1191,23 @@ function disallowmissing! end function disallowmissing!(df::DataFrame, col::ColumnIndex; error::Bool=true) x = df[!, col] if Missing <: eltype(x) - # finding missing is cheap so we accept paying it - # to get a better error message - row = findfirst(ismissing, x) - if row !== nothing - if error - col_name = only(names(df, col)) - throw(ArgumentError("Missing value found in column :$col_name " * - "in row $row")) + if error + try + df[!, col] = disallowmissing(x) + catch e + row = findfirst(ismissing, x) + if row !== nothing + col_name = only(names(df, col)) + throw(ArgumentError("Missing value found in column " * + ":$col_name in row $row")) + else + rethrow(e) + end end else - df[!, col] = disallowmissing(x) + if !any(ismissing, x) + df[!, col] = disallowmissing(x) + end end end return df From 318f4f040a749115673344b682049d0e265facee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 18 Dec 2021 17:30:59 +0100 Subject: [PATCH 4/4] handle error in one place --- src/abstractdataframe/abstractdataframe.jl | 24 ++++++++-------------- src/dataframe/dataframe.jl | 20 ++++++++---------- 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 38fddb630b..f6f329f5ee 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2060,30 +2060,22 @@ function Missings.disallowmissing(df::AbstractDataFrame, for i in axes(df, 2) x = df[!, i] if i in idxcols - local y + y = x if Missing <: eltype(x) - if error - try - y = disallowmissing(x) - catch e - row = findfirst(ismissing, x) - if row !== nothing + try + y = disallowmissing(x) + catch e + row = findfirst(ismissing, x) + if row !== nothing + if error col = _names(df)[i] throw(ArgumentError("Missing value found in column " * ":$col in row $row")) - else - rethrow(e) end - end - else - if any(ismissing, x) - y = x else - y = disallowmissing(x) + rethrow(e) end end - else - y = x end push!(newcols, y === x ? copy(y) : y) else diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 29d0c30891..dd0ada0370 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1191,22 +1191,18 @@ function disallowmissing! end function disallowmissing!(df::DataFrame, col::ColumnIndex; error::Bool=true) x = df[!, col] if Missing <: eltype(x) - if error - try - df[!, col] = disallowmissing(x) - catch e - row = findfirst(ismissing, x) - if row !== nothing + try + df[!, col] = disallowmissing(x) + catch e + row = findfirst(ismissing, x) + if row !== nothing + if error col_name = only(names(df, col)) throw(ArgumentError("Missing value found in column " * ":$col_name in row $row")) - else - rethrow(e) end - end - else - if !any(ismissing, x) - df[!, col] = disallowmissing(x) + else + rethrow(e) end end end