From 724d1321951dcc8545be9431c8199633693d78dd Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 12 Jun 2019 13:07:53 +0200 Subject: [PATCH] Speed up getindex with AbstractVector{Bool} row selection (#1848) Computing the integer indices before indexing into the vectors is faster. The array indexing code uses a LogicalIndex wrapper which computes the number of true indices and doesn't allocate a vector of integer indices, but it's only slightly faster when there's a single column, and slower for more than one column. --- src/dataframe/dataframe.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index d130a84ea7..345aac547a 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -343,14 +343,16 @@ end end # df[MultiRowIndex, MultiColumnIndex] => DataFrame -@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector, - col_inds::Union{AbstractVector, Regex}) +@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T}, + col_inds::Union{AbstractVector, Regex}) where T @boundscheck if !checkindex(Bool, axes(df, 1), row_inds) throw(BoundsError("attempt to access a data frame with $(nrow(df)) " * "rows at index $row_inds")) end selected_columns = index(df)[col_inds] - new_columns = AbstractVector[dv[row_inds] for dv in _columns(df)[selected_columns]] + # Computing integer indices once for all columns is faster + selected_rows = T === Bool ? findall(row_inds) : row_inds + new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)[selected_columns]] return DataFrame(new_columns, Index(_names(df)[selected_columns]), copycols=false) end @@ -368,12 +370,14 @@ function Base.getindex(df::DataFrame, row_ind::Colon, col_inds::Union{AbstractVe end # df[MultiRowIndex, :] => DataFrame -@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector, ::Colon) +@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T}, ::Colon) where T @boundscheck if !checkindex(Bool, axes(df, 1), row_inds) throw(BoundsError("attempt to access a data frame with $(nrow(df)) " * "rows at index $row_inds")) end - new_columns = AbstractVector[dv[row_inds] for dv in _columns(df)] + # Computing integer indices once for all columns is faster + selected_rows = T === Bool ? findall(row_inds) : row_inds + new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)] return DataFrame(new_columns, copy(index(df)), copycols=false) end