Skip to content

Commit

Permalink
Speed up getindex with AbstractVector{Bool} row selection (#1848)
Browse files Browse the repository at this point in the history
Computing the integer indices before indexing into the vectors is faster.
The array indexing code uses a LogicalIndex wrapper which computes the number
of true indices and doesn't allocate a vector of integer indices, but it's
only slightly faster when there's a single column, and slower for more than
one column.
  • Loading branch information
nalimilan authored Jun 12, 2019
1 parent c0cfaed commit 724d132
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -343,14 +343,16 @@ end
end

# df[MultiRowIndex, MultiColumnIndex] => DataFrame
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector,
col_inds::Union{AbstractVector, Regex})
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T},
col_inds::Union{AbstractVector, Regex}) where T
@boundscheck if !checkindex(Bool, axes(df, 1), row_inds)
throw(BoundsError("attempt to access a data frame with $(nrow(df)) " *
"rows at index $row_inds"))
end
selected_columns = index(df)[col_inds]
new_columns = AbstractVector[dv[row_inds] for dv in _columns(df)[selected_columns]]
# Computing integer indices once for all columns is faster
selected_rows = T === Bool ? findall(row_inds) : row_inds
new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)[selected_columns]]
return DataFrame(new_columns, Index(_names(df)[selected_columns]), copycols=false)
end

Expand All @@ -368,12 +370,14 @@ function Base.getindex(df::DataFrame, row_ind::Colon, col_inds::Union{AbstractVe
end

# df[MultiRowIndex, :] => DataFrame
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector, ::Colon)
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T}, ::Colon) where T
@boundscheck if !checkindex(Bool, axes(df, 1), row_inds)
throw(BoundsError("attempt to access a data frame with $(nrow(df)) " *
"rows at index $row_inds"))
end
new_columns = AbstractVector[dv[row_inds] for dv in _columns(df)]
# Computing integer indices once for all columns is faster
selected_rows = T === Bool ? findall(row_inds) : row_inds
new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)]
return DataFrame(new_columns, copy(index(df)), copycols=false)
end

Expand Down

0 comments on commit 724d132

Please sign in to comment.