Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Define and implement new search APIs #3229

Merged
merged 23 commits into from
Nov 19, 2019
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9b517ab
search code with broken tester - waiting for column_wrapper
ChuckHastings Oct 17, 2019
26dd539
Merge remote-tracking branch 'jake/fea-ext-new-column-wrapper' into f…
ChuckHastings Oct 17, 2019
661dda4
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Oct 18, 2019
47b8624
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Oct 25, 2019
a5f9cbc
allow access to column 0 in the table device view
ChuckHastings Oct 28, 2019
37abb6e
move bit_is_set to be host callable for use in testing
ChuckHastings Oct 28, 2019
9bffd6b
implement all of the upper_bound and lower_bound tests for search
ChuckHastings Oct 28, 2019
fc83cf9
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Oct 28, 2019
ecf0610
documentation changes from PR
ChuckHastings Oct 28, 2019
34a19c1
Merge remote-tracking branch 'devavret/fea-cudf-scalar' into fea_move…
ChuckHastings Oct 28, 2019
6f30761
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Nov 1, 2019
94c1044
Merge remote-tracking branch 'devavret/fea-cudf-scalar' into fea_move…
ChuckHastings Nov 1, 2019
c5fe14c
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Nov 4, 2019
e14d573
use new fill mechanism to populate column from scalar
ChuckHastings Nov 5, 2019
077d113
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Nov 7, 2019
67d3289
Update search to use latest from branch-0.11
ChuckHastings Nov 7, 2019
ab5d958
update changelog
ChuckHastings Nov 7, 2019
9912a6a
add string support and string tests
ChuckHastings Nov 13, 2019
fddd2ff
Merge branch 'branch-0.11' into fea_move_search_part2
ChuckHastings Nov 13, 2019
69d426d
remove some dead and useless code
ChuckHastings Nov 14, 2019
4e8ac4f
clean up some code in search_test, revert column_utilities to elimina…
ChuckHastings Nov 14, 2019
105d33b
Apply suggestions from code review
ChuckHastings Nov 18, 2019
d06b092
make last changes compile
ChuckHastings Nov 18, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,7 @@ add_library(cudf
src/filling/legacy/repeat.cu
src/filling/legacy/tile.cu
src/search/legacy/search.cu
src/search/search.cu
src/column/column.cu
src/column/column_view.cpp
src/column/column_device_view.cu
Expand Down
132 changes: 132 additions & 0 deletions cpp/include/cudf/search.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/types.hpp>
#include <cudf/column/column.hpp>
#include <cudf/table/table.hpp>

#include <vector>

namespace cudf {
namespace experimental {

/**---------------------------------------------------------------------------*
* @brief Find smallest indices in a sorted table where values should be
* inserted to maintain order
*
* For each row v in @p values, find the first index in @p t where
* inserting the row will maintain the sort order of @p t
*
* Example:
*
* Single column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 1 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
*
* @param t Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence The size of a NULL value in comparison to all other
* values
* @param mr Device memory resource to use for device memory allocation
* @return std::unique_ptr<column> A non-nullable column of INT32 elements
* containing the insertion points.
*---------------------------------------------------------------------------**/
std::unique_ptr<column> lower_bound(table_view const& t,
table_view const& values,
std::vector<order> const& column_order,
null_order null_precedence,
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**---------------------------------------------------------------------------*
* @brief Find largest indices in a sorted table where values should be
* inserted to maintain order
*
* For each row v in @p values, find the last index in @p t where
* inserting the row will maintain the sort order of @p t
*
* Example:
*
* Single Column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 3 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 * *
* @param column Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence The size of a NULL value in comparison to all other
* values
* @param mr Device memory resource to use for device memory allocation
* @return std::unique_ptr<column> A non-nullable column of INT32 elements
* containing the insertion points.
*---------------------------------------------------------------------------**/
std::unique_ptr<column> upper_bound(table_view const& t,
table_view const& values,
std::vector<order> const& column_order,
null_order null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**---------------------------------------------------------------------------*
* @brief Find if the `value` is present in the `column` and dtype of both
ChuckHastings marked this conversation as resolved.
Show resolved Hide resolved
* `value` and `column` should match.
*
* @throws cudf::logic_error
* If dtype of `column` and `value` doesn't match
ChuckHastings marked this conversation as resolved.
Show resolved Hide resolved
*
* @example:
*
* Single Column:
* idx 0 1 2 3 4
* col = { 10, 20, 20, 30, 50 }
* Scalar:
* value = { 20 }
* result = true
*
* @param col A column object
* @param value A scalar value to search for in `col`
*
* @return bool If `value` is found in `column` true, else false.
*---------------------------------------------------------------------------**/
bool contains(column_view const& col, gdf_scalar const& value);
} // namespace experimental
} // namespace cudf


6 changes: 3 additions & 3 deletions cpp/include/cudf/table/table_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ class table_device_view_base {

__device__ ColumnDeviceView const& column(size_type column_index) const
noexcept {
assert(column_index > 0);
assert(column_index >= 0);
assert(column_index < _num_columns);
return _columns[column_index];
}

__device__ ColumnDeviceView& column(size_type column_index) noexcept {
assert(column_index > 0);
assert(column_index >= 0);
assert(column_index < _num_columns);
return _columns[column_index];
}
Expand Down Expand Up @@ -104,4 +104,4 @@ class mutable_table_device_view
mutable_table_view>(source_view,
stream) {}
};
} // namespace cudf
} // namespace cudf
6 changes: 3 additions & 3 deletions cpp/include/cudf/utilities/bit.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -78,22 +78,22 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask,
~(bitmask_type{1} << intra_word_index(bit_index));
}

#ifdef __CUDACC__

/**---------------------------------------------------------------------------*
* @brief Indicates whether the specified bit is set to `1`
*
* @param bit_index Index of the bit to test
* @return true The specified bit is `1`
* @return false The specified bit is `0`
*---------------------------------------------------------------------------**/
__device__ inline bool bit_is_set(bitmask_type const* bitmask,
CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask,
size_type bit_index) {
assert(nullptr != bitmask);
return bitmask[word_index(bit_index)] &
(bitmask_type{1} << intra_word_index(bit_index));
}

#ifdef __CUDACC__

/**---------------------------------------------------------------------------*
* @brief Sets the specified bit to `1`
*
Expand Down
209 changes: 209 additions & 0 deletions cpp/src/search/search.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/search.hpp>
#include <cudf/legacy/copying.hpp>
#include <cudf/table/row_operators.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>

#include <rmm/thrust_rmm_allocator.h>

#include <thrust/binary_search.h>
#include <thrust/device_vector.h>
#include <thrust/logical.h>

namespace cudf {
namespace experimental {

namespace {

template <typename DataIterator, typename ValuesIterator,
typename OutputIterator, typename Comparator>
void launch_search(DataIterator it_data,
ValuesIterator it_vals,
size_type data_size,
size_type values_size,
OutputIterator it_output,
Comparator comp,
bool find_first,
cudaStream_t stream)
{
if (find_first) {
thrust::lower_bound(rmm::exec_policy(stream)->on(stream),
it_data, it_data + data_size,
it_vals, it_vals + values_size,
it_output, comp);
}
else {
thrust::upper_bound(rmm::exec_policy(stream)->on(stream),
it_data, it_data + data_size,
it_vals, it_vals + values_size,
it_output, comp);
}
}

} // namespace

namespace detail {

std::unique_ptr<column> search_ordered(table_view const& t,
table_view const& values,
bool find_first,
std::vector<order> const& column_order,
null_order null_precedence,
rmm::mr::device_memory_resource *mr,
cudaStream_t stream = 0)
{
// Allocate result column
std::unique_ptr<column> result = make_numeric_column(data_type{INT32}, values.num_rows(),
mask_state::UNALLOCATED, stream, mr);

mutable_column_view result_view = result.get()->mutable_view();

// Handle empty inputs
if (t.num_rows() == 0) {
CUDA_TRY(cudaMemset(result_view.data<int32_t>(), 0, values.num_rows() * sizeof(int32_t)));
return result;
}

if (not column_order.empty()) {
CUDF_EXPECTS(
static_cast<std::size_t>(t.num_columns()) == column_order.size(),
"Mismatch between number of columns and column order.");
}

auto d_t = table_device_view::create(t, stream);
auto d_values = table_device_view::create(values, stream);
auto count_it = thrust::make_counting_iterator<int32_t>(0);

// Need an order*
rmm::device_vector<order> d_column_order(column_order.begin(), column_order.end());

if (has_nulls(t)) {
auto ineq_op = (find_first)
? row_lexicographic_comparator<true>(*d_t, *d_values, null_precedence, d_column_order.data().get())
: row_lexicographic_comparator<true>(*d_values, *d_t, null_precedence, d_column_order.data().get());

launch_search(count_it, count_it, t.num_rows(), values.num_rows(),
result_view.data<int32_t>(), ineq_op, find_first, stream);
} else {
auto ineq_op = (find_first)
? row_lexicographic_comparator<false>(*d_t, *d_values, null_precedence, d_column_order.data().get())
: row_lexicographic_comparator<false>(*d_values, *d_t, null_precedence, d_column_order.data().get());

launch_search(count_it, count_it, t.num_rows(), values.num_rows(),
result_view.data<int32_t>(), ineq_op, find_first, stream);
}

return result;
}

template <bool nullable = true>
struct compare_with_value{
compare_with_value(table_device_view t, table_device_view val, bool nulls_are_equal = true)
: compare(t, val, nulls_are_equal) {}

__device__ bool operator()(int32_t i){
return compare(i, 0);
}
row_equality_comparator<nullable> compare;
};

bool contains(column_view const& col,
gdf_scalar const& value,
cudaStream_t stream = 0)
{
// TODO: Rework for cudf::exp::scalar
// Any reference to value is suspect

// TODO: Not sure how to do this!!!
//CUDF_EXPECTS(col.type() == value.dtype, "DTYPE mismatch");

// No element to compare against
if (col.size() == 0) {
return false;
}

// If value is invalid and there are any nulls, return true
if (value.is_valid == false){
return col.has_nulls();
}

std::unique_ptr<column> scalar_as_column = make_numeric_column(col.type(), 1);

CUDA_TRY(cudaMemcpyAsync(scalar_as_column.get()->mutable_view().data<int32_t>(),
&value.data,
//cudf::size_of(value.dtype),
sizeof(int32_t),
cudaMemcpyHostToDevice, stream));

auto d_t = cudf::table_device_view::create(table_view{{col}});
auto sss = scalar_as_column.get()->view();
//auto d_value = cudf::table_device_view::create(table_view{{scalar_as_column.get()->view()}});
auto d_value = cudf::table_device_view::create(table_view{{sss}});

// TODO: What is this type? above code assumes it is
// a counting iterator templated to gdf_index_type,
// but don't we have to specify this if we want that?
// Otherwise this should a counting iterator to int.
//
auto data_it = thrust::make_counting_iterator(0);

if (col.has_nulls()) {
auto eq_op = compare_with_value<true>(*d_t, *d_value, true);

return thrust::any_of(rmm::exec_policy(stream)->on(stream),
data_it, data_it + col.size(),
eq_op);
}
else {
auto eq_op = compare_with_value<false>(*d_t, *d_value, true);

return thrust::any_of(rmm::exec_policy(stream)->on(stream),
data_it, data_it + col.size(),
eq_op);
}
}
} // namespace detail

std::unique_ptr<column> lower_bound(table_view const& t,
table_view const& values,
std::vector<order> const& column_order,
null_order null_precedence,
rmm::mr::device_memory_resource *mr)
{
return detail::search_ordered(t, values, true, column_order, null_precedence, mr);
}

std::unique_ptr<column> upper_bound(table_view const& t,
table_view const& values,
std::vector<order> const& column_order,
null_order null_precedence,
rmm::mr::device_memory_resource *mr)
{
return detail::search_ordered(t, values, false, column_order, null_precedence, mr);
}

bool contains(column const& col, gdf_scalar const& value)
{
return detail::contains(col, value);
}

} // namespace exp
} // namespace cudf
Loading