Skip to content

Commit

Permalink
Merge branch 'branch-22.06' into regex-rework-optimizers
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed May 11, 2022
2 parents e060a6f + 2aaa863 commit 25aab2d
Show file tree
Hide file tree
Showing 29 changed files with 931 additions and 506 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/jni-docker-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: JNI Docker Build

on:
workflow_dispatch: # manual trigger only

concurrency:
group: jni-docker-build-${{ github.ref }}
cancel-in-progress: true

jobs:
docker-build:
if: github.repository == 'rapidsai/cudf'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Set up QEMU
uses: docker/setup-qemu-action@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}

- name: Set ENVs
run: |
echo "IMAGE_NAME=rapidsai/cudf-jni-build" >> $GITHUB_ENV
echo "IMAGE_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
- name: Build and Push
uses: docker/build-push-action@v3
with:
push: true
file: java/ci/Dockerfile.centos7
tags: "${{ env.IMAGE_NAME }}:${{ env.IMAGE_REF }}"
5 changes: 5 additions & 0 deletions conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ dependencies:
- pydata-sphinx-theme
- librdkafka=1.7.0
- python-confluent-kafka=1.7.0
- moto>=3.1.6
- boto3>=1.21.21
- botocore>=1.24.21
- aiobotocore>=2.2.0
- s3fs>=2022.3.0
- pip:
- git+https://github.com/python-streamz/streamz.git@master
- pyorc
Expand Down
26 changes: 11 additions & 15 deletions cpp/include/cudf/detail/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,11 +33,11 @@ namespace detail {
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> lower_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -46,33 +46,29 @@ std::unique_ptr<column> lower_bound(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> upper_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::contains(column_view const&, scalar const&,
* rmm::mr::device_memory_resource*)
* @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
bool contains(column_view const& col,
scalar const& value,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);

/**
* @copydoc cudf::contains(column_view const&, column_view const&,
* rmm::mr::device_memory_resource*)
* @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> contains(
column_view const& haystack,
column_view const& needles,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
Expand Down
141 changes: 65 additions & 76 deletions cpp/include/cudf/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,134 +32,123 @@ namespace cudf {
*/

/**
* @brief Find smallest indices in a sorted table where values should be
* inserted to maintain order
* @brief Find smallest indices in a sorted table where values should be inserted to maintain order.
*
* For each row v in @p values, find the first index in @p t where
* inserting the row will maintain the sort order of @p t
* For each row in `needles`, find the first index in `haystack` where inserting the row still
* maintains its sort order.
*
* @code{.pseudo}
* Example:
*
* Single column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 1 }
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needles = { 20 }
* result = { 1 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
* idx 0 1 2 3 4
* haystack = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* needles = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
* @endcode
*
* @param t Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence Vector of null_precedence enums values
* @param mr Device memory resource used to allocate the returned column's device
* memory
* @param haystack The table containing search space.
* @param needles Values for which to find the insert locations in the search space.
* @param column_order Vector of column sort order.
* @param null_precedence Vector of null_precedence enums needles.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A non-nullable column of cudf::size_type elements containing the insertion points.
*/
std::unique_ptr<column> lower_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Find largest indices in a sorted table where values should be
* inserted to maintain order
* @brief Find largest indices in a sorted table where values should be inserted to maintain order.
*
* For each row v in @p values, find the last index in @p t where
* inserting the row will maintain the sort order of @p t
* For each row in `needles`, find the last index in `haystack` where inserting the row still
* maintains its sort order.
*
* @code{.pseudo}
* Example:
*
* Single Column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 3 }
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needles = { 20 }
* result = { 3 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 }
* idx 0 1 2 3 4
* haystack = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* needles = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 }
* @endcode
*
* @param search_table Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence Vector of null_precedence enums values
* @param mr Device memory resource used to allocate the returned column's device
* memory
* @param haystack The table containing search space.
* @param needles Values for which to find the insert locations in the search space.
* @param column_order Vector of column sort order.
* @param null_precedence Vector of null_precedence enums needles.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A non-nullable column of cudf::size_type elements containing the insertion points.
*/
std::unique_ptr<column> upper_bound(
table_view const& search_table,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Find if the `value` is present in the `col`
* @brief Check if the given `needle` value exists in the `haystack` column.
*
* @throws cudf::logic_error
* If `col.type() != values.type()`
* @throws cudf::logic_error If `haystack.type() != needle.type()`.
*
* @code{.pseudo}
* Single Column:
* idx 0 1 2 3 4
* col = { 10, 20, 20, 30, 50 }
* Scalar:
* value = { 20 }
* result = true
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needle = { 20 }
* result = true
* @endcode
*
* @param col A column object
* @param value A scalar value to search for in `col`
*
* @return bool If `value` is found in `column` true, else false.
* @param haystack The column containing search space.
* @param needle A scalar value to check for existence in the search space.
* @return true if the given `needle` value exists in the `haystack` column.
*/
bool contains(column_view const& col, scalar const& value);
bool contains(column_view const& haystack, scalar const& needle);

/**
* @brief Returns a new column of type bool identifying for each element of @p haystack column,
* if that element is contained in @p needles column.
* @brief Check if the given `needles` values exists in the `haystack` column.
*
* The new column will have the same dimension and null status as the @p haystack column. That is,
* any element that is invalid in the @p haystack column will be invalid in the returned column.
* The new column will have type BOOL and have the same size and null mask as the input `needles`
* column. That is, any null row in the `needles` column will result in a nul row in the output
* column.
*
* @throws cudf::logic_error
* If `haystack.type() != needles.type()`
* @throws cudf::logic_error If `haystack.type() != needles.type()`
*
* @code{.pseudo}
* haystack = { 10, 20, 30, 40, 50 }
* needles = { 20, 40, 60, 80 }
*
* result = { false, true, false, true, false }
* result = { true, true, false, false }
* @endcode
*
* @param haystack A column object
* @param needles A column of values to search for in `col`
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @return A column of bool elements containing true if the corresponding entry in haystack
* appears in needles and false if it does not.
* @param haystack The column containing search space.
* @param needles A column of values to check for existence in the search space.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A BOOL column indicating if each element in `needles` exists in the search space.
*/
std::unique_ptr<column> contains(
column_view const& haystack,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/remove_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ std::unique_ptr<column> remove_keys(
CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");

// locate keys to remove by searching the keys column
auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr);
auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
auto d_matches = matches->view().data<bool>();
// call common utility method to keep the keys not matched to keys_to_remove
auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; };
Expand All @@ -181,7 +181,7 @@ std::unique_ptr<column> remove_unused_keys(
thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
// wrap the indices for comparison in contains()
column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
return cudf::detail::contains(keys_positions_view, indices_view, stream, mr);
return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
}();
auto d_matches = matches->view().data<bool>();

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/set_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ std::unique_ptr<column> set_keys(
std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));

// compute the new nulls
auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr);
auto matches = cudf::detail::contains(keys_column->view(), keys, stream, mr);
auto d_matches = matches->view().data<bool>();
auto indices_itr =
cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices());
Expand Down
Loading

0 comments on commit 25aab2d

Please sign in to comment.