From 34b8fb419e45143bcf9a93ab1b1031297322776f Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 1 Aug 2023 12:26:54 -0700 Subject: [PATCH 01/29] first draft example --- cpp/tests/CMakeLists.txt | 2 + cpp/tests/blog/example.cpp | 141 +++++++++++++++++++++++++ cpp/tests/blog/example.json | 5 + cpp/tests/blog/max_greater_filter.json | 2 + cpp/tests/blog/sort.json | 2 + cpp/tests/blog/unique_filter.json | 3 + 6 files changed, 155 insertions(+) create mode 100644 cpp/tests/blog/example.cpp create mode 100644 cpp/tests/blog/example.json create mode 100644 cpp/tests/blog/max_greater_filter.json create mode 100644 cpp/tests/blog/sort.json create mode 100644 cpp/tests/blog/unique_filter.json diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d9e34c739ea..cb781607732 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -74,6 +74,8 @@ function(ConfigureTest CMAKE_TEST_NAME) ) endfunction() +ConfigureTest(BLOG blog/example.cpp) + # ################################################################################################## # test sources ################################################################################## # ################################################################################################## diff --git a/cpp/tests/blog/example.cpp b/cpp/tests/blog/example.cpp new file mode 100644 index 00000000000..d0b855a5cb1 --- /dev/null +++ b/cpp/tests/blog/example.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct BlogExample : public cudf::test::BaseFixture {}; + +void write_json(cudf::table_view tbl, std::string path) +{ + // write the data for inspection + auto sink_info = cudf::io::sink_info(path); + auto builder2 = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); + auto options2 = builder2.build(); + cudf::io::write_json(options2); +} + +std::unique_ptr nunique_func(cudf::table_view tbl) // group nunique + filter > 1 +{ + // do the nunique aggregation + auto keys = cudf::table_view{{tbl.column(0)}}; + auto val = tbl.column(1); + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_nunique_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + std::vector columns{result_key->get_column(0), *result_val}; + auto agg_v = cudf::table_view(columns); + + // filter out the keys with nunique > 1 + auto const op = cudf::ast::ast_operator::EQUAL; + auto literal_value = cudf::numeric_scalar(1); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_1 = cudf::ast::column_reference(1); + auto expression = cudf::ast::operation(op, col_ref_1, literal); + auto boolean_mask = cudf::compute_column(agg_v, expression); + auto filtered = cudf::apply_boolean_mask(agg_v, boolean_mask->view()); + + // semi join to gather only those keys from the original table + auto join_indices = cudf::left_semi_join(cudf::table_view{{tbl.column(0)}}, + cudf::table_view{{filtered->view().column(0)}}); + auto left_indices_span = cudf::device_span{*join_indices}; + auto left_indices_col = cudf::column_view{left_indices_span}; + auto filtered2 = cudf::gather(tbl, left_indices_col); + + write_json(*filtered2, "/home/nfs/dgala/cudf/cpp/tests/blog/unique_filter.json"); + + return filtered2; +} + +std::unique_ptr max_func(cudf::table_view tbl) // groupby max + filter >= 0.8 +{ + // do the groupbymax aggregation + auto keys = cudf::table_view{{tbl.column(0)}}; + auto val = tbl.column(2); + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_max_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + std::vector columns{result_key->get_column(0), *result_val}; + auto agg_v = cudf::table_view(columns); + + // filter out the keys with nunique > 1 + auto const op = cudf::ast::ast_operator::GREATER_EQUAL; + auto literal_value = cudf::numeric_scalar(0.8); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_1 = cudf::ast::column_reference(1); + auto expression = cudf::ast::operation(op, col_ref_1, literal); + auto boolean_mask = cudf::compute_column(agg_v, expression); + auto filtered = cudf::apply_boolean_mask(agg_v, boolean_mask->view()); + + // semi join to gather only those keys from the original table + auto join_indices = cudf::left_semi_join(cudf::table_view{{tbl.column(0)}}, + cudf::table_view{{filtered->view().column(0)}}); + auto left_indices_span = cudf::device_span{*join_indices}; + auto left_indices_col = cudf::column_view{left_indices_span}; + auto filtered2 = cudf::gather(tbl, left_indices_col); + + // write the data for inspection + write_json(*filtered2, "/home/nfs/dgala/cudf/cpp/tests/blog/max_greater_filter.json"); + + return filtered2; +} + +void sort_func(cudf::table_view tbl) +{ + auto sorted_tbl = cudf::sort(tbl); + + write_json(*sorted_tbl, "/home/nfs/dgala/cudf/cpp/tests/blog/sort.json"); +} + +TEST_F(BlogExample, Test) +{ + // load the json from the example + auto source_info = cudf::io::source_info("/home/nfs/dgala/cudf/cpp/tests/blog/example.json"); + auto builder = cudf::io::json_reader_options::builder(source_info).lines(true); + auto options = builder.build(); + auto json = cudf::io::read_json(options); + auto tbl = json.tbl->view(); + + auto nunique_result = nunique_func(tbl); + + auto max_result = max_func(nunique_result->view()); + + sort_func(max_result->view()); +} diff --git a/cpp/tests/blog/example.json b/cpp/tests/blog/example.json new file mode 100644 index 00000000000..4f1dec6497e --- /dev/null +++ b/cpp/tests/blog/example.json @@ -0,0 +1,5 @@ +{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} +{"features": {"key":"a2", "value": [{"info":"message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} +{"features": {"key":"a3", "value": [{"info":"message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} +{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} +{"features": {"key":"a4", "value": [{"info":"message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} diff --git a/cpp/tests/blog/max_greater_filter.json b/cpp/tests/blog/max_greater_filter.json new file mode 100644 index 00000000000..a63e172203b --- /dev/null +++ b/cpp/tests/blog/max_greater_filter.json @@ -0,0 +1,2 @@ +{"0":{"0":"a3","1":[{"0":"message_3","1":"device_a","2":1688750003}]},"1":"network_b","2":0.8} +{"0":{"0":"a4","1":[{"0":"message_4","1":"device_a","2":1688750004}]},"1":"network_b","2":0.9} diff --git a/cpp/tests/blog/sort.json b/cpp/tests/blog/sort.json new file mode 100644 index 00000000000..a63e172203b --- /dev/null +++ b/cpp/tests/blog/sort.json @@ -0,0 +1,2 @@ +{"0":{"0":"a3","1":[{"0":"message_3","1":"device_a","2":1688750003}]},"1":"network_b","2":0.8} +{"0":{"0":"a4","1":[{"0":"message_4","1":"device_a","2":1688750004}]},"1":"network_b","2":0.9} diff --git a/cpp/tests/blog/unique_filter.json b/cpp/tests/blog/unique_filter.json new file mode 100644 index 00000000000..6096d8707b2 --- /dev/null +++ b/cpp/tests/blog/unique_filter.json @@ -0,0 +1,3 @@ +{"0":{"0":"a2","1":[{"0":"message_2","1":"device_a","2":1688750002}]},"1":"network_a","2":0.7} +{"0":{"0":"a3","1":[{"0":"message_3","1":"device_a","2":1688750003}]},"1":"network_b","2":0.8} +{"0":{"0":"a4","1":[{"0":"message_4","1":"device_a","2":1688750004}]},"1":"network_b","2":0.9} From 9059909f762d2379484237b59997ae2f81547dc7 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Aug 2023 16:15:57 -0700 Subject: [PATCH 02/29] add dedup example --- cpp/examples/build.sh | 10 +- cpp/examples/nested_types/CMakeLists.txt | 32 +++++ cpp/examples/nested_types/deduplication.cpp | 138 ++++++++++++++++++++ cpp/examples/nested_types/example.json | 5 + 4 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 cpp/examples/nested_types/CMakeLists.txt create mode 100644 cpp/examples/nested_types/deduplication.cpp create mode 100644 cpp/examples/nested_types/example.json diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 7d389cd318d..244eaa0f8f7 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # libcudf examples build script @@ -29,3 +29,11 @@ STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" # Build cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} + +# Nested Types example +NESTED_TYPES_EXAMPLE_DIR=${EXAMPLES_DIR}/nested_types +NESTED_TYPES_EXAMPLE_BUILD_DIR=${NESTED_TYPES_EXAMPLE_DIR}/build +# Configure +cmake -S ${NESTED_TYPES_EXAMPLE_DIR} -B ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" +# Build +cmake --build ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt new file mode 100644 index 00000000000..8e21617d0be --- /dev/null +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.26.4) + +project( + nested_types + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +set(CPM_DOWNLOAD_VERSION v0.35.3) +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake + ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake +) +include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) + +set(CUDF_TAG branch-23.10) +CPMFindPackage( + NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf + GIT_TAG ${CUDF_TAG} + GIT_SHALLOW + TRUE + SOURCE_SUBDIR + cpp +) + +# Configure your project here +add_executable(deduplication deduplication.cpp) +target_link_libraries(deduplication PRIVATE cudf::cudf) +target_compile_features(deduplication PRIVATE cxx_std_17) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp new file mode 100644 index 00000000000..e68dfccd899 --- /dev/null +++ b/cpp/examples/nested_types/deduplication.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +std::unique_ptr read_json(std::string filepath) +{ + auto source_info = cudf::io::source_info(filepath); + auto builder = cudf::io::json_reader_options::builder(source_info).lines(true); + auto options = builder.build(); + auto json = cudf::io::read_json(options); + return std::move(json.tbl); +} + +void write_json(cudf::table_view tbl, std::string filepath) +{ + // write the data for inspection + auto sink_info = cudf::io::sink_info(filepath); + auto builder2 = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); + auto options2 = builder2.build(); + cudf::io::write_json(options2); +} + +std::unique_ptr deduplication_hash(cudf::column_view col) +{ + auto tbl = cudf::table_view{{col}}; + + // Get count for each key + auto keys = cudf::table_view{{tbl.column(0)}}; + auto val = tbl.column(0); + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_count_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + std::vector columns{result_key->get_column(0), *result_val}; + auto agg_v = cudf::table_view(columns); + + // Join on keys to get + return std::make_unique(agg_v); +} + +std::unique_ptr deduplication_sort(cudf::column_view col) +{ + auto tbl = cudf::table_view{{col}}; + + // Get count for each key + auto keys = cudf::table_view{{tbl.column(0)}}; + auto val = tbl.column(0); + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_nunique_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + std::vector columns{result_key->get_column(0), *result_val}; + auto agg_v = cudf::table_view(columns); + + // Join on keys to get + return std::make_unique(agg_v); +} + +/** + * @brief Main for nested_types examples + * + * Command line parameters: + * 1. JSON input file name/path (default: "example.json") + * 2. `hash` for hash based deduplication or `sort` for sort based deduplication (default: "hash") + * 3. JSON output file name/path (default: "hash_output.json") + * + * The stdout includes the number of rows in the input and the output size in bytes. + */ +int main(int argc, char const** argv) +{ + std::string input_filepath; + std::string algorithm; + std::string output_filepath; + if (argc < 2) { + input_filepath = "example.json"; + algorithm = "hash"; + output_filepath = "hash_output.json"; + } else if (argc == 4) { + input_filepath = argv[1]; + algorithm = argv[2]; + output_filepath = argv[3]; + } else { + std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; + return 1; + } + + // read input file + auto tbl = read_json(input_filepath); + + auto st = std::chrono::steady_clock::now(); + + // alg here + std::unique_ptr result; + if (algorithm == "hash") { + result = deduplication_hash(tbl->view().column(0)); + } else { + result = deduplication_sort(tbl->view().column(0)); + } + + std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << elapsed.count() << " seconds\n"; + + write_json(result->view(), output_filepath); + + return 0; +} diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json new file mode 100644 index 00000000000..4f1dec6497e --- /dev/null +++ b/cpp/examples/nested_types/example.json @@ -0,0 +1,5 @@ +{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} +{"features": {"key":"a2", "value": [{"info":"message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} +{"features": {"key":"a3", "value": [{"info":"message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} +{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} +{"features": {"key":"a4", "value": [{"info":"message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} From 5f3b0dad3a7a4b1da2bcb40f7b73d71148ebbdcd Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 18 Aug 2023 05:55:05 -0700 Subject: [PATCH 03/29] write new example --- cpp/examples/README.md | 1 + cpp/examples/nested_types/deduplication.cpp | 69 ++++++++++----------- cpp/tests/blog/example.json | 5 -- 3 files changed, 34 insertions(+), 41 deletions(-) delete mode 100644 cpp/tests/blog/example.json diff --git a/cpp/examples/README.md b/cpp/examples/README.md index b2e8dd399d0..7f2b769f4a5 100644 --- a/cpp/examples/README.md +++ b/cpp/examples/README.md @@ -7,3 +7,4 @@ Current examples: - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings +- Nested Types: demonstrates using libcudf for some operations on nested types diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index e68dfccd899..9c4fe290c0c 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -14,11 +14,15 @@ * limitations under the License. */ +#include +#include +#include #include #include #include #include #include +#include #include #include @@ -42,10 +46,8 @@ void write_json(cudf::table_view tbl, std::string filepath) cudf::io::write_json(options2); } -std::unique_ptr deduplication_hash(cudf::column_view col) +std::unique_ptr count_aggregate(cudf::table_view tbl) { - auto tbl = cudf::table_view{{col}}; - // Get count for each key auto keys = cudf::table_view{{tbl.column(0)}}; auto val = tbl.column(0); @@ -65,27 +67,30 @@ std::unique_ptr deduplication_hash(cudf::column_view col) return std::make_unique(agg_v); } -std::unique_ptr deduplication_sort(cudf::column_view col) +std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) { - auto tbl = cudf::table_view{{col}}; + auto [left_indices, right_indices] = + cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}}); + auto new_left = cudf::gather(left, cudf::device_span{*left_indices}); + auto new_right = cudf::gather(right, cudf::device_span{*right_indices}); - // Get count for each key - auto keys = cudf::table_view{{tbl.column(0)}}; - auto val = tbl.column(0); - cudf::groupby::groupby grpby_obj(keys); - std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - auto agg = cudf::make_nunique_aggregation(); - requests[0].aggregations.push_back(std::move(agg)); - requests[0].values = val; - auto agg_results = grpby_obj.aggregate(requests); - auto result_key = std::move(agg_results.first); - auto result_val = std::move(agg_results.second[0].results[0]); - std::vector columns{result_key->get_column(0), *result_val}; - auto agg_v = cudf::table_view(columns); + auto left_cols = new_left->release(); + auto right_cols = new_right->release(); + left_cols.push_back(std::move(right_cols[1])); - // Join on keys to get - return std::make_unique(agg_v); + return std::make_unique(std::move(left_cols)); +} + +std::unique_ptr filter_duplicates(cudf::table_view tbl) +{ + auto const op = cudf::ast::ast_operator::EQUAL; + auto literal_value = cudf::numeric_scalar(1); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_1 = cudf::ast::column_reference(3); + auto expression = cudf::ast::operation(op, col_ref_1, literal); + auto boolean_mask = cudf::compute_column(tbl, expression); + auto filtered = cudf::apply_boolean_mask(tbl, boolean_mask->view()); + return filtered; } /** @@ -93,24 +98,20 @@ std::unique_ptr deduplication_sort(cudf::column_view col) * * Command line parameters: * 1. JSON input file name/path (default: "example.json") - * 2. `hash` for hash based deduplication or `sort` for sort based deduplication (default: "hash") - * 3. JSON output file name/path (default: "hash_output.json") + * 3. JSON output file name/path (default: "output.json") * * The stdout includes the number of rows in the input and the output size in bytes. */ int main(int argc, char const** argv) { std::string input_filepath; - std::string algorithm; std::string output_filepath; if (argc < 2) { input_filepath = "example.json"; - algorithm = "hash"; - output_filepath = "hash_output.json"; + output_filepath = "output.json"; } else if (argc == 4) { input_filepath = argv[1]; - algorithm = argv[2]; - output_filepath = argv[3]; + output_filepath = argv[2]; } else { std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; return 1; @@ -121,18 +122,14 @@ int main(int argc, char const** argv) auto st = std::chrono::steady_clock::now(); - // alg here - std::unique_ptr result; - if (algorithm == "hash") { - result = deduplication_hash(tbl->view().column(0)); - } else { - result = deduplication_sort(tbl->view().column(0)); - } + auto count = count_aggregate(tbl->view()); + auto combined = join_count(tbl->view(), count->view()); + auto filtered = filter_duplicates(combined->view()); std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; std::cout << "Wall time: " << elapsed.count() << " seconds\n"; - write_json(result->view(), output_filepath); + write_json(filtered->view(), output_filepath); return 0; } diff --git a/cpp/tests/blog/example.json b/cpp/tests/blog/example.json deleted file mode 100644 index 4f1dec6497e..00000000000 --- a/cpp/tests/blog/example.json +++ /dev/null @@ -1,5 +0,0 @@ -{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} -{"features": {"key":"a2", "value": [{"info":"message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} -{"features": {"key":"a3", "value": [{"info":"message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} -{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} -{"features": {"key":"a4", "value": [{"info":"message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} From a15dfef9e7bf1f8b6a1c764898693e2565ad1280 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 18 Aug 2023 05:56:01 -0700 Subject: [PATCH 04/29] delete old example --- cpp/tests/CMakeLists.txt | 2 - cpp/tests/blog/example.cpp | 141 ------------------------- cpp/tests/blog/max_greater_filter.json | 2 - cpp/tests/blog/sort.json | 2 - cpp/tests/blog/unique_filter.json | 3 - 5 files changed, 150 deletions(-) delete mode 100644 cpp/tests/blog/example.cpp delete mode 100644 cpp/tests/blog/max_greater_filter.json delete mode 100644 cpp/tests/blog/sort.json delete mode 100644 cpp/tests/blog/unique_filter.json diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cb781607732..d9e34c739ea 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -74,8 +74,6 @@ function(ConfigureTest CMAKE_TEST_NAME) ) endfunction() -ConfigureTest(BLOG blog/example.cpp) - # ################################################################################################## # test sources ################################################################################## # ################################################################################################## diff --git a/cpp/tests/blog/example.cpp b/cpp/tests/blog/example.cpp deleted file mode 100644 index d0b855a5cb1..00000000000 --- a/cpp/tests/blog/example.cpp +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct BlogExample : public cudf::test::BaseFixture {}; - -void write_json(cudf::table_view tbl, std::string path) -{ - // write the data for inspection - auto sink_info = cudf::io::sink_info(path); - auto builder2 = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); - auto options2 = builder2.build(); - cudf::io::write_json(options2); -} - -std::unique_ptr nunique_func(cudf::table_view tbl) // group nunique + filter > 1 -{ - // do the nunique aggregation - auto keys = cudf::table_view{{tbl.column(0)}}; - auto val = tbl.column(1); - cudf::groupby::groupby grpby_obj(keys); - std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - auto agg = cudf::make_nunique_aggregation(); - requests[0].aggregations.push_back(std::move(agg)); - requests[0].values = val; - auto agg_results = grpby_obj.aggregate(requests); - auto result_key = std::move(agg_results.first); - auto result_val = std::move(agg_results.second[0].results[0]); - std::vector columns{result_key->get_column(0), *result_val}; - auto agg_v = cudf::table_view(columns); - - // filter out the keys with nunique > 1 - auto const op = cudf::ast::ast_operator::EQUAL; - auto literal_value = cudf::numeric_scalar(1); - auto literal = cudf::ast::literal(literal_value); - auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::operation(op, col_ref_1, literal); - auto boolean_mask = cudf::compute_column(agg_v, expression); - auto filtered = cudf::apply_boolean_mask(agg_v, boolean_mask->view()); - - // semi join to gather only those keys from the original table - auto join_indices = cudf::left_semi_join(cudf::table_view{{tbl.column(0)}}, - cudf::table_view{{filtered->view().column(0)}}); - auto left_indices_span = cudf::device_span{*join_indices}; - auto left_indices_col = cudf::column_view{left_indices_span}; - auto filtered2 = cudf::gather(tbl, left_indices_col); - - write_json(*filtered2, "/home/nfs/dgala/cudf/cpp/tests/blog/unique_filter.json"); - - return filtered2; -} - -std::unique_ptr max_func(cudf::table_view tbl) // groupby max + filter >= 0.8 -{ - // do the groupbymax aggregation - auto keys = cudf::table_view{{tbl.column(0)}}; - auto val = tbl.column(2); - cudf::groupby::groupby grpby_obj(keys); - std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - auto agg = cudf::make_max_aggregation(); - requests[0].aggregations.push_back(std::move(agg)); - requests[0].values = val; - auto agg_results = grpby_obj.aggregate(requests); - auto result_key = std::move(agg_results.first); - auto result_val = std::move(agg_results.second[0].results[0]); - std::vector columns{result_key->get_column(0), *result_val}; - auto agg_v = cudf::table_view(columns); - - // filter out the keys with nunique > 1 - auto const op = cudf::ast::ast_operator::GREATER_EQUAL; - auto literal_value = cudf::numeric_scalar(0.8); - auto literal = cudf::ast::literal(literal_value); - auto col_ref_1 = cudf::ast::column_reference(1); - auto expression = cudf::ast::operation(op, col_ref_1, literal); - auto boolean_mask = cudf::compute_column(agg_v, expression); - auto filtered = cudf::apply_boolean_mask(agg_v, boolean_mask->view()); - - // semi join to gather only those keys from the original table - auto join_indices = cudf::left_semi_join(cudf::table_view{{tbl.column(0)}}, - cudf::table_view{{filtered->view().column(0)}}); - auto left_indices_span = cudf::device_span{*join_indices}; - auto left_indices_col = cudf::column_view{left_indices_span}; - auto filtered2 = cudf::gather(tbl, left_indices_col); - - // write the data for inspection - write_json(*filtered2, "/home/nfs/dgala/cudf/cpp/tests/blog/max_greater_filter.json"); - - return filtered2; -} - -void sort_func(cudf::table_view tbl) -{ - auto sorted_tbl = cudf::sort(tbl); - - write_json(*sorted_tbl, "/home/nfs/dgala/cudf/cpp/tests/blog/sort.json"); -} - -TEST_F(BlogExample, Test) -{ - // load the json from the example - auto source_info = cudf::io::source_info("/home/nfs/dgala/cudf/cpp/tests/blog/example.json"); - auto builder = cudf::io::json_reader_options::builder(source_info).lines(true); - auto options = builder.build(); - auto json = cudf::io::read_json(options); - auto tbl = json.tbl->view(); - - auto nunique_result = nunique_func(tbl); - - auto max_result = max_func(nunique_result->view()); - - sort_func(max_result->view()); -} diff --git a/cpp/tests/blog/max_greater_filter.json b/cpp/tests/blog/max_greater_filter.json deleted file mode 100644 index a63e172203b..00000000000 --- a/cpp/tests/blog/max_greater_filter.json +++ /dev/null @@ -1,2 +0,0 @@ -{"0":{"0":"a3","1":[{"0":"message_3","1":"device_a","2":1688750003}]},"1":"network_b","2":0.8} -{"0":{"0":"a4","1":[{"0":"message_4","1":"device_a","2":1688750004}]},"1":"network_b","2":0.9} diff --git a/cpp/tests/blog/sort.json b/cpp/tests/blog/sort.json deleted file mode 100644 index a63e172203b..00000000000 --- a/cpp/tests/blog/sort.json +++ /dev/null @@ -1,2 +0,0 @@ -{"0":{"0":"a3","1":[{"0":"message_3","1":"device_a","2":1688750003}]},"1":"network_b","2":0.8} -{"0":{"0":"a4","1":[{"0":"message_4","1":"device_a","2":1688750004}]},"1":"network_b","2":0.9} diff --git a/cpp/tests/blog/unique_filter.json b/cpp/tests/blog/unique_filter.json deleted file mode 100644 index 6096d8707b2..00000000000 --- a/cpp/tests/blog/unique_filter.json +++ /dev/null @@ -1,3 +0,0 @@ -{"0":{"0":"a2","1":[{"0":"message_2","1":"device_a","2":1688750002}]},"1":"network_a","2":0.7} -{"0":{"0":"a3","1":[{"0":"message_3","1":"device_a","2":1688750003}]},"1":"network_b","2":0.8} -{"0":{"0":"a4","1":[{"0":"message_4","1":"device_a","2":1688750004}]},"1":"network_b","2":0.9} From 776b10d1314b07bef4d3df588ed1167b4c78da04 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 18 Aug 2023 23:46:14 +0530 Subject: [PATCH 05/29] add metadata to json writer --- cpp/examples/nested_types/deduplication.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 9c4fe290c0c..5a45938c856 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "cudf/io/types.hpp" #include #include #include @@ -28,21 +29,21 @@ #include #include -std::unique_ptr read_json(std::string filepath) +cudf::io::table_with_metadata read_json(std::string filepath) { auto source_info = cudf::io::source_info(filepath); auto builder = cudf::io::json_reader_options::builder(source_info).lines(true); auto options = builder.build(); - auto json = cudf::io::read_json(options); - return std::move(json.tbl); + return cudf::io::read_json(options); } -void write_json(cudf::table_view tbl, std::string filepath) +void write_json(cudf::table_view tbl, cudf::io::table_metadata metadata, std::string filepath) { // write the data for inspection auto sink_info = cudf::io::sink_info(filepath); auto builder2 = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); - auto options2 = builder2.build(); + builder2.metadata(metadata); + auto options2 = builder2.build(); cudf::io::write_json(options2); } @@ -118,7 +119,7 @@ int main(int argc, char const** argv) } // read input file - auto tbl = read_json(input_filepath); + auto [tbl, metadata] = read_json(input_filepath); auto st = std::chrono::steady_clock::now(); @@ -129,7 +130,7 @@ int main(int argc, char const** argv) std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; std::cout << "Wall time: " << elapsed.count() << " seconds\n"; - write_json(filtered->view(), output_filepath); + write_json(filtered->view(), metadata, output_filepath); return 0; } From fa6f928a5b787b3b1ff43a888ce2e796120c2c5d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 23 Aug 2023 23:08:37 +0530 Subject: [PATCH 06/29] add new column name --- cpp/examples/nested_types/deduplication.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 5a45938c856..b78de2f47ab 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -126,6 +126,7 @@ int main(int argc, char const** argv) auto count = count_aggregate(tbl->view()); auto combined = join_count(tbl->view(), count->view()); auto filtered = filter_duplicates(combined->view()); + metadata.schema_info.emplace_back("count"); std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; std::cout << "Wall time: " << elapsed.count() << " seconds\n"; From e1f6c7cb3fa628e4f03458c21dbb64ed341a130b Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 23 Aug 2023 11:14:33 -0700 Subject: [PATCH 07/29] add sort, drop filtering --- cpp/examples/nested_types/deduplication.cpp | 35 ++++++++++----------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index b78de2f47ab..529d0236563 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -14,16 +14,15 @@ * limitations under the License. */ -#include "cudf/io/types.hpp" -#include #include #include #include #include +#include #include +#include #include #include -#include #include #include @@ -82,16 +81,10 @@ std::unique_ptr join_count(cudf::table_view left, cudf::table_view return std::make_unique(std::move(left_cols)); } -std::unique_ptr filter_duplicates(cudf::table_view tbl) +std::unique_ptr sort_keys(cudf::table_view tbl) { - auto const op = cudf::ast::ast_operator::EQUAL; - auto literal_value = cudf::numeric_scalar(1); - auto literal = cudf::ast::literal(literal_value); - auto col_ref_1 = cudf::ast::column_reference(3); - auto expression = cudf::ast::operation(op, col_ref_1, literal); - auto boolean_mask = cudf::compute_column(tbl, expression); - auto filtered = cudf::apply_boolean_mask(tbl, boolean_mask->view()); - return filtered; + auto sort_order = cudf::sorted_order(cudf::table_view{{tbl.column(0)}}); + return cudf::gather(tbl, *sort_order); } /** @@ -99,7 +92,7 @@ std::unique_ptr filter_duplicates(cudf::table_view tbl) * * Command line parameters: * 1. JSON input file name/path (default: "example.json") - * 3. JSON output file name/path (default: "output.json") + * 2. JSON output file name/path (default: "output.json") * * The stdout includes the number of rows in the input and the output size in bytes. */ @@ -110,7 +103,7 @@ int main(int argc, char const** argv) if (argc < 2) { input_filepath = "example.json"; output_filepath = "output.json"; - } else if (argc == 4) { + } else if (argc == 3) { input_filepath = argv[1]; output_filepath = argv[2]; } else { @@ -123,15 +116,21 @@ int main(int argc, char const** argv) auto st = std::chrono::steady_clock::now(); - auto count = count_aggregate(tbl->view()); - auto combined = join_count(tbl->view(), count->view()); - auto filtered = filter_duplicates(combined->view()); + auto count = count_aggregate(tbl->view()); + std::chrono::duration count_time = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << count_time.count() << " seconds\n"; + auto combined = join_count(tbl->view(), count->view()); + std::chrono::duration combined_time = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << combined_time.count() << " seconds\n"; + auto sorted = sort_keys(combined->view()); + std::chrono::duration sorted_time = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << sorted_time.count() << " seconds\n"; metadata.schema_info.emplace_back("count"); std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; std::cout << "Wall time: " << elapsed.count() << " seconds\n"; - write_json(filtered->view(), metadata, output_filepath); + write_json(sorted->view(), metadata, output_filepath); return 0; } From df45de25aa997e14887962124ad5c647403418cf Mon Sep 17 00:00:00 2001 From: divyegala Date: Thu, 21 Sep 2023 12:50:54 -0700 Subject: [PATCH 08/29] add pool mr, reduce unnecessary table_view copy --- cpp/examples/nested_types/deduplication.cpp | 45 +++++++++++++++++++-- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 529d0236563..4f21a3d4826 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -24,10 +24,37 @@ #include #include +#include +#include +#include +#include + #include #include #include +/** + * @brief Create CUDA memory resource + */ +auto make_cuda_mr() { return std::make_shared(); } + +/** + * @brief Create a pool device memory resource + */ +auto make_pool_mr() +{ + return rmm::mr::make_owning_wrapper(make_cuda_mr()); +} + +/** + * @brief Create memory resource for libcudf functions + */ +std::shared_ptr create_memory_resource(std::string const& name) +{ + if (name == "pool") { return make_pool_mr(); } + return make_cuda_mr(); +} + cudf::io::table_with_metadata read_json(std::string filepath) { auto source_info = cudf::io::source_info(filepath); @@ -60,11 +87,14 @@ std::unique_ptr count_aggregate(cudf::table_view tbl) auto agg_results = grpby_obj.aggregate(requests); auto result_key = std::move(agg_results.first); auto result_val = std::move(agg_results.second[0].results[0]); - std::vector columns{result_key->get_column(0), *result_val}; - auto agg_v = cudf::table_view(columns); + + auto left_cols = result_key->release(); + left_cols.push_back(std::move(result_val)); + // std::vector columns{result_key->get_column(0), *result_val}; + // auto agg_v = cudf::table_view(columns); // Join on keys to get - return std::make_unique(agg_v); + return std::make_unique(std::move(left_cols)); } std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) @@ -93,6 +123,7 @@ std::unique_ptr sort_keys(cudf::table_view tbl) * Command line parameters: * 1. JSON input file name/path (default: "example.json") * 2. JSON output file name/path (default: "output.json") + * 3. Memory resource (optional): "pool" or "cuda" (default: "pool") * * The stdout includes the number of rows in the input and the output size in bytes. */ @@ -100,17 +131,23 @@ int main(int argc, char const** argv) { std::string input_filepath; std::string output_filepath; - if (argc < 2) { + std::string mr_name; + if (argc < 3) { input_filepath = "example.json"; output_filepath = "output.json"; + mr_name = "pool"; } else if (argc == 3) { input_filepath = argv[1]; output_filepath = argv[2]; + mr_name = argv[3]; } else { std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; return 1; } + auto resource = create_memory_resource(mr_name); + rmm::mr::set_current_device_resource(resource.get()); + // read input file auto [tbl, metadata] = read_json(input_filepath); From 496c28a3fa5cb750040b8373dbf1f8b1786b852f Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 22 Sep 2023 12:37:20 -0400 Subject: [PATCH 09/29] Update cpp/examples/nested_types/deduplication.cpp Co-authored-by: Gregory Kimball --- cpp/examples/nested_types/deduplication.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 4f21a3d4826..a166d3a3610 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -132,11 +132,11 @@ int main(int argc, char const** argv) std::string input_filepath; std::string output_filepath; std::string mr_name; - if (argc < 3) { + if (argc < 4) { input_filepath = "example.json"; output_filepath = "output.json"; mr_name = "pool"; - } else if (argc == 3) { + } else if (argc == 4) { input_filepath = argv[1]; output_filepath = argv[2]; mr_name = argv[3]; From 647604d97ee06787f21af264fd88b0c00261b272 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 26 Sep 2023 13:57:45 -0700 Subject: [PATCH 10/29] use placeholder column for count aggregation --- cpp/examples/nested_types/deduplication.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index a166d3a3610..eb9deaa8c67 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -77,21 +78,20 @@ std::unique_ptr count_aggregate(cudf::table_view tbl) { // Get count for each key auto keys = cudf::table_view{{tbl.column(0)}}; - auto val = tbl.column(0); + auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows()); + cudf::groupby::groupby grpby_obj(keys); std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); auto agg = cudf::make_count_aggregation(); requests[0].aggregations.push_back(std::move(agg)); - requests[0].values = val; + requests[0].values = *val; auto agg_results = grpby_obj.aggregate(requests); auto result_key = std::move(agg_results.first); auto result_val = std::move(agg_results.second[0].results[0]); auto left_cols = result_key->release(); left_cols.push_back(std::move(result_val)); - // std::vector columns{result_key->get_column(0), *result_val}; - // auto agg_v = cudf::table_view(columns); // Join on keys to get return std::make_unique(std::move(left_cols)); From 345ed7725769d9b66e71f9551b578d1e646bf216 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 10 Oct 2023 12:07:04 -0700 Subject: [PATCH 11/29] fix timing counters --- cpp/examples/nested_types/deduplication.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index eb9deaa8c67..d058bf8c4d1 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -153,15 +153,21 @@ int main(int argc, char const** argv) auto st = std::chrono::steady_clock::now(); + auto count_st = std::chrono::steady_clock::now(); auto count = count_aggregate(tbl->view()); - std::chrono::duration count_time = std::chrono::steady_clock::now() - st; + std::chrono::duration count_time = std::chrono::steady_clock::now() - count_st; std::cout << "Wall time: " << count_time.count() << " seconds\n"; + + auto combined_st = std::chrono::steady_clock::now(); auto combined = join_count(tbl->view(), count->view()); - std::chrono::duration combined_time = std::chrono::steady_clock::now() - st; + std::chrono::duration combined_time = std::chrono::steady_clock::now() - combined_st; std::cout << "Wall time: " << combined_time.count() << " seconds\n"; + + auto sorted_st = std::chrono::steady_clock::now(); auto sorted = sort_keys(combined->view()); - std::chrono::duration sorted_time = std::chrono::steady_clock::now() - st; + std::chrono::duration sorted_time = std::chrono::steady_clock::now() - sorted_st; std::cout << "Wall time: " << sorted_time.count() << " seconds\n"; + metadata.schema_info.emplace_back("count"); std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; From 4fa8e3f1287313cc9187cb08796ccd7458e585d1 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 13 Oct 2023 12:51:35 -0700 Subject: [PATCH 12/29] ranges --- cpp/include/cudf/detail/null_mask.cuh | 3 +++ cpp/src/structs/utilities.cpp | 4 +++- cpp/src/table/row_operators.cu | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 78cd3d7bcb7..2d57c8a7eeb 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -42,6 +42,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -156,6 +158,7 @@ size_type inplace_bitmask_binop(Binop op, size_type mask_size_bits, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); CUDF_EXPECTS( std::all_of(masks_begin_bits.begin(), masks_begin_bits.end(), [](auto b) { return b >= 0; }), "Invalid range."); diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index acb153f28d6..82fc31c3d0c 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -30,6 +30,8 @@ #include #include +#include + namespace cudf::structs::detail { /** @@ -397,6 +399,7 @@ std::pair push_down_nulls(column_view cons rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); auto output = push_down_nulls_no_sanitize(input, stream, mr); if (auto const output_view = output.first; @@ -410,7 +413,6 @@ std::pair push_down_nulls(column_view cons // must be done after calling it. output.second.new_null_masks.clear(); } - return output; } diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 770a7c775b4..aff08664881 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -34,6 +34,8 @@ #include +#include + namespace cudf { namespace experimental { @@ -73,6 +75,7 @@ std::vector unslice_children(column_view const& c) */ table_view remove_struct_child_offsets(table_view table) { + CUDF_FUNC_RANGE(); std::vector cols; cols.reserve(table.num_columns()); std::transform(table.begin(), table.end(), std::back_inserter(cols), [&](column_view const& c) { @@ -185,6 +188,7 @@ auto decompose_structs(table_view table, host_span column_order = {}, host_span null_precedence = {}) { + CUDF_FUNC_RANGE(); auto linked_columns = detail::table_to_linked_columns(table); std::vector verticalized_columns; From 3668decea8ffe2e981695438f2e2b36ef9a33980 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 13 Oct 2023 13:07:36 -0700 Subject: [PATCH 13/29] add count aggregation copy --- cpp/examples/nested_types/deduplication.cpp | 35 +++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index d058bf8c4d1..80381cc4fdf 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -97,6 +97,29 @@ std::unique_ptr count_aggregate(cudf::table_view tbl) return std::make_unique(std::move(left_cols)); } +std::unique_ptr count_aggregate_with_copy(cudf::table_view tbl) +{ + // Get count for each key + auto keys = cudf::table_view{{tbl.column(0)}}; + auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows()); + + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_count_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = *val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + + std::vector columns{result_key->get_column(0), *result_val}; + auto agg_v = cudf::table_view(columns); + + // Join on keys to get + return std::make_unique(agg_v); +} + std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) { auto [left_indices, right_indices] = @@ -156,17 +179,23 @@ int main(int argc, char const** argv) auto count_st = std::chrono::steady_clock::now(); auto count = count_aggregate(tbl->view()); std::chrono::duration count_time = std::chrono::steady_clock::now() - count_st; - std::cout << "Wall time: " << count_time.count() << " seconds\n"; + std::cout << "count_aggregate time: " << count_time.count() << " seconds\n"; + + auto count_w_copy_st = std::chrono::steady_clock::now(); + auto count_w_copy = count_aggregate(tbl->view()); + std::chrono::duration count_w_copy_time = + std::chrono::steady_clock::now() - count_w_copy_st; + std::cout << "count_aggregate_with_copy time: " << count_w_copy_time.count() << " seconds\n"; auto combined_st = std::chrono::steady_clock::now(); auto combined = join_count(tbl->view(), count->view()); std::chrono::duration combined_time = std::chrono::steady_clock::now() - combined_st; - std::cout << "Wall time: " << combined_time.count() << " seconds\n"; + std::cout << "join_count time: " << combined_time.count() << " seconds\n"; auto sorted_st = std::chrono::steady_clock::now(); auto sorted = sort_keys(combined->view()); std::chrono::duration sorted_time = std::chrono::steady_clock::now() - sorted_st; - std::cout << "Wall time: " << sorted_time.count() << " seconds\n"; + std::cout << "sort_keys time: " << sorted_time.count() << " seconds\n"; metadata.schema_info.emplace_back("count"); From 86751901330406e5d4b37f3e695a3f1b2f7550b8 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 13 Oct 2023 16:57:43 -0400 Subject: [PATCH 14/29] Update deduplication.cpp Co-authored-by: Gregory Kimball --- cpp/examples/nested_types/deduplication.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 80381cc4fdf..c182ce6378c 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -182,7 +182,7 @@ int main(int argc, char const** argv) std::cout << "count_aggregate time: " << count_time.count() << " seconds\n"; auto count_w_copy_st = std::chrono::steady_clock::now(); - auto count_w_copy = count_aggregate(tbl->view()); + auto count_w_copy = count_aggregate_with_copy(tbl->view()); std::chrono::duration count_w_copy_time = std::chrono::steady_clock::now() - count_w_copy_st; std::cout << "count_aggregate_with_copy time: " << count_w_copy_time.count() << " seconds\n"; From 2b2ebdd7e406cc6181e7796559ba7c2c298b2343 Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 13 Oct 2023 15:08:09 -0700 Subject: [PATCH 15/29] try has_nulls() instead of nullable() in gather --- cpp/include/cudf/detail/gather.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 955f9914632..32d0e75afcc 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -675,7 +675,7 @@ std::unique_ptr gather(table_view const& source_table, auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY || std::any_of(source_table.begin(), source_table.end(), [](auto const& col) { - return col.nullable(); + return col.has_nulls(); }); if (nullable) { auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY From 3ba32ae2e00380ac21dfdfed2b9435190ad307af Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 20 Oct 2023 11:23:46 -0700 Subject: [PATCH 16/29] address reviews --- cpp/examples/nested_types/deduplication.cpp | 89 ++++++++++----------- cpp/include/cudf/detail/gather.cuh | 8 +- 2 files changed, 47 insertions(+), 50 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index c182ce6378c..af94ebc7e10 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -34,6 +34,23 @@ #include #include +/** + * @file deduplication.cpp + * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables. + * + * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three + * kinds: + * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type + * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing + * to determine equality for nested types + * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types + * so as to enable sorting + * + * @note This example is for demonstration purposes only. It is not intended to show the most + * performant way to do the example algorithm. + * + */ + /** * @brief Create CUDA memory resource */ @@ -50,7 +67,7 @@ auto make_pool_mr() /** * @brief Create memory resource for libcudf functions */ -std::shared_ptr create_memory_resource(std::string const& name) +std::shared_ptr create_memory_resource(std::string_view name) { if (name == "pool") { return make_pool_mr(); } return make_cuda_mr(); @@ -68,12 +85,15 @@ void write_json(cudf::table_view tbl, cudf::io::table_metadata metadata, std::st { // write the data for inspection auto sink_info = cudf::io::sink_info(filepath); - auto builder2 = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); - builder2.metadata(metadata); - auto options2 = builder2.build(); - cudf::io::write_json(options2); + auto builder = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); + builder.metadata(metadata); + auto options = builder.build(); + cudf::io::write_json(options); } +/** + * @brief Aggregate count of duplicate rows in nested-type column + */ std::unique_ptr count_aggregate(cudf::table_view tbl) { // Get count for each key @@ -93,33 +113,12 @@ std::unique_ptr count_aggregate(cudf::table_view tbl) auto left_cols = result_key->release(); left_cols.push_back(std::move(result_val)); - // Join on keys to get return std::make_unique(std::move(left_cols)); } -std::unique_ptr count_aggregate_with_copy(cudf::table_view tbl) -{ - // Get count for each key - auto keys = cudf::table_view{{tbl.column(0)}}; - auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows()); - - cudf::groupby::groupby grpby_obj(keys); - std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - auto agg = cudf::make_count_aggregation(); - requests[0].aggregations.push_back(std::move(agg)); - requests[0].values = *val; - auto agg_results = grpby_obj.aggregate(requests); - auto result_key = std::move(agg_results.first); - auto result_val = std::move(agg_results.second[0].results[0]); - - std::vector columns{result_key->get_column(0), *result_val}; - auto agg_v = cudf::table_view(columns); - - // Join on keys to get - return std::make_unique(agg_v); -} - +/** + * @brief Join each row with its duplicate counts + */ std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) { auto [left_indices, right_indices] = @@ -134,6 +133,9 @@ std::unique_ptr join_count(cudf::table_view left, cudf::table_view return std::make_unique(std::move(left_cols)); } +/** + * @brief Sort nested-type column + */ std::unique_ptr sort_keys(cudf::table_view tbl) { auto sort_order = cudf::sorted_order(cudf::table_view{{tbl.column(0)}}); @@ -174,32 +176,27 @@ int main(int argc, char const** argv) // read input file auto [tbl, metadata] = read_json(input_filepath); - auto st = std::chrono::steady_clock::now(); + auto const st = std::chrono::steady_clock::now(); - auto count_st = std::chrono::steady_clock::now(); - auto count = count_aggregate(tbl->view()); - std::chrono::duration count_time = std::chrono::steady_clock::now() - count_st; + auto const count_st = std::chrono::steady_clock::now(); + auto count = count_aggregate(tbl->view()); + std::chrono::duration const count_time = std::chrono::steady_clock::now() - count_st; std::cout << "count_aggregate time: " << count_time.count() << " seconds\n"; - auto count_w_copy_st = std::chrono::steady_clock::now(); - auto count_w_copy = count_aggregate_with_copy(tbl->view()); - std::chrono::duration count_w_copy_time = - std::chrono::steady_clock::now() - count_w_copy_st; - std::cout << "count_aggregate_with_copy time: " << count_w_copy_time.count() << " seconds\n"; - - auto combined_st = std::chrono::steady_clock::now(); - auto combined = join_count(tbl->view(), count->view()); - std::chrono::duration combined_time = std::chrono::steady_clock::now() - combined_st; + auto const combined_st = std::chrono::steady_clock::now(); + auto combined = join_count(tbl->view(), count->view()); + std::chrono::duration const combined_time = + std::chrono::steady_clock::now() - combined_st; std::cout << "join_count time: " << combined_time.count() << " seconds\n"; - auto sorted_st = std::chrono::steady_clock::now(); - auto sorted = sort_keys(combined->view()); - std::chrono::duration sorted_time = std::chrono::steady_clock::now() - sorted_st; + auto const sorted_st = std::chrono::steady_clock::now(); + auto sorted = sort_keys(combined->view()); + std::chrono::duration const sorted_time = std::chrono::steady_clock::now() - sorted_st; std::cout << "sort_keys time: " << sorted_time.count() << " seconds\n"; metadata.schema_info.emplace_back("count"); - std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; + std::chrono::duration const elapsed = std::chrono::steady_clock::now() - st; std::cout << "Wall time: " << elapsed.count() << " seconds\n"; write_json(sorted->view(), metadata, output_filepath); diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 32d0e75afcc..5a926f5bfca 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -483,7 +483,7 @@ struct column_gatherer_impl { auto const nullable = nullify_out_of_bounds || std::any_of(sliced_children.begin(), sliced_children.end(), - [](auto const& col) { return col.nullable(); }); + [](auto const& col) { return col.has_nulls(); }); if (nullable) { gather_bitmask( @@ -569,8 +569,8 @@ void gather_bitmask(table_view const& source, // Create null mask if source is nullable but target is not for (size_t i = 0; i < target.size(); ++i) { - if ((source.column(i).nullable() or op == gather_bitmask_op::NULLIFY) and - not target[i]->nullable()) { + if ((source.column(i).has_nulls() or op == gather_bitmask_op::NULLIFY) and + not target[i]->has_nulls()) { auto const state = op == gather_bitmask_op::PASSTHROUGH ? mask_state::ALL_VALID : mask_state::UNINITIALIZED; auto mask = detail::create_null_mask(target[i]->size(), state, stream, mr); @@ -613,7 +613,7 @@ void gather_bitmask(table_view const& source, // Copy the valid counts into each column auto const valid_counts = make_std_vector_sync(d_valid_counts, stream); for (size_t i = 0; i < target.size(); ++i) { - if (target[i]->nullable()) { + if (target[i]->has_nulls()) { auto const null_count = target_rows - valid_counts[i]; target[i]->set_null_count(null_count); } From 2f0f0e8c3ab3b4c6018747812f698348c1ae823c Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 6 Nov 2023 09:49:56 -0800 Subject: [PATCH 17/29] some review comments --- cpp/examples/nested_types/CMakeLists.txt | 2 +- cpp/examples/nested_types/deduplication.cpp | 3 --- cpp/examples/nested_types/output.json | 5 +++++ 3 files changed, 6 insertions(+), 4 deletions(-) create mode 100644 cpp/examples/nested_types/output.json diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt index 8e21617d0be..9d7bb7ecccb 100644 --- a/cpp/examples/nested_types/CMakeLists.txt +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -16,7 +16,7 @@ file( ) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-23.10) +set(CUDF_TAG branch-23.12) CPMFindPackage( NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index af94ebc7e10..98044acc755 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -46,9 +46,6 @@ * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types * so as to enable sorting * - * @note This example is for demonstration purposes only. It is not intended to show the most - * performant way to do the example algorithm. - * */ /** diff --git a/cpp/examples/nested_types/output.json b/cpp/examples/nested_types/output.json new file mode 100644 index 00000000000..7440a81ce76 --- /dev/null +++ b/cpp/examples/nested_types/output.json @@ -0,0 +1,5 @@ +{"features":{"key":"a1","value":[{"info":"message_1","type":"device_a","dt":1688750001}]},"source":"network_a","quality":0.7,"count":2} +{"features":{"key":"a1","value":[{"info":"message_1","type":"device_a","dt":1688750001}]},"source":"network_b","quality":0.9,"count":2} +{"features":{"key":"a2","value":[{"info":"message_2","type":"device_a","dt":1688750002}]},"source":"network_a","quality":0.7,"count":1} +{"features":{"key":"a3","value":[{"info":"message_3","type":"device_a","dt":1688750003}]},"source":"network_b","quality":0.8,"count":1} +{"features":{"key":"a4","value":[{"info":"message_4","type":"device_a","dt":1688750004}]},"source":"network_b","quality":0.9,"count":1} From f2c42be2c8868a23375892921f7f9aae0544568b Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 6 Nov 2023 10:11:12 -0800 Subject: [PATCH 18/29] Revert "ranges" This reverts commit 4fa8e3f1287313cc9187cb08796ccd7458e585d1. --- cpp/include/cudf/detail/null_mask.cuh | 3 --- cpp/src/structs/utilities.cpp | 4 +--- cpp/src/table/row_operators.cu | 4 ---- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 2d57c8a7eeb..78cd3d7bcb7 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -42,8 +42,6 @@ #include #include -#include - namespace cudf { namespace detail { /** @@ -158,7 +156,6 @@ size_type inplace_bitmask_binop(Binop op, size_type mask_size_bits, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); CUDF_EXPECTS( std::all_of(masks_begin_bits.begin(), masks_begin_bits.end(), [](auto b) { return b >= 0; }), "Invalid range."); diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 82fc31c3d0c..acb153f28d6 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -30,8 +30,6 @@ #include #include -#include - namespace cudf::structs::detail { /** @@ -399,7 +397,6 @@ std::pair push_down_nulls(column_view cons rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); auto output = push_down_nulls_no_sanitize(input, stream, mr); if (auto const output_view = output.first; @@ -413,6 +410,7 @@ std::pair push_down_nulls(column_view cons // must be done after calling it. output.second.new_null_masks.clear(); } + return output; } diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index aff08664881..770a7c775b4 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -34,8 +34,6 @@ #include -#include - namespace cudf { namespace experimental { @@ -75,7 +73,6 @@ std::vector unslice_children(column_view const& c) */ table_view remove_struct_child_offsets(table_view table) { - CUDF_FUNC_RANGE(); std::vector cols; cols.reserve(table.num_columns()); std::transform(table.begin(), table.end(), std::back_inserter(cols), [&](column_view const& c) { @@ -188,7 +185,6 @@ auto decompose_structs(table_view table, host_span column_order = {}, host_span null_precedence = {}) { - CUDF_FUNC_RANGE(); auto linked_columns = detail::table_to_linked_columns(table); std::vector verticalized_columns; From 27db5c2ebb777225a81ea42212d04d727c8c82e3 Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 6 Nov 2023 10:11:45 -0800 Subject: [PATCH 19/29] Revert "try has_nulls() instead of nullable() in gather" This reverts commit 2b2ebdd7e406cc6181e7796559ba7c2c298b2343. --- cpp/include/cudf/detail/gather.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 5a926f5bfca..9278e25951f 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -675,7 +675,7 @@ std::unique_ptr
gather(table_view const& source_table, auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY || std::any_of(source_table.begin(), source_table.end(), [](auto const& col) { - return col.has_nulls(); + return col.nullable(); }); if (nullable) { auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY From 9a3c852e65e58af2fcdcefd5ea3756a26db7871f Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 6 Nov 2023 10:12:43 -0800 Subject: [PATCH 20/29] address more review comments --- cpp/examples/nested_types/deduplication.cpp | 86 +++++++++------------ cpp/include/cudf/detail/gather.cuh | 2 +- 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 98044acc755..a417eca9d97 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -42,34 +42,27 @@ * kinds: * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing - * to determine equality for nested types + * to determine equality for nested types * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types * so as to enable sorting * */ -/** - * @brief Create CUDA memory resource - */ -auto make_cuda_mr() { return std::make_shared(); } - -/** - * @brief Create a pool device memory resource - */ -auto make_pool_mr() -{ - return rmm::mr::make_owning_wrapper(make_cuda_mr()); -} - /** * @brief Create memory resource for libcudf functions */ std::shared_ptr create_memory_resource(std::string_view name) { - if (name == "pool") { return make_pool_mr(); } - return make_cuda_mr(); + auto cuda_mr = std::make_shared(); + if (name == "pool") { + return rmm::mr::make_owning_wrapper(cuda_mr); + } + return cuda_mr; } +/** + * @brief Read JSON input from file + */ cudf::io::table_with_metadata read_json(std::string filepath) { auto source_info = cudf::io::source_info(filepath); @@ -78,11 +71,14 @@ cudf::io::table_with_metadata read_json(std::string filepath) return cudf::io::read_json(options); } -void write_json(cudf::table_view tbl, cudf::io::table_metadata metadata, std::string filepath) +/** + * @brief Write JSON output to file + */ +void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath) { // write the data for inspection auto sink_info = cudf::io::sink_info(filepath); - auto builder = cudf::io::json_writer_options::builder(sink_info, tbl).lines(true); + auto builder = cudf::io::json_writer_options::builder(sink_info, input).lines(true); builder.metadata(metadata); auto options = builder.build(); cudf::io::write_json(options); @@ -91,10 +87,10 @@ void write_json(cudf::table_view tbl, cudf::io::table_metadata metadata, std::st /** * @brief Aggregate count of duplicate rows in nested-type column */ -std::unique_ptr count_aggregate(cudf::table_view tbl) +std::unique_ptr count_aggregate(cudf::table_view input) { // Get count for each key - auto keys = cudf::table_view{{tbl.column(0)}}; + auto keys = cudf::table_view{{input.column(0)}}; auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows()); cudf::groupby::groupby grpby_obj(keys); @@ -120,8 +116,8 @@ std::unique_ptr join_count(cudf::table_view left, cudf::table_view { auto [left_indices, right_indices] = cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}}); - auto new_left = cudf::gather(left, cudf::device_span{*left_indices}); - auto new_right = cudf::gather(right, cudf::device_span{*right_indices}); + auto new_left = cudf::gather(left, cudf::device_span{*left_indices}); + auto new_right = cudf::gather(right, cudf::device_span{*right_indices}); auto left_cols = new_left->release(); auto right_cols = new_right->release(); @@ -132,11 +128,13 @@ std::unique_ptr join_count(cudf::table_view left, cudf::table_view /** * @brief Sort nested-type column + * + * @note if stability is desired, use `cudf::stable_sorted_order` */ -std::unique_ptr sort_keys(cudf::table_view tbl) +std::unique_ptr sort_keys(cudf::table_view input) { - auto sort_order = cudf::sorted_order(cudf::table_view{{tbl.column(0)}}); - return cudf::gather(tbl, *sort_order); + auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}}); + return cudf::gather(input, *sort_order); } /** @@ -147,55 +145,43 @@ std::unique_ptr sort_keys(cudf::table_view tbl) * 2. JSON output file name/path (default: "output.json") * 3. Memory resource (optional): "pool" or "cuda" (default: "pool") * - * The stdout includes the number of rows in the input and the output size in bytes. + * Example invocation from directory `cudf/cpp/examples/nested_types`: + * ./build/deduplication example.json output.json pool + * */ int main(int argc, char const** argv) { std::string input_filepath; std::string output_filepath; std::string mr_name; - if (argc < 4) { + if (argc != 4 && argc != 1) { + std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; + return 1; + } + if (argc == 1) { input_filepath = "example.json"; output_filepath = "output.json"; mr_name = "pool"; - } else if (argc == 4) { + } else { input_filepath = argv[1]; output_filepath = argv[2]; mr_name = argv[3]; - } else { - std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; - return 1; } auto resource = create_memory_resource(mr_name); rmm::mr::set_current_device_resource(resource.get()); // read input file - auto [tbl, metadata] = read_json(input_filepath); - - auto const st = std::chrono::steady_clock::now(); + auto [input, metadata] = read_json(input_filepath); - auto const count_st = std::chrono::steady_clock::now(); - auto count = count_aggregate(tbl->view()); - std::chrono::duration const count_time = std::chrono::steady_clock::now() - count_st; - std::cout << "count_aggregate time: " << count_time.count() << " seconds\n"; + auto count = count_aggregate(input->view()); - auto const combined_st = std::chrono::steady_clock::now(); - auto combined = join_count(tbl->view(), count->view()); - std::chrono::duration const combined_time = - std::chrono::steady_clock::now() - combined_st; - std::cout << "join_count time: " << combined_time.count() << " seconds\n"; + auto combined = join_count(input->view(), count->view()); - auto const sorted_st = std::chrono::steady_clock::now(); - auto sorted = sort_keys(combined->view()); - std::chrono::duration const sorted_time = std::chrono::steady_clock::now() - sorted_st; - std::cout << "sort_keys time: " << sorted_time.count() << " seconds\n"; + auto sorted = sort_keys(combined->view()); metadata.schema_info.emplace_back("count"); - std::chrono::duration const elapsed = std::chrono::steady_clock::now() - st; - std::cout << "Wall time: " << elapsed.count() << " seconds\n"; - write_json(sorted->view(), metadata, output_filepath); return 0; diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 9278e25951f..5a926f5bfca 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -675,7 +675,7 @@ std::unique_ptr
gather(table_view const& source_table, auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY || std::any_of(source_table.begin(), source_table.end(), [](auto const& col) { - return col.nullable(); + return col.has_nulls(); }); if (nullable) { auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY From a6cf0eeaa00f9428f65641b6338188daa9ab2339 Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 6 Nov 2023 10:13:41 -0800 Subject: [PATCH 21/29] revert to nullable() in gather --- cpp/include/cudf/detail/gather.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 5a926f5bfca..955f9914632 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -483,7 +483,7 @@ struct column_gatherer_impl { auto const nullable = nullify_out_of_bounds || std::any_of(sliced_children.begin(), sliced_children.end(), - [](auto const& col) { return col.has_nulls(); }); + [](auto const& col) { return col.nullable(); }); if (nullable) { gather_bitmask( @@ -569,8 +569,8 @@ void gather_bitmask(table_view const& source, // Create null mask if source is nullable but target is not for (size_t i = 0; i < target.size(); ++i) { - if ((source.column(i).has_nulls() or op == gather_bitmask_op::NULLIFY) and - not target[i]->has_nulls()) { + if ((source.column(i).nullable() or op == gather_bitmask_op::NULLIFY) and + not target[i]->nullable()) { auto const state = op == gather_bitmask_op::PASSTHROUGH ? mask_state::ALL_VALID : mask_state::UNINITIALIZED; auto mask = detail::create_null_mask(target[i]->size(), state, stream, mr); @@ -613,7 +613,7 @@ void gather_bitmask(table_view const& source, // Copy the valid counts into each column auto const valid_counts = make_std_vector_sync(d_valid_counts, stream); for (size_t i = 0; i < target.size(); ++i) { - if (target[i]->has_nulls()) { + if (target[i]->nullable()) { auto const null_count = target_rows - valid_counts[i]; target[i]->set_null_count(null_count); } @@ -675,7 +675,7 @@ std::unique_ptr
gather(table_view const& source_table, auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY || std::any_of(source_table.begin(), source_table.end(), [](auto const& col) { - return col.has_nulls(); + return col.nullable(); }); if (nullable) { auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY From 1948b1092a645bd9d792f6ab333cadc513e74e14 Mon Sep 17 00:00:00 2001 From: divyegala Date: Mon, 6 Nov 2023 10:18:10 -0800 Subject: [PATCH 22/29] delete output file included by mistake --- cpp/examples/nested_types/output.json | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 cpp/examples/nested_types/output.json diff --git a/cpp/examples/nested_types/output.json b/cpp/examples/nested_types/output.json deleted file mode 100644 index 7440a81ce76..00000000000 --- a/cpp/examples/nested_types/output.json +++ /dev/null @@ -1,5 +0,0 @@ -{"features":{"key":"a1","value":[{"info":"message_1","type":"device_a","dt":1688750001}]},"source":"network_a","quality":0.7,"count":2} -{"features":{"key":"a1","value":[{"info":"message_1","type":"device_a","dt":1688750001}]},"source":"network_b","quality":0.9,"count":2} -{"features":{"key":"a2","value":[{"info":"message_2","type":"device_a","dt":1688750002}]},"source":"network_a","quality":0.7,"count":1} -{"features":{"key":"a3","value":[{"info":"message_3","type":"device_a","dt":1688750003}]},"source":"network_b","quality":0.8,"count":1} -{"features":{"key":"a4","value":[{"info":"message_4","type":"device_a","dt":1688750004}]},"source":"network_b","quality":0.9,"count":1} From f7e96a99716a383f43d6066653f1558af247cdd1 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Nov 2023 10:10:44 -0800 Subject: [PATCH 23/29] bool for creating mr --- cpp/examples/nested_types/deduplication.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index a417eca9d97..c7ed7f27327 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -51,12 +51,10 @@ /** * @brief Create memory resource for libcudf functions */ -std::shared_ptr create_memory_resource(std::string_view name) +std::shared_ptr create_memory_resource(bool pool) { auto cuda_mr = std::make_shared(); - if (name == "pool") { - return rmm::mr::make_owning_wrapper(cuda_mr); - } + if (pool) { return rmm::mr::make_owning_wrapper(cuda_mr); } return cuda_mr; } @@ -168,7 +166,8 @@ int main(int argc, char const** argv) mr_name = argv[3]; } - auto resource = create_memory_resource(mr_name); + auto pool = mr_name == "pool"; + auto resource = create_memory_resource(pool); rmm::mr::set_current_device_resource(resource.get()); // read input file From 9e2020da3212c1f3140ba4606c31948ba45e14b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 14 Nov 2023 10:49:37 -0800 Subject: [PATCH 24/29] Add function for building examples --- cpp/examples/build.sh | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 244eaa0f8f7..001cdeec694 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -14,26 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")} ################################################################################ # Add individual libcudf examples build scripts down below -# Basic example -BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic -BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build -# Configure -cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} - -# Strings example -STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings -STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build -# Configure -cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} - -# Nested Types example -NESTED_TYPES_EXAMPLE_DIR=${EXAMPLES_DIR}/nested_types -NESTED_TYPES_EXAMPLE_BUILD_DIR=${NESTED_TYPES_EXAMPLE_DIR}/build -# Configure -cmake -S ${NESTED_TYPES_EXAMPLE_DIR} -B ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} +build_example() { + example_dir=${1} + example_dir="${EXAMPLES_DIR}/${example_dir}" + build_dir="${example_dir}/build" + + # Configure + cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}" + # Build + cmake --build ${build_dir} -j${PARALLEL_LEVEL} +} + +build_example basic +build_example strings +build_example nested_types From 45d64960eaf38b93d29bd4742f3e91c081d8366d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 14 Nov 2023 11:00:31 -0800 Subject: [PATCH 25/29] Centralized dependency fetching --- ci/release/update-version.sh | 3 +-- cpp/examples/basic/CMakeLists.txt | 18 +------------- cpp/examples/fetch_dependencies.cmake | 30 ++++++++++++++++++++++++ cpp/examples/nested_types/CMakeLists.txt | 18 +------------- cpp/examples/strings/CMakeLists.txt | 18 +------------- 5 files changed, 34 insertions(+), 53 deletions(-) create mode 100644 cpp/examples/fetch_dependencies.cmake diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 4f1cbc47d1d..16742465c32 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -101,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md # Libcudf examples update -sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt -sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt +sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake # CI files for FILE in .github/workflows/*.yaml; do diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 9ff716f41e4..759a43b5627 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.12) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) # Configure your project here add_executable(basic_example src/process_csv.cpp) diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake new file mode 100644 index 00000000000..dc86c6a9aa5 --- /dev/null +++ b/cpp/examples/fetch_dependencies.cmake @@ -0,0 +1,30 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= +set(CPM_DOWNLOAD_VERSION v0.35.3) +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake + ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake +) +include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) + +set(CUDF_TAG branch-23.12) +CPMFindPackage( + NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf + GIT_TAG ${CUDF_TAG} + GIT_SHALLOW + TRUE + SOURCE_SUBDIR + cpp +) diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt index 9d7bb7ecccb..cb9430db237 100644 --- a/cpp/examples/nested_types/CMakeLists.txt +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.12) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) # Configure your project here add_executable(deduplication deduplication.cpp) diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 4b500d3a92e..c90fa9dde16 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.12) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) From 57ccb511e863ad85017e3e0d1d24b414600f0be1 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Tue, 14 Nov 2023 14:50:02 -0500 Subject: [PATCH 26/29] Update cpp/examples/nested_types/example.json Co-authored-by: Bradley Dice --- cpp/examples/nested_types/example.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json index 4f1dec6497e..efaa37817d6 100644 --- a/cpp/examples/nested_types/example.json +++ b/cpp/examples/nested_types/example.json @@ -1,5 +1,5 @@ -{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} -{"features": {"key":"a2", "value": [{"info":"message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} -{"features": {"key":"a3", "value": [{"info":"message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} -{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} -{"features": {"key":"a4", "value": [{"info":"message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} +{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} +{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} +{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} +{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} +{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} From 4d6ac257efa3f231f4ba41e90ea2df8e62413a03 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Nov 2023 14:21:40 -0800 Subject: [PATCH 27/29] add cout --- cpp/examples/nested_types/deduplication.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index c7ed7f27327..08d54d48695 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -170,6 +170,7 @@ int main(int argc, char const** argv) auto resource = create_memory_resource(pool); rmm::mr::set_current_device_resource(resource.get()); + std::cout << "Reading " << input_filepath << "..." << std::endl; // read input file auto [input, metadata] = read_json(input_filepath); @@ -181,6 +182,7 @@ int main(int argc, char const** argv) metadata.schema_info.emplace_back("count"); + std::cout << "Writing " << output_filepath << "..." << std::endl; write_json(sorted->view(), metadata, output_filepath); return 0; From 02438d1444b0b1122964e834468258114ac94a4d Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 14 Nov 2023 14:35:24 -0800 Subject: [PATCH 28/29] add param in docs --- cpp/examples/nested_types/deduplication.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 08d54d48695..62588729a39 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -60,6 +60,9 @@ std::shared_ptr create_memory_resource(bool poo /** * @brief Read JSON input from file + * + * @param filepath path to input JSON file + * @return cudf::io::table_with_metadata */ cudf::io::table_with_metadata read_json(std::string filepath) { @@ -71,6 +74,10 @@ cudf::io::table_with_metadata read_json(std::string filepath) /** * @brief Write JSON output to file + * + * @param input table to write + * @param metadata metadata of input table read by JSON reader + * @param filepath path to output JSON file */ void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath) { @@ -84,6 +91,9 @@ void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std:: /** * @brief Aggregate count of duplicate rows in nested-type column + * + * @param input table to aggregate + * @return std::unique_ptr */ std::unique_ptr count_aggregate(cudf::table_view input) { @@ -109,6 +119,10 @@ std::unique_ptr count_aggregate(cudf::table_view input) /** * @brief Join each row with its duplicate counts + * + * @param left left table + * @param right right table + * @return std::unique_ptr */ std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) { @@ -127,6 +141,9 @@ std::unique_ptr join_count(cudf::table_view left, cudf::table_view /** * @brief Sort nested-type column * + * @param input table to sort + * @return std::unique_ptr + * * @note if stability is desired, use `cudf::stable_sorted_order` */ std::unique_ptr sort_keys(cudf::table_view input) From 57d68bd9c75f41c0dc5810a912794a70931f7cea Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 15 Nov 2023 12:34:27 -0500 Subject: [PATCH 29/29] Update cpp/examples/nested_types/deduplication.cpp Co-authored-by: Bradley Dice --- cpp/examples/nested_types/deduplication.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp index 62588729a39..5969985cc72 100644 --- a/cpp/examples/nested_types/deduplication.cpp +++ b/cpp/examples/nested_types/deduplication.cpp @@ -50,6 +50,9 @@ /** * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance */ std::shared_ptr create_memory_resource(bool pool) {