diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 4f1cbc47d1d..16742465c32 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -101,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md # Libcudf examples update -sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt -sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt +sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake # CI files for FILE in .github/workflows/*.yaml; do diff --git a/cpp/examples/README.md b/cpp/examples/README.md index b2e8dd399d0..7f2b769f4a5 100644 --- a/cpp/examples/README.md +++ b/cpp/examples/README.md @@ -7,3 +7,4 @@ Current examples: - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings +- Nested Types: demonstrates using libcudf for some operations on nested types diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 9ff716f41e4..759a43b5627 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.12) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) # Configure your project here add_executable(basic_example src/process_csv.cpp) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 7d389cd318d..001cdeec694 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # libcudf examples build script @@ -14,18 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")} ################################################################################ # Add individual libcudf examples build scripts down below -# Basic example -BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic -BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build -# Configure -cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} - -# Strings example -STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings -STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build -# Configure -cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} +build_example() { + example_dir=${1} + example_dir="${EXAMPLES_DIR}/${example_dir}" + build_dir="${example_dir}/build" + + # Configure + cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}" + # Build + cmake --build ${build_dir} -j${PARALLEL_LEVEL} +} + +build_example basic +build_example strings +build_example nested_types diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake new file mode 100644 index 00000000000..dc86c6a9aa5 --- /dev/null +++ b/cpp/examples/fetch_dependencies.cmake @@ -0,0 +1,30 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= +set(CPM_DOWNLOAD_VERSION v0.35.3) +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake + ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake +) +include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) + +set(CUDF_TAG branch-23.12) +CPMFindPackage( + NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf + GIT_TAG ${CUDF_TAG} + GIT_SHALLOW + TRUE + SOURCE_SUBDIR + cpp +) diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt new file mode 100644 index 00000000000..cb9430db237 --- /dev/null +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.26.4) + +project( + nested_types + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +include(../fetch_dependencies.cmake) + +# Configure your project here +add_executable(deduplication deduplication.cpp) +target_link_libraries(deduplication PRIVATE cudf::cudf) +target_compile_features(deduplication PRIVATE cxx_std_17) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp new file mode 100644 index 00000000000..5969985cc72 --- /dev/null +++ b/cpp/examples/nested_types/deduplication.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +/** + * @file deduplication.cpp + * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables. + * + * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three + * kinds: + * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type + * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing + * to determine equality for nested types + * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types + * so as to enable sorting + * + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool pool) +{ + auto cuda_mr = std::make_shared(); + if (pool) { return rmm::mr::make_owning_wrapper(cuda_mr); } + return cuda_mr; +} + +/** + * @brief Read JSON input from file + * + * @param filepath path to input JSON file + * @return cudf::io::table_with_metadata + */ +cudf::io::table_with_metadata read_json(std::string filepath) +{ + auto source_info = cudf::io::source_info(filepath); + auto builder = cudf::io::json_reader_options::builder(source_info).lines(true); + auto options = builder.build(); + return cudf::io::read_json(options); +} + +/** + * @brief Write JSON output to file + * + * @param input table to write + * @param metadata metadata of input table read by JSON reader + * @param filepath path to output JSON file + */ +void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath) +{ + // write the data for inspection + auto sink_info = cudf::io::sink_info(filepath); + auto builder = cudf::io::json_writer_options::builder(sink_info, input).lines(true); + builder.metadata(metadata); + auto options = builder.build(); + cudf::io::write_json(options); +} + +/** + * @brief Aggregate count of duplicate rows in nested-type column + * + * @param input table to aggregate + * @return std::unique_ptr + */ +std::unique_ptr count_aggregate(cudf::table_view input) +{ + // Get count for each key + auto keys = cudf::table_view{{input.column(0)}}; + auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows()); + + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_count_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = *val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + + auto left_cols = result_key->release(); + left_cols.push_back(std::move(result_val)); + + return std::make_unique(std::move(left_cols)); +} + +/** + * @brief Join each row with its duplicate counts + * + * @param left left table + * @param right right table + * @return std::unique_ptr + */ +std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) +{ + auto [left_indices, right_indices] = + cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}}); + auto new_left = cudf::gather(left, cudf::device_span{*left_indices}); + auto new_right = cudf::gather(right, cudf::device_span{*right_indices}); + + auto left_cols = new_left->release(); + auto right_cols = new_right->release(); + left_cols.push_back(std::move(right_cols[1])); + + return std::make_unique(std::move(left_cols)); +} + +/** + * @brief Sort nested-type column + * + * @param input table to sort + * @return std::unique_ptr + * + * @note if stability is desired, use `cudf::stable_sorted_order` + */ +std::unique_ptr sort_keys(cudf::table_view input) +{ + auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}}); + return cudf::gather(input, *sort_order); +} + +/** + * @brief Main for nested_types examples + * + * Command line parameters: + * 1. JSON input file name/path (default: "example.json") + * 2. JSON output file name/path (default: "output.json") + * 3. Memory resource (optional): "pool" or "cuda" (default: "pool") + * + * Example invocation from directory `cudf/cpp/examples/nested_types`: + * ./build/deduplication example.json output.json pool + * + */ +int main(int argc, char const** argv) +{ + std::string input_filepath; + std::string output_filepath; + std::string mr_name; + if (argc != 4 && argc != 1) { + std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; + return 1; + } + if (argc == 1) { + input_filepath = "example.json"; + output_filepath = "output.json"; + mr_name = "pool"; + } else { + input_filepath = argv[1]; + output_filepath = argv[2]; + mr_name = argv[3]; + } + + auto pool = mr_name == "pool"; + auto resource = create_memory_resource(pool); + rmm::mr::set_current_device_resource(resource.get()); + + std::cout << "Reading " << input_filepath << "..." << std::endl; + // read input file + auto [input, metadata] = read_json(input_filepath); + + auto count = count_aggregate(input->view()); + + auto combined = join_count(input->view(), count->view()); + + auto sorted = sort_keys(combined->view()); + + metadata.schema_info.emplace_back("count"); + + std::cout << "Writing " << output_filepath << "..." << std::endl; + write_json(sorted->view(), metadata, output_filepath); + + return 0; +} diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json new file mode 100644 index 00000000000..efaa37817d6 --- /dev/null +++ b/cpp/examples/nested_types/example.json @@ -0,0 +1,5 @@ +{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} +{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} +{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} +{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} +{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 4b500d3a92e..c90fa9dde16 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.12) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)