Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Example code for blog on new row comparators (rapidsai#13795)
Browse files Browse the repository at this point in the history
Example code using a few libcudf APIs to demonstrate nested-type usage.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Ray Douglass (https://github.com/raydouglass)

URL: rapidsai#13795
  • Loading branch information
divyegala authored Nov 16, 2023
1 parent f9c586d commit afd7d18
Show file tree
Hide file tree
Showing 9 changed files with 279 additions and 52 deletions.
3 changes: 1 addition & 2 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md

# Libcudf examples update
sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake

# CI files
for FILE in .github/workflows/*.yaml; do
Expand Down
1 change: 1 addition & 0 deletions cpp/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ Current examples:

- Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
- Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
- Nested Types: demonstrates using libcudf for some operations on nested types
18 changes: 1 addition & 17 deletions cpp/examples/basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,7 @@ project(
LANGUAGES CXX CUDA
)

set(CPM_DOWNLOAD_VERSION v0.35.3)
file(
DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
)
include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)

set(CUDF_TAG branch-23.12)
CPMFindPackage(
NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
GIT_TAG ${CUDF_TAG}
GIT_SHALLOW
TRUE
SOURCE_SUBDIR
cpp
)
include(../fetch_dependencies.cmake)

# Configure your project here
add_executable(basic_example src/process_csv.cpp)
Expand Down
31 changes: 15 additions & 16 deletions cpp/examples/build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

# libcudf examples build script

Expand All @@ -14,18 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
################################################################################
# Add individual libcudf examples build scripts down below

# Basic example
BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic
BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build
# Configure
cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
# Build
cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}

# Strings example
STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings
STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
# Configure
cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
# Build
cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
build_example() {
example_dir=${1}
example_dir="${EXAMPLES_DIR}/${example_dir}"
build_dir="${example_dir}/build"

# Configure
cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
# Build
cmake --build ${build_dir} -j${PARALLEL_LEVEL}
}

build_example basic
build_example strings
build_example nested_types
30 changes: 30 additions & 0 deletions cpp/examples/fetch_dependencies.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# =============================================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================
set(CPM_DOWNLOAD_VERSION v0.35.3)
file(
DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
)
include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)

set(CUDF_TAG branch-23.12)
CPMFindPackage(
NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
GIT_TAG ${CUDF_TAG}
GIT_SHALLOW
TRUE
SOURCE_SUBDIR
cpp
)
16 changes: 16 additions & 0 deletions cpp/examples/nested_types/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

cmake_minimum_required(VERSION 3.26.4)

project(
nested_types
VERSION 0.0.1
LANGUAGES CXX CUDA
)

include(../fetch_dependencies.cmake)

# Configure your project here
add_executable(deduplication deduplication.cpp)
target_link_libraries(deduplication PRIVATE cudf::cudf)
target_compile_features(deduplication PRIVATE cxx_std_17)
209 changes: 209 additions & 0 deletions cpp/examples/nested_types/deduplication.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/copying.hpp>
#include <cudf/groupby.hpp>
#include <cudf/io/json.hpp>
#include <cudf/io/types.hpp>
#include <cudf/join.hpp>
#include <cudf/sorting.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/owning_wrapper.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>

#include <chrono>
#include <iostream>
#include <string>

/**
* @file deduplication.cpp
* @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables.
*
* The algorithms chosen to be demonstrated are to showcase nested-type row operators of three
* kinds:
* 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type
* 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing
* to determine equality for nested types
* 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types
* so as to enable sorting
*
*/

/**
* @brief Create memory resource for libcudf functions
*
* @param pool Whether to use a pool memory resource.
* @return Memory resource instance
*/
std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool pool)
{
auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
if (pool) { return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr); }
return cuda_mr;
}

/**
* @brief Read JSON input from file
*
* @param filepath path to input JSON file
* @return cudf::io::table_with_metadata
*/
cudf::io::table_with_metadata read_json(std::string filepath)
{
auto source_info = cudf::io::source_info(filepath);
auto builder = cudf::io::json_reader_options::builder(source_info).lines(true);
auto options = builder.build();
return cudf::io::read_json(options);
}

/**
* @brief Write JSON output to file
*
* @param input table to write
* @param metadata metadata of input table read by JSON reader
* @param filepath path to output JSON file
*/
void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath)
{
// write the data for inspection
auto sink_info = cudf::io::sink_info(filepath);
auto builder = cudf::io::json_writer_options::builder(sink_info, input).lines(true);
builder.metadata(metadata);
auto options = builder.build();
cudf::io::write_json(options);
}

/**
* @brief Aggregate count of duplicate rows in nested-type column
*
* @param input table to aggregate
* @return std::unique_ptr<cudf::table>
*/
std::unique_ptr<cudf::table> count_aggregate(cudf::table_view input)
{
// Get count for each key
auto keys = cudf::table_view{{input.column(0)}};
auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());

cudf::groupby::groupby grpby_obj(keys);
std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
auto agg = cudf::make_count_aggregation<cudf::groupby_aggregation>();
requests[0].aggregations.push_back(std::move(agg));
requests[0].values = *val;
auto agg_results = grpby_obj.aggregate(requests);
auto result_key = std::move(agg_results.first);
auto result_val = std::move(agg_results.second[0].results[0]);

auto left_cols = result_key->release();
left_cols.push_back(std::move(result_val));

return std::make_unique<cudf::table>(std::move(left_cols));
}

/**
* @brief Join each row with its duplicate counts
*
* @param left left table
* @param right right table
* @return std::unique_ptr<cudf::table>
*/
std::unique_ptr<cudf::table> join_count(cudf::table_view left, cudf::table_view right)
{
auto [left_indices, right_indices] =
cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}});
auto new_left = cudf::gather(left, cudf::device_span<cudf::size_type const>{*left_indices});
auto new_right = cudf::gather(right, cudf::device_span<cudf::size_type const>{*right_indices});

auto left_cols = new_left->release();
auto right_cols = new_right->release();
left_cols.push_back(std::move(right_cols[1]));

return std::make_unique<cudf::table>(std::move(left_cols));
}

/**
* @brief Sort nested-type column
*
* @param input table to sort
* @return std::unique_ptr<cudf::table>
*
* @note if stability is desired, use `cudf::stable_sorted_order`
*/
std::unique_ptr<cudf::table> sort_keys(cudf::table_view input)
{
auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}});
return cudf::gather(input, *sort_order);
}

/**
* @brief Main for nested_types examples
*
* Command line parameters:
* 1. JSON input file name/path (default: "example.json")
* 2. JSON output file name/path (default: "output.json")
* 3. Memory resource (optional): "pool" or "cuda" (default: "pool")
*
* Example invocation from directory `cudf/cpp/examples/nested_types`:
* ./build/deduplication example.json output.json pool
*
*/
int main(int argc, char const** argv)
{
std::string input_filepath;
std::string output_filepath;
std::string mr_name;
if (argc != 4 && argc != 1) {
std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl;
return 1;
}
if (argc == 1) {
input_filepath = "example.json";
output_filepath = "output.json";
mr_name = "pool";
} else {
input_filepath = argv[1];
output_filepath = argv[2];
mr_name = argv[3];
}

auto pool = mr_name == "pool";
auto resource = create_memory_resource(pool);
rmm::mr::set_current_device_resource(resource.get());

std::cout << "Reading " << input_filepath << "..." << std::endl;
// read input file
auto [input, metadata] = read_json(input_filepath);

auto count = count_aggregate(input->view());

auto combined = join_count(input->view(), count->view());

auto sorted = sort_keys(combined->view());

metadata.schema_info.emplace_back("count");

std::cout << "Writing " << output_filepath << "..." << std::endl;
write_json(sorted->view(), metadata, output_filepath);

return 0;
}
5 changes: 5 additions & 0 deletions cpp/examples/nested_types/example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7}
{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7}
{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8}
{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9}
{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9}
18 changes: 1 addition & 17 deletions cpp/examples/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,7 @@ project(
LANGUAGES CXX CUDA
)

set(CPM_DOWNLOAD_VERSION v0.35.3)
file(
DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
)
include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)

set(CUDF_TAG branch-23.12)
CPMFindPackage(
NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
GIT_TAG ${CUDF_TAG}
GIT_SHALLOW
TRUE
SOURCE_SUBDIR
cpp
)
include(../fetch_dependencies.cmake)

list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)

Expand Down

0 comments on commit afd7d18

Please sign in to comment.