rapidsai · rapids-bot · Nov 16, 2023 · Aug 1, 2023 · Aug 8, 2023 · Aug 18, 2023
@@ -7,3 +7,4 @@ Current examples:
 
 - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
 - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
+- Nested Types: demonstrates using libcudf for some operations on nested types
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
@@ -29,3 +29,11 @@ STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
 cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
 # Build
 cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
+
+# Nested Types example
+NESTED_TYPES_EXAMPLE_DIR=${EXAMPLES_DIR}/nested_types
+NESTED_TYPES_EXAMPLE_BUILD_DIR=${NESTED_TYPES_EXAMPLE_DIR}/build
+# Configure
+cmake -S ${NESTED_TYPES_EXAMPLE_DIR} -B ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
+# Build
+cmake --build ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
@@ -0,0 +1,32 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+project(
+  nested_types
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+set(CPM_DOWNLOAD_VERSION v0.35.3)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
+include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+
+set(CUDF_TAG branch-23.12)
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
+)
+
+# Configure your project here
+add_executable(deduplication deduplication.cpp)
+target_link_libraries(deduplication PRIVATE cudf::cudf)
+target_compile_features(deduplication PRIVATE cxx_std_17)
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+/**
+ * @file deduplication.cpp
+ * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables.
+ *
+ * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three
+ * kinds:
+ * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type
+ * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing
+ * to determine equality for nested types
+ * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types
+ * so as to enable sorting
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string_view name)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (name == "pool") {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr);
+  }
+  return cuda_mr;
+}
+
+/**
+ * @brief Read JSON input from file
+ */
+cudf::io::table_with_metadata read_json(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::json_reader_options::builder(source_info).lines(true);
+  auto options     = builder.build();
+  return cudf::io::read_json(options);
+}
+
+/**
+ * @brief Write JSON output to file
+ */
+void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath)
+{
+  // write the data for inspection
+  auto sink_info = cudf::io::sink_info(filepath);
+  auto builder   = cudf::io::json_writer_options::builder(sink_info, input).lines(true);
+  builder.metadata(metadata);
+  auto options = builder.build();
+  cudf::io::write_json(options);
+}
+
+/**
+ * @brief Aggregate count of duplicate rows in nested-type column
+ */
+std::unique_ptr<cudf::table> count_aggregate(cudf::table_view input)
+{
+  // Get count for each key
+  auto keys = cudf::table_view{{input.column(0)}};
+  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
-  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
+  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()}, keys.num_rows());
 // Currently, input values (not keys) of STRUCT and LIST types are not supported in any of 
 // hash-based aggregations. For those situations, we fallback to sort-based aggregations. 
 if (v_type.id() == type_id::STRUCT or v_type.id() == type_id::LIST) { return false; } 
-  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
+  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()}, keys.num_rows());
 // Currently, input values (not keys) of STRUCT and LIST types are not supported in any of 
 // hash-based aggregations. For those situations, we fallback to sort-based aggregations. 
 if (v_type.id() == type_id::STRUCT or v_type.id() == type_id::LIST) { return false; } 
+
+  cudf::groupby::groupby grpby_obj(keys);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  auto agg = cudf::make_count_aggregation<cudf::groupby_aggregation>();
+  requests[0].aggregations.push_back(std::move(agg));
+  requests[0].values = *val;
+  auto agg_results   = grpby_obj.aggregate(requests);
+  auto result_key    = std::move(agg_results.first);
+  auto result_val    = std::move(agg_results.second[0].results[0]);
+
+  auto left_cols = result_key->release();
+  left_cols.push_back(std::move(result_val));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Join each row with its duplicate counts
+ */
+std::unique_ptr<cudf::table> join_count(cudf::table_view left, cudf::table_view right)
+{
+  auto [left_indices, right_indices] =
+    cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}});
+  auto new_left  = cudf::gather(left, cudf::device_span<cudf::size_type const>{*left_indices});
+  auto new_right = cudf::gather(right, cudf::device_span<cudf::size_type const>{*right_indices});
+
+  auto left_cols  = new_left->release();
+  auto right_cols = new_right->release();
+  left_cols.push_back(std::move(right_cols[1]));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Sort nested-type column
+ *
+ * @note if stability is desired, use `cudf::stable_sorted_order`
+ */
+std::unique_ptr<cudf::table> sort_keys(cudf::table_view input)
+{
+  auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}});
+  return cudf::gather(input, *sort_order);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. JSON input file name/path (default: "example.json")
+ * 2. JSON output file name/path (default: "output.json")
+ * 3. Memory resource (optional): "pool" or "cuda" (default: "pool")
+ *
+ * Example invocation from directory `cudf/cpp/examples/nested_types`:
+ * ./build/deduplication example.json output.json pool
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  std::string mr_name;
+  if (argc != 4 && argc != 1) {
+    std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl;
+    return 1;
+  }
+  if (argc == 1) {
+    input_filepath  = "example.json";
+    output_filepath = "output.json";
+    mr_name         = "pool";
+  } else {
+    input_filepath  = argv[1];
+    output_filepath = argv[2];
+    mr_name         = argv[3];
+  }
+
+  auto resource = create_memory_resource(mr_name);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  // read input file
+  auto [input, metadata] = read_json(input_filepath);
+
+  auto count = count_aggregate(input->view());
+
+  auto combined = join_count(input->view(), count->view());
+
+  auto sorted = sort_keys(combined->view());
+
+  metadata.schema_info.emplace_back("count");
+
+  write_json(sorted->view(), metadata, output_filepath);
+
+  return 0;
+}
@@ -0,0 +1,5 @@
+{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key":"a2", "value": [{"info":"message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key":"a3", "value": [{"info":"message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8}
+{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9}
+{"features": {"key":"a4", "value": [{"info":"message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,3 +7,4 @@ Current examples:

		- Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
		- Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
		- Nested Types: demonstrates using libcudf for some operations on nested types