Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example code for blog on new row comparators #13795

Merged
merged 42 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
34b8fb4
first draft example
divyegala Aug 1, 2023
9059909
add dedup example
divyegala Aug 8, 2023
5f3b0da
write new example
divyegala Aug 18, 2023
a15dfef
delete old example
divyegala Aug 18, 2023
776b10d
add metadata to json writer
karthikeyann Aug 18, 2023
3c2c0ee
Merge branch 'branch-23.10' into blog-example
divyegala Aug 18, 2023
fa6f928
add new column name
karthikeyann Aug 23, 2023
d0da110
Merge branch 'branch-23.10' into blog-example
karthikeyann Aug 23, 2023
e1f6c7c
add sort, drop filtering
divyegala Aug 23, 2023
df45de2
add pool mr, reduce unnecessary table_view copy
divyegala Sep 21, 2023
c12bc71
Merge remote-tracking branch 'upstream/branch-23.10' into blog-example
divyegala Sep 21, 2023
496c28a
Update cpp/examples/nested_types/deduplication.cpp
divyegala Sep 22, 2023
647604d
use placeholder column for count aggregation
divyegala Sep 26, 2023
345ed77
fix timing counters
divyegala Oct 10, 2023
4fa8e3f
ranges
divyegala Oct 13, 2023
3668dec
add count aggregation copy
divyegala Oct 13, 2023
8675190
Update deduplication.cpp
divyegala Oct 13, 2023
2b2ebdd
try has_nulls() instead of nullable() in gather
divyegala Oct 13, 2023
3ba32ae
address reviews
divyegala Oct 20, 2023
a15e993
Merge remote-tracking branch 'upstream/branch-23.12' into blog-example
divyegala Oct 20, 2023
391140c
Merge branch 'branch-23.12' into blog-example
divyegala Oct 20, 2023
8d8d1ac
Merge branch 'branch-23.12' into blog-example
divyegala Oct 26, 2023
2f0f0e8
some review comments
divyegala Nov 6, 2023
e113913
Merge remote-tracking branch 'upstream/branch-23.12' into blog-example
divyegala Nov 6, 2023
f2c42be
Revert "ranges"
divyegala Nov 6, 2023
27db5c2
Revert "try has_nulls() instead of nullable() in gather"
divyegala Nov 6, 2023
9a3c852
address more review comments
divyegala Nov 6, 2023
a6cf0ee
revert to nullable() in gather
divyegala Nov 6, 2023
1948b10
delete output file included by mistake
divyegala Nov 6, 2023
f7e96a9
bool for creating mr
divyegala Nov 14, 2023
2bfd415
Merge remote-tracking branch 'upstream/branch-23.12' into blog-example
divyegala Nov 14, 2023
9e2020d
Add function for building examples
vyasr Nov 14, 2023
45d6496
Centralized dependency fetching
vyasr Nov 14, 2023
57ccb51
Update cpp/examples/nested_types/example.json
divyegala Nov 14, 2023
4d6ac25
add cout
divyegala Nov 14, 2023
b3fee13
Merge remote-tracking branch 'origin/blog-example' into blog-example
divyegala Nov 14, 2023
6cf9f58
Merge branch 'branch-23.12' into blog-example
divyegala Nov 14, 2023
02438d1
add param in docs
divyegala Nov 14, 2023
26ed48c
Merge remote-tracking branch 'origin/blog-example' into blog-example
divyegala Nov 14, 2023
57d68bd
Update cpp/examples/nested_types/deduplication.cpp
divyegala Nov 15, 2023
43c0fc3
Merge branch 'branch-23.12' into blog-example
divyegala Nov 15, 2023
9625037
Merge branch 'branch-23.12' into blog-example
bdice Nov 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ Current examples:

- Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
- Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
- Nested Types: demonstrates using libcudf for some operations on nested types
10 changes: 9 additions & 1 deletion cpp/examples/build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

# libcudf examples build script

Expand Down Expand Up @@ -29,3 +29,11 @@ STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
# Build
cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}

# Nested Types example
NESTED_TYPES_EXAMPLE_DIR=${EXAMPLES_DIR}/nested_types
NESTED_TYPES_EXAMPLE_BUILD_DIR=${NESTED_TYPES_EXAMPLE_DIR}/build
# Configure
cmake -S ${NESTED_TYPES_EXAMPLE_DIR} -B ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
# Build
cmake --build ${NESTED_TYPES_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
vyasr marked this conversation as resolved.
Show resolved Hide resolved
32 changes: 32 additions & 0 deletions cpp/examples/nested_types/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

cmake_minimum_required(VERSION 3.26.4)

project(
nested_types
VERSION 0.0.1
LANGUAGES CXX CUDA
)

set(CPM_DOWNLOAD_VERSION v0.35.3)
file(
DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
)
include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)

set(CUDF_TAG branch-23.12)
CPMFindPackage(
NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
GIT_TAG ${CUDF_TAG}
GIT_SHALLOW
TRUE
SOURCE_SUBDIR
cpp
)

# Configure your project here
add_executable(deduplication deduplication.cpp)
target_link_libraries(deduplication PRIVATE cudf::cudf)
target_compile_features(deduplication PRIVATE cxx_std_17)
bdice marked this conversation as resolved.
Show resolved Hide resolved
188 changes: 188 additions & 0 deletions cpp/examples/nested_types/deduplication.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/copying.hpp>
#include <cudf/groupby.hpp>
#include <cudf/io/json.hpp>
#include <cudf/io/types.hpp>
#include <cudf/join.hpp>
#include <cudf/sorting.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/owning_wrapper.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>

#include <chrono>
#include <iostream>
#include <string>

divyegala marked this conversation as resolved.
Show resolved Hide resolved
/**
* @file deduplication.cpp
* @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables.
*
* The algorithms chosen to be demonstrated are to showcase nested-type row operators of three
* kinds:
* 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type
* 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing
* to determine equality for nested types
* 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types
* so as to enable sorting
*
*/

/**
* @brief Create memory resource for libcudf functions
divyegala marked this conversation as resolved.
Show resolved Hide resolved
*/
std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string_view name)
divyegala marked this conversation as resolved.
Show resolved Hide resolved
{
auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
if (name == "pool") {
return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr);
}
return cuda_mr;
}

/**
* @brief Read JSON input from file
*/
cudf::io::table_with_metadata read_json(std::string filepath)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
{
auto source_info = cudf::io::source_info(filepath);
auto builder = cudf::io::json_reader_options::builder(source_info).lines(true);
auto options = builder.build();
return cudf::io::read_json(options);
}

/**
* @brief Write JSON output to file
*/
void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath)
{
// write the data for inspection
auto sink_info = cudf::io::sink_info(filepath);
auto builder = cudf::io::json_writer_options::builder(sink_info, input).lines(true);
builder.metadata(metadata);
auto options = builder.build();
cudf::io::write_json(options);
}

/**
* @brief Aggregate count of duplicate rows in nested-type column
*/
std::unique_ptr<cudf::table> count_aggregate(cudf::table_view input)
{
// Get count for each key
auto keys = cudf::table_view{{input.column(0)}};
auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do count aggregations return int32_t or size_type? I think it's supposed to be size_type but not 100% sure.

Suggested change
auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()}, keys.num_rows());

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just a dummy column to force usage of hash aggregation instead of sort aggregation, it does not hold the return values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I understand. We usually want to rely on hash aggregation anytime we can, since it tends to be faster than sort aggregations. Can you point to the behavior that you're avoiding, and why this is the right solution (as opposed to fixing the aggregation dispatch to use hashing instead of sorting)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// Currently, input values (not keys) of STRUCT and LIST types are not supported in any of
// hash-based aggregations. For those situations, we fallback to sort-based aggregations.
if (v_type.id() == type_id::STRUCT or v_type.id() == type_id::LIST) { return false; }

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't the right solution. It should definitely be fixed at the core of the problem.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't force hash aggregations do we still have at least some? As long as the example is demonstrating the all the comparator functionality that we want I think it's totally fine if it isn't the most performant right now (and we've removed timings anyway). If there are improvements to be made to libcudf, we should open a separate issue for that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get it, what do you mean by "at least have some"?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. We need to file an issue that makes this use hash-based aggregations without any "forcing"
  2. I think Vyas is asking, if you don't add this column to "force" hash aggregations, are some of the aggregations hash-based and others sort-based? My understanding of libcudf's behavior is that if any aggregation is sort-based, all the aggregations fall back to using sort-based implementations. Is that true?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's right. If an aggregation only has a sort-based implementation then it will fall back to a sort aggregation.

In this particular case though, a hash-based aggregation is available and it needs to be "forced" due to a limitation.

Here's the issue: #14412

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bdice are you OK with marking this thread as resolved for now? The issue looks like it has a reasonable plan for how to move forward, but I don't think we should block this PR on getting that enabled.


cudf::groupby::groupby grpby_obj(keys);
std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
auto agg = cudf::make_count_aggregation<cudf::groupby_aggregation>();
requests[0].aggregations.push_back(std::move(agg));
requests[0].values = *val;
auto agg_results = grpby_obj.aggregate(requests);
auto result_key = std::move(agg_results.first);
auto result_val = std::move(agg_results.second[0].results[0]);

auto left_cols = result_key->release();
left_cols.push_back(std::move(result_val));
divyegala marked this conversation as resolved.
Show resolved Hide resolved

return std::make_unique<cudf::table>(std::move(left_cols));
}

/**
* @brief Join each row with its duplicate counts
*/
std::unique_ptr<cudf::table> join_count(cudf::table_view left, cudf::table_view right)
{
auto [left_indices, right_indices] =
cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}});
auto new_left = cudf::gather(left, cudf::device_span<cudf::size_type const>{*left_indices});
auto new_right = cudf::gather(right, cudf::device_span<cudf::size_type const>{*right_indices});

auto left_cols = new_left->release();
auto right_cols = new_right->release();
left_cols.push_back(std::move(right_cols[1]));

return std::make_unique<cudf::table>(std::move(left_cols));
}

/**
* @brief Sort nested-type column
*
* @note if stability is desired, use `cudf::stable_sorted_order`
*/
std::unique_ptr<cudf::table> sort_keys(cudf::table_view input)
{
auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}});
return cudf::gather(input, *sort_order);
}

/**
* @brief Main for nested_types examples
*
* Command line parameters:
* 1. JSON input file name/path (default: "example.json")
* 2. JSON output file name/path (default: "output.json")
* 3. Memory resource (optional): "pool" or "cuda" (default: "pool")
divyegala marked this conversation as resolved.
Show resolved Hide resolved
*
* Example invocation from directory `cudf/cpp/examples/nested_types`:
* ./build/deduplication example.json output.json pool
*
*/
int main(int argc, char const** argv)
{
std::string input_filepath;
std::string output_filepath;
std::string mr_name;
if (argc != 4 && argc != 1) {
std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl;
return 1;
}
if (argc == 1) {
input_filepath = "example.json";
output_filepath = "output.json";
mr_name = "pool";
} else {
input_filepath = argv[1];
output_filepath = argv[2];
mr_name = argv[3];
}

auto resource = create_memory_resource(mr_name);
rmm::mr::set_current_device_resource(resource.get());

// read input file
auto [input, metadata] = read_json(input_filepath);

auto count = count_aggregate(input->view());

auto combined = join_count(input->view(), count->view());

auto sorted = sort_keys(combined->view());

metadata.schema_info.emplace_back("count");

write_json(sorted->view(), metadata, output_filepath);
divyegala marked this conversation as resolved.
Show resolved Hide resolved

return 0;
}
5 changes: 5 additions & 0 deletions cpp/examples/nested_types/example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7}
{"features": {"key":"a2", "value": [{"info":"message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7}
{"features": {"key":"a3", "value": [{"info":"message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8}
{"features": {"key":"a1", "value": [{"info":"message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9}
{"features": {"key":"a4", "value": [{"info":"message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9}
divyegala marked this conversation as resolved.
Show resolved Hide resolved
Loading