diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8aed7089dc5..ed600000135 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -576,7 +576,7 @@ add_library(cudftestutil STATIC tests/utilities/column_utilities.cu tests/utilities/table_utilities.cu tests/io/metadata_utilities.cpp - tests/strings/utilities.cu) + tests/strings/utilities.cpp) set_target_properties(cudftestutil PROPERTIES BUILD_RPATH "\$ORIGIN" diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index ba2bc245484..0ec2590bdb5 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -297,12 +297,21 @@ std::unique_ptr create_random_column(data_profile const& profile, } } + // cudf expects the null mask buffer to be padded up to 64 bytes. so allocate the proper size and + // copy what we have. + rmm::device_buffer result_bitmask{cudf::bitmask_allocation_size_bytes(num_rows), + rmm::cuda_stream_default}; + cudaMemcpyAsync(result_bitmask.data(), + null_mask.data(), + null_mask.size() * sizeof(cudf::bitmask_type), + cudaMemcpyHostToDevice, + rmm::cuda_stream_default); + return std::make_unique( cudf::data_type{cudf::type_to_id()}, num_rows, rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default), - rmm::device_buffer( - null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default)); + std::move(result_bitmask)); } /** diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 6661f518639..374af536dc5 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -503,9 +503,12 @@ std::unique_ptr make_merge_m2_aggregation(); * * Compute covariance between two columns. * The input columns are child columns of a non-nullable struct columns. + * @param min_periods Minimum number of non-null observations required to produce a result. + * @param ddof Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is + * the number of non-null observations. */ template -std::unique_ptr make_covariance_aggregation(); +std::unique_ptr make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1); /** * @brief Factory to create a CORRELATION aggregation @@ -513,10 +516,12 @@ std::unique_ptr make_covariance_aggregation(); * Compute correlation coefficient between two columns. * The input columns are child columns of a non-nullable struct columns. * - * @param[in] type: correlation_type + * @param type correlation_type + * @param min_periods Minimum number of non-null observations required to produce a result. */ template -std::unique_ptr make_correlation_aggregation(correlation_type type); +std::unique_ptr make_correlation_aggregation(correlation_type type, + size_type min_periods = 1); /** * @brief Factory to create a TDIGEST aggregation diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 19df8d8e7b6..cffefcaf9cd 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -192,6 +192,15 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op case ast_operator::NOT: f.template operator()(std::forward(args)...); break; + case ast_operator::CAST_TO_INT64: + f.template operator()(std::forward(args)...); + break; + case ast_operator::CAST_TO_UINT64: + f.template operator()(std::forward(args)...); + break; + case ast_operator::CAST_TO_FLOAT64: + f.template operator()(std::forward(args)...); + break; default: #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid operator."); @@ -780,6 +789,26 @@ struct operator_functor { } }; +template +struct cast { + static constexpr auto arity{1}; + template + CUDA_DEVICE_CALLABLE auto operator()(From f) -> decltype(static_cast(f)) + { + return static_cast(f); + } +}; + +template <> +struct operator_functor : cast { +}; +template <> +struct operator_functor : cast { +}; +template <> +struct operator_functor : cast { +}; + /* * The default specialization of nullable operators is to fall back to the non-nullable * implementation diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 5454f9a2b95..7ae40a7d65f 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -88,29 +88,32 @@ enum class ast_operator { ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) == ///< LOGICAL_OR(valid, valid) // Unary operators - IDENTITY, ///< Identity function - SIN, ///< Trigonometric sine - COS, ///< Trigonometric cosine - TAN, ///< Trigonometric tangent - ARCSIN, ///< Trigonometric sine inverse - ARCCOS, ///< Trigonometric cosine inverse - ARCTAN, ///< Trigonometric tangent inverse - SINH, ///< Hyperbolic sine - COSH, ///< Hyperbolic cosine - TANH, ///< Hyperbolic tangent - ARCSINH, ///< Hyperbolic sine inverse - ARCCOSH, ///< Hyperbolic cosine inverse - ARCTANH, ///< Hyperbolic tangent inverse - EXP, ///< Exponential (base e, Euler number) - LOG, ///< Natural Logarithm (base e) - SQRT, ///< Square-root (x^0.5) - CBRT, ///< Cube-root (x^(1.0/3)) - CEIL, ///< Smallest integer value not less than arg - FLOOR, ///< largest integer value not greater than arg - ABS, ///< Absolute value - RINT, ///< Rounds the floating-point argument arg to an integer value - BIT_INVERT, ///< Bitwise Not (~) - NOT ///< Logical Not (!) + IDENTITY, ///< Identity function + SIN, ///< Trigonometric sine + COS, ///< Trigonometric cosine + TAN, ///< Trigonometric tangent + ARCSIN, ///< Trigonometric sine inverse + ARCCOS, ///< Trigonometric cosine inverse + ARCTAN, ///< Trigonometric tangent inverse + SINH, ///< Hyperbolic sine + COSH, ///< Hyperbolic cosine + TANH, ///< Hyperbolic tangent + ARCSINH, ///< Hyperbolic sine inverse + ARCCOSH, ///< Hyperbolic cosine inverse + ARCTANH, ///< Hyperbolic tangent inverse + EXP, ///< Exponential (base e, Euler number) + LOG, ///< Natural Logarithm (base e) + SQRT, ///< Square-root (x^0.5) + CBRT, ///< Cube-root (x^(1.0/3)) + CEIL, ///< Smallest integer value not less than arg + FLOOR, ///< largest integer value not greater than arg + ABS, ///< Absolute value + RINT, ///< Rounds the floating-point argument arg to an integer value + BIT_INVERT, ///< Bitwise Not (~) + NOT, ///< Logical Not (!) + CAST_TO_INT64, ///< Cast value to int64_t + CAST_TO_UINT64, ///< Cast value to uint64_t + CAST_TO_FLOAT64 ///< Cast value to double }; /** diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index e12ed3f521e..69bde7f57fd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -901,7 +901,14 @@ class merge_m2_aggregation final : public groupby_aggregation { */ class covariance_aggregation final : public groupby_aggregation { public: - explicit covariance_aggregation() : aggregation{COVARIANCE} {} + explicit covariance_aggregation(size_type min_periods, size_type ddof) + : aggregation{COVARIANCE}, _min_periods{min_periods}, _ddof(ddof) + { + } + size_type _min_periods; + size_type _ddof; + + size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } std::unique_ptr clone() const override { @@ -913,6 +920,12 @@ class covariance_aggregation final : public groupby_aggregation { return collector.visit(col_type, *this); } void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } + + protected: + size_t hash_impl() const + { + return std::hash{}(_min_periods) ^ std::hash{}(_ddof); + } }; /** @@ -920,8 +933,12 @@ class covariance_aggregation final : public groupby_aggregation { */ class correlation_aggregation final : public groupby_aggregation { public: - explicit correlation_aggregation(correlation_type type) : aggregation{CORRELATION}, _type{type} {} + explicit correlation_aggregation(correlation_type type, size_type min_periods) + : aggregation{CORRELATION}, _type{type}, _min_periods{min_periods} + { + } correlation_type _type; + size_type _min_periods; bool is_equal(aggregation const& _other) const override { @@ -944,7 +961,10 @@ class correlation_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } protected: - size_t hash_impl() const { return std::hash{}(static_cast(_type)); } + size_t hash_impl() const + { + return std::hash{}(static_cast(_type)) ^ std::hash{}(_min_periods); + } }; /** diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 89e589d306a..aac44bed50e 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -24,55 +24,21 @@ namespace cudf { namespace io { namespace detail { namespace csv { + /** - * @brief Class to read CSV dataset data into columns. + * @brief Reads the entire dataset. + * + * @param sources Input `datasource` object to read the dataset from + * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @return The set of columns along with table metadata */ -class reader { - private: - class impl; - std::unique_ptr _impl; - - public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Constructor from an array of datasources - * - * @param sources Input `datasource` objects to read the dataset from - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector>&& sources, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~reader(); - - /** - * @brief Reads the entire dataset. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with table metadata - */ - table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default); -}; +table_with_metadata read_csv(std::unique_ptr&& source, + csv_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); class writer { public: diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp index eee2d396d9f..5fa07fd5568 100644 --- a/cpp/include/cudf_test/base_fixture.hpp +++ b/cpp/include/cudf_test/base_fixture.hpp @@ -24,7 +24,9 @@ #include #include +#include #include +#include #include #include #include @@ -217,6 +219,8 @@ class TempDirTestEnvironment : public ::testing::Environment { /// MR factory functions inline auto make_cuda() { return std::make_shared(); } +inline auto make_async() { return std::make_shared(); } + inline auto make_managed() { return std::make_shared(); } inline auto make_pool() @@ -224,6 +228,11 @@ inline auto make_pool() return rmm::mr::make_owning_wrapper(make_cuda()); } +inline auto make_arena() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + inline auto make_binning() { auto pool = make_pool(); @@ -253,7 +262,9 @@ inline std::shared_ptr create_memory_resource( { if (allocation_mode == "binning") return make_binning(); if (allocation_mode == "cuda") return make_cuda(); + if (allocation_mode == "async") return make_async(); if (allocation_mode == "pool") return make_pool(); + if (allocation_mode == "arena") return make_arena(); if (allocation_mode == "managed") return make_managed(); CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode); } diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 3c6ab157d46..31bf9d65d56 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -713,23 +713,25 @@ template std::unique_ptr make_merge_m2_aggregation -std::unique_ptr make_covariance_aggregation() +std::unique_ptr make_covariance_aggregation(size_type min_periods, size_type ddof) { - return std::make_unique(); + return std::make_unique(min_periods, ddof); } -template std::unique_ptr make_covariance_aggregation(); -template std::unique_ptr make_covariance_aggregation(); +template std::unique_ptr make_covariance_aggregation( + size_type min_periods, size_type ddof); +template std::unique_ptr make_covariance_aggregation( + size_type min_periods, size_type ddof); /// Factory to create a CORRELATION aggregation template -std::unique_ptr make_correlation_aggregation(correlation_type type) +std::unique_ptr make_correlation_aggregation(correlation_type type, size_type min_periods) { - return std::make_unique(type); + return std::make_unique(type, min_periods); } template std::unique_ptr make_correlation_aggregation( - correlation_type type); + correlation_type type, size_type min_periods); template std::unique_ptr make_correlation_aggregation( - correlation_type type); + correlation_type type, size_type min_periods); template std::unique_ptr make_tdigest_aggregation(int max_centroids) diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp index 1889ae67ee3..ea6894b5ed3 100644 --- a/cpp/src/aggregation/result_cache.cpp +++ b/cpp/src/aggregation/result_cache.cpp @@ -30,10 +30,10 @@ void result_cache::add_result(column_view const& input, { // We can't guarantee that agg will outlive the cache, so we need to take ownership of a copy. // To allow lookup by reference, make the key a reference and keep the owner in the value pair. - auto owned_agg = agg.clone(); - auto const& key = *owned_agg; - auto value = std::make_pair(std::move(owned_agg), std::move(col)); - _cache[{input, key}] = std::move(value); + auto owned_agg = agg.clone(); + auto const& key = *owned_agg; + // try_emplace doesn't update/insert if already present + _cache.try_emplace({input, key}, std::move(owned_agg), std::move(col)); } column_view result_cache::get_result(column_view const& input, aggregation const& agg) const diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index e471fccda07..83c6c1bca57 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -573,6 +573,7 @@ void aggregate_result_functor::operator()(aggregation c CUDF_EXPECTS(values.num_children() == 2, "Input to `groupby covariance` must be a structs column having 2 children columns."); + auto const& cov_agg = dynamic_cast(agg); // Covariance only for valid values in both columns. // in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT. auto [_, values_child0, values_child1] = @@ -596,6 +597,8 @@ void aggregate_result_functor::operator()(aggregation c count, mean0, mean1, + cov_agg._min_periods, + cov_agg._ddof, stream, mr)); }; @@ -629,28 +632,33 @@ void aggregate_result_functor::operator()(aggregation aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()(*std_agg); aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()(*std_agg); - auto const stddev0 = cache.get_result(values_child0, *std_agg); - auto const stddev1 = cache.get_result(values_child1, *std_agg); - - auto mean_agg = make_mean_aggregation(); - auto const mean0 = cache.get_result(values_child0, *mean_agg); - auto const mean1 = cache.get_result(values_child1, *mean_agg); - auto count_agg = make_count_aggregation(); - auto const count = cache.get_result(values_child0, *count_agg); - // Compute covariance here to avoid repeated computation of mean & count - auto cov_agg = make_covariance_aggregation(); - cache.add_result(values, - *cov_agg, - detail::group_covariance(get_grouped_values().child(0), - get_grouped_values().child(1), - helper.group_labels(stream), - helper.num_groups(stream), - count, - mean0, - mean1, - stream, - mr)); + auto cov_agg = make_covariance_aggregation(corr_agg._min_periods); + if (not cache.has_result(values, *cov_agg)) { + auto mean_agg = make_mean_aggregation(); + auto const mean0 = cache.get_result(values_child0, *mean_agg); + auto const mean1 = cache.get_result(values_child1, *mean_agg); + auto count_agg = make_count_aggregation(); + auto const count = cache.get_result(values_child0, *count_agg); + + auto const& cov_agg_obj = dynamic_cast(*cov_agg); + cache.add_result(values, + *cov_agg, + detail::group_covariance(get_grouped_values().child(0), + get_grouped_values().child(1), + helper.group_labels(stream), + helper.num_groups(stream), + count, + mean0, + mean1, + cov_agg_obj._min_periods, + cov_agg_obj._ddof, + stream, + mr)); + } + + auto const stddev0 = cache.get_result(values_child0, *std_agg); + auto const stddev1 = cache.get_result(values_child1, *std_agg); auto const covariance = cache.get_result(values, *cov_agg); cache.add_result( values, agg, detail::group_correlation(covariance, stddev0, stddev1, stream, mr)); diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu index e43d0185e93..cdcf4311be7 100644 --- a/cpp/src/groupby/sort/group_correlation.cu +++ b/cpp/src/groupby/sort/group_correlation.cu @@ -113,6 +113,8 @@ std::unique_ptr group_covariance(column_view const& values_0, column_view const& count, column_view const& mean_0, column_view const& mean_1, + size_type min_periods, + size_type ddof, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -140,8 +142,13 @@ std::unique_ptr group_covariance(column_view const& values_0, auto d_values_0 = column_device_view::create(values_0, stream); auto d_values_1 = column_device_view::create(values_1, stream); - covariance_transform covariance_transform_op{ - *d_values_0, *d_values_1, mean0_ptr, mean1_ptr, count.data(), group_labels.begin()}; + covariance_transform covariance_transform_op{*d_values_0, + *d_values_1, + mean0_ptr, + mean1_ptr, + count.data(), + group_labels.begin(), + ddof}; auto result = make_numeric_column( data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); @@ -157,8 +164,8 @@ std::unique_ptr group_covariance(column_view const& values_0, thrust::make_discard_iterator(), d_result); - auto is_null = [ddof = covariance_transform_op.ddof] __device__(size_type group_size) { - return not(group_size == 0 or group_size - ddof <= 0); + auto is_null = [ddof, min_periods] __device__(size_type group_size) { + return not(group_size == 0 or group_size - ddof <= 0 or group_size < min_periods); }; auto [new_nullmask, null_count] = cudf::detail::valid_if(count.begin(), count.end(), is_null, stream, mr); diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index df9f5b391fb..935ef9554a9 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -52,14 +52,12 @@ std::unique_ptr rank_generator(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr); - table_view const order_table{{std::get<0>(superimposed)}}; auto const flattened = cudf::structs::detail::flatten_nested_columns( - order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); + table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); auto const d_flat_order = table_device_view::create(flattened, stream); row_equality_comparator comparator(*d_flat_order, *d_flat_order, true); auto ranks = make_fixed_width_column(data_type{type_to_id()}, - order_table.num_rows(), + flattened.flattened_columns().num_rows(), mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 789a289a07e..75708c7b01c 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -451,6 +451,8 @@ std::unique_ptr group_merge_m2(column_view const& values, * @param count The count of valid rows of the grouped values of both columns * @param mean_0 The mean of the first grouped values column * @param mean_1 The mean of the second grouped values column + * @param min_periods The minimum number of non-null rows required to consider the covariance + * @param ddof The delta degrees of freedom used in the calculation of the variance * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory */ @@ -461,6 +463,8 @@ std::unique_ptr group_covariance(column_view const& values_0, column_view const& count, column_view const& mean_0, column_view const& mean_1, + size_type min_periods, + size_type ddof, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index d0e47d93bc6..b9915da90b9 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -238,6 +238,7 @@ struct HasherDispatcher { std::is_same_v) { hasher->process(input_col.element(row_index)); } else { + (void)row_index; cudf_assert(false && "Unsupported type for hash function."); } } @@ -263,6 +264,8 @@ struct ListHasherDispatcher { if (input_col.is_valid(i)) { hasher->process(input_col.element(i)); } } } else { + (void)offset_begin; + (void)offset_end; cudf_assert(false && "Unsupported type for hash function."); } } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 99b593c99b9..7f032b6987c 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -19,14 +19,21 @@ * @brief cuDF-IO CSV reader class implementation */ -#include "reader_impl.hpp" +#include "csv_common.h" +#include "csv_gpu.h" #include +#include +#include #include #include #include #include +#include +#include +#include +#include #include #include #include @@ -37,10 +44,14 @@ #include #include +#include #include +#include #include #include #include +#include +#include using std::string; using std::vector; @@ -56,27 +67,40 @@ namespace csv { using namespace cudf::io::csv; using namespace cudf::io; +namespace { + /** - * @brief Translates a dtype string and returns its dtype enumeration and any - * extended dtype flags that are supported by cuIO. Often, this is a column - * with the same underlying dtype the basic types, but with different parsing - * interpretations. - * - * @param[in] dtype String containing the basic or extended dtype + * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span. * - * @return Tuple of data_type and flags + * Row offsets are stored this way to avoid reallocation/copies when discarding front or back + * elements. */ -std::tuple get_dtype_info(const std::string& dtype) -{ - if (dtype == "hex" || dtype == "hex64") { - return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal); - } - if (dtype == "hex32") { - return std::make_tuple(data_type{cudf::type_id::INT32}, column_parse::as_hexadecimal); +class selected_rows_offsets { + rmm::device_uvector all; + device_span selected; + + public: + selected_rows_offsets(rmm::device_uvector&& data, + device_span selected_span) + : all{std::move(data)}, selected{selected_span} + { } + selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} - return std::make_tuple(convert_string_to_dtype(dtype), column_parse::as_default); -} + operator device_span() const { return selected; } + void shrink(size_t size) + { + CUDF_EXPECTS(size <= selected.size(), "New size must be smaller"); + selected = selected.subspan(0, size); + } + void erase_first_n(size_t n) + { + CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove"); + selected = selected.subspan(n, selected.size() - n); + } + auto size() const { return selected.size(); } + auto data() const { return selected.data(); } +}; /** * @brief Removes the first and Last quote in the string @@ -96,10 +120,10 @@ string removeQuotes(string str, char quotechar) * @brief Parse the first row to set the column names in the raw_csv parameter. * The first row can be either the header row, or the first data row */ -std::vector setColumnNames(std::vector const& header, - parse_options_view const& opts, - int header_row, - std::string prefix) +std::vector get_column_names(std::vector const& header, + parse_options_view const& parse_opts, + int header_row, + std::string prefix) { std::vector col_names; @@ -112,35 +136,36 @@ std::vector setColumnNames(std::vector const& header, bool quotation = false; for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == opts.quotechar) { + if (first_row[pos] == parse_opts.quotechar) { quotation = !quotation; } // Check if end of a column/row - else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts.terminator) || - (!quotation && first_row[pos] == opts.delimiter)) { + else if (pos == first_row.size() - 1 || + (!quotation && first_row[pos] == parse_opts.terminator) || + (!quotation && first_row[pos] == parse_opts.delimiter)) { // This is the header, add the column name if (header_row >= 0) { // Include the current character, in case the line is not terminated int col_name_len = pos - prev + 1; // Exclude the delimiter/terminator is present - if (first_row[pos] == opts.delimiter || first_row[pos] == opts.terminator) { + if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) { --col_name_len; } // Also exclude '\r' character at the end of the column name if it's // part of the terminator - if (col_name_len > 0 && opts.terminator == '\n' && first_row[pos] == '\n' && + if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' && first_row[pos - 1] == '\r') { --col_name_len; } const string new_col_name(first_row.data() + prev, col_name_len); - col_names.push_back(removeQuotes(new_col_name, opts.quotechar)); + col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar)); // Stop parsing when we hit the line terminator; relevant when there is // a blank line following the header. In this case, first_row includes // multiple line terminators at the end, as the new recStart belongs to // a line that comes after the blank line(s) - if (!quotation && first_row[pos] == opts.terminator) { break; } + if (!quotation && first_row[pos] == parse_opts.terminator) { break; } } else { // This is the first data row, add the automatically generated name col_names.push_back(prefix + std::to_string(num_cols)); @@ -148,8 +173,8 @@ std::vector setColumnNames(std::vector const& header, num_cols++; // Skip adjacent delimiters if delim_whitespace is set - while (opts.multi_delimiter && pos < first_row.size() && first_row[pos] == opts.delimiter && - first_row[pos + 1] == opts.delimiter) { + while (parse_opts.multi_delimiter && pos < first_row.size() && + first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) { ++pos; } prev = pos + 1; @@ -170,277 +195,43 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) container.resize(1, stream); } -std::pair, reader::impl::selected_rows_offsets> -reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) -{ - auto range_offset = opts_.get_byte_range_offset(); - auto range_size = opts_.get_byte_range_size(); - auto range_size_padded = opts_.get_byte_range_size_with_padding(); - auto skip_rows = opts_.get_skiprows(); - auto skip_end_rows = opts_.get_skipfooter(); - auto num_rows = opts_.get_nrows(); - - if (range_offset > 0 || range_size > 0) { - CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, - "Reading compressed data using `byte range` is unsupported"); - } - - // Transfer source data to GPU - if (!source_->is_empty()) { - auto const data_size = (range_size_padded != 0) ? range_size_padded : source_->size(); - auto const buffer = source_->host_read(range_offset, data_size); - - auto h_data = host_span( // - reinterpret_cast(buffer->data()), - buffer->size()); - - std::vector h_uncomp_data_owner; - - if (opts_.get_compression() != compression_type::NONE) { - h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression()); - h_data = h_uncomp_data_owner; - } - - // None of the parameters for row selection is used, we are parsing the entire file - const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && - skip_end_rows <= 0 && num_rows == -1; - - // With byte range, find the start of the first data row - size_t const data_start_offset = (range_offset != 0) ? find_first_row_start(h_data) : 0; - - // TODO: Allow parsing the header outside the mapped range - CUDF_EXPECTS((range_offset == 0 || opts_.get_header() < 0), - "byte_range offset with header not supported"); - - // Gather row offsets - auto data_row_offsets = - load_data_and_gather_row_offsets(h_data, - data_start_offset, - (range_size) ? range_size : h_data.size(), - (skip_rows > 0) ? skip_rows : 0, - num_rows, - load_whole_file, - stream); - auto& row_offsets = data_row_offsets.second; - // Exclude the rows that are to be skipped from the end - if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { - row_offsets.shrink(row_offsets.size() - skip_end_rows); - } - return data_row_offsets; - } - return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; -} - -std::vector reader::impl::select_data_types( - std::map const& col_type_map) -{ - std::vector selected_dtypes; - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - auto const col_type_it = col_type_map.find(col_names_[col]); - CUDF_EXPECTS(col_type_it != col_type_map.end(), - "Must specify data types for all active columns"); - selected_dtypes.emplace_back(col_type_it->second); - } - } - return selected_dtypes; -} - -std::vector reader::impl::select_data_types(std::vector const& dtypes) -{ - std::vector selected_dtypes; - - if (dtypes.size() == 1) { - // If it's a single dtype, assign that dtype to all active columns - selected_dtypes.resize(num_active_cols_, dtypes.front()); - } else { - // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_cols_, - "Must specify data types for all columns"); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); } - } - } - return selected_dtypes; -} - -table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) -{ - auto const data_row_offsets = select_data_and_row_offsets(stream); - auto const& data = data_row_offsets.first; - auto const& row_offsets = data_row_offsets.second; - - // Exclude the end-of-data row from number of rows with actual data - num_records_ = std::max(row_offsets.size(), 1ul) - 1; - - // Check if the user gave us a list of column names - if (not opts_.get_names().empty()) { - column_flags_.resize(opts_.get_names().size(), column_parse::enabled); - col_names_ = opts_.get_names(); - } else { - col_names_ = setColumnNames(header_, opts.view(), opts_.get_header(), opts_.get_prefix()); - - num_actual_cols_ = num_active_cols_ = col_names_.size(); - - column_flags_.resize(num_actual_cols_, column_parse::enabled); - - // Rename empty column names to "Unnamed: col_index" - for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) { - if (col_names_[col_idx].empty()) { - col_names_[col_idx] = string("Unnamed: ") + std::to_string(col_idx); - } - } - - // Looking for duplicates - std::unordered_map col_names_histogram; - for (auto& col_name : col_names_) { - // Operator [] inserts a default-initialized value if the given key is not - // present - if (++col_names_histogram[col_name] > 1) { - if (opts_.is_enabled_mangle_dupe_cols()) { - // Rename duplicates of column X as X.1, X.2, ...; First appearance - // stays as X - do { - col_name += "." + std::to_string(col_names_histogram[col_name] - 1); - } while (col_names_histogram[col_name]++); - } else { - // All duplicate columns will be ignored; First appearance is parsed - const auto idx = &col_name - col_names_.data(); - column_flags_[idx] = column_parse::disabled; - } - } - } - - // Update the number of columns to be processed, if some might have been - // removed - if (!opts_.is_enabled_mangle_dupe_cols()) { num_active_cols_ = col_names_histogram.size(); } - } - - // User can specify which columns should be parsed - if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) { - std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled); - - for (const auto index : opts_.get_use_cols_indexes()) { - column_flags_[index] = column_parse::enabled; - } - num_active_cols_ = std::unordered_set(opts_.get_use_cols_indexes().begin(), - opts_.get_use_cols_indexes().end()) - .size(); - - for (const auto& name : opts_.get_use_cols_names()) { - const auto it = std::find(col_names_.begin(), col_names_.end(), name); - if (it != col_names_.end()) { - auto curr_it = it - col_names_.begin(); - if (column_flags_[curr_it] == column_parse::disabled) { - column_flags_[curr_it] = column_parse::enabled; - num_active_cols_++; - } - } - } - } - - // User can specify which columns should be read as datetime - if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) { - for (const auto index : opts_.get_parse_dates_indexes()) { - column_flags_[index] |= column_parse::as_datetime; - } - - for (const auto& name : opts_.get_parse_dates_names()) { - auto it = std::find(col_names_.begin(), col_names_.end(), name); - if (it != col_names_.end()) { - column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; - } - } - } - - // User can specify which columns should be parsed as hexadecimal - if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { - for (const auto index : opts_.get_parse_hex_indexes()) { - column_flags_[index] |= column_parse::as_hexadecimal; - } - - for (const auto& name : opts_.get_parse_hex_names()) { - auto it = std::find(col_names_.begin(), col_names_.end(), name); - if (it != col_names_.end()) { - column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal; - } - } - } - - // Return empty table rather than exception if nothing to load - if (num_active_cols_ == 0) { return {std::make_unique(), {}}; } - - auto metadata = table_metadata{}; - auto out_columns = std::vector>(); - - bool has_to_infer_column_types = - std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes()); - - std::vector column_types; - if (has_to_infer_column_types) { - column_types = infer_column_types(data, row_offsets, stream); - } else { - column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); }, - opts_.get_dtypes()); - } - - out_columns.reserve(column_types.size()); - - if (num_records_ != 0) { - auto out_buffers = decode_data(data, row_offsets, column_types, stream); - for (size_t i = 0; i < column_types.size(); ++i) { - metadata.column_names.emplace_back(out_buffers[i].name); - if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' && - opts.doublequote == true) { - // PANDAS' default behavior of enabling doublequote for two consecutive - // quotechars in quoted fields results in reduction to a single quotechar - // TODO: Would be much more efficient to perform this operation in-place - // during the conversion stage - const std::string quotechar(1, opts.quotechar); - const std::string dblquotechar(2, opts.quotechar); - std::unique_ptr col = cudf::make_strings_column(*out_buffers[i]._strings, stream); - out_columns.emplace_back( - cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_)); - } else { - out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_)); - } - } - } else { - // Create empty columns - for (size_t i = 0; i < column_types.size(); ++i) { - out_columns.emplace_back(make_empty_column(column_types[i])); - } - // Handle empty metadata - for (int col = 0; col < num_actual_cols_; ++col) { - if (column_flags_[col] & column_parse::enabled) { - metadata.column_names.emplace_back(col_names_[col]); - } - } - } - return {std::make_unique
(std::move(out_columns)), std::move(metadata)}; -} - -size_t reader::impl::find_first_row_start(host_span data) +size_t find_first_row_start(char row_terminator, host_span data) { // For now, look for the first terminator (assume the first terminator isn't within a quote) // TODO: Attempt to infer this from the data size_t pos = 0; - while (pos < data.size() && data[pos] != opts.terminator) { + while (pos < data.size() && data[pos] != row_terminator) { ++pos; } return std::min(pos + 1, data.size()); } -std::pair, reader::impl::selected_rows_offsets> -reader::impl::load_data_and_gather_row_offsets(host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream) +/** + * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. + * + * This function scans the input data to record the row offsets (relative to the start of the + * input data). A row is actually the data/offset between two termination symbols. + * + * @param data Uncompressed input data in host memory + * @param range_begin Only include rows starting after this position + * @param range_end Only include rows starting before this position + * @param skip_rows Number of rows to skip from the start + * @param num_rows Number of rows to read; -1: all remaining data + * @param load_whole_file Hint that the entire data will be needed on gpu + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Input data and row offsets in the device memory + */ +std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + std::vector& header, + host_span data, + size_t range_begin, + size_t range_end, + size_t skip_rows, + int64_t num_rows, + bool load_whole_file, + rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB size_t buffer_size = std::min(max_chunk_bytes, data.size()); @@ -449,7 +240,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, hostdevice_vector row_ctx(max_blocks); size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); size_t pos = std::min(range_begin, data.size()); - size_t header_rows = (opts_.get_header() >= 0) ? opts_.get_header() + 1 : 0; + size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; uint64_t ctx = 0; // For compatibility with the previous parser, a row is considered in-range if the @@ -475,7 +266,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, // Pass 1: Count the potential number of rows in each character block for each // possible parser state at the beginning of the block. - uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(opts.view(), + uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), row_ctx.device_ptr(), device_span(), d_data, @@ -514,7 +305,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, stream.value())); // Pass 2: Output row offsets - cudf::io::csv::gpu::gather_row_offsets(opts.view(), + cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), row_ctx.device_ptr(), all_row_offsets, d_data, @@ -551,8 +342,8 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, // num_rows does not include blank rows if (num_rows >= 0) { if (all_row_offsets.size() > header_rows + static_cast(num_rows)) { - size_t num_blanks = - cudf::io::csv::gpu::count_blank_rows(opts.view(), d_data, all_row_offsets, stream); + size_t num_blanks = cudf::io::csv::gpu::count_blank_rows( + parse_opts.view(), d_data, all_row_offsets, stream); if (all_row_offsets.size() - num_blanks > header_rows + static_cast(num_rows)) { // Got the desired number of rows break; @@ -571,7 +362,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, } while (pos < data.size()); auto const non_blank_row_offsets = - io::csv::gpu::remove_blank_rows(opts.view(), d_data, all_row_offsets, stream); + io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; // Remove header rows and extract header @@ -588,7 +379,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, const auto header_end = buffer_pos + row_ctx[1]; CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), "Invalid csv header location"); - header_.assign(data.begin() + header_start, data.begin() + header_end); + header.assign(data.begin() + header_start, data.begin() + header_end); if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } } // Apply num_rows limit @@ -598,30 +389,145 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, return {std::move(d_data), std::move(row_offsets)}; } -std::vector reader::impl::infer_column_types(device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream) +std::pair, selected_rows_offsets> select_data_and_row_offsets( + cudf::io::datasource* source, + csv_reader_options const& reader_opts, + std::vector& header, + parse_options const& parse_opts, + rmm::cuda_stream_view stream) +{ + auto range_offset = reader_opts.get_byte_range_offset(); + auto range_size = reader_opts.get_byte_range_size(); + auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); + auto skip_rows = reader_opts.get_skiprows(); + auto skip_end_rows = reader_opts.get_skipfooter(); + auto num_rows = reader_opts.get_nrows(); + + if (range_offset > 0 || range_size > 0) { + CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE, + "Reading compressed data using `byte range` is unsupported"); + } + + // Transfer source data to GPU + if (!source->is_empty()) { + auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); + auto buffer = source->host_read(range_offset, data_size); + + auto h_data = host_span( // + reinterpret_cast(buffer->data()), + buffer->size()); + + std::vector h_uncomp_data_owner; + + if (reader_opts.get_compression() != compression_type::NONE) { + h_uncomp_data_owner = get_uncompressed_data(h_data, reader_opts.get_compression()); + h_data = h_uncomp_data_owner; + } + // None of the parameters for row selection is used, we are parsing the entire file + const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && + skip_end_rows <= 0 && num_rows == -1; + + // With byte range, find the start of the first data row + size_t const data_start_offset = + (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0; + + // TODO: Allow parsing the header outside the mapped range + CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), + "byte_range offset with header not supported"); + + // Gather row offsets + auto data_row_offsets = + load_data_and_gather_row_offsets(reader_opts, + parse_opts, + header, + h_data, + data_start_offset, + (range_size) ? range_size : h_data.size(), + (skip_rows > 0) ? skip_rows : 0, + num_rows, + load_whole_file, + stream); + auto& row_offsets = data_row_offsets.second; + // Exclude the rows that are to be skipped from the end + if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { + row_offsets.shrink(row_offsets.size() - skip_end_rows); + } + return data_row_offsets; + } + return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; +} + +std::vector select_data_types(std::vector const& column_flags, + std::vector const& dtypes, + int32_t num_actual_columns, + int32_t num_active_columns) +{ + std::vector selected_dtypes; + + if (dtypes.size() == 1) { + // If it's a single dtype, assign that dtype to all active columns + selected_dtypes.resize(num_active_columns, dtypes.front()); + } else { + // If it's a list, assign dtypes to active columns in the given order + CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_columns, + "Must specify data types for all columns"); + + for (int i = 0; i < num_actual_columns; i++) { + if (column_flags[i] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[i]); } + } + } + return selected_dtypes; +} + +std::vector get_data_types_from_column_names( + std::vector const& column_flags, + std::map const& column_type_map, + std::vector const& column_names, + int32_t num_actual_columns) +{ + std::vector selected_dtypes; + + for (int32_t i = 0; i < num_actual_columns; i++) { + if (column_flags[i] & column_parse::enabled) { + auto const col_type_it = column_type_map.find(column_names[i]); + CUDF_EXPECTS(col_type_it != column_type_map.end(), + "Must specify data types for all active columns"); + selected_dtypes.emplace_back(col_type_it->second); + } + } + + return selected_dtypes; +} + +std::vector infer_column_types(parse_options const& parse_opts, + std::vector const& column_flags, + device_span data, + device_span row_offsets, + int32_t num_records, + int32_t num_active_columns, + data_type timestamp_type, + rmm::cuda_stream_view stream) { std::vector dtypes; - if (num_records_ == 0) { - dtypes.resize(num_active_cols_, data_type{type_id::EMPTY}); + if (num_records == 0) { + dtypes.resize(num_active_columns, data_type{type_id::EMPTY}); } else { auto column_stats = - cudf::io::csv::gpu::detect_column_types(opts.view(), + cudf::io::csv::gpu::detect_column_types(parse_opts.view(), data, - make_device_uvector_async(column_flags_, stream), + make_device_uvector_async(column_flags, stream), row_offsets, - num_active_cols_, + num_active_columns, stream); stream.synchronize(); - for (int col = 0; col < num_active_cols_; col++) { + for (int col = 0; col < num_active_columns; col++) { unsigned long long int_count_total = column_stats[col].big_int_count + column_stats[col].negative_small_int_count + column_stats[col].positive_small_int_count; - if (column_stats[col].null_count == num_records_) { + if (column_stats[col].null_count == num_records) { // Entire column is NULL; allocate the smallest amount of memory dtypes.emplace_back(cudf::type_id::INT8); } else if (column_stats[col].string_count > 0L) { @@ -649,9 +555,9 @@ std::vector reader::impl::infer_column_types(device_span } } - if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { + if (timestamp_type.id() != cudf::type_id::EMPTY) { for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } + if (cudf::is_timestamp(type)) { type = timestamp_type; } } } @@ -663,43 +569,50 @@ std::vector reader::impl::infer_column_types(device_span return dtypes; } -std::vector reader::impl::decode_data(device_span data, - device_span row_offsets, - host_span column_types, - rmm::cuda_stream_view stream) +std::vector decode_data(parse_options const& parse_opts, + std::vector const& column_flags, + std::vector const& column_names, + device_span data, + device_span row_offsets, + host_span column_types, + int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; out_buffers.reserve(column_types.size()); - for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) { - if (column_flags_[col] & column_parse::enabled) { + for (int col = 0, active_col = 0; col < num_actual_columns; ++col) { + if (column_flags[col] & column_parse::enabled) { const bool is_final_allocation = column_types[active_col].id() != type_id::STRING; auto out_buffer = column_buffer(column_types[active_col], - num_records_, + num_records, true, stream, - is_final_allocation ? mr_ : rmm::mr::get_current_device_resource()); + is_final_allocation ? mr : rmm::mr::get_current_device_resource()); - out_buffer.name = col_names_[col]; + out_buffer.name = column_names[col]; out_buffer.null_count() = UNKNOWN_NULL_COUNT; out_buffers.emplace_back(std::move(out_buffer)); active_col++; } } - thrust::host_vector h_data(num_active_cols_); - thrust::host_vector h_valid(num_active_cols_); + thrust::host_vector h_data(num_active_columns); + thrust::host_vector h_valid(num_active_columns); - for (int i = 0; i < num_active_cols_; ++i) { + for (int i = 0; i < num_active_columns; ++i) { h_data[i] = out_buffers[i].data(); h_valid[i] = out_buffers[i].null_mask(); } - cudf::io::csv::gpu::decode_row_column_data(opts.view(), + cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(), data, - make_device_uvector_async(column_flags_, stream), + make_device_uvector_async(column_flags, stream), row_offsets, make_device_uvector_async(column_types, stream), make_device_uvector_async(h_data, stream), @@ -709,6 +622,209 @@ std::vector reader::impl::decode_data(device_span dat return out_buffers; } +table_with_metadata read_csv(cudf::io::datasource* source, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::vector header; + + auto const data_row_offsets = + select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream); + + auto const& data = data_row_offsets.first; + auto const& row_offsets = data_row_offsets.second; + + // Exclude the end-of-data row from number of rows with actual data + auto num_records = std::max(row_offsets.size(), 1ul) - 1; + auto column_flags = std::vector(); + auto column_names = std::vector(); + auto num_actual_columns = static_cast(reader_opts.get_names().size()); + auto num_active_columns = num_actual_columns; + + // Check if the user gave us a list of column names + if (not reader_opts.get_names().empty()) { + column_flags.resize(reader_opts.get_names().size(), column_parse::enabled); + column_names = reader_opts.get_names(); + } else { + column_names = get_column_names( + header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); + + num_actual_columns = num_active_columns = column_names.size(); + + column_flags.resize(num_actual_columns, column_parse::enabled); + + // Rename empty column names to "Unnamed: col_index" + for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { + if (column_names[col_idx].empty()) { + column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); + } + } + + // Looking for duplicates + std::unordered_map col_names_histogram; + for (auto& col_name : column_names) { + // Operator [] inserts a default-initialized value if the given key is not + // present + if (++col_names_histogram[col_name] > 1) { + if (reader_opts.is_enabled_mangle_dupe_cols()) { + // Rename duplicates of column X as X.1, X.2, ...; First appearance + // stays as X + do { + col_name += "." + std::to_string(col_names_histogram[col_name] - 1); + } while (col_names_histogram[col_name]++); + } else { + // All duplicate columns will be ignored; First appearance is parsed + const auto idx = &col_name - column_names.data(); + column_flags[idx] = column_parse::disabled; + } + } + } + + // Update the number of columns to be processed, if some might have been + // removed + if (!reader_opts.is_enabled_mangle_dupe_cols()) { + num_active_columns = col_names_histogram.size(); + } + } + + // User can specify which columns should be parsed + if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) { + std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled); + + for (const auto index : reader_opts.get_use_cols_indexes()) { + column_flags[index] = column_parse::enabled; + } + num_active_columns = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), + reader_opts.get_use_cols_indexes().end()) + .size(); + + for (const auto& name : reader_opts.get_use_cols_names()) { + const auto it = std::find(column_names.begin(), column_names.end(), name); + if (it != column_names.end()) { + auto curr_it = it - column_names.begin(); + if (column_flags[curr_it] == column_parse::disabled) { + column_flags[curr_it] = column_parse::enabled; + num_active_columns++; + } + } + } + } + + // User can specify which columns should be read as datetime + if (!reader_opts.get_parse_dates_indexes().empty() || + !reader_opts.get_parse_dates_names().empty()) { + for (const auto index : reader_opts.get_parse_dates_indexes()) { + column_flags[index] |= column_parse::as_datetime; + } + + for (const auto& name : reader_opts.get_parse_dates_names()) { + auto it = std::find(column_names.begin(), column_names.end(), name); + if (it != column_names.end()) { + column_flags[it - column_names.begin()] |= column_parse::as_datetime; + } + } + } + + // User can specify which columns should be parsed as hexadecimal + if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) { + for (const auto index : reader_opts.get_parse_hex_indexes()) { + column_flags[index] |= column_parse::as_hexadecimal; + } + + for (const auto& name : reader_opts.get_parse_hex_names()) { + auto it = std::find(column_names.begin(), column_names.end(), name); + if (it != column_names.end()) { + column_flags[it - column_names.begin()] |= column_parse::as_hexadecimal; + } + } + } + + // Return empty table rather than exception if nothing to load + if (num_active_columns == 0) { return {std::make_unique
(), {}}; } + + auto metadata = table_metadata{}; + auto out_columns = std::vector>(); + + bool has_to_infer_column_types = + std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); + + std::vector column_types; + if (has_to_infer_column_types) { + column_types = infer_column_types( // + parse_opts, + column_flags, + data, + row_offsets, + num_records, + num_active_columns, + reader_opts.get_timestamp_type(), + stream); + } else { + column_types = + std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& data_types) { + return select_data_types( + column_flags, data_types, num_actual_columns, num_active_columns); + }, + [&](const std::map& data_types) { + return get_data_types_from_column_names( // + column_flags, + data_types, + column_names, + num_actual_columns); + }}, + reader_opts.get_dtypes()); + } + + out_columns.reserve(column_types.size()); + + if (num_records != 0) { + auto out_buffers = decode_data( // + parse_opts, + column_flags, + column_names, + data, + row_offsets, + column_types, + num_records, + num_actual_columns, + num_active_columns, + stream, + mr); + for (size_t i = 0; i < column_types.size(); ++i) { + metadata.column_names.emplace_back(out_buffers[i].name); + if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' && + parse_opts.doublequote == true) { + // PANDAS' default behavior of enabling doublequote for two consecutive + // quotechars in quoted fields results in reduction to a single quotechar + // TODO: Would be much more efficient to perform this operation in-place + // during the conversion stage + const std::string quotechar(1, parse_opts.quotechar); + const std::string dblquotechar(2, parse_opts.quotechar); + std::unique_ptr col = cudf::make_strings_column(*out_buffers[i]._strings, stream); + out_columns.emplace_back( + cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr)); + } else { + out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr)); + } + } + } else { + // Create empty columns + for (size_t i = 0; i < column_types.size(); ++i) { + out_columns.emplace_back(make_empty_column(column_types[i])); + } + // Handle empty metadata + for (int col = 0; col < num_actual_columns; ++col) { + if (column_flags[col] & column_parse::enabled) { + metadata.column_names.emplace_back(column_names[col]); + } + } + } + return {std::make_unique
(std::move(out_columns)), std::move(metadata)}; +} + /** * @brief Create a serialized trie for N/A value matching, based on the options. */ @@ -807,33 +923,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, return parse_opts; } -reader::impl::impl(std::unique_ptr source, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : mr_(mr), source_(std::move(source)), opts_(options) -{ - num_actual_cols_ = opts_.get_names().size(); - num_active_cols_ = num_actual_cols_; - - opts = make_parse_options(options, stream); -} +} // namespace -// Forward to implementation -reader::reader(std::vector>&& sources, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(std::unique_ptr&& source, + csv_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(std::move(sources[0]), options, stream, mr); -} + auto parse_options = make_parse_options(options, stream); -// Destructor within this translation unit -reader::~reader() = default; - -// Forward to implementation -table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); } + return read_csv(source.get(), options, parse_options, stream, mr); +} } // namespace csv } // namespace detail diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp deleted file mode 100644 index de363a46ffe..00000000000 --- a/cpp/src/io/csv/reader_impl.hpp +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "csv_common.h" -#include "csv_gpu.h" - -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -using cudf::host_span; - -namespace cudf { -namespace io { -namespace detail { -namespace csv { -using namespace cudf::io::csv; -using namespace cudf::io; - -/** - * @brief Implementation for CSV reader - * - * The CSV reader is implemented in 4 stages: - * Stage 1: read and optionally decompress the input data in host memory - * (may be a memory-mapped view of the data on disk) - * - * Stage 2: gather the offset of each data row within the csv data. - * Since the number of rows in a given character block may depend on the - * initial parser state (like whether the block starts in a middle of a - * quote or not), a separate row count and output parser state is computed - * for every possible input parser state per 16KB character block. - * The result is then used to infer the parser state and starting row at - * the beginning of every character block. - * A second pass can then output the location of every row (which is needed - * for the subsequent parallel conversion of every row from csv text - * to cudf binary form) - * - * Stage 3: Optional stage to infer the data type of each CSV column. - * - * Stage 4: Convert every row from csv text form to cudf binary form. - */ -class reader::impl { - public: - /** - * @brief Constructor from a dataset source with reader options. - * - * @param source Dataset source - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit impl(std::unique_ptr source, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Read an entire set or a subset of data and returns a set of columns. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with metadata - */ - table_with_metadata read(rmm::cuda_stream_view stream); - - private: - /** - * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span. - * - * Row offsets are stored this way to avoid reallocation/copies when discarding front or back - * elements. - */ - class selected_rows_offsets { - rmm::device_uvector all; - device_span selected; - - public: - selected_rows_offsets(rmm::device_uvector&& data, - device_span selected_span) - : all{std::move(data)}, selected{selected_span} - { - } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} - - operator device_span() const { return selected; } - void shrink(size_t size) - { - CUDF_EXPECTS(size <= selected.size(), "New size must be smaller"); - selected = selected.subspan(0, size); - } - void erase_first_n(size_t n) - { - CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove"); - selected = selected.subspan(n, selected.size() - n); - } - auto size() const { return selected.size(); } - auto data() const { return selected.data(); } - }; - - /** - * @brief Selectively loads data on the GPU and gathers offsets of rows to read. - * - * Selection is based on read options. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - std::pair, reader::impl::selected_rows_offsets> - select_data_and_row_offsets(rmm::cuda_stream_view stream); - - /** - * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. - * - * This function scans the input data to record the row offsets (relative to the start of the - * input data). A row is actually the data/offset between two termination symbols. - * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory - */ - std::pair, reader::impl::selected_rows_offsets> - load_data_and_gather_row_offsets(host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream); - - /** - * @brief Find the start position of the first data row - * - * @param h_data Uncompressed input data in host memory - * - * @return Byte position of the first row - */ - size_t find_first_row_start(host_span data); - - /** - * @brief Automatically infers each column's data type based on the CSV's data within that column. - * - * @param data The CSV data from which to infer the columns' data types - * @param row_offsets The row offsets into the CSV's data - * @param stream The stream to which the type inference-kernel will be dispatched - * @return The columns' inferred data types - */ - std::vector infer_column_types(device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream); - - /** - * @brief Selects the columns' data types from the map of dtypes. - * - * @param col_type_map Column name -> data type map specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::map const& col_type_map); - - /** - * @brief Selects the columns' data types from the list of dtypes. - * - * @param dtypes Vector of data types specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::vector const& dtypes); - - /** - * @brief Converts the row-column data and outputs to column bufferrs. - * - * @param column_types Column types - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return list of column buffers of decoded data, or ptr/size in the case of strings. - */ - std::vector decode_data(device_span data, - device_span row_offsets, - host_span column_types, - rmm::cuda_stream_view stream); - - private: - rmm::mr::device_memory_resource* mr_ = nullptr; - std::unique_ptr source_; - const csv_reader_options opts_; - - cudf::size_type num_records_ = 0; // Number of rows with actual data - int num_active_cols_ = 0; // Number of columns to read - int num_actual_cols_ = 0; // Number of columns in the dataset - - // Parsing options - parse_options opts{}; - std::vector column_flags_; - - // Intermediate data - std::vector col_names_; - std::vector header_; -}; - -} // namespace csv -} // namespace detail -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 511a1a22ee7..5ae5d77be1d 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -201,8 +201,6 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) { - namespace csv = cudf::io::detail::csv; - CUDF_FUNC_RANGE(); options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); @@ -211,10 +209,13 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ options.get_byte_range_offset(), options.get_byte_range_size_with_padding()); - auto reader = - std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); - return reader->read(); + return cudf::io::detail::csv::read_csv( // + std::move(datasources[0]), + options, + rmm::cuda_stream_default, + mr); } // Freeform API wraps the detail writer class API diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 19533c9fbdd..10fc1015528 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -68,7 +68,7 @@ struct parse_options { cudf::detail::optional_trie trie_na; bool multi_delimiter; - parse_options_view view() + parse_options_view view() const { return {delimiter, terminator, diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu index bb6a85094f5..e7f1e867a41 100644 --- a/cpp/src/reductions/scan/rank_scan.cu +++ b/cpp/src/reductions/scan/rank_scan.cu @@ -50,14 +50,12 @@ std::unique_ptr rank_generator(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr); - table_view const order_table{{std::get<0>(superimposed)}}; auto const flattened = cudf::structs::detail::flatten_nested_columns( - order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); + table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); auto const d_flat_order = table_device_view::create(flattened, stream); row_equality_comparator comparator(*d_flat_order, *d_flat_order, true); auto ranks = make_fixed_width_column(data_type{type_to_id()}, - order_table.num_rows(), + flattened.flattened_columns().num_rows(), mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 0e00221dabf..96ce14644c9 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -567,11 +567,11 @@ class regex_compiler { case LBRA: /* must have been RBRA */ op1 = popand('('); id_inst2 = m_prog.add_inst(RBRA); - m_prog.inst_at(id_inst2).u1.subid = ator.subid; // subidstack[subidstack.size()-1]; + m_prog.inst_at(id_inst2).u1.subid = ator.subid; m_prog.inst_at(op1.id_last).u2.next_id = id_inst2; id_inst1 = m_prog.add_inst(LBRA); - m_prog.inst_at(id_inst1).u1.subid = ator.subid; // subidstack[subidstack.size() - 1]; - m_prog.inst_at(id_inst1).u2.next_id = op1.id_first; + m_prog.inst_at(id_inst1).u1.subid = ator.subid; + m_prog.inst_at(id_inst1).u2.next_id = op1.id_first; pushand(id_inst1, id_inst2); return; case OR: @@ -826,7 +826,8 @@ reprog reprog::create_from(const char32_t* pattern) { reprog rtn; regex_compiler compiler(pattern, ANY, rtn); // future feature: ANYNL - // rtn->print(); + // for debugging, it can be helpful to call rtn.print() here to dump + // out the instructions that have been created from the given pattern return rtn; } @@ -912,6 +913,7 @@ void reprog::optimize2() _startinst_ids.push_back(-1); // terminator mark } +#ifndef NDEBUG void reprog::print() { printf("Instructions:\n"); @@ -992,6 +994,7 @@ void reprog::print() } if (_num_capturing_groups) printf("Number of capturing groups: %d\n", _num_capturing_groups); } +#endif } // namespace detail } // namespace strings diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 854fce15fd4..66e99756615 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -198,10 +198,10 @@ __device__ inline int32_t reprog_device::regexec( { int32_t match = 0; auto checkstart = jnk.starttype; - auto txtlen = dstr.length(); auto pos = begin; auto eos = end; char32_t c = 0; + auto last_character = false; string_view::const_iterator itr = string_view::const_iterator(dstr, pos); jnk.list1->reset(); @@ -235,7 +235,9 @@ __device__ inline int32_t reprog_device::regexec( jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1); } - c = static_cast(pos >= txtlen ? 0 : *itr); + last_character = (pos >= dstr.length()); + + c = static_cast(last_character ? 0 : *itr); // expand LBRA, RBRA, BOL, EOL, BOW, NBOW, and OR bool expanded = false; @@ -274,7 +276,7 @@ __device__ inline int32_t reprog_device::regexec( } break; case EOL: - if ((c == 0) || (inst->u1.c == '$' && c == '\n')) { + if (last_character || (inst->u1.c == '$' && c == '\n')) { id_activate = inst->u2.next_id; expanded = true; } @@ -360,7 +362,7 @@ __device__ inline int32_t reprog_device::regexec( ++itr; swaplist(jnk.list1, jnk.list2); checkstart = jnk.list1->size > 0 ? 0 : 1; - } while (c && (jnk.list1->size > 0 || match == 0)); + } while (!last_character && (jnk.list1->size > 0 || match == 0)); return match; } diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 175918a0846..8cfd6d24fae 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -109,6 +109,23 @@ TEST_F(TransformTest, BasicAddition) cudf::test::expect_columns_equal(expected, result->view(), verbosity); } +TEST_F(TransformTest, BasicAdditionCast) +{ + auto c_0 = column_wrapper{3, 20, 1, 50}; + auto c_1 = column_wrapper{10, 7, 20, 0}; + auto table = cudf::table_view{{c_0, c_1}}; + + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(1); + auto cast = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, col_ref_1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast); + + auto expected = column_wrapper{13, 27, 21, 50}; + auto result = cudf::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), verbosity); +} + TEST_F(TransformTest, BasicEquality) { auto c_0 = column_wrapper{3, 20, 1, 50}; diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp index 90d230ef1eb..4aa4ef236f0 100644 --- a/cpp/tests/groupby/correlation_tests.cpp +++ b/cpp/tests/groupby/correlation_tests.cpp @@ -32,7 +32,8 @@ using namespace cudf::test::iterators; namespace cudf { namespace test { -using structs = structs_column_wrapper; +constexpr auto nan = std::numeric_limits::quiet_NaN(); +using structs = structs_column_wrapper; template struct groupby_correlation_test : public cudf::test::BaseFixture { @@ -54,8 +55,7 @@ TYPED_TEST(groupby_correlation_test, basic) auto vals = structs{{member_0, member_1}}; fixed_width_column_wrapper expect_keys{1, 2, 3}; - fixed_width_column_wrapper expect_vals{ - {1.0, 0.6, std::numeric_limits::quiet_NaN()}}; + fixed_width_column_wrapper expect_vals{{1.0, 0.6, nan}}; auto agg = cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); @@ -129,8 +129,7 @@ TYPED_TEST(groupby_correlation_test, null_keys_and_values) auto vals = structs{{val0, val1}}; fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); - fixed_width_column_wrapper expect_vals( - {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0}); + fixed_width_column_wrapper expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0}); auto agg = cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); @@ -153,8 +152,7 @@ TYPED_TEST(groupby_correlation_test, null_values_same) auto vals = structs{{val0, val1}}; fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); - fixed_width_column_wrapper expect_vals( - {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0}); + fixed_width_column_wrapper expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0}); auto agg = cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); @@ -181,14 +179,41 @@ TYPED_TEST(groupby_correlation_test, null_values_different) auto vals = structs{{val0, val1}}; fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); - fixed_width_column_wrapper expect_vals({1.0, 0., std::numeric_limits::quiet_NaN(), 0.}, - {1, 1, 1, 0}); + fixed_width_column_wrapper expect_vals({1.0, 0., nan, 0.}, {1, 1, 1, 0}); auto agg = cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); } +TYPED_TEST(groupby_correlation_test, min_periods) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + + fixed_width_column_wrapper expect_vals1{{1.0, 0.6, nan}}; + auto agg1 = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON, 3); + test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES); + + fixed_width_column_wrapper expect_vals2{{1.0, 0.6, nan}, {0, 1, 0}}; + auto agg2 = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON, 4); + test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES); + + fixed_width_column_wrapper expect_vals3{{1.0, 0.6, nan}, {0, 0, 0}}; + auto agg3 = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON, 5); + test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES); +} + struct groupby_dictionary_correlation_test : public cudf::test::BaseFixture { }; @@ -203,8 +228,7 @@ TEST_F(groupby_dictionary_correlation_test, basic) auto vals = structs{{member_0, member_1}}; fixed_width_column_wrapper expect_keys{1, 2, 3}; - fixed_width_column_wrapper expect_vals{ - {1.0, 0.6, std::numeric_limits::quiet_NaN()}}; + fixed_width_column_wrapper expect_vals{{1.0, 0.6, nan}}; auto agg = cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp index 039fce16222..3a4fbf92387 100644 --- a/cpp/tests/groupby/covariance_tests.cpp +++ b/cpp/tests/groupby/covariance_tests.cpp @@ -175,6 +175,53 @@ TYPED_TEST(groupby_covariance_test, null_values_different) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); } +TYPED_TEST(groupby_covariance_test, min_periods) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + + fixed_width_column_wrapper expect_vals1{{1.0, 1.0, 0.0}}; + auto agg1 = cudf::make_covariance_aggregation(3); + test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES); + + fixed_width_column_wrapper expect_vals2{{1.0, 1.0, 0.0}, {0, 1, 0}}; + auto agg2 = cudf::make_covariance_aggregation(4); + test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES); + + fixed_width_column_wrapper expect_vals3{{1.0, 1.0, 0.0}, {0, 0, 0}}; + auto agg3 = cudf::make_covariance_aggregation(5); + test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, ddof) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + + fixed_width_column_wrapper expect_vals1{{2.0, 1.5, 0.0}}; + auto agg1 = cudf::make_covariance_aggregation(1, 2); + test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES); + + auto const inf = std::numeric_limits::infinity(); + fixed_width_column_wrapper expect_vals2{{inf, 3.0, 0.0}, {0, 1, 0}}; + auto agg2 = cudf::make_covariance_aggregation(1, 3); + test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES); +} + struct groupby_dictionary_covariance_test : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index 683eeb7eb01..6bbf25646d0 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -289,5 +289,66 @@ TEST_F(groupby_dictionary_keys_test, basic) force_use_sort_impl::YES); } +struct groupby_cache_test : public cudf::test::BaseFixture { +}; + +// To check if the cache doesn't insert multiple times to cache for same aggregation on a column in +// same request. +// If this test fails, then insert happened and key stored in cache map becomes dangling reference. +// Any comparison with same aggregation as key will fail. +TEST_F(groupby_cache_test, duplicate_agggregations) +{ + using K = int32_t; + using V = int32_t; + + fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + groupby::groupby gb_obj(table_view({keys})); + + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = vals; + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + + // hash groupby + EXPECT_NO_THROW(gb_obj.aggregate(requests)); + + // sort groupby + // WAR to force groupby to use sort implementation + requests[0].aggregations.push_back(make_nth_element_aggregation(0)); + EXPECT_NO_THROW(gb_obj.aggregate(requests)); +} + +// To check if the cache doesn't insert multiple times to cache for same aggregation on same column +// but in different requests. +// If this test fails, then insert happened and key stored in cache map becomes dangling reference. +// Any comparison with same aggregation as key will fail. +TEST_F(groupby_cache_test, duplicate_columns) +{ + using K = int32_t; + using V = int32_t; + + fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + groupby::groupby gb_obj(table_view({keys})); + + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = vals; + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + requests.emplace_back(groupby::aggregation_request()); + requests[1].values = vals; + requests[1].aggregations.push_back(cudf::make_sum_aggregation()); + + // hash groupby + EXPECT_NO_THROW(gb_obj.aggregate(requests)); + + // sort groupby + // WAR to force groupby to use sort implementation + requests[0].aggregations.push_back(make_nth_element_aggregation(0)); + EXPECT_NO_THROW(gb_obj.aggregate(requests)); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 17e08bd21c5..ff9f79ea87f 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -28,12 +28,11 @@ struct StringsCharsTest : public cudf::test::BaseFixture { }; -class StringsCharsTestTypes - : public StringsCharsTest, - public testing::WithParamInterface { +class CharsTypes : public StringsCharsTest, + public testing::WithParamInterface { }; -TEST_P(StringsCharsTestTypes, AllTypes) +TEST_P(CharsTypes, AllTypes) { std::vector h_strings{"Héllo", "thesé", @@ -84,8 +83,8 @@ TEST_P(StringsCharsTestTypes, AllTypes) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -INSTANTIATE_TEST_CASE_P(StringsCharsTestAllTypes, - StringsCharsTestTypes, +INSTANTIATE_TEST_CASE_P(StringsCharsTest, + CharsTypes, testing::ValuesIn(std::array{ cudf::strings::string_character_types::DECIMAL, cudf::strings::string_character_types::NUMERIC, diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index ddd6fc9e1dc..bb9a4c9cf5e 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -21,6 +21,7 @@ #include #include +#include #include struct StringsContainsTests : public cudf::test::BaseFixture { @@ -236,6 +237,30 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) } } +TEST_F(StringsContainsTests, EmbeddedNullCharacter) +{ + std::vector data(10); + std::generate(data.begin(), data.end(), [n = 0]() mutable { + char first = static_cast('A' + n++); + char raw_data[] = {first, '\0', 'B'}; + return std::string{raw_data, 3}; + }); + cudf::test::strings_column_wrapper input(data.begin(), data.end()); + auto strings_view = cudf::strings_column_view(input); + + auto results = cudf::strings::contains_re(strings_view, "A"); + auto expected = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + results = cudf::strings::contains_re(strings_view, "B"); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + results = cudf::strings::contains_re(strings_view, "J\\0B"); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsContainsTests, CountTest) { std::vector h_strings{ diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp index a64304d1027..f344b5432a2 100644 --- a/cpp/tests/strings/pad_tests.cpp +++ b/cpp/tests/strings/pad_tests.cpp @@ -104,11 +104,10 @@ TEST_F(StringsPadTest, ZeroSizeStringsColumn) cudf::test::expect_strings_empty(results->view()); } -class StringsPadParmsTest : public StringsPadTest, - public testing::WithParamInterface { +class PadParameters : public StringsPadTest, public testing::WithParamInterface { }; -TEST_P(StringsPadParmsTest, Padding) +TEST_P(PadParameters, Padding) { std::vector h_strings{"eee ddd", "bb cc", "aa", "bbb", "fff", "", "o"}; cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); @@ -128,8 +127,8 @@ TEST_P(StringsPadParmsTest, Padding) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -INSTANTIATE_TEST_CASE_P(StringsPadParmWidthTest, - StringsPadParmsTest, +INSTANTIATE_TEST_CASE_P(StringsPadTest, + PadParameters, testing::ValuesIn(std::array{5, 6, 7})); TEST_F(StringsPadTest, ZFill) diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 1f01f0f1429..fc1c20d8719 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,10 +23,10 @@ #include -struct StringsReplaceTests : public cudf::test::BaseFixture { +struct StringsReplaceRegexTest : public cudf::test::BaseFixture { }; -TEST_F(StringsReplaceTests, ReplaceRegexTest) +TEST_F(StringsReplaceRegexTest, ReplaceRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", "the fat cat lays next to the other accénted cat", @@ -59,7 +59,7 @@ TEST_F(StringsReplaceTests, ReplaceRegexTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, ReplaceMultiRegexTest) +TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", "the fat cat lays next to the other accénted cat", @@ -95,7 +95,7 @@ TEST_F(StringsReplaceTests, ReplaceMultiRegexTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, InvalidRegex) +TEST_F(StringsReplaceRegexTest, InvalidRegex) { cudf::test::strings_column_wrapper strings( {"abc*def|ghi+jkl", ""}); // these do not really matter @@ -116,7 +116,7 @@ TEST_F(StringsReplaceTests, InvalidRegex) cudf::logic_error); } -TEST_F(StringsReplaceTests, WithEmptyPattern) +TEST_F(StringsReplaceRegexTest, WithEmptyPattern) { std::vector h_strings{"asd", "xcv"}; cudf::test::strings_column_wrapper strings( @@ -133,7 +133,7 @@ TEST_F(StringsReplaceTests, WithEmptyPattern) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } -TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest) +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", "the fat cat lays next to the other accénted cat", @@ -167,7 +167,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest) +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest) { cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"}); auto strings_view = cudf::strings_column_view(strings); @@ -181,7 +181,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest) +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest) { cudf::test::strings_column_wrapper strings( {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"}); @@ -200,7 +200,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier) +TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier) { cudf::test::strings_column_wrapper input( {"

title

ABC

", "

1234567

XYZ

"}); @@ -217,7 +217,7 @@ TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest) +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest) { cudf::test::strings_column_wrapper strings({"this string left intentionally blank"}); auto view = cudf::strings_column_view(strings); @@ -228,7 +228,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest) EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error); } -TEST_F(StringsReplaceTests, MediumReplaceRegex) +TEST_F(StringsReplaceRegexTest, MediumReplaceRegex) { // This results in 95 regex instructions and falls in the 'medium' range. std::string medium_regex = @@ -256,7 +256,7 @@ TEST_F(StringsReplaceTests, MediumReplaceRegex) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, LargeReplaceRegex) +TEST_F(StringsReplaceRegexTest, LargeReplaceRegex) { // This results in 117 regex instructions and falls in the 'large' range. std::string large_regex = diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp index f9a71407a0d..448b61300fd 100644 --- a/cpp/tests/strings/substring_tests.cpp +++ b/cpp/tests/strings/substring_tests.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include #include @@ -28,8 +28,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - struct StringsSubstringsTest : public cudf::test::BaseFixture { }; @@ -51,11 +49,11 @@ TEST_F(StringsSubstringsTest, Substring) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -class SubstringParmsTest : public StringsSubstringsTest, - public testing::WithParamInterface { +class Parameters : public StringsSubstringsTest, + public testing::WithParamInterface { }; -TEST_P(SubstringParmsTest, Substring) +TEST_P(Parameters, Substring) { std::vector h_strings{"basic strings", "that can", "be used", "with STL"}; cudf::size_type start = GetParam(); @@ -72,7 +70,7 @@ TEST_P(SubstringParmsTest, Substring) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_P(SubstringParmsTest, Substring_From) +TEST_P(Parameters, Substring_From) { std::vector h_strings{"basic strings", "that can", "be used", "with STL"}; cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); @@ -96,7 +94,7 @@ TEST_P(SubstringParmsTest, Substring_From) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_P(SubstringParmsTest, AllEmpty) +TEST_P(Parameters, AllEmpty) { std::vector h_strings{"", "", "", ""}; cudf::size_type start = GetParam(); @@ -118,7 +116,7 @@ TEST_P(SubstringParmsTest, AllEmpty) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_P(SubstringParmsTest, AllNulls) +TEST_P(Parameters, AllNulls) { std::vector h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; cudf::test::strings_column_wrapper strings( @@ -147,23 +145,9 @@ TEST_P(SubstringParmsTest, AllNulls) } INSTANTIATE_TEST_CASE_P(StringsSubstringsTest, - SubstringParmsTest, + Parameters, testing::ValuesIn(std::array{1, 2, 3})); -TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn) -{ - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto strings_column = cudf::strings_column_view(zero_size_strings_column); - auto results = cudf::strings::slice_strings(strings_column, 1, 2); - cudf::test::expect_strings_empty(results->view()); - - cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0); - cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0); - results = cudf::strings::slice_strings(strings_column, starts_column, stops_column); - cudf::test::expect_strings_empty(results->view()); -} - TEST_F(StringsSubstringsTest, NegativePositions) { cudf::test::strings_column_wrapper strings{ @@ -270,34 +254,59 @@ TEST_F(StringsSubstringsTest, MaxPositions) TEST_F(StringsSubstringsTest, Error) { cudf::test::strings_column_wrapper strings{"this string intentionally left blank"}; - auto strings_column = cudf::strings_column_view(strings); - EXPECT_THROW(cudf::strings::slice_strings(strings_column, 0, 0, 0), cudf::logic_error); -} + auto strings_view = cudf::strings_column_view(strings); + EXPECT_THROW(cudf::strings::slice_strings(strings_view, 0, 0, 0), cudf::logic_error); -struct StringsSubstringsScalarDelimiterTest : public cudf::test::BaseFixture { -}; + auto delim_col = cudf::test::strings_column_wrapper({"", ""}); + EXPECT_THROW(cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, -1), + cudf::logic_error); + + auto indexes = cudf::test::fixed_width_column_wrapper({1, 2}); + EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes, indexes), cudf::logic_error); -TEST_F(StringsSubstringsScalarDelimiterTest, ZeroSizeStringsColumn) + auto indexes_null = cudf::test::fixed_width_column_wrapper({1}, {0}); + EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_null, indexes_null), + cudf::logic_error); + + auto indexes_bad = cudf::test::fixed_width_column_wrapper({1}); + EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_bad, indexes_bad), + cudf::logic_error); +} + +TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn) { - cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto strings_view = cudf::strings_column_view(col0); + cudf::column_view zero_size_strings_column( + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(zero_size_strings_column); + + auto results = cudf::strings::slice_strings(strings_view, 1, 2); + cudf::test::expect_strings_empty(results->view()); + + results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1); + cudf::test::expect_strings_empty(results->view()); + + cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0); + cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0); + results = cudf::strings::slice_strings(strings_view, starts_column, stops_column); + cudf::test::expect_strings_empty(results->view()); - auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1); + results = cudf::strings::slice_strings(strings_view, strings_view, 1); cudf::test::expect_strings_empty(results->view()); } -TEST_F(StringsSubstringsScalarDelimiterTest, AllEmpty) +TEST_F(StringsSubstringsTest, AllEmpty) { auto strings_col = cudf::test::strings_column_wrapper({"", "", "", "", ""}); auto strings_view = cudf::strings_column_view(strings_col); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto exp_results = cudf::column_view(strings_col); auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); + results = cudf::strings::slice_strings(strings_view, strings_view, -1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } -TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter) +TEST_F(StringsSubstringsTest, EmptyDelimiter) { auto strings_col = cudf::test::strings_column_wrapper( {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); @@ -306,11 +315,18 @@ TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter) auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {true, true, false, true, true, true}); - auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); + + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } -TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount) +TEST_F(StringsSubstringsTest, ZeroCount) { auto strings_col = cudf::test::strings_column_wrapper( {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); @@ -321,10 +337,16 @@ TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount) {true, true, false, true, true, true}); auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); + + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } -TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) +TEST_F(StringsSubstringsTest, SearchScalarDelimiter) { auto strings_col = cudf::test::strings_column_wrapper( {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); @@ -336,7 +358,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) {true, true, false, true, true, true}); auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -344,17 +366,17 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true}); auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_view.parent(), verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col); } { auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_view.parent(), verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col); } { @@ -367,7 +389,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -380,7 +402,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -394,7 +416,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -408,7 +430,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -424,7 +446,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -441,76 +463,11 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } } -struct StringsSubstringsColumnDelimiterTest : public cudf::test::BaseFixture { -}; - -TEST_F(StringsSubstringsColumnDelimiterTest, ZeroSizeStringsColumn) -{ - cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto strings_view = cudf::strings_column_view(col0); - - auto results = cudf::strings::slice_strings(strings_view, strings_view, 1); - // Check empty column - cudf::test::expect_strings_empty(results->view()); -} - -TEST_F(StringsSubstringsColumnDelimiterTest, GenerateExceptions) -{ - auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."}); - - EXPECT_THROW(cudf::strings::slice_strings( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1), - cudf::logic_error); -} - -TEST_F(StringsSubstringsColumnDelimiterTest, ColumnAllEmpty) -{ - auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"}); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - - auto results = cudf::strings::slice_strings( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); -} - -TEST_F(StringsSubstringsColumnDelimiterTest, DelimiterAllEmptyAndInvalid) -{ - auto col0 = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, false, true, false, true, false}); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::slice_strings( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); -} - -TEST_F(StringsSubstringsColumnDelimiterTest, ZeroDelimiterCount) -{ - auto col0 = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, false, true, false, true, false}); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::slice_strings( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); -} - -TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) +TEST_F(StringsSubstringsTest, SearchColumnDelimiter) { { auto col0 = cudf::test::strings_column_wrapper( @@ -523,7 +480,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings( cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -541,7 +498,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings( cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -565,7 +522,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings( cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } { @@ -587,6 +544,6 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) auto results = cudf::strings::slice_strings( cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); } } diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cpp similarity index 100% rename from cpp/tests/strings/utilities.cu rename to cpp/tests/strings/utilities.cpp diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 0f10d6efe4a..6cf36208f0b 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -809,7 +809,7 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt std::vector bitmask_to_host(cudf::column_view const& c) { if (c.nullable()) { - auto num_bitmasks = bitmask_allocation_size_bytes(c.size()) / sizeof(bitmask_type); + auto num_bitmasks = num_bitmask_words(c.size()); std::vector host_bitmask(num_bitmasks); if (c.offset() == 0) { CUDA_TRY(cudaMemcpy(host_bitmask.data(), diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 0dc10f9d717..82a22eb1dbc 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -133,7 +133,7 @@ TYPED_TEST(ColumnUtilitiesTest, NullableToHostAllValid) auto masks = cudf::test::detail::make_null_mask_vector(all_valid, all_valid + size); - EXPECT_TRUE(std::equal(masks.begin(), masks.end(), host_data.second.begin())); + EXPECT_TRUE(cudf::test::validate_host_masks(masks, host_data.second, size)); } struct ColumnUtilitiesEquivalenceTest : public cudf::test::BaseFixture { diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst index 544acb9c683..0962113eb25 100644 --- a/docs/cudf/source/basics/io-supported-types.rst +++ b/docs/cudf/source/basics/io-supported-types.rst @@ -56,7 +56,7 @@ The following table lists are compatible cudf types for each supported IO format +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+ | datetime64[ns] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+ - | struct | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | + | struct | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+ | decimal32 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+ diff --git a/java/src/main/java/ai/rapids/cudf/CloseableArray.java b/java/src/main/java/ai/rapids/cudf/CloseableArray.java new file mode 100644 index 00000000000..5c75f2378e8 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/CloseableArray.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +/** Utility class that wraps an array of closeable instances and can be closed */ +public class CloseableArray implements AutoCloseable { + private T[] array; + + public static CloseableArray wrap(T[] array) { + return new CloseableArray(array); + } + + CloseableArray(T[] array) { + this.array = array; + } + + public int size() { + return array.length; + } + + public T get(int i) { + return array[i]; + } + + public T set(int i, T obj) { + array[i] = obj; + return obj; + } + + public T[] getArray() { + return array; + } + + public T[] release() { + T[] result = array; + array = null; + return result; + } + + public void closeAt(int i) { + try { + T toClose = array[i]; + array[i] = null; + toClose.close(); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() { + close(null); + } + + public void close(Exception pendingError) { + if (array == null) { + return; + } + T[] toClose = array; + array = null; + RuntimeException error = null; + if (pendingError instanceof RuntimeException) { + error = (RuntimeException) pendingError; + } else if (pendingError != null) { + error = new RuntimeException(pendingError); + } + for (T obj: toClose) { + if (obj != null) { + try { + obj.close(); + } catch (RuntimeException e) { + if (error != null) { + error.addSuppressed(e); + } else { + error = e; + } + } catch (Exception e) { + if (error != null) { + error.addSuppressed(e); + } else { + error = new RuntimeException(e); + } + } + } + } + if (error != null) { + throw error; + } + } +} diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java index 02e4d32617d..bb2d6dbde7d 100755 --- a/java/src/main/java/ai/rapids/cudf/Cuda.java +++ b/java/src/main/java/ai/rapids/cudf/Cuda.java @@ -15,6 +15,9 @@ */ package ai.rapids.cudf; +import ai.rapids.cudf.NvtxColor; +import ai.rapids.cudf.NvtxRange; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -521,4 +524,27 @@ public static synchronized boolean isEnvCompatibleForTesting() { * Whether per-thread default stream is enabled. */ public static native boolean isPtdsEnabled(); + + /** + * Copy data from multiple device buffer sources to multiple device buffer destinations. + * For each buffer to copy there is a corresponding entry in the destination address, source + * address, and copy size vectors. + * @param destAddrs vector of device destination addresses + * @param srcAddrs vector of device source addresses + * @param copySizes vector of copy sizes + * @param stream CUDA stream to use for the copy + */ + public static void multiBufferCopyAsync(long [] destAddrs, + long [] srcAddrs, + long [] copySizes, + Stream stream) { + // Temporary sub-par stand-in for a multi-buffer copy CUDA kernel + assert(destAddrs.length == srcAddrs.length); + assert(copySizes.length == destAddrs.length); + try (NvtxRange copyRange = new NvtxRange("multiBufferCopyAsync", NvtxColor.CYAN)){ + for (int i = 0; i < destAddrs.length; i++) { + asyncMemcpy(destAddrs[i], srcAddrs[i], copySizes[i], CudaMemcpyKind.DEVICE_TO_DEVICE, stream); + } + } + } } diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java index a936d4830ee..05545807bb6 100644 --- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java +++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java @@ -19,8 +19,6 @@ package ai.rapids.cudf; import ai.rapids.cudf.ast.CompiledExpression; -import ai.rapids.cudf.nvcomp.BatchedLZ4Decompressor; -import ai.rapids.cudf.nvcomp.Decompressor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -248,16 +246,6 @@ static void register(Cuda.Event event, Cleaner cleaner) { all.add(new CleanerWeakReference(event, cleaner, collected, false)); } - public static void register(Decompressor.Metadata metadata, Cleaner cleaner) { - // It is now registered... - all.add(new CleanerWeakReference(metadata, cleaner, collected, false)); - } - - public static void register(BatchedLZ4Decompressor.BatchedMetadata metadata, Cleaner cleaner) { - // It is now registered... - all.add(new CleanerWeakReference(metadata, cleaner, collected, false)); - } - static void register(CuFileDriver driver, Cleaner cleaner) { // It is now registered... all.add(new CleanerWeakReference(driver, cleaner, collected, false)); @@ -324,4 +312,4 @@ public String toString() { + "\n"; } } -} \ No newline at end of file +} diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java index 07983f77aad..408c93ff0a1 100644 --- a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java @@ -82,6 +82,19 @@ public static RollingAggregation max() { return new RollingAggregation(Aggregation.max()); } + /** + * Rolling Window Standard Deviation with 1 as delta degrees of freedom(DDOF). + */ + public static RollingAggregation standardDeviation() { + return new RollingAggregation(Aggregation.standardDeviation()); + } + + /** + * Rolling Window Standard Deviation with configurable delta degrees of freedom(DDOF). + */ + public static RollingAggregation standardDeviation(int ddof) { + return new RollingAggregation(Aggregation.standardDeviation(ddof)); + } /** * Count number of valid, a.k.a. non-null, elements. diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java index 9ef18dbd75d..6fb5a16d888 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java +++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java @@ -23,29 +23,32 @@ * NOTE: This must be kept in sync with `jni_to_unary_operator` in CompiledExpression.cpp! */ public enum UnaryOperator { - IDENTITY(0), // Identity function - SIN(1), // Trigonometric sine - COS(2), // Trigonometric cosine - TAN(3), // Trigonometric tangent - ARCSIN(4), // Trigonometric sine inverse - ARCCOS(5), // Trigonometric cosine inverse - ARCTAN(6), // Trigonometric tangent inverse - SINH(7), // Hyperbolic sine - COSH(8), // Hyperbolic cosine - TANH(9), // Hyperbolic tangent - ARCSINH(10), // Hyperbolic sine inverse - ARCCOSH(11), // Hyperbolic cosine inverse - ARCTANH(12), // Hyperbolic tangent inverse - EXP(13), // Exponential (base e, Euler number) - LOG(14), // Natural Logarithm (base e) - SQRT(15), // Square-root (x^0.5) - CBRT(16), // Cube-root (x^(1.0/3)) - CEIL(17), // Smallest integer value not less than arg - FLOOR(18), // largest integer value not greater than arg - ABS(19), // Absolute value - RINT(20), // Rounds the floating-point argument arg to an integer value - BIT_INVERT(21), // Bitwise Not (~) - NOT(22); // Logical Not (!) + IDENTITY(0), // Identity function + SIN(1), // Trigonometric sine + COS(2), // Trigonometric cosine + TAN(3), // Trigonometric tangent + ARCSIN(4), // Trigonometric sine inverse + ARCCOS(5), // Trigonometric cosine inverse + ARCTAN(6), // Trigonometric tangent inverse + SINH(7), // Hyperbolic sine + COSH(8), // Hyperbolic cosine + TANH(9), // Hyperbolic tangent + ARCSINH(10), // Hyperbolic sine inverse + ARCCOSH(11), // Hyperbolic cosine inverse + ARCTANH(12), // Hyperbolic tangent inverse + EXP(13), // Exponential (base e, Euler number) + LOG(14), // Natural Logarithm (base e) + SQRT(15), // Square-root (x^0.5) + CBRT(16), // Cube-root (x^(1.0/3)) + CEIL(17), // Smallest integer value not less than arg + FLOOR(18), // largest integer value not greater than arg + ABS(19), // Absolute value + RINT(20), // Rounds the floating-point argument arg to an integer value + BIT_INVERT(21), // Bitwise Not (~) + NOT(22), // Logical Not (!) + CAST_TO_INT64(23), // Cast value to int64_t + CAST_TO_UINT64(24), // Cast value to uint64_t + CAST_TO_FLOAT64(25); // Cast value to double private final byte nativeId; diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java index 88b20414b0c..1ab3b97945d 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,199 +17,302 @@ package ai.rapids.cudf.nvcomp; import ai.rapids.cudf.BaseDeviceMemoryBuffer; +import ai.rapids.cudf.CloseableArray; import ai.rapids.cudf.Cuda; import ai.rapids.cudf.DeviceMemoryBuffer; import ai.rapids.cudf.HostMemoryBuffer; +import ai.rapids.cudf.MemoryBuffer; +import ai.rapids.cudf.NvtxColor; +import ai.rapids.cudf.NvtxRange; + +import java.util.Arrays; /** Multi-buffer LZ4 compressor */ public class BatchedLZ4Compressor { - /** Describes a batched compression result */ - public static class BatchedCompressionResult { - private final DeviceMemoryBuffer[] compressedBuffers; - private final long[] compressedSizes; - - BatchedCompressionResult(DeviceMemoryBuffer[] buffers, long[] sizes) { - this.compressedBuffers = buffers; - this.compressedSizes = sizes; - } + static final long MAX_CHUNK_SIZE = 16777216; // in bytes + // each chunk has a 64-bit integer value as metadata containing the compressed size + static final long METADATA_BYTES_PER_CHUNK = 8; - /** - * Get the output compressed buffers corresponding to the input buffers. - * Note that the buffers are likely larger than required to store the compressed data. - */ - public DeviceMemoryBuffer[] getCompressedBuffers() { - return compressedBuffers; - } - - /** Get the corresponding amount of compressed data in each output buffer. */ - public long[] getCompressedSizes() { - return compressedSizes; - } - } + private final long chunkSize; + private final long targetIntermediateBufferSize; + private final long maxOutputChunkSize; /** - * Get the amount of temporary storage space required to compress a batch of buffers. - * @param inputs batch of data buffers to be individually compressed - * @param chunkSize compression chunk size to use - * @return amount in bytes of temporary storage space required to compress the batch + * Construct a batched LZ4 compressor instance + * @param chunkSize maximum amount of uncompressed data to compress as a single chunk. Inputs + * larger than this will be compressed in multiple chunks. + * @param targetIntermediateBufferSize desired maximum size of intermediate device buffers + * used during compression. */ - public static long getTempSize(BaseDeviceMemoryBuffer[] inputs, long chunkSize) { - if (chunkSize <= 0) { - throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); - } - int numBuffers = inputs.length; - long[] inputAddrs = new long[numBuffers]; - long[] inputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); - } - return NvcompJni.batchedLZ4CompressGetTempSize(inputAddrs, inputSizes, chunkSize); + public BatchedLZ4Compressor(long chunkSize, long targetIntermediateBufferSize) { + validateChunkSize(chunkSize); + this.chunkSize = chunkSize; + this.maxOutputChunkSize = NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize); + assert maxOutputChunkSize < Integer.MAX_VALUE; + this.targetIntermediateBufferSize = Math.max(targetIntermediateBufferSize, maxOutputChunkSize); } /** - * Get the amount of output storage space required to compress a batch of buffers. - * @param inputs batch of data buffers to be individually compressed - * @param chunkSize compression chunk size to use - * @param tempBuffer temporary storage space - * @return amount in bytes of output storage space corresponding to each input buffer in the batch + * Compress a batch of buffers with LZ4. The input buffers will be closed. + * @param origInputs buffers to compress + * @param stream CUDA stream to use + * @return compressed buffers corresponding to the input buffers */ - public static long[] getOutputSizes(BaseDeviceMemoryBuffer[] inputs, long chunkSize, - BaseDeviceMemoryBuffer tempBuffer) { - if (chunkSize <= 0) { - throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); + public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) { + try (CloseableArray inputs = CloseableArray.wrap(origInputs)) { + if (chunkSize <= 0) { + throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); + } + final int numInputs = inputs.size(); + if (numInputs == 0) { + return new DeviceMemoryBuffer[0]; + } + + // Each buffer is broken up into chunkSize chunks for compression. Calculate how many + // chunks are needed for each input buffer. + int[] chunksPerInput = new int[numInputs]; + int numChunks = 0; + for (int i = 0; i < numInputs; i++) { + BaseDeviceMemoryBuffer buffer = inputs.get(i); + int numBufferChunks = getNumChunksInBuffer(buffer); + chunksPerInput[i] = numBufferChunks; + numChunks += numBufferChunks; + } + + // Allocate buffers for each chunk and generate parallel lists of chunk source addresses, + // chunk destination addresses, and sizes. + try (CloseableArray compressedBuffers = + allocCompressedBuffers(numChunks, stream); + DeviceMemoryBuffer compressedChunkSizes = + DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) { + long[] inputChunkAddrs = new long[numChunks]; + long[] inputChunkSizes = new long[numChunks]; + long[] outputChunkAddrs = new long[numChunks]; + buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes, + compressedBuffers, outputChunkAddrs); + + long[] outputChunkSizes; + final long tempBufferSize = NvcompJni.batchedLZ4CompressGetTempSize(numChunks, chunkSize); + try (DeviceMemoryBuffer addrsAndSizes = + putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream); + DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempBufferSize, stream)) { + final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L; + final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L; + NvcompJni.batchedLZ4CompressAsync( + addrsAndSizes.getAddress(), + devInputSizesPtr, + chunkSize, + numChunks, + tempBuffer.getAddress(), + tempBufferSize, + devOutputAddrsPtr, + compressedChunkSizes.getAddress(), + stream.getStream()); + } + + // Synchronously copy the resulting compressed sizes per chunk. + outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream); + + // inputs are no longer needed at this point, so free them early + inputs.close(); + + // Combine compressed chunks into output buffers corresponding to each original input + return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs, + outputChunkSizes, stream); + } } - int numBuffers = inputs.length; - long[] inputAddrs = new long[numBuffers]; - long[] inputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); + } + + static void validateChunkSize(long chunkSize) { + if (chunkSize <= 0 || chunkSize > MAX_CHUNK_SIZE) { + throw new IllegalArgumentException("Invalid chunk size: " + chunkSize + " Max chunk size is: " + + MAX_CHUNK_SIZE + " bytes"); } - return NvcompJni.batchedLZ4CompressGetOutputSize(inputAddrs, inputSizes, chunkSize, - tempBuffer.getAddress(), tempBuffer.getLength()); } - /** - * Calculates the minimum size in bytes necessary to store the compressed output sizes - * when performing an asynchronous batch compression. - * @param numBuffers number of buffers in the batch - * @return minimum size of the compressed output sizes buffer needed - */ - public static long getCompressedSizesBufferSize(int numBuffers) { - // Each compressed size value is a 64-bit long - return numBuffers * 8; + private static long ceilingDivide(long x, long y) { + return (x + y - 1) / y; } - /** - * Asynchronously compress a batch of input buffers. The compressed size output buffer must be - * pinned memory for this operation to be truly asynchronous. Note that the caller must - * synchronize on the specified CUDA stream in order to safely examine the compressed output - * sizes! - * @param compressedSizesOutputBuffer host memory where the compressed output size will be stored - * @param inputs buffers to compress - * @param chunkSize type of data within each buffer - * @param tempBuffer compression chunk size to use - * @param outputs output buffers that will contain the compressed results - * @param stream CUDA stream to use - */ - public static void compressAsync(HostMemoryBuffer compressedSizesOutputBuffer, - BaseDeviceMemoryBuffer[] inputs, long chunkSize, - BaseDeviceMemoryBuffer tempBuffer, - BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) { - if (chunkSize <= 0) { - throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); - } - int numBuffers = inputs.length; - if (outputs.length != numBuffers) { - throw new IllegalArgumentException("buffer count mismatch, " + numBuffers + " inputs and " + - outputs.length + " outputs"); - } - if (compressedSizesOutputBuffer.getLength() < getCompressedSizesBufferSize(numBuffers)) { - throw new IllegalArgumentException("compressed output size buffer must be able to hold " + - "at least 8 bytes per buffer, size is only " + compressedSizesOutputBuffer.getLength()); - } + private int getNumChunksInBuffer(MemoryBuffer buffer) { + return (int) ceilingDivide(buffer.getLength(), chunkSize); + } - long[] inputAddrs = new long[numBuffers]; - long[] inputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); + private CloseableArray allocCompressedBuffers(long numChunks, + Cuda.Stream stream) { + final long chunksPerBuffer = targetIntermediateBufferSize / maxOutputChunkSize; + final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer); + if (numBuffers > Integer.MAX_VALUE) { + throw new IllegalStateException("Too many chunks"); } - - long[] outputAddrs = new long[numBuffers]; - long[] outputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = outputs[i]; - outputAddrs[i] = buffer.getAddress(); - outputSizes[i] = buffer.getLength(); + try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) { + CloseableArray buffers = CloseableArray.wrap( + new DeviceMemoryBuffer[(int) numBuffers]); + try { + // allocate all of the max-chunks intermediate compressed buffers + for (int i = 0; i < buffers.size() - 1; ++i) { + buffers.set(i, DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream)); + } + // allocate the tail intermediate compressed buffer that may be smaller than the others + buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate( + (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream)); + return buffers; + } catch (Exception e) { + buffers.close(e); + throw e; + } } - - NvcompJni.batchedLZ4CompressAsync(compressedSizesOutputBuffer.getAddress(), - inputAddrs, inputSizes, chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(), - outputAddrs, outputSizes, stream.getStream()); } - /** - * Compress a batch of buffers with LZ4 - * @param inputs buffers to compress - * @param chunkSize compression chunk size to use - * @param stream CUDA stream to use - * @return compression results containing the corresponding output buffer and compressed data size - * for each input buffer - */ - public static BatchedCompressionResult compress(BaseDeviceMemoryBuffer[] inputs, long chunkSize, - Cuda.Stream stream) { - if (chunkSize <= 0) { - throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); + // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point + // into the chunks in the input and output buffers. + private void buildAddrsAndSizes(CloseableArray inputs, + long[] inputChunkAddrs, + long[] inputChunkSizes, + CloseableArray compressedBuffers, + long[] outputChunkAddrs) { + // setup the input addresses and sizes + int chunkIdx = 0; + for (BaseDeviceMemoryBuffer input : inputs.getArray()) { + final int numChunksInBuffer = getNumChunksInBuffer(input); + for (int i = 0; i < numChunksInBuffer; i++) { + inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize; + inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize + : (input.getLength() - (long) i * chunkSize); + ++chunkIdx; + } } - int numBuffers = inputs.length; - long[] inputAddrs = new long[numBuffers]; - long[] inputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); + assert chunkIdx == inputChunkAddrs.length; + assert chunkIdx == inputChunkSizes.length; + + // setup output addresses + chunkIdx = 0; + for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) { + assert buffer.getLength() % maxOutputChunkSize == 0; + long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize; + long baseAddr = buffer.getAddress(); + for (int i = 0; i < numChunksInBuffer; i++) { + outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize; + } } + assert chunkIdx == outputChunkAddrs.length; + } - DeviceMemoryBuffer[] outputBuffers = new DeviceMemoryBuffer[numBuffers]; - try { - long tempSize = NvcompJni.batchedLZ4CompressGetTempSize(inputAddrs, inputSizes, chunkSize); - try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) { - long[] outputSizes = NvcompJni.batchedLZ4CompressGetOutputSize(inputAddrs, inputSizes, - chunkSize, tempBuffer.getAddress(), tempBuffer.getLength()); - long[] outputAddrs = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(outputSizes[i]); - outputBuffers[i] = buffer; - outputAddrs[i] = buffer.getAddress(); + // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer. + private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs, + long[] inputSizes, + long[] outputAddrs, + Cuda.Stream stream) { + final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays + final long outputAddrsOffset = inputAddrs.length * 8L; + final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L; + try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) { + try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize); + DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) { + hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length); + hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length); + for (int i = 0; i < inputSizes.length; i++) { + hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]); } + result.copyFromHostBuffer(hostbuf, stream); + result.incRefCount(); + return result; + } + } + } - long compressedSizesBufferSize = getCompressedSizesBufferSize(numBuffers); - try (HostMemoryBuffer compressedSizesBuffer = - HostMemoryBuffer.allocate(compressedSizesBufferSize)) { - NvcompJni.batchedLZ4CompressAsync(compressedSizesBuffer.getAddress(), - inputAddrs, inputSizes, chunkSize, - tempBuffer.getAddress(), tempBuffer.getLength(), - outputAddrs, outputSizes, stream.getStream()); - stream.sync(); - long[] compressedSizes = new long[numBuffers]; - compressedSizesBuffer.getLongs(compressedSizes, 0, 0, numBuffers); - return new BatchedCompressionResult(outputBuffers, compressedSizes); + // Synchronously copy the resulting compressed sizes from device memory to host memory. + private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) { + try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) { + try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) { + hostbuf.copyFromDeviceBuffer(devChunkSizes, stream); + int numChunks = (int) (devChunkSizes.getLength() / 8); + long[] result = new long[numChunks]; + for (int i = 0; i < numChunks; i++) { + long size = hostbuf.getLong(i * 8L); + assert size < Integer.MAX_VALUE : "output size is too big"; + result[i] = size; } + return result; } - } catch (Throwable t) { - for (DeviceMemoryBuffer buffer : outputBuffers) { - if (buffer != null) { - buffer.close(); + } + } + + // Stitch together the individual chunks into the result buffers. + // Each result buffer has metadata at the beginning, followed by compressed chunks. + // This is done by building up parallel lists of source addr, dest addr and size and + // then calling multiBufferCopyAsync() + private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput, + DeviceMemoryBuffer compressedChunkSizes, + long[] outputChunkAddrs, + long[] outputChunkSizes, + Cuda.Stream stream) { + try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) { + final int numOutputs = chunksPerInput.length; + final long chunkSizesAddr = compressedChunkSizes.getAddress(); + long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes); + try (CloseableArray outputs = + CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) { + // Each chunk needs to be copied, and each output needs a copy of the + // compressed chunk size vector representing the metadata. + final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length; + long[] destAddrs = new long[totalBuffersToCopy]; + long[] srcAddrs = new long[totalBuffersToCopy]; + long[] sizes = new long[totalBuffersToCopy]; + int copyBufferIdx = 0; + int chunkIdx = 0; + for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) { + DeviceMemoryBuffer outputBuffer = DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]); + final long outputBufferAddr = outputBuffer.getAddress(); + outputs.set(outputIdx, outputBuffer); + final long numChunks = chunksPerInput[outputIdx]; + final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK; + + // setup a copy of the metadata at the front of the output buffer + srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8; + destAddrs[copyBufferIdx] = outputBufferAddr; + sizes[copyBufferIdx] = metadataSize; + ++copyBufferIdx; + + // setup copies of the compressed chunks for this output buffer + long nextChunkAddr = outputBufferAddr + metadataSize; + for (int i = 0; i < numChunks; ++i) { + srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx]; + destAddrs[copyBufferIdx] = nextChunkAddr; + final long chunkSize = outputChunkSizes[chunkIdx]; + sizes[copyBufferIdx] = chunkSize; + copyBufferIdx++; + chunkIdx++; + nextChunkAddr += chunkSize; + } } + assert copyBufferIdx == totalBuffersToCopy; + assert chunkIdx == outputChunkAddrs.length; + assert chunkIdx == outputChunkSizes.length; + + Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream); + return outputs.release(); } - throw t; } } - + // Calculate the list of sizes for each output buffer (metadata plus size of compressed chunks) + private long[] calcOutputBufferSizes(int[] chunksPerInput, + long[] outputChunkSizes) { + long[] sizes = new long[chunksPerInput.length]; + int chunkIdx = 0; + for (int i = 0; i < sizes.length; i++) { + final int chunksInBuffer = chunksPerInput[i]; + final int chunkEndIdx = chunkIdx + chunksInBuffer; + // metadata stored in front of compressed data + long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer; + // add in the compressed chunk sizes to get the total size + while (chunkIdx < chunkEndIdx) { + bufferSize += outputChunkSizes[chunkIdx++]; + } + sizes[i] = bufferSize; + } + assert chunkIdx == outputChunkSizes.length; + return sizes; + } } diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java index 61969db4fb4..40ad4d5e9ed 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,214 +16,183 @@ package ai.rapids.cudf.nvcomp; +import ai.rapids.cudf.CloseableArray; import ai.rapids.cudf.Cuda; import ai.rapids.cudf.BaseDeviceMemoryBuffer; import ai.rapids.cudf.DeviceMemoryBuffer; -import ai.rapids.cudf.MemoryCleaner; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import ai.rapids.cudf.HostMemoryBuffer; +import ai.rapids.cudf.NvtxColor; +import ai.rapids.cudf.NvtxRange; + +import java.util.Arrays; /** LZ4 decompressor that operates on multiple input buffers in a batch */ public class BatchedLZ4Decompressor { - private static final Logger log = LoggerFactory.getLogger(Decompressor.class); - - /** - * Get the metadata associated with a batch of compressed buffers - * @param inputs compressed buffers that will be decompressed - * @param stream CUDA stream to use - * @return opaque metadata object - */ - public static BatchedMetadata getMetadata(BaseDeviceMemoryBuffer[] inputs, Cuda.Stream stream) { - long[] inputAddrs = new long[inputs.length]; - long[] inputSizes = new long[inputs.length]; - for (int i = 0; i < inputs.length; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); - } - return new BatchedMetadata(NvcompJni.batchedLZ4DecompressGetMetadata( - inputAddrs, inputSizes, stream.getStream())); - } - - /** - * Get the amount of temporary storage required to decompress a batch of buffers - * @param metadata metadata retrieved from the compressed buffers - * @return amount in bytes of temporary storage space required to decompress the buffer batch - */ - public static long getTempSize(BatchedMetadata metadata) { - return NvcompJni.batchedLZ4DecompressGetTempSize(metadata.getMetadata()); - } - - /** - * Get the amount of ouptut storage required to decopmress a batch of buffers - * @param metadata metadata retrieved from the compressed buffers - * @param numOutputs number of buffers in the batch - * @return amount in bytes of temporary storage space required to decompress the buffer batch - */ - public static long[] getOutputSizes(BatchedMetadata metadata, int numOutputs) { - return NvcompJni.batchedLZ4DecompressGetOutputSize(metadata.getMetadata(), numOutputs); - } - /** * Asynchronously decompress a batch of buffers - * @param inputs buffers to decompress - * @param tempBuffer temporary buffer - * @param metadata metadata retrieved from the compressed buffers - * @param outputs output buffers that will contain the compressed results - * @param stream CUDA stream to use + * @param chunkSize maximum uncompressed block size, must match value used during compression + * @param origInputs buffers to decompress, will be closed by this operation + * @param outputs output buffers that will contain the compressed results, each must be sized + * to the exact decompressed size of the corresponding input + * @param stream CUDA stream to use */ - public static void decompressAsync(BaseDeviceMemoryBuffer[] inputs, - BaseDeviceMemoryBuffer tempBuffer, BatchedMetadata metadata, - BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) { - int numBuffers = inputs.length; - if (outputs.length != numBuffers) { - throw new IllegalArgumentException("buffer count mismatch, " + numBuffers + " inputs and " + - outputs.length + " outputs"); - } + public static void decompressAsync(long chunkSize, + BaseDeviceMemoryBuffer[] origInputs, + BaseDeviceMemoryBuffer[] outputs, + Cuda.Stream stream) { + try (CloseableArray inputs = + CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) { + BatchedLZ4Compressor.validateChunkSize(chunkSize); + if (origInputs.length != outputs.length) { + throw new IllegalArgumentException("number of inputs must match number of outputs"); + } + final int numInputs = inputs.size(); + if (numInputs == 0) { + return; + } - long[] inputAddrs = new long[numBuffers]; - long[] inputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); - } + int[] chunksPerInput = new int[numInputs]; + long totalChunks = 0; + for (int i = 0; i < numInputs; i++) { + // use output size to determine number of chunks in the input, as the output buffer + // must be exactly sized to the uncompressed data + BaseDeviceMemoryBuffer buffer = outputs[i]; + int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer); + chunksPerInput[i] = numBufferChunks; + totalChunks += numBufferChunks; + } - long[] outputAddrs = new long[numBuffers]; - long[] outputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = outputs[i]; - outputAddrs[i] = buffer.getAddress(); - outputSizes[i] = buffer.getLength(); + final long tempBufferSize = NvcompJni.batchedLZ4DecompressGetTempSize(totalChunks, chunkSize); + try (DeviceMemoryBuffer devAddrsSizes = + buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput, + outputs, stream); + DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) { + // buffer containing addresses and sizes contains four vectors of longs in this order: + // - compressed chunk input addresses + // - chunk output buffer addresses + // - compressed chunk sizes + // - uncompressed chunk sizes + final long inputAddrsPtr = devAddrsSizes.getAddress(); + final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8; + final long inputSizesPtr = outputAddrsPtr + totalChunks * 8; + final long outputSizesPtr = inputSizesPtr + totalChunks * 8; + NvcompJni.batchedLZ4DecompressAsync( + inputAddrsPtr, + inputSizesPtr, + outputSizesPtr, + totalChunks, + devTemp.getAddress(), + devTemp.getLength(), + outputAddrsPtr, + stream.getStream()); + } } + } - NvcompJni.batchedLZ4DecompressAsync(inputAddrs, inputSizes, - tempBuffer.getAddress(), tempBuffer.getLength(), metadata.getMetadata(), - outputAddrs, outputSizes, stream.getStream()); + private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) { + return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize); } /** - * Asynchronously decompress a batch of buffers - * @param inputs buffers to decompress + * Build a device memory buffer containing four vectors of longs in the following order: + *
    + *
  • compressed chunk input addresses
  • + *
  • uncompressed chunk output addresses
  • + *
  • compressed chunk sizes
  • + *
  • uncompressed chunk sizes
  • + *
+ * Each vector contains as many longs as the number of chunks being decompressed + * @param chunkSize maximum uncompressed size of a chunk + * @param totalChunks total number of chunks to be decompressed + * @param inputs device buffers containing the compressed data + * @param chunksPerInput number of compressed chunks per input buffer + * @param outputs device buffers that will hold the uncompressed output * @param stream CUDA stream to use - * @return output buffers containing compressed data corresponding to the input buffers + * @return device buffer containing address and size vectors */ - public static DeviceMemoryBuffer[] decompressAsync(BaseDeviceMemoryBuffer[] inputs, - Cuda.Stream stream) { - int numBuffers = inputs.length; - long[] inputAddrs = new long[numBuffers]; - long[] inputSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - BaseDeviceMemoryBuffer buffer = inputs[i]; - inputAddrs[i] = buffer.getAddress(); - inputSizes[i] = buffer.getLength(); - } - - long metadata = NvcompJni.batchedLZ4DecompressGetMetadata(inputAddrs, inputSizes, - stream.getStream()); - try { - long[] outputSizes = NvcompJni.batchedLZ4DecompressGetOutputSize(metadata, numBuffers); - long[] outputAddrs = new long[numBuffers]; - DeviceMemoryBuffer[] outputs = new DeviceMemoryBuffer[numBuffers]; - try { - for (int i = 0; i < numBuffers; ++i) { - DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(outputSizes[i]); - outputs[i] = buffer; - outputAddrs[i] = buffer.getAddress(); - } - - long tempSize = NvcompJni.batchedLZ4DecompressGetTempSize(metadata); - try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) { - NvcompJni.batchedLZ4DecompressAsync(inputAddrs, inputSizes, - tempBuffer.getAddress(), tempBuffer.getLength(), metadata, - outputAddrs, outputSizes, stream.getStream()); - } - } catch (Throwable t) { - for (DeviceMemoryBuffer buffer : outputs) { - if (buffer != null) { - buffer.close(); + private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize, + long totalChunks, + BaseDeviceMemoryBuffer[] inputs, + int[] chunksPerInput, + BaseDeviceMemoryBuffer[] outputs, + Cuda.Stream stream) { + final long totalBufferSize = totalChunks * 8L * 4L; + try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) { + try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream); + HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize); + DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) { + // Build four long vectors in the AddrsSizes buffer: + // - compressed input address (one per chunk) + // - uncompressed output address (one per chunk) + // - compressed input size (one per chunk) + // - uncompressed input size (one per chunk) + final long srcAddrsOffset = 0; + final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L; + final long srcSizesOffset = destAddrsOffset + totalChunks * 8L; + final long destSizesOffset = srcSizesOffset + totalChunks * 8L; + long chunkIdx = 0; + for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) { + final BaseDeviceMemoryBuffer input = inputs[inputIdx]; + final BaseDeviceMemoryBuffer output = outputs[inputIdx]; + final int numChunksInInput = chunksPerInput[inputIdx]; + long srcAddr = input.getAddress() + + BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK * numChunksInInput; + long destAddr = output.getAddress(); + final long chunkIdxEnd = chunkIdx + numChunksInInput; + while (chunkIdx < chunkIdxEnd) { + final long srcChunkSize = metadata.getLong(chunkIdx * 8); + final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize + : output.getAddress() + output.getLength() - destAddr; + hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr); + hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr); + hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize); + hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize); + srcAddr += srcChunkSize; + destAddr += destChunkSize; + ++chunkIdx; } } - throw t; + devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream); + devAddrsSizes.incRefCount(); + return devAddrsSizes; } - - return outputs; - } finally { - NvcompJni.batchedLZ4DecompressDestroyMetadata(metadata); } } - /** Opaque metadata object for batched LZ4 decompression */ - public static class BatchedMetadata implements AutoCloseable { - private final BatchedMetadataCleaner cleaner; - private final long id; - private boolean closed = false; - - BatchedMetadata(long metadata) { - this.cleaner = new BatchedMetadataCleaner(metadata); - this.id = cleaner.id; - MemoryCleaner.register(this, cleaner); - cleaner.addRef(); - } - - long getMetadata() { - return cleaner.metadata; - } - - public boolean isLZ4Metadata() { - return NvcompJni.isLZ4Metadata(getMetadata()); - } - - @Override - public synchronized void close() { - if (!closed) { - cleaner.delRef(); - cleaner.clean(false); - closed = true; - } else { - cleaner.logRefCountDebug("double free " + this); - throw new IllegalStateException("Close called too many times " + this); - } - } - - @Override - public String toString() { - return "LZ4 BATCHED METADATA (ID: " + id + " " + - Long.toHexString(cleaner.metadata) + ")"; - } - - private static class BatchedMetadataCleaner extends MemoryCleaner.Cleaner { - private long metadata; - - BatchedMetadataCleaner(long metadata) { - this.metadata = metadata; - } - - @Override - protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { - boolean neededCleanup = false; - long address = metadata; - if (metadata != 0) { - try { - NvcompJni.batchedLZ4DecompressDestroyMetadata(metadata); - } finally { - // Always mark the resource as freed even if an exception is thrown. - // We cannot know how far it progressed before the exception, and - // therefore it is unsafe to retry. - metadata = 0; - } - neededCleanup = true; - } - if (neededCleanup && logErrorIfNotClean) { - log.error("LZ4 BATCHED METADATA WAS LEAKED (Address: " + Long.toHexString(address) + ")"); - logRefCountDebug("Leaked event"); + /** + * Fetch the metadata at the front of each input in a single, contiguous host buffer. + * @param totalChunks total number of compressed chunks + * @param inputs buffers containing the compressed data + * @param chunksPerInput number of compressed chunks for the corresponding input + * @param stream CUDA stream to use + * @return host buffer containing all of the metadata + */ + private static HostMemoryBuffer fetchMetadata(long totalChunks, + BaseDeviceMemoryBuffer[] inputs, + int[] chunksPerInput, + Cuda.Stream stream) { + try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) { + // one long per chunk containing the compressed size + final long totalMetadataSize = totalChunks * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK; + // Build corresponding vectors of destination addresses, source addresses and sizes. + long[] destAddrs = new long[inputs.length]; + long[] srcAddrs = new long[inputs.length]; + long[] sizes = new long[inputs.length]; + try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize); + DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) { + long destCopyAddr = devMetadata.getAddress(); + for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) { + final BaseDeviceMemoryBuffer input = inputs[inputIdx]; + final long copySize = chunksPerInput[inputIdx] * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK; + destAddrs[inputIdx] = destCopyAddr; + srcAddrs[inputIdx] = input.getAddress(); + sizes[inputIdx] = copySize; + destCopyAddr += copySize; } - return neededCleanup; - } - - @Override - public boolean isClean() { - return metadata != 0; + Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream); + hostMetadata.copyFromDeviceBuffer(devMetadata, stream); + hostMetadata.incRefCount(); + return hostMetadata; } } } diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java b/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java index 5a133acbf7c..70f0a021a4d 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,10 @@ public enum CompressionType { INT(4), UINT(5), LONGLONG(6), - ULONGLONG(7); + ULONGLONG(7), + BITS(0xff); + + private static final CompressionType[] types = CompressionType.values(); final int nativeId; @@ -33,6 +36,17 @@ public enum CompressionType { this.nativeId = nativeId; } + /** Lookup the CompressionType that corresponds to the specified native identifier */ + public static CompressionType fromNativeId(int id) { + for (CompressionType type : types) { + if (type.nativeId == id) { + return type; + } + } + throw new IllegalArgumentException("Unknown compression type ID: " + id); + } + + /** Get the native code identifier for the type */ public final int toNativeId() { return nativeId; } diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java deleted file mode 100644 index 90dabfbcf8e..00000000000 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.rapids.cudf.nvcomp; - -import ai.rapids.cudf.Cuda; -import ai.rapids.cudf.BaseDeviceMemoryBuffer; -import ai.rapids.cudf.MemoryCleaner; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Generic single-buffer decompressor interface */ -public class Decompressor { - private static final Logger log = LoggerFactory.getLogger(Decompressor.class); - - /** - * Get the metadata associated with a compressed buffer - * @param buffer compressed data buffer - * @param stream CUDA stream to use - * @return opaque metadata object - */ - public static Metadata getMetadata(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) { - long metadata = NvcompJni.decompressGetMetadata(buffer.getAddress(), buffer.getLength(), - stream.getStream()); - return new Metadata(metadata); - } - - /** - * Get the amount of temporary storage space required to decompress a buffer. - * @param metadata metadata retrieved from the compressed data - * @return amount in bytes of temporary storage space required to decompress - */ - public static long getTempSize(Metadata metadata) { - return NvcompJni.decompressGetTempSize(metadata.getMetadata()); - } - - /** - * Get the amount of output storage space required to hold the uncompressed data. - * @param metadata metadata retrieved from the compressed data - * @return amount in bytes of output storage space required to decompress - */ - public static long getOutputSize(Metadata metadata) { - return NvcompJni.decompressGetOutputSize(metadata.getMetadata()); - } - - /** - * Asynchronously decompress a buffer. - * @param input compressed data buffer - * @param tempBuffer temporary storage buffer - * @param metadata metadata retrieved from compressed data - * @param output output storage buffer - * @param stream CUDA stream to use - */ - public static void decompressAsync(BaseDeviceMemoryBuffer input, BaseDeviceMemoryBuffer tempBuffer, - Metadata metadata, BaseDeviceMemoryBuffer output, Cuda.Stream stream) { - NvcompJni.decompressAsync( - input.getAddress(), input.getLength(), - tempBuffer.getAddress(), tempBuffer.getLength(), - metadata.getMetadata(), - output.getAddress(), output.getLength(), - stream.getStream()); - } - - /** - * Determine if a buffer is data compressed with LZ4. - * @param buffer data to examine - * @return true if the data is LZ4 compressed - */ - public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer) { - return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength()); - } - - - /** Opaque metadata object for single-buffer decompression */ - public static class Metadata implements AutoCloseable { - private final MetadataCleaner cleaner; - private final long id; - private boolean closed = false; - - Metadata(long metadata) { - this.cleaner = new MetadataCleaner(metadata); - this.id = cleaner.id; - MemoryCleaner.register(this, cleaner); - cleaner.addRef(); - } - - long getMetadata() { - return cleaner.metadata; - } - - /** - * Determine if this metadata is associated with LZ4-compressed data - * @return true if the metadata is associated with LZ4-compressed data - */ - public boolean isLZ4Metadata() { - return NvcompJni.isLZ4Metadata(getMetadata()); - } - - @Override - public synchronized void close() { - if (!closed) { - cleaner.delRef(); - cleaner.clean(false); - closed = true; - } else { - cleaner.logRefCountDebug("double free " + this); - throw new IllegalStateException("Close called too many times " + this); - } - } - - @Override - public String toString() { - return "DECOMPRESSOR METADATA (ID: " + id + " " + - Long.toHexString(cleaner.metadata) + ")"; - } - - private static class MetadataCleaner extends MemoryCleaner.Cleaner { - private long metadata; - - MetadataCleaner(long metadata) { - this.metadata = metadata; - } - - @Override - protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { - boolean neededCleanup = false; - long address = metadata; - if (metadata != 0) { - try { - NvcompJni.decompressDestroyMetadata(metadata); - } finally { - // Always mark the resource as freed even if an exception is thrown. - // We cannot know how far it progressed before the exception, and - // therefore it is unsafe to retry. - metadata = 0; - } - neededCleanup = true; - } - if (neededCleanup && logErrorIfNotClean) { - log.error("DECOMPRESSOR METADATA WAS LEAKED (Address: " + - Long.toHexString(address) + ")"); - logRefCountDebug("Leaked event"); - } - return neededCleanup; - } - - @Override - public boolean isClean() { - return metadata != 0; - } - } - } -} diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java index ce7012a3bee..67a770f1346 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,46 +18,54 @@ import ai.rapids.cudf.Cuda; import ai.rapids.cudf.BaseDeviceMemoryBuffer; +import ai.rapids.cudf.DeviceMemoryBuffer; import ai.rapids.cudf.HostMemoryBuffer; /** Single-buffer compressor implementing LZ4 */ public class LZ4Compressor { - /** - * Calculate the amount of temporary storage space required to compress a buffer. - * @param input buffer to compress - * @param inputType type of data within the buffer - * @param chunkSize compression chunk size to use - * @return amount in bytes of temporary storage space required to compress the buffer - */ - public static long getTempSize(BaseDeviceMemoryBuffer input, CompressionType inputType, - long chunkSize) { - if (chunkSize <= 0) { - throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); + /** LZ4 compression settings corresponding to a chunk size */ + public static final class Configuration { + private final long metadataBytes; + private final long tempBytes; + private final long maxCompressedBytes; + + Configuration(long metadataBytes, long tempBytes, long maxCompressedBytes) { + this.metadataBytes = metadataBytes; + this.tempBytes = tempBytes; + this.maxCompressedBytes = maxCompressedBytes; + } + + /** Get the size of the metadata information in bytes */ + public long getMetadataBytes() { + return metadataBytes; + } + + /** Get the size of the temporary storage in bytes needed to compress */ + public long getTempBytes() { + return tempBytes; + } + + /** Get the maximum compressed output size in bytes */ + public long getMaxCompressedBytes() { + return maxCompressedBytes; } - return NvcompJni.lz4CompressGetTempSize(input.getAddress(), input.getLength(), - inputType.nativeId, chunkSize); } /** - * Calculate the amount of output storage space required to compress a buffer. - * @param input buffer to compress - * @param inputType type of data within the buffer - * @param chunkSize compression chunk size to use - * @param tempBuffer temporary storage space - * @return amount in bytes of output storage space required to compress the buffer + * Get the compression configuration necessary for a particular chunk size. + * @param chunkSize size of an LZ4 chunk in bytes + * @param uncompressedSize total size of the uncompressed data + * @return compression configuration for the specified chunk size */ - public static long getOutputSize(BaseDeviceMemoryBuffer input, CompressionType inputType, - long chunkSize, BaseDeviceMemoryBuffer tempBuffer) { - if (chunkSize <= 0) { - throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); - } - return NvcompJni.lz4CompressGetOutputSize(input.getAddress(), input.getLength(), - inputType.nativeId, chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(), false); + public static Configuration configure(long chunkSize, long uncompressedSize) { + long[] configs = NvcompJni.lz4CompressConfigure(chunkSize, uncompressedSize); + assert configs.length == 3; + return new Configuration(configs[0], configs[1], configs[2]); } /** - * Compress a buffer with LZ4. + * Synchronously compress a buffer with LZ4. * @param input buffer to compress * @param inputType type of data within the buffer * @param chunkSize compression chunk size to use @@ -72,16 +80,19 @@ public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputT if (chunkSize <= 0) { throw new IllegalArgumentException("Illegal chunk size: " + chunkSize); } - return NvcompJni.lz4Compress(input.getAddress(), input.getLength(), inputType.nativeId, - chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(), - output.getAddress(), output.getLength(), stream.getStream()); + try (DeviceMemoryBuffer devOutputSizeBuffer = DeviceMemoryBuffer.allocate(Long.BYTES); + HostMemoryBuffer hostOutputSizeBuffer = HostMemoryBuffer.allocate(Long.BYTES)) { + compressAsync(devOutputSizeBuffer, input, inputType, chunkSize, tempBuffer, output, stream); + hostOutputSizeBuffer.copyFromDeviceBuffer(devOutputSizeBuffer, stream); + return hostOutputSizeBuffer.getLong(0); + } } /** * Asynchronously compress a buffer with LZ4. The compressed size output buffer must be pinned * memory for this operation to be truly asynchronous. Note that the caller must synchronize * on the specified CUDA stream in order to safely examine the compressed output size! - * @param compressedSizeOutputBuffer host memory where the compressed output size will be stored + * @param compressedSizeOutputBuffer device memory where the compressed output size will be stored * @param input buffer to compress * @param inputType type of data within the buffer * @param chunkSize compression chunk size to use @@ -89,7 +100,7 @@ public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputT * @param output buffer that will contain the compressed result * @param stream CUDA stream to use */ - public static void compressAsync(HostMemoryBuffer compressedSizeOutputBuffer, + public static void compressAsync(DeviceMemoryBuffer compressedSizeOutputBuffer, BaseDeviceMemoryBuffer input, CompressionType inputType, long chunkSize, BaseDeviceMemoryBuffer tempBuffer, BaseDeviceMemoryBuffer output, Cuda.Stream stream) { @@ -100,9 +111,16 @@ public static void compressAsync(HostMemoryBuffer compressedSizeOutputBuffer, throw new IllegalArgumentException("compressed output size buffer must be able to hold " + "at least 8 bytes, size is only " + compressedSizeOutputBuffer.getLength()); } - NvcompJni.lz4CompressAsync(compressedSizeOutputBuffer.getAddress(), - input.getAddress(), input.getLength(), inputType.nativeId, chunkSize, - tempBuffer.getAddress(), tempBuffer.getLength(), output.getAddress(), output.getLength(), + NvcompJni.lz4CompressAsync( + compressedSizeOutputBuffer.getAddress(), + input.getAddress(), + input.getLength(), + inputType.nativeId, + chunkSize, + tempBuffer.getAddress(), + tempBuffer.getLength(), + output.getAddress(), + output.getLength(), stream.getStream()); } } diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java new file mode 100644 index 00000000000..46b3127581b --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf.nvcomp; + +import ai.rapids.cudf.BaseDeviceMemoryBuffer; +import ai.rapids.cudf.Cuda; + +/** Single-buffer decompression using LZ4 */ +public class LZ4Decompressor { + + /** + * LZ4 decompression settings corresponding to an LZ4 compressed input. + * NOTE: Each instance must be closed to avoid a native memory leak. + */ + public static final class Configuration implements AutoCloseable { + private final long metadataPtr; + private final long metadataSize; + private final long tempBytes; + private final long uncompressedBytes; + + Configuration(long metadataPtr, long metadataSize, long tempBytes, + long uncompressedBytes) { + this.metadataPtr = metadataPtr; + this.metadataSize = metadataSize; + this.tempBytes = tempBytes; + this.uncompressedBytes = uncompressedBytes; + } + + /** Get the host address of the metadata */ + public long getMetadataPtr() { + return metadataPtr; + } + + /** Get the size of the metadata in bytes */ + public long getMetadataSize() { + return metadataSize; + } + + /** Get the size of the temporary buffer in bytes needed to decompress */ + public long getTempBytes() { + return tempBytes; + } + + /** Get the size of the uncompressed data in bytes */ + public long getUncompressedBytes() { + return uncompressedBytes; + } + + @Override + public void close() { + NvcompJni.lz4DestroyMetadata(metadataPtr); + } + } + + /** + * Determine if a buffer is data compressed with LZ4. + * @param buffer data to examine + * @param stream CUDA stream to use + * @return true if the data is LZ4 compressed + */ + public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) { + return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength(), stream.getStream()); + } + + /** + * Get the decompression configuration from compressed data. + * NOTE: The resulting configuration object must be closed to avoid a native memory leak. + * @param compressed data that has been compressed by the LZ4 compressor + * @param stream CUDA stream to use + * @return decompression configuration for the specified input + */ + public static Configuration configure(BaseDeviceMemoryBuffer compressed, Cuda.Stream stream) { + long[] configs = NvcompJni.lz4DecompressConfigure(compressed.getAddress(), + compressed.getLength(), stream.getStream()); + assert configs.length == 4; + return new Configuration(configs[0], configs[1], configs[2], configs[3]); + } + + /** + * Asynchronously decompress data compressed with the LZ4 compressor. + * @param compressed buffer containing LZ4-compressed data + * @param config decompression configuration + * @param temp temporary storage buffer + * @param outputBuffer buffer that will be written with the uncompressed output + * @param stream CUDA stream to use + */ + public static void decompressAsync( + BaseDeviceMemoryBuffer compressed, + Configuration config, + BaseDeviceMemoryBuffer temp, + BaseDeviceMemoryBuffer outputBuffer, + Cuda.Stream stream) { + NvcompJni.lz4DecompressAsync( + compressed.getAddress(), + compressed.getLength(), + config.getMetadataPtr(), + config.getMetadataSize(), + temp.getAddress(), + temp.getLength(), + outputBuffer.getAddress(), + outputBuffer.getLength(), + stream.getStream()); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java index 5ce0a8d815d..58f8390d0eb 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,69 +24,14 @@ class NvcompJni { NativeDepsLoader.loadNativeDeps(); } - /** - * Extracts the metadata from the input on the device and copies - * it to the host. Note that the result must be released with a - * call to decompressDestroyMetadata - * @param inPtr device address of the compressed data - * @param inSize size of the compressed data in bytes - * @param stream address of CUDA stream that will be used for synchronization - * @return address of the metadata on the host - */ - static native long decompressGetMetadata(long inPtr, long inSize, long stream); - - /** - * Destroys the metadata object and frees the associated memory. - * @param metadataPtr address of the metadata object - */ - static native void decompressDestroyMetadata(long metadataPtr); - - /** - * Computes the temporary storage size needed to decompress. - * This over-estimates the needed storage considerably. - * @param metadataPtr address of the metadata object - * @return the number of temporary storage bytes needed to decompress - */ - static native long decompressGetTempSize(long metadataPtr); - - /** - * Computes the decompressed size of the data. Gets this from the - * metadata contained in the compressed data. - * @param metadataPtr address of the metadata object - * @return the size of the decompressed data in bytes - */ - static native long decompressGetOutputSize(long metadataPtr); - - /** - * Perform asynchronous decompression using the specified CUDA stream. - * The input, temporary, and output buffers must all be in GPU-accessible - * memory. - * @param inPtr device address of the compressed buffer - * @param inSize size of the compressed data in bytes - * @param tempPtr device address of the temporary decompression storage buffer - * @param tempSize size of the temporary decompression storage buffer - * @param metadataPtr address of the metadata object - * @param outPtr device address of the buffer to use for uncompressed output - * @param outSize size of the uncompressed output buffer in bytes - * @param stream CUDA stream to use - */ - static native void decompressAsync( - long inPtr, - long inSize, - long tempPtr, - long tempSize, - long metadataPtr, - long outPtr, - long outSize, - long stream); - /** * Determine if data is compressed with the nvcomp LZ4 compressor. * @param inPtr device address of the compressed data * @param inSize size of the compressed data in bytes + * @param stream CUDA stream to use * @return true if the data is compressed with the nvcomp LZ4 compressor */ - static native boolean isLZ4Data(long inPtr, long inSize); + static native boolean isLZ4Data(long inPtr, long inSize, long stream); /** * Determine if the metadata corresponds to data compressed with the nvcomp LZ4 compressor. @@ -96,45 +41,21 @@ static native void decompressAsync( static native boolean isLZ4Metadata(long metadataPtr); /** - * Calculate the temporary buffer size needed for LZ4 compression. - * @param inPtr device address of the uncompressed data - * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param chunkSize size of an LZ4 chunk in bytes - * @return number of temporary storage bytes needed to compress + * Return the LZ4 compression configuration necessary for a particular chunk size. + * @param chunkSize maximum size of an uncompressed chunk in bytes + * @param uncompressedSize total size of the uncompressed data + * @return array of three longs containing metadata size, temp storage size, + * and output buffer size */ - static native long lz4CompressGetTempSize( - long inPtr, - long inSize, - int inputType, - long chunkSize); + static native long[] lz4CompressConfigure(long chunkSize, long uncompressedSize); /** - * Calculate the output buffer size for LZ4 compression. The output - * size can be an estimated upper bound or the exact value. - * @param inPtr device address of the uncompressed data - * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param chunkSize size of an LZ4 chunk in bytes - * @param tempPtr device address of the temporary storage buffer - * @param tempSize size of the temporary storage buffer in bytes - * @param computeExactSize set to true to compute the exact output size - * @return output buffer size in bytes. If computeExactSize is true then - * this is an exact size otherwise it is a maximum, worst-case size of the - * compressed data. - */ - static native long lz4CompressGetOutputSize( - long inPtr, - long inSize, - int inputType, - long chunkSize, - long tempPtr, - long tempSize, - boolean computeExactSize); - - /** - * Perform LZ4 compression synchronously using the specified CUDA - * stream. + * Perform LZ4 compression asynchronously using the specified CUDA stream. + * @param compressedSizeOutputPtr host address of a 64-bit integer to update + * with the resulting compressed size of the + * data. For the operation to be truly + * asynchronous this should point to pinned + * host memory. * @param inPtr device address of the uncompressed data * @param inSize size of the uncompressed data in bytes * @param inputType type of uncompressed data @@ -144,9 +65,9 @@ static native long lz4CompressGetOutputSize( * @param outPtr device address of the output buffer * @param outSize size of the output buffer in bytes * @param stream CUDA stream to use - * @return size of the compressed output in bytes */ - static native long lz4Compress( + static native void lz4CompressAsync( + long compressedSizeOutputPtr, long inPtr, long inSize, int inputType, @@ -158,29 +79,33 @@ static native long lz4Compress( long stream); /** - * Perform LZ4 compression synchronously using the specified CUDA - * stream. - * @param compressedSizeOutputPtr host address of a 64-bit integer to update - * with the resulting compressed size of the - * data. For the operation to be truly - * asynchronous this should point to pinned - * host memory. + * Return the decompression configuration for a compressed input. + * NOTE: The resulting configuration object must be closed to destroy the corresponding + * host-side metadata created by this method to avoid a native memory leak. + * @param inPtr device address of the compressed data + * @param inSize size of the compressed data + * @return array of four longs containing metadata address, metadata size, temp storage size, + * and output buffer size + */ + static native long[] lz4DecompressConfigure(long inPtr, long inSize, long stream); + + /** + * Perform LZ4 decompression asynchronously using the specified CUDA stream. * @param inPtr device address of the uncompressed data * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param chunkSize size of an LZ4 chunk in bytes + * @param metadataPtr host address of the metadata + * @param metadataSize size of the metadata in bytes * @param tempPtr device address of the temporary compression storage buffer * @param tempSize size of the temporary storage buffer in bytes * @param outPtr device address of the output buffer * @param outSize size of the output buffer in bytes * @param stream CUDA stream to use */ - static native void lz4CompressAsync( - long compressedSizeOutputPtr, + static native void lz4DecompressAsync( long inPtr, long inSize, - int inputType, - long chunkSize, + long metadataPtr, + long metadataSize, long tempPtr, long tempSize, long outPtr, @@ -188,229 +113,99 @@ static native void lz4CompressAsync( long stream); /** - * Extracts the metadata from the batch of inputs on the device and - * copies them to the host. This synchronizes on the stream. - * @param inPtrs device addresses of the compressed buffers in the batch - * @param inSizes corresponding byte sizes of the compressed buffers - * @param stream CUDA stream to use - * @return handle to the batched decompress metadata on the host - */ - static native long batchedLZ4DecompressGetMetadata( - long[] inPtrs, - long[] inSizes, - long stream); - - /** - * Destroys batched metadata and frees the underlying host memory. - * @param batchedMetadata handle to the batched decompress metadata + * Destroy host-side metadata created by {@link NvcompJni#lz4DecompressConfigure(long, long, long)} + * @param metadataPtr host address of metadata */ - static native void batchedLZ4DecompressDestroyMetadata(long batchedMetadata); + static native void lz4DestroyMetadata(long metadataPtr); /** - * Computes the temporary storage size in bytes needed to decompress - * the compressed batch. - * @param batchedMetadata handle to the batched metadata - * @return number of temporary storage bytes needed to decompress the batch - */ - static native long batchedLZ4DecompressGetTempSize(long batchedMetadata); - - /** - * Computes the decompressed size of each chunk in the batch. - * @param batchedMetadata handle to the batched metadata - * @param numOutputs number of output buffers in the batch - * @return Array of corresponding output sizes needed to decompress - * each buffer in the batch. - */ - static native long[] batchedLZ4DecompressGetOutputSize( - long batchedMetadata, - long numOutputs); - - /** - * Asynchronously decompress a batch of compressed data buffers. - * @param inPtrs device addresses of the compressed buffers - * @param inSizes corresponding byte sizes of the compressed buffers - * @param tempPtr device address of the temporary decompression space - * @param tempSize size of the temporary decompression space in bytes - * @param batchedMetadata handle to the batched metadata - * @param outPtrs device addresses of the uncompressed output buffers - * @param outSizes corresponding byte sizes of the uncompressed output buffers - * @param stream CUDA stream to use - */ - static native void batchedLZ4DecompressAsync( - long[] inPtrs, - long[] inSizes, - long tempPtr, - long tempSize, - long batchedMetadata, - long[] outPtrs, - long[] outSizes, - long stream); - - /** - * Get the temporary workspace size required to perform compression of entire batch. - * @param inPtrs device addresses of the uncompressed buffers - * @param inSizes corresponding byte sizes of the uncompressed buffers - * @param chunkSize size of an LZ4 chunk in bytes + * Get the temporary workspace size required to perform compression of entire LZ4 batch. + * @param batchSize number of chunks in the batch + * @param maxChunkSize maximum size of an uncompressed chunk in bytes * @return The size of required temporary workspace in bytes to compress the batch. */ - static native long batchedLZ4CompressGetTempSize( - long[] inPtrs, - long[] inSizes, - long chunkSize); + static native long batchedLZ4CompressGetTempSize(long batchSize, long maxChunkSize); /** - * Get the required output sizes of each chunk to perform compression. - * @param inPtrs device addresses of the uncompressed buffers - * @param inSizes corresponding byte sizes of the uncompressed buffers - * @param chunkSize size of an LZ4 chunk in bytes - * @param tempPtr device address of the temporary workspace buffer - * @param tempSize size of the temporary workspace buffer in bytes - * @return array of corresponding sizes in bytes of the output buffers needed - * to compress the buffers in the batch. + * Get the maximum size any chunk could compress to in a LZ4 batch. This is the minimum amount of + * output memory to allocate per chunk when batch compressing. + * @param maxChunkSize maximum size of an uncompressed chunk size in bytes + * @return maximum compressed output size of a chunk */ - static native long[] batchedLZ4CompressGetOutputSize( - long[] inPtrs, - long[] inSizes, - long chunkSize, - long tempPtr, - long tempSize); + static native long batchedLZ4CompressGetMaxOutputChunkSize(long maxChunkSize); /** - * Asynchronously compress a batch of buffers. Note that + * Asynchronously compress a batch of buffers with LZ4. Note that * compressedSizesOutPtr must point to pinned memory for this operation * to be asynchronous. - * @param compressedSizesOutPtr host address where to write the sizes of the + * @param devInPtrs device address of uncompressed buffer addresses vector + * @param devInSizes device address of uncompressed buffer sizes vector + * @param chunkSize maximum size of an uncompressed chunk in bytes + * @param batchSize number of chunks in the batch + * @param tempPtr device address of the temporary workspace buffer + * @param tempSize size of the temporary workspace buffer in bytes + * @param devOutPtrs device address of output buffer addresses vector + * @param compressedSizesOutPtr device address where to write the sizes of the * compressed data written to the corresponding * output buffers. Must point to a buffer with * at least 8 bytes of memory per output buffer - * in the batch. For asynchronous operation - * this must point to pinned host memory. - * @param inPtrs device addresses of the uncompressed buffers - * @param inSizes corresponding byte sizes of the uncompressed buffers - * @param chunkSize size of an LZ4 chunk in bytes - * @param tempPtr device address of the temporary workspace buffer - * @param tempSize size of the temporary workspace buffer in bytes - * @param outPtrs device addresses of the output compressed buffers - * @param outSizes corresponding sizes in bytes of the output buffers + * in the batch. * @param stream CUDA stream to use */ static native void batchedLZ4CompressAsync( - long compressedSizesOutPtr, - long[] inPtrs, - long[] inSizes, + long devInPtrs, + long devInSizes, long chunkSize, + long batchSize, long tempPtr, long tempSize, - long[] outPtrs, - long[] outSizes, + long devOutPtrs, + long compressedSizesOutPtr, long stream); /** - * Calculate the temporary buffer size needed for cascaded compression. - * @param inPtr device address of the uncompressed data - * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param numRLEs number of Run Length Encoding layers to use - * @param numDeltas number of delta layers to use - * @param useBitPacking set to true if bit-packing should be used - * @return number of temporary storage bytes needed to compress - */ - static native long cascadedCompressGetTempSize( - long inPtr, - long inSize, - int inputType, - int numRLEs, - int numDeltas, - boolean useBitPacking); - - /** - * Calculate the output buffer size for cascaded compression. The output - * size can be an estimated upper bound or the exact value. - * @param inPtr device address of the uncompressed data - * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param numRLEs number of Run Length Encoding layers to use - * @param numDeltas number of delta layers to use - * @param useBitPacking set to true if bit-packing should be used - * @param tempPtr device address of the temporary compression storage buffer - * @param tempSize size of the temporary storage buffer in bytes - * @param computeExactSize set to true to compute the exact output size - * @return output buffer size in bytes. If computeExactSize is true then - * this is an exact size otherwise it is a maximum, worst-case size of the - * compressed data. + * Computes the temporary storage size in bytes needed to decompress a LZ4-compressed batch. + * @param numChunks number of chunks in the batch + * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes + * @return number of temporary storage bytes needed to decompress the batch */ - static native long cascadedCompressGetOutputSize( - long inPtr, - long inSize, - int inputType, - int numRLEs, - int numDeltas, - boolean useBitPacking, - long tempPtr, - long tempSize, - boolean computeExactSize); + static native long batchedLZ4DecompressGetTempSize( + long numChunks, + long maxUncompressedChunkBytes); /** - * Perform cascaded compression synchronously using the specified CUDA - * stream. - * @param inPtr device address of the uncompressed data - * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param numRLEs number of Run Length Encoding layers to use - * @param numDeltas number of delta layers to use - * @param useBitPacking set to true if bit-packing should be used - * @param tempPtr device address of the temporary compression storage buffer - * @param tempSize size of the temporary storage buffer in bytes - * @param outPtr device address of the output buffer - * @param outSize size of the output buffer in bytes + * Asynchronously decompress a batch of LZ4-compressed data buffers. + * @param devInPtrs device address of compressed input buffer addresses vector + * @param devInSizes device address of compressed input buffer sizes vector + * @param devOutSizes device address of uncompressed buffer sizes vector + * @param batchSize number of buffers in the batch + * @param tempPtr device address of the temporary decompression space + * @param tempSize size of the temporary decompression space in bytes + * @param devOutPtrs device address of uncompressed output buffer addresses vector * @param stream CUDA stream to use - * @return size of the compressed output in bytes */ - static native long cascadedCompress( - long inPtr, - long inSize, - int inputType, - int numRLEs, - int numDeltas, - boolean useBitPacking, + static native void batchedLZ4DecompressAsync( + long devInPtrs, + long devInSizes, + long devOutSizes, + long batchSize, long tempPtr, long tempSize, - long outPtr, - long outSize, + long devOutPtrs, long stream); /** - * Perform cascaded compression asynchronously using the specified CUDA - * stream. Note if the compressedSizeOutputPtr argument points to paged - * memory then this may synchronize in practice due to limitations of - * copying from the device to paged memory. - * @param compressedSizeOutputPtr address of a 64-bit integer to update with - * the resulting compressed size of the data. - * For the operation to be truly asynchronous - * this should point to pinned host memory. - * @param inPtr device address of the uncompressed data - * @param inSize size of the uncompressed data in bytes - * @param inputType type of uncompressed data - * @param numRLEs number of Run Length Encoding layers to use - * @param numDeltas number of delta layers to use - * @param useBitPacking set to true if bit-packing should be used - * @param tempPtr device address of the temporary compression storage buffer - * @param tempSize size of the temporary storage buffer in bytes - * @param outPtr device address of the output buffer - * @param outSize size of the output buffer in bytes + * Asynchronously calculates the decompressed size needed for each chunk. + * @param devInPtrs device address of compressed input buffer addresses vector + * @param devInSizes device address of compressed input buffer sizes vector + * @param devOutSizes device address of calculated decompress sizes vector + * @param batchSize number of buffers in the batch * @param stream CUDA stream to use */ - static native void cascadedCompressAsync( - long compressedSizeOutputPtr, - long inPtr, - long inSize, - int inputType, - int numRLEs, - int numDeltas, - boolean useBitPacking, - long tempPtr, - long tempSize, - long outPtr, - long outSize, + static native void batchedLZ4GetDecompressSizeAsync( + long devInPtrs, + long devInSizes, + long devOutSizes, + long batchSize, long stream); } diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 2c95c6eebac..b5c7cfe8e6f 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -184,11 +184,21 @@ endif() ################################################################################################### # - nvcomp ---------------------------------------------------------------------------------------- -include(ConfigureNvcomp) -if(NVCOMP_FOUND) - message(STATUS "nvcomp compression library found in ${NVCOMP_ROOT}") +find_path(NVCOMP_INCLUDE "nvcomp" + HINTS "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-src/include" + "$ENV{CONDA_PREFIX}/include") + +message(STATUS "NVCOMP: NVCOMP_INCLUDE set to ${NVCOMP_INCLUDE}") + +set(CUDF_JNI_NVCOMP_LIBNAME "libnvcomp.a") +find_library(NVCOMP_LIBRARY ${CUDF_JNI_NVCOMP_LIBNAME} REQUIRED + HINTS "${CUDF_CPP_BUILD_DIR}/lib" + "$ENV{CONDA_PREFIX}/lib") + +if(NOT NVCOMP_LIBRARY) + message(FATAL_ERROR "nvcomp static library not found.") else() - message(FATAL_ERROR "nvcomp compression library not found.") + message(STATUS "NVCOMP: NVCOMP_LIBRARY set to ${NVCOMP_LIBRARY}") endif() ################################################################################################### @@ -218,7 +228,8 @@ add_library(cudfjni SHARED src/RmmJni.cpp src/ScalarJni.cpp src/TableJni.cpp - src/map_lookup.cu) + src/map_lookup.cu + src/check_nvcomp_output_sizes.cu) ################################################################################################### # - include paths --------------------------------------------------------------------------------- @@ -229,7 +240,7 @@ target_include_directories(cudfjni "${CUB_INCLUDE}" "${LIBCUDACXX_INCLUDE}" "${CUDAToolkit_INCLUDE_DIRS}" - "${NVCOMP_INCLUDE_DIR}" + "${NVCOMP_INCLUDE}" "${CMAKE_BINARY_DIR}/include" "${CMAKE_SOURCE_DIR}/include" "${SPDLOG_INCLUDE}" @@ -293,7 +304,7 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM ################################################################################################### # - link libraries -------------------------------------------------------------------------------- -target_link_libraries(cudfjni PRIVATE nvcomp ${CUDF_LIB} ${ARROW_LIBRARY}) +target_link_libraries(cudfjni PRIVATE ${NVCOMP_LIBRARY} ${CUDF_LIB} ${ARROW_LIBRARY}) ################################################################################################### # - cudart options -------------------------------------------------------------------------------- diff --git a/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake b/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake deleted file mode 100644 index 1a0083e4518..00000000000 --- a/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake +++ /dev/null @@ -1,79 +0,0 @@ -#============================================================================= -# Copyright (c) 2020-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -set(NVCOMP_ROOT "${CMAKE_BINARY_DIR}/nvcomp") - -if(CUDA_STATIC_RUNTIME) - set(NVCOMP_CUDA_RUNTIME_LIBRARY Static) -else() - set(NVCOMP_CUDA_RUNTIME_LIBRARY Shared) -endif() - -set(NVCOMP_CMAKE_ARGS "-DCMAKE_CUDA_RUNTIME_LIBRARY=${NVCOMP_CUDA_RUNTIME_LIBRARY} -DUSE_RMM=ON -DCUB_DIR=${CUB_INCLUDE}") - -configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Nvcomp.CMakeLists.txt.cmake" - "${NVCOMP_ROOT}/CMakeLists.txt") - -file(MAKE_DIRECTORY "${NVCOMP_ROOT}/build") - -execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} . - RESULT_VARIABLE NVCOMP_CONFIG - WORKING_DIRECTORY ${NVCOMP_ROOT}) - -if(NVCOMP_CONFIG) - message(FATAL_ERROR "Configuring nvcomp failed: " ${NVCOMP_CONFIG}) -endif() - -set(PARALLEL_BUILD -j) -if($ENV{PARALLEL_LEVEL}) - set(NUM_JOBS $ENV{PARALLEL_LEVEL}) - set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}") -endif() - -if(${NUM_JOBS}) - if(${NUM_JOBS} EQUAL 1) - message(STATUS "NVCOMP BUILD: Enabling Sequential CMake build") - elseif(${NUM_JOBS} GREATER 1) - message(STATUS "NVCOMP BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs") - endif() -else() - message(STATUS "NVCOMP BUILD: Enabling Parallel CMake build with all threads") -endif() - -execute_process(COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD} - RESULT_VARIABLE NVCOMP_BUILD - WORKING_DIRECTORY ${NVCOMP_ROOT}/build) - -if(NVCOMP_BUILD) - message(FATAL_ERROR "Building nvcomp failed: " ${NVCOMP_BUILD}) -endif() - -message(STATUS "nvcomp build completed at: " ${NVCOMP_ROOT}/build) - -set(NVCOMP_INCLUDE_DIR "${NVCOMP_ROOT}/build/include") -set(NVCOMP_LIBRARY_DIR "${NVCOMP_ROOT}/build/lib") - -find_library(NVCOMP_LIB nvcomp - NO_DEFAULT_PATH - HINTS "${NVCOMP_LIBRARY_DIR}") - -if(NVCOMP_LIB) - message(STATUS "nvcomp library: " ${NVCOMP_LIB}) - set(NVCOMP_FOUND TRUE) - - add_library(nvcomp STATIC IMPORTED) - set_target_properties(nvcomp PROPERTIES IMPORTED_LOCATION "${NVCOMP_LIB}") -endif() diff --git a/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake b/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake deleted file mode 100644 index 5761ef8fd1b..00000000000 --- a/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake +++ /dev/null @@ -1,33 +0,0 @@ -#============================================================================= -# Copyright (c) 2020-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -cmake_minimum_required(VERSION 3.12) - -project(nvcomp) - -include(ExternalProject) - -ExternalProject_Add(nvcomp - GIT_REPOSITORY https://github.com/NVIDIA/nvcomp.git - GIT_TAG v1.2.1 - GIT_SHALLOW true - SOURCE_DIR "${NVCOMP_ROOT}/nvcomp" - BINARY_DIR "${NVCOMP_ROOT}/build" - INSTALL_DIR "${NVCOMP_ROOT}/install" - PATCH_COMMAND patch --reject-file=- -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/nvcomp.patch || true - CMAKE_ARGS ${NVCOMP_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${NVCOMP_ROOT}/install - BUILD_COMMAND ${CMAKE_COMMAND} --build . --target nvcomp - INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping nvcomp install step.") diff --git a/java/src/main/native/cmake/nvcomp.patch b/java/src/main/native/cmake/nvcomp.patch deleted file mode 100644 index ea1340b7754..00000000000 --- a/java/src/main/native/cmake/nvcomp.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 32f48ef..a2e3125 100644 ---- a/src/CMakeLists.txt -+++ b/src/CMakeLists.txt -@@ -10,7 +10,9 @@ endif() - file(GLOB CUDA_SOURCES *.cu) - file(GLOB CPP_SOURCES *.cpp) - --add_library(nvcomp SHARED ${CUDA_SOURCES} ${CPP_SOURCES}) -+ -+add_library(nvcomp STATIC ${CUDA_SOURCES} ${CPP_SOURCES}) -+set_property(TARGET nvcomp PROPERTY POSITION_INDEPENDENT_CODE True) - set_property(TARGET nvcomp PROPERTY CUDA_ARCHITECTURES ${GPU_ARCHS}) - target_compile_options(nvcomp PRIVATE - $<$:--expt-extended-lambda -Xcompiler -pthread>) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index 4b6696e3911..95444db7dff 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -21,6 +21,7 @@ #include #include +#include namespace cudf { namespace jni { @@ -741,11 +742,7 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { } #define CATCH_STD_CLASS(env, class_name, ret_val) \ - catch (const std::bad_alloc &e) { \ - /* In some cases a cuda exception can be the cause so peek and clear if needed*/ \ - if (cudaErrorMemoryAllocation == cudaPeekAtLastError()) { \ - cudaGetLastError(); \ - } \ + catch (const rmm::out_of_memory &e) { \ auto what = \ std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp index 4b378905a43..a18c88e10dc 100644 --- a/java/src/main/native/src/CompiledExpression.cpp +++ b/java/src/main/native/src/CompiledExpression.cpp @@ -144,6 +144,9 @@ cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) { case 20: return cudf::ast::ast_operator::RINT; case 21: return cudf::ast::ast_operator::BIT_INVERT; case 22: return cudf::ast::ast_operator::NOT; + case 23: return cudf::ast::ast_operator::CAST_TO_INT64; + case 24: return cudf::ast::ast_operator::CAST_TO_UINT64; + case 25: return cudf::ast::ast_operator::CAST_TO_FLOAT64; default: throw std::invalid_argument("unexpected JNI AST unary operator value"); } } diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp index 0e34d2856f5..d551e9414d1 100644 --- a/java/src/main/native/src/NvcompJni.cpp +++ b/java/src/main/native/src/NvcompJni.cpp @@ -13,11 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#include -#include #include +#include +#include + +#include "check_nvcomp_output_sizes.hpp" #include "cudf_jni_apis.hpp" namespace { @@ -27,7 +28,7 @@ constexpr char const *NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCud constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException"; constexpr char const *UNSUPPORTED_CLASS = "java/lang/UnsupportedOperationException"; -void check_nvcomp_status(JNIEnv *env, nvcompError_t status) { +void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) { switch (status) { case nvcompSuccess: break; case nvcompErrorInvalidValue: @@ -36,9 +37,15 @@ void check_nvcomp_status(JNIEnv *env, nvcompError_t status) { case nvcompErrorNotSupported: cudf::jni::throw_java_exception(env, UNSUPPORTED_CLASS, "nvcomp unsupported"); break; + case nvcompErrorCannotDecompress: + cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp cannot decompress"); + break; case nvcompErrorCudaError: cudf::jni::throw_java_exception(env, NVCOMP_CUDA_ERROR_CLASS, "nvcomp CUDA error"); break; + case nvcompErrorInternal: + cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp internal error"); + break; default: cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp unknown error"); break; @@ -49,74 +56,16 @@ void check_nvcomp_status(JNIEnv *env, nvcompError_t status) { extern "C" { -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetMetadata( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jlong jstream) { - try { - cudf::jni::auto_set_device(env); - void *metadata_ptr; - auto stream = reinterpret_cast(jstream); - auto status = nvcompDecompressGetMetadata(reinterpret_cast(in_ptr), in_size, - &metadata_ptr, stream); - check_nvcomp_status(env, status); - return reinterpret_cast(metadata_ptr); - } - CATCH_STD(env, 0); -} - -JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressDestroyMetadata( - JNIEnv *env, jclass, jlong metadata_ptr) { - try { - cudf::jni::auto_set_device(env); - nvcompDecompressDestroyMetadata(reinterpret_cast(metadata_ptr)); - } - CATCH_STD(env, ); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetTempSize( - JNIEnv *env, jclass, jlong metadata_ptr) { - try { - cudf::jni::auto_set_device(env); - size_t temp_size; - auto status = nvcompDecompressGetTempSize(reinterpret_cast(metadata_ptr), &temp_size); - check_nvcomp_status(env, status); - return temp_size; - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetOutputSize( - JNIEnv *env, jclass, jlong metadata_ptr) { - try { - cudf::jni::auto_set_device(env); - size_t out_size; - auto status = nvcompDecompressGetOutputSize(reinterpret_cast(metadata_ptr), &out_size); - check_nvcomp_status(env, status); - return out_size; - } - CATCH_STD(env, 0); -} - -JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressAsync( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jlong temp_ptr, jlong temp_size, - jlong metadata_ptr, jlong out_ptr, jlong out_size, jlong jstream) { - try { - cudf::jni::auto_set_device(env); - auto stream = reinterpret_cast(jstream); - auto status = nvcompDecompressAsync(reinterpret_cast(in_ptr), in_size, - reinterpret_cast(temp_ptr), temp_size, - reinterpret_cast(metadata_ptr), - reinterpret_cast(out_ptr), out_size, stream); - check_nvcomp_status(env, status); - } - CATCH_STD(env, ); -} - JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Data(JNIEnv *env, jclass, - jlong in_ptr, - jlong in_size) { + jlong j_in_ptr, + jlong j_in_size, + jlong j_stream) { try { cudf::jni::auto_set_device(env); - return LZ4IsData(reinterpret_cast(in_ptr), in_size); + auto in_ptr = reinterpret_cast(j_in_ptr); + auto in_size = static_cast(j_in_size); + auto stream = reinterpret_cast(j_stream); + return LZ4IsData(in_ptr, in_size, stream); } CATCH_STD(env, 0) } @@ -130,365 +79,215 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Metadata(JN CATCH_STD(env, 0) } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressGetTempSize( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size) { - try { - cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); - nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - size_t temp_size; - auto status = nvcompLZ4CompressGetTempSize(reinterpret_cast(in_ptr), in_size, comp_type, - &opts, &temp_size); - check_nvcomp_status(env, status); - return temp_size; - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressGetOutputSize( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size, - jlong temp_ptr, jlong temp_size, jboolean compute_exact) { - try { - cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); - nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - size_t out_size; - auto status = nvcompLZ4CompressGetOutputSize( - reinterpret_cast(in_ptr), in_size, comp_type, &opts, - reinterpret_cast(temp_ptr), temp_size, &out_size, compute_exact); - check_nvcomp_status(env, status); - return out_size; - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4Compress( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size, - jlong temp_ptr, jlong temp_size, jlong out_ptr, jlong out_size, jlong jstream) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressConfigure( + JNIEnv *env, jclass, jlong j_chunk_size, jlong j_uncompressed_size) { try { cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - auto stream = reinterpret_cast(jstream); - size_t compressed_size = out_size; - auto status = - nvcompLZ4CompressAsync(reinterpret_cast(in_ptr), in_size, comp_type, &opts, - reinterpret_cast(temp_ptr), temp_size, - reinterpret_cast(out_ptr), &compressed_size, stream); + opts.chunk_size = static_cast(j_chunk_size); + auto uncompressed_size = static_cast(j_uncompressed_size); + std::size_t metadata_bytes = 0; + std::size_t temp_bytes = 0; + std::size_t out_bytes = 0; + auto status = nvcompLZ4CompressConfigure(&opts, NVCOMP_TYPE_CHAR, uncompressed_size, + &metadata_bytes, &temp_bytes, &out_bytes); check_nvcomp_status(env, status); - if (cudaStreamSynchronize(stream) != cudaSuccess) { - JNI_THROW_NEW(env, NVCOMP_CUDA_ERROR_CLASS, "Error synchronizing stream", 0); - } - return compressed_size; + cudf::jni::native_jlongArray result(env, 3); + result[0] = static_cast(metadata_bytes); + result[1] = static_cast(temp_bytes); + result[2] = static_cast(out_bytes); + return result.get_jArray(); } CATCH_STD(env, 0); } JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressAsync( - JNIEnv *env, jclass, jlong compressed_output_ptr, jlong in_ptr, jlong in_size, jint input_type, - jlong chunk_size, jlong temp_ptr, jlong temp_size, jlong out_ptr, jlong out_size, - jlong jstream) { + JNIEnv *env, jclass, jlong j_compressed_size_ptr, jlong j_in_ptr, jlong j_in_size, + jint j_input_type, jlong j_chunk_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr, + jlong j_out_size, jlong j_stream) { try { cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); + auto in_ptr = reinterpret_cast(j_in_ptr); + auto in_size = static_cast(j_in_size); + auto comp_type = static_cast(j_input_type); nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - auto stream = reinterpret_cast(jstream); - auto compressed_size_ptr = reinterpret_cast(compressed_output_ptr); - *compressed_size_ptr = out_size; - auto status = - nvcompLZ4CompressAsync(reinterpret_cast(in_ptr), in_size, comp_type, &opts, - reinterpret_cast(temp_ptr), temp_size, - reinterpret_cast(out_ptr), compressed_size_ptr, stream); + opts.chunk_size = static_cast(j_chunk_size); + auto temp_ptr = reinterpret_cast(j_temp_ptr); + auto temp_size = static_cast(j_temp_size); + auto out_ptr = reinterpret_cast(j_out_ptr); + auto compressed_size_ptr = reinterpret_cast(j_compressed_size_ptr); + auto stream = reinterpret_cast(j_stream); + auto status = nvcompLZ4CompressAsync(&opts, comp_type, in_ptr, in_size, temp_ptr, temp_size, + out_ptr, compressed_size_ptr, stream); check_nvcomp_status(env, status); } CATCH_STD(env, ); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetMetadata( - JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong jstream) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressConfigure( + JNIEnv *env, jclass, jlong j_input_ptr, jlong j_input_size, jlong j_stream) { try { cudf::jni::auto_set_device(env); - - cudf::jni::native_jpointerArray input_ptrs(env, in_ptrs); - cudf::jni::native_jlongArray input_jsizes(env, in_sizes); - if (input_ptrs.size() != input_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch"); - } - std::vector sizes; - std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(), - std::back_inserter(sizes), - [](jlong x) -> size_t { return static_cast(x); }); - + auto compressed_ptr = reinterpret_cast(j_input_ptr); + auto compressed_bytes = static_cast(j_input_size); void *metadata_ptr = nullptr; - auto stream = reinterpret_cast(jstream); - auto status = nvcompBatchedLZ4DecompressGetMetadata(input_ptrs.data(), sizes.data(), - input_ptrs.size(), &metadata_ptr, stream); - check_nvcomp_status(env, status); - return reinterpret_cast(metadata_ptr); - } - CATCH_STD(env, 0); -} - -JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressDestroyMetadata( - JNIEnv *env, jclass, jlong metadata_ptr) { - try { - cudf::jni::auto_set_device(env); - nvcompBatchedLZ4DecompressDestroyMetadata(reinterpret_cast(metadata_ptr)); - } - CATCH_STD(env, ); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize( - JNIEnv *env, jclass, jlong metadata_ptr) { - try { - cudf::jni::auto_set_device(env); - size_t temp_size; + std::size_t metadata_bytes = 0; + std::size_t temp_bytes = 0; + std::size_t uncompressed_bytes = 0; + auto stream = reinterpret_cast(j_stream); auto status = - nvcompBatchedLZ4DecompressGetTempSize(reinterpret_cast(metadata_ptr), &temp_size); + nvcompLZ4DecompressConfigure(compressed_ptr, compressed_bytes, &metadata_ptr, + &metadata_bytes, &temp_bytes, &uncompressed_bytes, stream); check_nvcomp_status(env, status); - return static_cast(temp_size); + cudf::jni::native_jlongArray result(env, 4); + result[0] = reinterpret_cast(metadata_ptr); + result[1] = static_cast(metadata_bytes); + result[2] = static_cast(temp_bytes); + result[3] = static_cast(uncompressed_bytes); + return result.get_jArray(); } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetOutputSize( - JNIEnv *env, jclass, jlong metadata_ptr, jint num_outputs) { +JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressAsync( + JNIEnv *env, jclass, jlong j_in_ptr, jlong j_in_size, jlong j_metadata_ptr, + jlong j_metadata_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr, jlong j_out_size, + jlong j_stream) { try { cudf::jni::auto_set_device(env); - std::vector sizes(num_outputs); - auto status = nvcompBatchedLZ4DecompressGetOutputSize(reinterpret_cast(metadata_ptr), - num_outputs, sizes.data()); + auto compressed_ptr = reinterpret_cast(j_in_ptr); + auto compressed_bytes = static_cast(j_in_size); + auto metadata_ptr = reinterpret_cast(j_metadata_ptr); + auto metadata_bytes = static_cast(j_metadata_size); + auto temp_ptr = reinterpret_cast(j_temp_ptr); + auto temp_bytes = static_cast(j_temp_size); + auto uncompressed_ptr = reinterpret_cast(j_out_ptr); + auto uncompressed_bytes = static_cast(j_out_size); + auto stream = reinterpret_cast(j_stream); + auto status = nvcompLZ4DecompressAsync(compressed_ptr, compressed_bytes, metadata_ptr, + metadata_bytes, temp_ptr, temp_bytes, uncompressed_ptr, + uncompressed_bytes, stream); check_nvcomp_status(env, status); - cudf::jni::native_jlongArray jsizes(env, num_outputs); - std::transform(sizes.begin(), sizes.end(), jsizes.data(), - [](size_t x) -> jlong { return static_cast(x); }); - return jsizes.get_jArray(); } - CATCH_STD(env, NULL); + CATCH_STD(env, ); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync( - JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong temp_ptr, jlong temp_size, - jlong metadata_ptr, jlongArray out_ptrs, jlongArray out_sizes, jlong jstream) { +JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DestroyMetadata(JNIEnv *env, jclass, + jlong metadata_ptr) { try { cudf::jni::auto_set_device(env); - cudf::jni::native_jpointerArray input_ptrs(env, in_ptrs); - cudf::jni::native_jlongArray input_jsizes(env, in_sizes); - if (input_ptrs.size() != input_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch"); - } - std::vector input_sizes; - std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(), - std::back_inserter(input_sizes), - [](jlong x) -> size_t { return static_cast(x); }); - - cudf::jni::native_jpointerArray output_ptrs(env, out_ptrs); - cudf::jni::native_jlongArray output_jsizes(env, out_sizes); - if (output_ptrs.size() != output_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "output array size mismatch"); - } - if (input_ptrs.size() != output_ptrs.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input/output array size mismatch"); - } - std::vector output_sizes; - std::transform(output_jsizes.data(), output_jsizes.data() + output_jsizes.size(), - std::back_inserter(output_sizes), - [](jlong x) -> size_t { return static_cast(x); }); - - auto stream = reinterpret_cast(jstream); - auto status = nvcompBatchedLZ4DecompressAsync( - input_ptrs.data(), input_sizes.data(), input_ptrs.size(), - reinterpret_cast(temp_ptr), static_cast(temp_size), - reinterpret_cast(metadata_ptr), output_ptrs.data(), output_sizes.data(), stream); - check_nvcomp_status(env, status); + nvcompLZ4DestroyMetadata(reinterpret_cast(metadata_ptr)); } CATCH_STD(env, ); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize( - JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong chunk_size) { + JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) { try { cudf::jni::auto_set_device(env); - cudf::jni::native_jpointerArray input_ptrs(env, in_ptrs); - cudf::jni::native_jlongArray input_jsizes(env, in_sizes); - if (input_ptrs.size() != input_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch"); - } - std::vector sizes; - std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(), - std::back_inserter(sizes), - [](jlong x) -> size_t { return static_cast(x); }); - - nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - size_t temp_size = 0; - auto status = nvcompBatchedLZ4CompressGetTempSize(input_ptrs.data(), sizes.data(), - input_ptrs.size(), &opts, &temp_size); + auto batch_size = static_cast(j_batch_size); + auto max_chunk_size = static_cast(j_max_chunk_size); + std::size_t temp_size = 0; + auto status = nvcompBatchedLZ4CompressGetTempSize(batch_size, max_chunk_size, + nvcompBatchedLZ4DefaultOpts, &temp_size); check_nvcomp_status(env, status); return static_cast(temp_size); } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetOutputSize( - JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong chunk_size, jlong temp_ptr, - jlong temp_size) { +JNIEXPORT jlong JNICALL +Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize( + JNIEnv *env, jclass, jlong j_max_chunk_size) { try { cudf::jni::auto_set_device(env); - cudf::jni::native_jpointerArray input_ptrs(env, in_ptrs); - cudf::jni::native_jlongArray input_jsizes(env, in_sizes); - if (input_ptrs.size() != input_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch"); - } - std::vector input_sizes; - std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(), - std::back_inserter(input_sizes), - [](jlong x) -> size_t { return static_cast(x); }); - - nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - std::vector output_sizes(input_ptrs.size()); - auto status = nvcompBatchedLZ4CompressGetOutputSize( - input_ptrs.data(), input_sizes.data(), input_ptrs.size(), &opts, - reinterpret_cast(temp_ptr), static_cast(temp_size), output_sizes.data()); + auto max_chunk_size = static_cast(j_max_chunk_size); + std::size_t max_output_size = 0; + auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize( + max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size); check_nvcomp_status(env, status); - cudf::jni::native_jlongArray jsizes(env, input_ptrs.size()); - std::transform(output_sizes.begin(), output_sizes.end(), jsizes.data(), - [](size_t x) -> jlong { return static_cast(x); }); - return jsizes.get_jArray(); + return static_cast(max_output_size); } - CATCH_STD(env, NULL); + CATCH_STD(env, 0); } JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync( - JNIEnv *env, jclass, jlong compressed_sizes_out_ptr, jlongArray in_ptrs, jlongArray in_sizes, - jlong chunk_size, jlong temp_ptr, jlong temp_size, jlongArray out_ptrs, jlongArray out_sizes, - jlong jstream) { + JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_chunk_size, jlong j_batch_size, + jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_compressed_sizes_out_ptr, + jlong j_stream) { try { cudf::jni::auto_set_device(env); - cudf::jni::native_jpointerArray input_ptrs(env, in_ptrs); - cudf::jni::native_jlongArray input_jsizes(env, in_sizes); - if (input_ptrs.size() != input_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch"); - } - std::vector input_sizes; - std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(), - std::back_inserter(input_sizes), - [](jlong x) -> size_t { return static_cast(x); }); - - cudf::jni::native_jpointerArray output_ptrs(env, out_ptrs); - cudf::jni::native_jlongArray output_jsizes(env, out_sizes); - if (output_ptrs.size() != output_jsizes.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "output array size mismatch"); - } - if (input_ptrs.size() != output_ptrs.size()) { - cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input/output array size mismatch"); - } - - auto output_sizes = reinterpret_cast(compressed_sizes_out_ptr); - std::transform(output_jsizes.data(), output_jsizes.data() + output_jsizes.size(), output_sizes, - [](jlong x) -> size_t { return static_cast(x); }); - - nvcompLZ4FormatOpts opts{}; - opts.chunk_size = chunk_size; - auto stream = reinterpret_cast(jstream); - auto status = nvcompBatchedLZ4CompressAsync( - input_ptrs.data(), input_sizes.data(), input_ptrs.size(), &opts, - reinterpret_cast(temp_ptr), static_cast(temp_size), output_ptrs.data(), - output_sizes, // input/output parameter - stream); + auto in_ptrs = reinterpret_cast(j_in_ptrs); + auto in_sizes = reinterpret_cast(j_in_sizes); + auto chunk_size = static_cast(j_chunk_size); + auto batch_size = static_cast(j_batch_size); + auto temp_ptr = reinterpret_cast(j_temp_ptr); + auto temp_size = static_cast(j_temp_size); + auto out_ptrs = reinterpret_cast(j_out_ptrs); + auto compressed_out_sizes = reinterpret_cast(j_compressed_sizes_out_ptr); + auto stream = reinterpret_cast(j_stream); + auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, temp_ptr, + temp_size, out_ptrs, compressed_out_sizes, + nvcompBatchedLZ4DefaultOpts, stream); check_nvcomp_status(env, status); } CATCH_STD(env, ); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressGetTempSize( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles, - jint num_deltas, jboolean use_bp) { - try { - cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); - nvcompCascadedFormatOpts opts{}; - opts.num_RLEs = num_rles; - opts.num_deltas = num_deltas; - opts.use_bp = use_bp; - size_t temp_size; - auto status = nvcompCascadedCompressGetTempSize(reinterpret_cast(in_ptr), in_size, - comp_type, &opts, &temp_size); - check_nvcomp_status(env, status); - return temp_size; - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressGetOutputSize( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles, - jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jboolean compute_exact) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize( + JNIEnv *env, jclass, jlong j_batch_size, jlong j_chunk_size) { try { cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); - nvcompCascadedFormatOpts opts{}; - opts.num_RLEs = num_rles; - opts.num_deltas = num_deltas; - opts.use_bp = use_bp; - size_t out_size; - auto status = nvcompCascadedCompressGetOutputSize( - reinterpret_cast(in_ptr), in_size, comp_type, &opts, - reinterpret_cast(temp_ptr), temp_size, &out_size, compute_exact); + auto batch_size = static_cast(j_batch_size); + auto chunk_size = static_cast(j_chunk_size); + std::size_t temp_size = 0; + auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size); check_nvcomp_status(env, status); - return out_size; + return static_cast(temp_size); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompress( - JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles, - jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jlong out_ptr, - jlong out_size, jlong jstream) { +JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync( + JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size, + jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_stream) { try { cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); - nvcompCascadedFormatOpts opts{}; - opts.num_RLEs = num_rles; - opts.num_deltas = num_deltas; - opts.use_bp = use_bp; - auto stream = reinterpret_cast(jstream); - size_t compressed_size = out_size; - auto status = - nvcompCascadedCompressAsync(reinterpret_cast(in_ptr), in_size, comp_type, &opts, - reinterpret_cast(temp_ptr), temp_size, - reinterpret_cast(out_ptr), &compressed_size, stream); + auto compressed_ptrs = reinterpret_cast(j_in_ptrs); + auto compressed_sizes = reinterpret_cast(j_in_sizes); + auto uncompressed_sizes = reinterpret_cast(j_out_sizes); + auto batch_size = static_cast(j_batch_size); + auto temp_ptr = reinterpret_cast(j_temp_ptr); + auto temp_size = static_cast(j_temp_size); + auto uncompressed_ptrs = reinterpret_cast(j_out_ptrs); + auto stream = reinterpret_cast(j_stream); + auto uncompressed_statuses = rmm::device_uvector(batch_size, stream); + auto actual_uncompressed_sizes = rmm::device_uvector(batch_size, stream); + auto status = nvcompBatchedLZ4DecompressAsync( + compressed_ptrs, compressed_sizes, uncompressed_sizes, actual_uncompressed_sizes.data(), + batch_size, temp_ptr, temp_size, uncompressed_ptrs, uncompressed_statuses.data(), stream); check_nvcomp_status(env, status); - if (cudaStreamSynchronize(stream) != cudaSuccess) { - JNI_THROW_NEW(env, NVCOMP_CUDA_ERROR_CLASS, "Error synchronizing stream", 0); + if (!cudf::java::check_nvcomp_output_sizes(uncompressed_sizes, actual_uncompressed_sizes.data(), + batch_size, stream)) { + cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, + "nvcomp decompress output size mismatch"); } - return compressed_size; } - CATCH_STD(env, 0); + CATCH_STD(env, ); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressAsync( - JNIEnv *env, jclass, jlong compressed_output_ptr, jlong in_ptr, jlong in_size, jint input_type, - jint num_rles, jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jlong out_ptr, - jlong out_size, jlong jstream) { +JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync( + JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size, + jlong j_stream) { try { cudf::jni::auto_set_device(env); - auto comp_type = static_cast(input_type); - nvcompCascadedFormatOpts opts{}; - opts.num_RLEs = num_rles; - opts.num_deltas = num_deltas; - opts.use_bp = use_bp; - auto stream = reinterpret_cast(jstream); - auto compressed_size_ptr = reinterpret_cast(compressed_output_ptr); - *compressed_size_ptr = out_size; - auto status = - nvcompCascadedCompressAsync(reinterpret_cast(in_ptr), in_size, comp_type, &opts, - reinterpret_cast(temp_ptr), temp_size, - reinterpret_cast(out_ptr), compressed_size_ptr, stream); + auto compressed_ptrs = reinterpret_cast(j_in_ptrs); + auto compressed_sizes = reinterpret_cast(j_in_sizes); + auto uncompressed_sizes = reinterpret_cast(j_out_sizes); + auto batch_size = static_cast(j_batch_size); + auto stream = reinterpret_cast(j_stream); + auto status = nvcompBatchedLZ4GetDecompressSizeAsync(compressed_ptrs, compressed_sizes, + uncompressed_sizes, batch_size, stream); check_nvcomp_status(env, status); } CATCH_STD(env, ); diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 1ee5345c036..4722db30244 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -210,17 +210,6 @@ class java_event_handler_memory_resource final : public device_memory_resource { } bool on_alloc_fail(std::size_t num_bytes) { - cudaError_t err = cudaPeekAtLastError(); - if (err != cudaSuccess) { - // workaround for RMM pooled mode (CNMEM backend) leaving a CUDA error pending - if (err == cudaErrorMemoryAllocation) { - cudaGetLastError(); - } else { - // let this allocation fail so the application can see the CUDA error - return false; - } - } - JNIEnv *env = cudf::jni::get_jni_env(jvm); jboolean result = env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast(num_bytes)); @@ -256,7 +245,7 @@ class java_event_handler_memory_resource final : public device_memory_resource { total_before = get_total_bytes_allocated(); result = resource->allocate(num_bytes, stream); break; - } catch (std::bad_alloc const &e) { + } catch (rmm::out_of_memory const &e) { if (!on_alloc_fail(num_bytes)) { throw; } diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.cu b/java/src/main/native/src/check_nvcomp_output_sizes.cu new file mode 100644 index 00000000000..944399882b8 --- /dev/null +++ b/java/src/main/native/src/check_nvcomp_output_sizes.cu @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include "check_nvcomp_output_sizes.hpp" + +namespace { + +struct java_domain { + static constexpr char const *name{"Java"}; +}; + +} // anonymous namespace + +namespace cudf { +namespace java { + +/** + * Check that the vector of expected uncompressed sizes matches the vector of actual compressed + * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements. + */ +bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes, + std::size_t const *dev_actual_uncompressed_sizes, + std::size_t num_chunks, rmm::cuda_stream_view stream) { + NVTX3_FUNC_RANGE_IN(java_domain); + return thrust::equal(rmm::exec_policy(stream), dev_uncompressed_sizes, + dev_uncompressed_sizes + num_chunks, dev_actual_uncompressed_sizes); +} + +} // namespace java +} // namespace cudf diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.hpp b/java/src/main/native/src/check_nvcomp_output_sizes.hpp new file mode 100644 index 00000000000..00b36471a85 --- /dev/null +++ b/java/src/main/native/src/check_nvcomp_output_sizes.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace java { + +/** + * Check that the vector of expected uncompressed sizes matches the vector of actual compressed + * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements. + */ +bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes, + std::size_t const *dev_actual_uncompressed_sizes, + std::size_t num_chunks, rmm::cuda_stream_view stream); +} // namespace java +} // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index d1af0d9a2f6..c767a98b342 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3085,6 +3085,17 @@ void testWindowStatic() { ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1, defaultOutput), options)) { assertColumnsAreEqual(expected, result); } + + try (ColumnVector expected = ColumnVector.fromBoxedDoubles(0.7071d, 1.5275d, 1.5275d, 1d, 1.4142); + ColumnVector result = v1.rollingWindow(RollingAggregation.standardDeviation(), options)) { + assertColumnsAreEqual(expected, result); + } + + try (ColumnVector expected = + ColumnVector.fromBoxedDoubles(Double.POSITIVE_INFINITY, 2.1602d, 2.1602d, 1.4142d, Double.POSITIVE_INFINITY); + ColumnVector result = v1.rollingWindow(RollingAggregation.standardDeviation(2), options)) { + assertColumnsAreEqual(expected, result); + } } } } diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java index a41cc22e9b2..c36d241500a 100644 --- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java +++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,18 +29,20 @@ public class NvcompTest { private static final Logger log = LoggerFactory.getLogger(ColumnVector.class); @Test - void testLZ4RoundTripSync() { + void testLZ4RoundTripViaLZ4DecompressorSync() { lz4RoundTrip(false); } @Test - void testLZ4RoundTripAsync() { + void testLZ4RoundTripViaLZ4DecompressorAsync() { lz4RoundTrip(true); } @Test void testBatchedLZ4RoundTripAsync() { + final Cuda.Stream stream = Cuda.DEFAULT_STREAM; final long chunkSize = 64 * 1024; + final long targetIntermediteSize = Long.MAX_VALUE; final int maxElements = 1024 * 1024 + 1; final int numBuffers = 200; long[] data = new long[maxElements]; @@ -48,149 +50,52 @@ void testBatchedLZ4RoundTripAsync() { data[i] = i; } - DeviceMemoryBuffer[] originalBuffers = new DeviceMemoryBuffer[numBuffers]; - DeviceMemoryBuffer[] uncompressedBuffers = new DeviceMemoryBuffer[numBuffers]; - - // compressed data in buffers that are likely oversized - DeviceMemoryBuffer[] compressedBuffers = new DeviceMemoryBuffer[numBuffers]; - - // compressed data in right-sized buffers - DeviceMemoryBuffer[] compressedInputs = new DeviceMemoryBuffer[numBuffers]; - - try { + try (CloseableArray originalBuffers = + CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) { // create the batched buffers to compress - for (int i = 0; i < numBuffers; ++i) { - originalBuffers[i] = initBatchBuffer(data, i); + for (int i = 0; i < originalBuffers.size(); i++) { + originalBuffers.set(i, initBatchBuffer(data, i)); + // Increment the refcount since compression will try to close it + originalBuffers.get(i).incRefCount(); } - // compress the buffers - long[] outputSizes; - long[] compressedSizes; - long tempSize = BatchedLZ4Compressor.getTempSize(originalBuffers, chunkSize); - try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) { - outputSizes = BatchedLZ4Compressor.getOutputSizes(originalBuffers, chunkSize, tempBuffer); - for (int i = 0; i < numBuffers; ++i) { - compressedBuffers[i] = DeviceMemoryBuffer.allocate(outputSizes[i]); - } - long sizesBufferSize = BatchedLZ4Compressor.getCompressedSizesBufferSize(numBuffers); - try (HostMemoryBuffer compressedSizesBuffer = HostMemoryBuffer.allocate(sizesBufferSize)) { - BatchedLZ4Compressor.compressAsync(compressedSizesBuffer, originalBuffers, chunkSize, - tempBuffer, compressedBuffers, Cuda.DEFAULT_STREAM); - Cuda.DEFAULT_STREAM.sync(); - compressedSizes = new long[numBuffers]; - for (int i = 0; i < numBuffers; ++i) { - compressedSizes[i] = compressedSizesBuffer.getLong(i * 8); - } - } - } - - // right-size the compressed buffers based on reported compressed sizes - for (int i = 0; i < numBuffers; ++i) { - compressedInputs[i] = compressedBuffers[i].slice(0, compressedSizes[i]); - } - - // decompress the buffers - try (BatchedLZ4Decompressor.BatchedMetadata metadata = - BatchedLZ4Decompressor.getMetadata(compressedInputs, Cuda.DEFAULT_STREAM)) { - outputSizes = BatchedLZ4Decompressor.getOutputSizes(metadata, numBuffers); - for (int i = 0; i < numBuffers; ++i) { - uncompressedBuffers[i] = DeviceMemoryBuffer.allocate(outputSizes[i]); - } - tempSize = BatchedLZ4Decompressor.getTempSize(metadata); - try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) { - BatchedLZ4Decompressor.decompressAsync(compressedInputs, tempBuffer, metadata, - uncompressedBuffers, Cuda.DEFAULT_STREAM); - } - } + // compress and decompress the buffers + BatchedLZ4Compressor compressor = new BatchedLZ4Compressor(chunkSize, targetIntermediteSize); - // check the decompressed results against the original - for (int i = 0; i < numBuffers; ++i) { - try (HostMemoryBuffer expected = HostMemoryBuffer.allocate(originalBuffers[i].getLength()); - HostMemoryBuffer actual = HostMemoryBuffer.allocate(outputSizes[i])) { - Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE); - Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE); - Assertions.assertEquals(originalBuffers[i].getLength(), uncompressedBuffers[i].getLength(), - "uncompressed size mismatch at buffer " + i); - expected.copyFromDeviceBuffer(originalBuffers[i]); - actual.copyFromDeviceBuffer(uncompressedBuffers[i]); - byte[] expectedBytes = new byte[(int) expected.getLength()]; - expected.getBytes(expectedBytes, 0, 0, expected.getLength()); - byte[] actualBytes = new byte[(int) actual.getLength()]; - actual.getBytes(actualBytes, 0, 0, actual.getLength()); - Assertions.assertArrayEquals(expectedBytes, actualBytes, - "mismatch in batch buffer " + i); + try (CloseableArray compressedBuffers = + CloseableArray.wrap(compressor.compress(originalBuffers.getArray(), stream)); + CloseableArray uncompressedBuffers = + CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) { + for (int i = 0; i < numBuffers; i++) { + uncompressedBuffers.set(i, + DeviceMemoryBuffer.allocate(originalBuffers.get(i).getLength())); } - } - } finally { - closeBufferArray(originalBuffers); - closeBufferArray(uncompressedBuffers); - closeBufferArray(compressedBuffers); - closeBufferArray(compressedInputs); - } - } - - @Test - void testBatchedLZ4CompressRoundTrip() { - final long chunkSize = 64 * 1024; - final int maxElements = 1024 * 1024 + 1; - final int numBuffers = 200; - long[] data = new long[maxElements]; - for (int i = 0; i < maxElements; ++i) { - data[i] = i; - } - - DeviceMemoryBuffer[] originalBuffers = new DeviceMemoryBuffer[numBuffers]; - DeviceMemoryBuffer[] uncompressedBuffers = new DeviceMemoryBuffer[numBuffers]; - BatchedLZ4Compressor.BatchedCompressionResult compResult = null; - - // compressed data in right-sized buffers - DeviceMemoryBuffer[] compressedInputs = new DeviceMemoryBuffer[numBuffers]; - - try { - // create the batched buffers to compress - for (int i = 0; i < numBuffers; ++i) { - originalBuffers[i] = initBatchBuffer(data, i); - } - // compress the buffers - compResult = BatchedLZ4Compressor.compress(originalBuffers, chunkSize, Cuda.DEFAULT_STREAM); + // decompress takes ownership of the compressed buffers and will close them + BatchedLZ4Decompressor.decompressAsync(chunkSize, compressedBuffers.release(), + uncompressedBuffers.getArray(), stream); - // right-size the compressed buffers based on reported compressed sizes - DeviceMemoryBuffer[] compressedBuffers = compResult.getCompressedBuffers(); - long[] compressedSizes = compResult.getCompressedSizes(); - for (int i = 0; i < numBuffers; ++i) { - compressedInputs[i] = compressedBuffers[i].slice(0, compressedSizes[i]); - } - - // decompress the buffers - uncompressedBuffers = BatchedLZ4Decompressor.decompressAsync(compressedInputs, - Cuda.DEFAULT_STREAM); - - // check the decompressed results against the original - for (int i = 0; i < numBuffers; ++i) { - try (HostMemoryBuffer expected = HostMemoryBuffer.allocate(originalBuffers[i].getLength()); - HostMemoryBuffer actual = HostMemoryBuffer.allocate(uncompressedBuffers[i].getLength())) { - Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE); - Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE); - Assertions.assertEquals(originalBuffers[i].getLength(), uncompressedBuffers[i].getLength(), - "uncompressed size mismatch at buffer " + i); - expected.copyFromDeviceBuffer(originalBuffers[i]); - actual.copyFromDeviceBuffer(uncompressedBuffers[i]); - byte[] expectedBytes = new byte[(int) expected.getLength()]; - expected.getBytes(expectedBytes, 0, 0, expected.getLength()); - byte[] actualBytes = new byte[(int) actual.getLength()]; - actual.getBytes(actualBytes, 0, 0, actual.getLength()); - Assertions.assertArrayEquals(expectedBytes, actualBytes, - "mismatch in batch buffer " + i); + // check the decompressed results against the original + for (int i = 0; i < numBuffers; ++i) { + try (HostMemoryBuffer expected = + HostMemoryBuffer.allocate(originalBuffers.get(i).getLength()); + HostMemoryBuffer actual = + HostMemoryBuffer.allocate(uncompressedBuffers.get(i).getLength())) { + Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE); + Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE); + Assertions.assertEquals(expected.getLength(), actual.getLength(), + "uncompressed size mismatch at buffer " + i); + expected.copyFromDeviceBuffer(originalBuffers.get(i)); + actual.copyFromDeviceBuffer(uncompressedBuffers.get(i)); + byte[] expectedBytes = new byte[(int) expected.getLength()]; + expected.getBytes(expectedBytes, 0, 0, expected.getLength()); + byte[] actualBytes = new byte[(int) actual.getLength()]; + actual.getBytes(actualBytes, 0, 0, actual.getLength()); + Assertions.assertArrayEquals(expectedBytes, actualBytes, + "mismatch in batch buffer " + i); + } } } - } finally { - closeBufferArray(originalBuffers); - closeBufferArray(uncompressedBuffers); - closeBufferArray(compressedInputs); - if (compResult != null) { - closeBufferArray(compResult.getCompressedBuffers()); - } } } @@ -200,14 +105,6 @@ private void closeBuffer(MemoryBuffer buffer) { } } - private void closeBufferArray(MemoryBuffer[] buffers) { - for (MemoryBuffer buffer : buffers) { - if (buffer != null) { - buffer.close(); - } - } - } - private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) { // grab a subsection of the data based on buffer ID int dataStart = 0; @@ -239,6 +136,7 @@ private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) { } private void lz4RoundTrip(boolean useAsync) { + final Cuda.Stream stream = Cuda.DEFAULT_STREAM; final long chunkSize = 64 * 1024; final int numElements = 10 * 1024 * 1024 + 1; long[] data = new long[numElements]; @@ -251,31 +149,32 @@ private void lz4RoundTrip(boolean useAsync) { DeviceMemoryBuffer uncompressedBuffer = null; try (ColumnVector v = ColumnVector.fromLongs(data)) { BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA); - log.debug("Uncompressed size is {}", inputBuffer.getLength()); - - long tempSize = LZ4Compressor.getTempSize(inputBuffer, CompressionType.CHAR, chunkSize); - - log.debug("Using {} temporary space for lz4 compression", tempSize); - tempBuffer = DeviceMemoryBuffer.allocate(tempSize); + final long uncompressedSize = inputBuffer.getLength(); + log.debug("Uncompressed size is {}", uncompressedSize); - long outSize = LZ4Compressor.getOutputSize(inputBuffer, CompressionType.CHAR, chunkSize, - tempBuffer); - log.debug("lz4 compressed size estimate is {}", outSize); + LZ4Compressor.Configuration compressConf = + LZ4Compressor.configure(chunkSize, uncompressedSize); + Assertions.assertTrue(compressConf.getMetadataBytes() > 0); + log.debug("Using {} temporary space for lz4 compression", compressConf.getTempBytes()); + tempBuffer = DeviceMemoryBuffer.allocate(compressConf.getTempBytes()); + log.debug("lz4 compressed size estimate is {}", compressConf.getMaxCompressedBytes()); - compressedBuffer = DeviceMemoryBuffer.allocate(outSize); + compressedBuffer = DeviceMemoryBuffer.allocate(compressConf.getMaxCompressedBytes()); long startTime = System.nanoTime(); long compressedSize; if (useAsync) { - try (HostMemoryBuffer tempHostBuffer = HostMemoryBuffer.allocate(8)) { - LZ4Compressor.compressAsync(tempHostBuffer, inputBuffer, CompressionType.CHAR, chunkSize, - tempBuffer, compressedBuffer, Cuda.DEFAULT_STREAM); - Cuda.DEFAULT_STREAM.sync(); - compressedSize = tempHostBuffer.getLong(0); + try (DeviceMemoryBuffer devCompressedSizeBuffer = DeviceMemoryBuffer.allocate(8); + HostMemoryBuffer hostCompressedSizeBuffer = HostMemoryBuffer.allocate(8)) { + LZ4Compressor.compressAsync(devCompressedSizeBuffer, inputBuffer, CompressionType.CHAR, + chunkSize, tempBuffer, compressedBuffer, stream); + hostCompressedSizeBuffer.copyFromDeviceBufferAsync(devCompressedSizeBuffer, stream); + stream.sync(); + compressedSize = hostCompressedSizeBuffer.getLong(0); } } else { compressedSize = LZ4Compressor.compress(inputBuffer, CompressionType.CHAR, chunkSize, - tempBuffer, compressedBuffer, Cuda.DEFAULT_STREAM); + tempBuffer, compressedBuffer, stream); } double duration = (System.nanoTime() - startTime) / 1000.0; log.info("Compressed with lz4 to {} in {} us", compressedSize, duration); @@ -283,23 +182,20 @@ private void lz4RoundTrip(boolean useAsync) { tempBuffer.close(); tempBuffer = null; - Assertions.assertTrue(Decompressor.isLZ4Data(compressedBuffer)); - - try (Decompressor.Metadata metadata = - Decompressor.getMetadata(compressedBuffer, Cuda.DEFAULT_STREAM)) { - Assertions.assertTrue(metadata.isLZ4Metadata()); - tempSize = Decompressor.getTempSize(metadata); + try (LZ4Decompressor.Configuration decompressConf = + LZ4Decompressor.configure(compressedBuffer, stream)) { + final long tempSize = decompressConf.getTempBytes(); log.debug("Using {} temporary space for lz4 compression", tempSize); tempBuffer = DeviceMemoryBuffer.allocate(tempSize); - outSize = Decompressor.getOutputSize(metadata); + final long outSize = decompressConf.getUncompressedBytes(); Assertions.assertEquals(inputBuffer.getLength(), outSize); uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize); - Decompressor.decompressAsync(compressedBuffer, tempBuffer, metadata, uncompressedBuffer, - Cuda.DEFAULT_STREAM); + LZ4Decompressor.decompressAsync(compressedBuffer, decompressConf, tempBuffer, + uncompressedBuffer, stream); try (ColumnVector v2 = new ColumnVector( DType.INT64, @@ -324,133 +220,4 @@ private void lz4RoundTrip(boolean useAsync) { closeBuffer(uncompressedBuffer); } } - - @Test - void testCascadedRoundTripSync() { - cascadedRoundTrip(false); - } - - @Test - void testCascadedRoundTripAsync() { - cascadedRoundTrip(true); - } - - private void cascadedRoundTrip(boolean useAsync) { - final int numElements = 10 * 1024 * 1024 + 1; - final int numRunLengthEncodings = 2; - final int numDeltas = 1; - final boolean useBitPacking = true; - int[] data = new int[numElements]; - for (int i = 0; i < numElements; ++i) { - data[i] = i; - } - - DeviceMemoryBuffer tempBuffer = null; - DeviceMemoryBuffer compressedBuffer = null; - DeviceMemoryBuffer uncompressedBuffer = null; - try (ColumnVector v = ColumnVector.fromInts(data)) { - BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA); - log.debug("Uncompressed size is " + inputBuffer.getLength()); - - long tempSize = NvcompJni.cascadedCompressGetTempSize( - inputBuffer.getAddress(), - inputBuffer.getLength(), - CompressionType.INT.nativeId, - numRunLengthEncodings, - numDeltas, - useBitPacking); - - log.debug("Using {} temporary space for cascaded compression", tempSize); - tempBuffer = DeviceMemoryBuffer.allocate(tempSize); - - long outSize = NvcompJni.cascadedCompressGetOutputSize( - inputBuffer.getAddress(), - inputBuffer.getLength(), - CompressionType.INT.nativeId, - numRunLengthEncodings, - numDeltas, - useBitPacking, - tempBuffer.getAddress(), - tempBuffer.getLength(), - false); - log.debug("Inexact cascaded compressed size estimate is {}", outSize); - - compressedBuffer = DeviceMemoryBuffer.allocate(outSize); - - long startTime = System.nanoTime(); - long compressedSize; - if (useAsync) { - try (HostMemoryBuffer tempHostBuffer = HostMemoryBuffer.allocate(8)) { - NvcompJni.cascadedCompressAsync( - tempHostBuffer.getAddress(), - inputBuffer.getAddress(), - inputBuffer.getLength(), - CompressionType.INT.nativeId, - numRunLengthEncodings, - numDeltas, - useBitPacking, - tempBuffer.getAddress(), - tempBuffer.getLength(), - compressedBuffer.getAddress(), - compressedBuffer.getLength(), - 0); - Cuda.DEFAULT_STREAM.sync(); - compressedSize = tempHostBuffer.getLong(0); - } - } else { - compressedSize = NvcompJni.cascadedCompress( - inputBuffer.getAddress(), - inputBuffer.getLength(), - CompressionType.INT.nativeId, - numRunLengthEncodings, - numDeltas, - useBitPacking, - tempBuffer.getAddress(), - tempBuffer.getLength(), - compressedBuffer.getAddress(), - compressedBuffer.getLength(), - 0); - } - - double duration = (System.nanoTime() - startTime) / 1000.0; - log.debug("Compressed with cascaded to {} in {} us", compressedSize, duration); - - tempBuffer.close(); - tempBuffer = null; - - try (Decompressor.Metadata metadata = - Decompressor.getMetadata(compressedBuffer, Cuda.DEFAULT_STREAM)) { - tempSize = Decompressor.getTempSize(metadata); - - log.debug("Using {} temporary space for cascaded compression", tempSize); - tempBuffer = DeviceMemoryBuffer.allocate(tempSize); - - outSize = Decompressor.getOutputSize(metadata); - Assertions.assertEquals(inputBuffer.getLength(), outSize); - - uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize); - - Decompressor.decompressAsync(compressedBuffer, tempBuffer, metadata, uncompressedBuffer, - Cuda.DEFAULT_STREAM); - - try (ColumnVector v2 = new ColumnVector( - DType.INT32, - numElements, - Optional.empty(), - uncompressedBuffer, - null, - null)) { - uncompressedBuffer = null; - try (ColumnVector compare = v2.equalTo(v); - Scalar compareAll = compare.all()) { - Assertions.assertTrue(compareAll.getBoolean()); - } - } - } - } finally { - closeBuffer(tempBuffer); - closeBuffer(compressedBuffer); - closeBuffer(uncompressedBuffer); - } - } } diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 0e68f1c71cc..ff5870c50be 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -116,6 +116,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): ) elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION + elif dtype == "decimal32": + meta["max_precision"] = cudf.Decimal32Dtype.MAX_PRECISION meta["dtype"] = dtype meta["null_frequency"] = null_frequency diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd index 74efdb08bea..628ffef433b 100644 --- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd +++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd @@ -5,6 +5,8 @@ from libc.stdint cimport int32_t, int64_t cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: # cython type stub to help resolve to numeric::decimal64 ctypedef int64_t decimal64 + # cython type stub to help resolve to numeric::decimal32 + ctypedef int64_t decimal32 cdef cppclass scale_type: scale_type(int32_t) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 43bc1ac9db7..5b73b1fef10 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -192,7 +192,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, # update the decimal precision of each column if meta is not None: for col, col_meta in zip(column_names, meta["columns"]): - if isinstance(df._data[col].dtype, cudf.Decimal64Dtype): + if is_decimal_dtype(df._data[col].dtype): df._data[col].dtype.precision = ( col_meta["metadata"]["precision"] ) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index ab53a242db2..21a039dbf78 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -2,7 +2,6 @@ import cudf from cudf.api.types import is_decimal_dtype -from cudf.core.dtypes import Decimal64Dtype from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column @@ -18,7 +17,11 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move, pair from cudf._lib.aggregation cimport Aggregation, make_aggregation -from cudf._lib.types cimport dtype_to_data_type, underlying_type_t_type_id +from cudf._lib.types cimport ( + dtype_to_data_type, + is_decimal_type_id, + underlying_type_t_type_id, +) import numpy as np @@ -72,11 +75,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): c_out_dtype )) - if c_result.get()[0].type().id() == libcudf_types.type_id.DECIMAL64: + if is_decimal_type_id(c_result.get()[0].type().id()): scale = -c_result.get()[0].type().scale() precision = _reduce_precision(col_dtype, reduction_op, len(incol)) py_result = DeviceScalar.from_unique_ptr( - move(c_result), dtype=cudf.Decimal64Dtype(precision, scale) + move(c_result), dtype=col_dtype.__class__(precision, scale) ) else: py_result = DeviceScalar.from_unique_ptr(move(c_result)) @@ -157,4 +160,4 @@ def _reduce_precision(dtype, op, nrows): new_p = 2 * p + nrows else: raise NotImplementedError() - return max(min(new_p, cudf.Decimal64Dtype.MAX_PRECISION), 0) + return max(min(new_p, dtype.MAX_PRECISION), 0) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx index 2d7f56dc5ce..e4311b356ec 100644 --- a/python/cudf/cudf/_lib/replace.pyx +++ b/python/cudf/cudf/_lib/replace.pyx @@ -204,12 +204,8 @@ def clip(Column input_col, object lo, object hi): and > hi will be replaced by hi """ - lo_scalar = as_device_scalar( - lo, dtype=input_col.dtype if lo is None else None - ) - hi_scalar = as_device_scalar( - hi, dtype=input_col.dtype if hi is None else None - ) + lo_scalar = as_device_scalar(lo, dtype=input_col.dtype) + hi_scalar = as_device_scalar(hi, dtype=input_col.dtype) return clamp(input_col, lo_scalar, hi_scalar) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 4017c60683e..43c0198f80a 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -45,7 +45,7 @@ from cudf._lib.cpp.scalar.scalar cimport ( struct_scalar, timestamp_scalar, ) -from cudf._lib.cpp.wrappers.decimals cimport decimal64, scale_type +from cudf._lib.cpp.wrappers.decimals cimport decimal32, decimal64, scale_type from cudf._lib.cpp.wrappers.durations cimport ( duration_ms, duration_ns, @@ -88,8 +88,8 @@ cdef class DeviceScalar: # IMPORTANT: this should only ever be called from __init__ valid = not _is_null_host_scalar(value) - if isinstance(dtype, cudf.Decimal64Dtype): - _set_decimal64_from_scalar( + if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)): + _set_decimal_from_scalar( self.c_value, value, dtype, valid) elif isinstance(dtype, cudf.ListDtype): _set_list_from_pylist( @@ -118,7 +118,7 @@ cdef class DeviceScalar: ) def _to_host_scalar(self): - if isinstance(self.dtype, cudf.Decimal64Dtype): + if isinstance(self.dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)): result = _get_py_decimal_from_fixed_point(self.c_value) elif cudf.api.types.is_struct_dtype(self.dtype): result = _get_py_dict_from_struct(self.c_value) @@ -305,16 +305,25 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, else: raise ValueError(f"dtype not supported: {dtype}") -cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): +cdef _set_decimal_from_scalar(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0 - s.reset( - new fixed_point_scalar[decimal64]( - np.int64(value), scale_type(-dtype.scale), valid + if isinstance(dtype, cudf.Decimal64Dtype): + s.reset( + new fixed_point_scalar[decimal64]( + np.int64(value), scale_type(-dtype.scale), valid + ) ) - ) + elif isinstance(dtype, cudf.Decimal32Dtype): + s.reset( + new fixed_point_scalar[decimal32]( + np.int32(value), scale_type(-dtype.scale), valid + ) + ) + else: + raise ValueError(f"dtype not supported: {dtype}") cdef _set_struct_from_pydict(unique_ptr[scalar]& s, object value, @@ -450,6 +459,10 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s): rep_val = int((s_ptr)[0].value()) scale = int((s_ptr)[0].type().scale()) return decimal.Decimal(rep_val).scaleb(scale) + elif cdtype.id() == libcudf_types.DECIMAL32: + rep_val = int((s_ptr)[0].value()) + scale = int((s_ptr)[0].type().scale()) + return decimal.Decimal(rep_val).scaleb(scale) else: raise ValueError("Could not convert cudf::scalar to numpy scalar") diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index dbbe9b1e05a..58e3221a4ec 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -17,3 +17,4 @@ ctypedef bool underlying_type_t_null_policy cdef dtype_from_column_view(column_view cv) cdef libcudf_types.data_type dtype_to_data_type(dtype) except * +cdef bool is_decimal_type_id(libcudf_types.type_id tid) except * diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index e798d78d426..1fa389f408c 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -225,10 +225,13 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: ( SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)])) - if tid in ( - libcudf_types.type_id.DECIMAL64, - libcudf_types.type_id.DECIMAL32 - ): + if is_decimal_type_id(tid): return libcudf_types.data_type(tid, -dtype.scale) else: return libcudf_types.data_type(tid) + +cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *: + return tid in ( + libcudf_types.type_id.DECIMAL64, + libcudf_types.type_id.DECIMAL32 + ) diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 85b4bf20e5c..b6089b65aa5 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -129,13 +129,11 @@ def null_raw(self): return self._series._column.mask_array_view def make_series(self): - """Make a Series object out of this node - """ + """Make a Series object out of this node""" return self._series.copy(deep=False) def _make_dictionary_series(self): - """Make a dictionary-encoded series from this node - """ + """Make a dictionary-encoded series from this node""" assert self.is_dictionary return self._series.copy(deep=False) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 23b6c01ca83..b5042305299 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -400,8 +400,7 @@ def valid_count(self) -> int: @property def nullmask(self) -> Buffer: - """The gpu buffer for the null-mask - """ + """The gpu buffer for the null-mask""" if not self.nullable: raise ValueError("Column has no null mask") return self.mask_array_view @@ -630,8 +629,7 @@ def fillna( ) def isnull(self) -> ColumnBase: - """Identify missing values in a Column. - """ + """Identify missing values in a Column.""" result = libcudf.unary.is_null(self) if self.dtype.kind == "f": @@ -642,8 +640,7 @@ def isnull(self) -> ColumnBase: return result def notnull(self) -> ColumnBase: - """Identify non-missing values in a Column. - """ + """Identify non-missing values in a Column.""" result = libcudf.unary.is_valid(self) if self.dtype.kind == "f": @@ -697,8 +694,7 @@ def take( keep_index: bool = True, nullify: bool = False, ) -> T: - """Return Column by taking values from the corresponding *indices*. - """ + """Return Column by taking values from the corresponding *indices*.""" # Handle zero size if indices.size == 0: return cast(T, column_empty_like(self, newsize=0)) @@ -1254,8 +1250,7 @@ def column_empty_like( masked: bool = False, newsize: int = None, ) -> ColumnBase: - """Allocate a new column like the given *column* - """ + """Allocate a new column like the given *column*""" if dtype is None: dtype = column.dtype row_count = len(column) if newsize is None else newsize @@ -1297,8 +1292,7 @@ def column_empty_like_same_mask( def column_empty( row_count: int, dtype: Dtype = "object", masked: bool = False ) -> ColumnBase: - """Allocate a new column like the given row_count and dtype. - """ + """Allocate a new column like the given row_count and dtype.""" dtype = cudf.dtype(dtype) children = () # type: Tuple[ColumnBase, ...] diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b1d69316863..68379002e6b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -347,8 +347,7 @@ def as_string_column( ) def _default_na_value(self) -> DatetimeLikeScalar: - """Returns the default NA value for this column - """ + """Returns the default NA value for this column""" return np.datetime64("nat", self.time_unit) def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index b13ad8664dc..6409a9f9196 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -25,8 +25,7 @@ class DecimalBaseColumn(NumericalBaseColumn): - """Base column for decimal64 and decimal32 columns - """ + """Base column for decimal64 and decimal32 columns""" dtype: Union[Decimal32Dtype, Decimal64Dtype] @@ -321,5 +320,5 @@ def _binop_precision(l_dtype, r_dtype, op): result = p1 + p2 + 1 else: raise NotImplementedError() - + # TODO return min(result, cudf.Decimal64Dtype.MAX_PRECISION) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index ca5026c2293..becb303feeb 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -21,7 +21,7 @@ column, string, ) -from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype +from cudf.core.dtypes import CategoricalDtype, Decimal32Dtype, Decimal64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( NUMERIC_TYPES, @@ -147,6 +147,7 @@ def binary_operator( NumericalColumn, cudf.Scalar, cudf.core.column.Decimal64Column, + cudf.core.column.Decimal32Column, ), ) or np.isscalar(rhs) @@ -158,6 +159,11 @@ def binary_operator( Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) ) return lhs.binary_operator(binop, rhs) + elif isinstance(rhs, cudf.core.column.Decimal32Column): + lhs = self.as_decimal_column( + Decimal32Dtype(Decimal32Dtype.MAX_PRECISION, 0) + ) + return lhs.binary_operator(binop, rhs) out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: tmp = self if reflect else rhs @@ -291,8 +297,7 @@ def _process_values_for_isin( return lhs, rhs def _default_na_value(self) -> ScalarLike: - """Returns the default NA value for this column - """ + """Returns the default NA value for this column""" dkind = self.dtype.kind if dkind == "f": return self.dtype.type(np.nan) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index c26b8b7e09c..853fb360c50 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -194,8 +194,7 @@ def corr(self, other: ColumnBase) -> float: def round( self, decimals: int = 0, how: str = "half_even" ) -> NumericalBaseColumn: - """Round the values in the Column to the given number of decimals. - """ + """Round the values in the Column to the given number of decimals.""" return libcudf.round.round(self, decimal_places=decimals, how=how) def _apply_scan_op(self, op: str) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index dba96b9069d..82f1794b200 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -42,7 +42,7 @@ def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column """ + """Takes in string column and returns boolean column""" return ( libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") ).fillna(False) @@ -552,7 +552,7 @@ def _split_by_character(self): def extract( self, pat: str, flags: int = 0, expand: bool = True ) -> SeriesOrIndex: - """ + r""" Extract capture groups in the regex `pat` as columns in a DataFrame. For each subject string in the Series, extract groups from the first @@ -624,7 +624,7 @@ def contains( na=np.nan, regex: bool = True, ) -> SeriesOrIndex: - """ + r""" Test if pattern or regex is contained within a string of a Series or Index. @@ -3270,7 +3270,7 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: return self._return_or_inplace(libstrings.wrap(self._column, width)) def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: - """ + r""" Count occurrences of pattern in each string of the Series/Index. This function is used to count the number of times a particular @@ -4767,7 +4767,7 @@ def is_consonant(self, position) -> SeriesOrIndex: 0 True 1 False dtype: bool - """ + """ ltype = libstrings.LetterType.CONSONANT if can_convert_to_column(position): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index c7b13903751..4b7a3bcc197 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -305,8 +305,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ) def _default_na_value(self) -> ScalarLike: - """Returns the default NA value for this column - """ + """Returns the default NA value for this column""" return np.timedelta64("nat", self.time_unit) @property diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 56882f89af8..2411b2a9211 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -244,9 +244,7 @@ def _clear_cache(self): del self._column_length def to_pandas_index(self) -> pd.Index: - """" - Convert the keys of the ColumnAccessor to a Pandas Index object. - """ + """Convert the keys of the ColumnAccessor to a Pandas Index object.""" if self.multiindex and len(self.level_names) > 0: # Using `from_frame()` instead of `from_tuples` # prevents coercion of values to a different type diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d2dbac4d155..5239cf9d648 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -923,8 +923,7 @@ def dtypes(self): @property def ndim(self): - """Dimension of the data. DataFrame ndim is always 2. - """ + """Dimension of the data. DataFrame ndim is always 2.""" return 2 def __dir__(self): @@ -1023,8 +1022,7 @@ def __getitem__(self, arg): @annotate("DATAFRAME_SETITEM", color="blue", domain="cudf_python") def __setitem__(self, arg, value): - """Add/set column by *arg or DataFrame* - """ + """Add/set column by *arg or DataFrame*""" if isinstance(arg, DataFrame): # not handling set_item where arg = df & value = df if isinstance(value, DataFrame): @@ -1143,13 +1141,13 @@ def __sizeof__(self): def _slice(self: T, arg: slice) -> T: """ - _slice : slice the frame as per the arg + _slice : slice the frame as per the arg - Parameters - ---------- - arg : should always be of type slice + Parameters + ---------- + arg : should always be of type slice - """ + """ from cudf.core.index import RangeIndex num_rows = len(self) @@ -1306,7 +1304,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented def _get_numeric_data(self): - """ Return a dataframe with only numeric data types """ + """Return a dataframe with only numeric data types""" columns = [ c for c, dt in self.dtypes.items() @@ -2057,7 +2055,7 @@ def __iter__(self): return iter(self.columns) def iteritems(self): - """ Iterate over column names and series pairs """ + """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) @@ -2078,8 +2076,7 @@ def at(self): @property # type: ignore @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python") def columns(self): - """Returns a tuple of columns - """ + """Returns a tuple of columns""" return self._data.to_pandas_index() @columns.setter # type: ignore @@ -2128,8 +2125,7 @@ def _rename_columns(self, new_names): @property def index(self): - """Returns the index of the DataFrame - """ + """Returns the index of the DataFrame""" return self._index @index.setter @@ -2618,7 +2614,7 @@ def take(self, positions, keep_index=True): @annotate("INSERT", color="green", domain="cudf_python") def insert(self, loc, name, value): - """ Add a column to DataFrame at the index specified by loc. + """Add a column to DataFrame at the index specified by loc. Parameters ---------- @@ -2842,8 +2838,7 @@ def drop( return out def _drop_column(self, name): - """Drop a column by *name* - """ + """Drop a column by *name*""" if name not in self._data: raise KeyError(f"column '{name}' does not exist") del self._data[name] @@ -2926,8 +2921,7 @@ def drop_duplicates( return self._mimic_inplace(outdf, inplace=inplace) def pop(self, item): - """Return a column and drop it from the DataFrame. - """ + """Return a column and drop it from the DataFrame.""" popped = self[item] del self[item] return popped @@ -5612,8 +5606,7 @@ def isin(self, values): # Stats # def _prepare_for_rowwise_op(self, method, skipna): - """Prepare a DataFrame for CuPy-based row-wise operations. - """ + """Prepare a DataFrame for CuPy-based row-wise operations.""" if method not in _cupy_nan_methods_map and any( col.nullable for col in self._columns @@ -6239,8 +6232,7 @@ def cov(self, **kwargs): return df def corr(self): - """Compute the correlation matrix of a DataFrame. - """ + """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns) df.columns = self.columns @@ -6766,11 +6758,11 @@ def _setitem_with_dataframe( mask: Optional[cudf.core.column.ColumnBase] = None, ): """ - This function sets item dataframes relevant columns with replacement df - :param input_df: Dataframe to be modified inplace - :param replace_df: Replacement DataFrame to replace values with - :param input_cols: columns to replace in the input dataframe - :param mask: boolean mask in case of masked replacing + This function sets item dataframes relevant columns with replacement df + :param input_df: Dataframe to be modified inplace + :param replace_df: Replacement DataFrame to replace values with + :param input_cols: columns to replace in the input dataframe + :param mask: boolean mask in case of masked replacing """ if input_cols is None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 71f910d7bb9..0b895460410 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1765,8 +1765,7 @@ def _fill(self, fill_values, begin, end, inplace): return self def shift(self, periods=1, freq=None, axis=0, fill_value=None): - """Shift values by `periods` positions. - """ + """Shift values by `periods` positions.""" assert axis in (None, 0) and freq is None return self._shift(periods) @@ -5505,7 +5504,7 @@ def multiply(self, other, axis, level=None, fill_value=None): def rmul(self, other, axis, level=None, fill_value=None): """ - Get Multiplication of dataframe or series and other, element-wise + Get Multiplication of dataframe or series and other, element-wise (binary operator `rmul`). Equivalent to ``other * frame``, but with support to substitute a diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ffe0051297b..6ffba8da069 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -820,7 +820,7 @@ def quantile(self, q=0.5, interpolation="linear"): interpolation : {"linear", "lower", "higher", "midpoint", "nearest"} The interpolation method to use when the desired quantile lies between two data points. Defaults to "linear". - """ + """ def func(x): return getattr(x, "quantile")(q=q, interpolation=interpolation) @@ -893,8 +893,7 @@ def diff(self, periods=1, axis=0): return result._copy_type_metadata(value_columns) def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: - """Internal implementation for `ffill` and `bfill` - """ + """Internal implementation for `ffill` and `bfill`""" value_columns = self.grouping.values result = self.obj.__class__._from_data( self._groupby.replace_nulls( @@ -1365,8 +1364,7 @@ def _handle_by_or_level(self, by=None, level=None): @property def keys(self): - """Return grouping key columns as index - """ + """Return grouping key columns as index""" nkeys = len(self._key_columns) if nkeys == 0: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f858a589614..27edd41ed92 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -656,8 +656,7 @@ def _compute_levels_and_codes(self): self._codes = cudf.DataFrame._from_data(codes) def _compute_validity_mask(self, index, row_tuple, max_length): - """ Computes the valid set of indices of values in the lookup - """ + """Computes the valid set of indices of values in the lookup""" lookup = cudf.DataFrame() for name, row in zip(index.names, row_tuple): if isinstance(row, slice) and row == slice(None): diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 91fef1df876..b2fac7a6140 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -592,7 +592,7 @@ def get_dummies( drop_first=False, dtype="uint8", ): - """ Returns a dataframe whose columns are the one hot encodings of all + """Returns a dataframe whose columns are the one hot encodings of all columns in `df` Parameters diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f425b650ee7..787b28e213c 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -7,7 +7,7 @@ import cudf from cudf.core.column.column import ColumnBase -from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype +from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.index import BaseIndex from cudf.core.series import Series from cudf.utils.dtypes import ( @@ -145,12 +145,12 @@ def _preprocess_host_value(self, value, dtype): else: return NA, dtype - if isinstance(dtype, Decimal64Dtype): + if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)): value = pa.scalar( value, type=pa.decimal128(dtype.precision, dtype.scale) ).as_py() if isinstance(value, decimal.Decimal) and dtype is None: - dtype = Decimal64Dtype._from_decimal(value) + dtype = cudf.Decimal64Dtype._from_decimal(value) value = to_cudf_compatible_scalar(value, dtype=dtype) @@ -171,7 +171,7 @@ def _preprocess_host_value(self, value, dtype): else: dtype = value.dtype - if not isinstance(dtype, Decimal64Dtype): + if not isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)): dtype = cudf.dtype(dtype) if not valid: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c8d8837cbaa..2a9adf65283 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -85,7 +85,7 @@ def _append_new_row_inplace(col: ColumnLike, value: ScalarLike): """Append a scalar `value` to the end of `col` inplace. - Cast to common type if possible + Cast to common type if possible """ to_type = find_common_type([type(value), col.dtype]) val_col = as_column(value, dtype=to_type) @@ -135,7 +135,11 @@ def __setitem__(self, key, value): if ( not isinstance( self._frame._column.dtype, - (cudf.Decimal64Dtype, cudf.CategoricalDtype), + ( + cudf.Decimal64Dtype, + cudf.Decimal32Dtype, + cudf.CategoricalDtype, + ), ) and hasattr(value, "dtype") and _is_non_decimal_numeric_dtype(value.dtype) @@ -2120,8 +2124,7 @@ def data(self): @property def index(self): - """The index object - """ + """The index object""" return self._index @index.setter @@ -2130,8 +2133,7 @@ def index(self, _index): @property def nullmask(self): - """The gpu buffer for the null-mask - """ + """The gpu buffer for the null-mask""" return cudf.Series(self._column.nullmask) def as_mask(self): diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 8c69b94cc84..5aa72215892 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -226,8 +226,7 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col): - """Handles empty and infinity strings - """ + """Handles empty and infinity strings""" col = libstrings.to_lower(col) col = _proc_empty_strings(col) col = _proc_inf_strings(col) @@ -235,8 +234,7 @@ def _proc_inf_empty_strings(col): def _proc_empty_strings(col): - """Replaces empty strings with NaN - """ + """Replaces empty strings with NaN""" s = cudf.Series(col) s = s.where(s != "", "NaN") return s._column diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index b568c108191..2341a5c23b9 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -18,7 +18,7 @@ def timeseries( nulls_frequency=0, seed=None, ): - """ Create timeseries dataframe with random data + """Create timeseries dataframe with random data Parameters ---------- @@ -81,7 +81,7 @@ def timeseries( def randomdata(nrows=10, dtypes=None, seed=None): - """ Create a dataframe with random data + """Create a dataframe with random data Parameters ---------- diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index b101835e626..cc5aec36853 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -68,7 +68,7 @@ def count_zero(arr): def assert_eq(left, right, **kwargs): - """ Assert that two cudf-like things are equivalent + """Assert that two cudf-like things are equivalent This equality test works for pandas/cudf dataframes/series/indexes/scalars in the same way, and so makes it easier to perform parametrized testing diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index afe21201b7e..f4a80c60ddf 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -368,6 +368,22 @@ def rand_dataframe( dtype=dtype, ) ) + elif dtype == "decimal32": + max_precision = meta.get( + "max_precision", cudf.Decimal32Dtype.MAX_PRECISION + ) + precision = np.random.randint(1, max_precision) + scale = np.random.randint(0, precision) + dtype = cudf.Decimal32Dtype(precision=precision, scale=scale) + column_params.append( + ColumnParameters( + cardinality=cardinality, + null_frequency=null_frequency, + generator=decimal_generator(dtype=dtype, size=cardinality), + is_sorted=False, + dtype=dtype, + ) + ) elif dtype == "category": column_params.append( ColumnParameters( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 4d0dde88ec2..c6cf7c4e6f5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -92,8 +92,7 @@ def test_index_find_label_range_genericindex(): def test_index_find_label_range_rangeindex(): - """Cudf specific - """ + """Cudf specific""" # step > 0 # 3, 8, 13, 18 ridx = RangeIndex(3, 20, 5) @@ -344,8 +343,7 @@ def test_index_copy_string(name, dtype, deep=True): NUMERIC_TYPES + ["datetime64[ns]", "timedelta64[ns]"] + OTHER_TYPES, ) def test_index_copy_integer(name, dtype, deep=True): - """Test for NumericIndex Copy Casts - """ + """Test for NumericIndex Copy Casts""" cidx = cudf.Int64Index([1, 2, 3]) pidx = cidx.to_pandas() @@ -358,8 +356,7 @@ def test_index_copy_integer(name, dtype, deep=True): @pytest.mark.parametrize("name", ["x"]) @pytest.mark.parametrize("dtype", SIGNED_TYPES) def test_index_copy_float(name, dtype, deep=True): - """Test for NumericIndex Copy Casts - """ + """Test for NumericIndex Copy Casts""" cidx = cudf.Float64Index([1.0, 2.0, 3.0]) pidx = cidx.to_pandas() diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index a6d0a10ce5d..2ded4925964 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -666,8 +666,7 @@ def test_multiindex_equals(): ) @pytest.mark.parametrize("names", [["X", "Y"]]) def test_multiindex_copy_sem(data, levels, codes, names): - """Test semantic equality for MultiIndex.copy - """ + """Test semantic equality for MultiIndex.copy""" gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -740,8 +739,8 @@ def test_multiindex_copy_sem(data, levels, codes, names): @pytest.mark.parametrize("deep", [True, False]) def test_multiindex_copy_deep(data, deep): """Test memory idendity for deep copy - Case1: Constructed from GroupBy, StringColumns - Case2: Constrcuted from MultiIndex, NumericColumns + Case1: Constructed from GroupBy, StringColumns + Case2: Constrcuted from MultiIndex, NumericColumns """ same_ref = not deep diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index f47e87374dc..6543af36dd4 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1103,6 +1103,8 @@ def test_dataframe_exceptions_for_clip(lower, upper): ([1, 2, 3, 4, 5], None, 4), ([1, 2, 3, 4, 5], None, None), ([1, 2, 3, 4, 5], 4, 2), + ([1.0, 2.0, 3.0, 4.0, 5.0], 4, 2), + (pd.Series([1, 2, 3, 4, 5], dtype="int32"), 4, 2), (["a", "b", "c", "d", "e"], "b", "d"), (["a", "b", "c", "d", "e"], "b", None), (["a", "b", "c", "d", "e"], None, "d"), @@ -1112,7 +1114,7 @@ def test_dataframe_exceptions_for_clip(lower, upper): @pytest.mark.parametrize("inplace", [True, False]) def test_series_clip(data, lower, upper, inplace): psr = pd.Series(data) - gsr = cudf.Series.from_pandas(data) + gsr = cudf.from_pandas(psr) expect = psr.clip(lower=lower, upper=upper) got = gsr.clip(lower=lower, upper=upper, inplace=inplace) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index b436825cf69..440dcf527ca 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -44,7 +44,7 @@ ) @pytest.mark.parametrize("to_host", [True, False]) def test_serialize(df, to_host): - """ This should hopefully replace all functions below """ + """This should hopefully replace all functions below""" a = df() if "cudf" not in type(a).__module__: a = cudf.from_pandas(a) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 222b7c726fc..53676a47046 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -117,8 +117,7 @@ def test_series_sort_index(nelem, asc): @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) @pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7]) def test_series_nlargest(data, n): - """Indirectly tests Series.sort_values() - """ + """Indirectly tests Series.sort_values()""" sr = Series(data) psr = pd.Series(data) assert_eq(sr.nlargest(n), psr.nlargest(n)) @@ -136,8 +135,7 @@ def test_series_nlargest(data, n): @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) @pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9]) def test_series_nsmallest(data, n): - """Indirectly tests Series.sort_values() - """ + """Indirectly tests Series.sort_values()""" sr = Series(data) psr = pd.Series(data) assert_eq(sr.nsmallest(n), psr.nsmallest(n)) diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index c8fb7c1a47d..fa5cde76524 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -332,8 +332,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): @functools.wraps(_make_row_wise_kernel) def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs): - """Caching version of ``_make_row_wise_kernel``. - """ + """Caching version of ``_make_row_wise_kernel``.""" if cache_key is None: cache_key = func try: @@ -349,8 +348,7 @@ def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs): @functools.wraps(_make_chunk_wise_kernel) def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs): - """Caching version of ``_make_row_wise_kernel``. - """ + """Caching version of ``_make_row_wise_kernel``.""" try: return _cache[func] except KeyError: @@ -360,6 +358,5 @@ def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs): def _mangle_user(name): - """Mangle user variable name - """ + """Mangle user variable name""" return "__user_{}".format(name) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index c5620bed078..2eb38c0f77e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -121,8 +121,7 @@ def np_to_pa_dtype(dtype): - """Util to convert numpy dtype to PyArrow dtype. - """ + """Util to convert numpy dtype to PyArrow dtype.""" # special case when dtype is np.datetime64 if dtype.kind == "M": time_unit, _ = np.datetime_data(dtype) @@ -153,8 +152,7 @@ def get_numeric_type_info(dtype): def numeric_normalize_types(*args): - """Cast all args to a common type using numpy promotion logic - """ + """Cast all args to a common type using numpy promotion logic""" dtype = np.result_type(*[a.dtype for a in args]) return [a.astype(dtype) for a in args] @@ -171,8 +169,8 @@ def _find_common_type_decimal(dtypes): def cudf_dtype_from_pydata_dtype(dtype): - """ Given a numpy or pandas dtype, converts it into the equivalent cuDF - Python dtype. + """Given a numpy or pandas dtype, converts it into the equivalent cuDF + Python dtype. """ if cudf.api.types.is_categorical_dtype(dtype): @@ -188,8 +186,8 @@ def cudf_dtype_from_pydata_dtype(dtype): def cudf_dtype_to_pa_type(dtype): - """ Given a cudf pandas dtype, converts it into the equivalent cuDF - Python dtype. + """Given a cudf pandas dtype, converts it into the equivalent cuDF + Python dtype. """ if cudf.api.types.is_categorical_dtype(dtype): raise NotImplementedError() @@ -204,8 +202,8 @@ def cudf_dtype_to_pa_type(dtype): def cudf_dtype_from_pa_type(typ): - """ Given a cuDF pyarrow dtype, converts it into the equivalent - cudf pandas dtype. + """Given a cuDF pyarrow dtype, converts it into the equivalent + cudf pandas dtype. """ if pa.types.is_list(typ): return cudf.core.dtypes.ListDtype.from_arrow(typ) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index 532fa925670..4f84fde5492 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -247,7 +247,7 @@ def hash_vocab( sep_token="[SEP]", ): """ - Write the vocab vocabulary hashtable to the output_path + Write the vocab vocabulary hashtable to the output_path """ np.random.seed(1243342) vocab = _load_vocab_dict(vocab_path) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 3c98d27868d..6746753249c 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1115,7 +1115,8 @@ def _is_local_filesystem(fs): def ensure_single_filepath_or_buffer(path_or_data, **kwargs): - """Return False if `path_or_data` resolves to multiple filepaths or buffers + """Return False if `path_or_data` resolves to multiple filepaths or + buffers. """ path_or_data = stringify_pathlike(path_or_data) if isinstance(path_or_data, str): @@ -1140,8 +1141,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs): def is_directory(path_or_data, **kwargs): - """Returns True if the provided filepath is a directory - """ + """Returns True if the provided filepath is a directory""" path_or_data = stringify_pathlike(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index c71a6dbccb1..c1c5513c718 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -157,8 +157,7 @@ def {kernelname}(out, {args}): def _wrap_query_expr(name, fn, args): - """Wrap the query expression in a cuda kernel. - """ + """Wrap the query expression in a cuda kernel.""" def _add_idx(arg): if arg.startswith(ENVREF_PREFIX): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 4995a5f31b5..4f9b23bf6fe 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -383,8 +383,8 @@ def _cast_to_appropriate_cudf_type(val, index=None): def _get_cupy_compatible_args_index(args, ser_index=None): """ - This function returns cupy compatible arguments and output index - if conversion is not possible it returns None + This function returns cupy compatible arguments and output index + if conversion is not possible it returns None """ casted_ls = [] diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 2907753d8d5..bf063918c89 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -33,7 +33,7 @@ class _Frame(dd.core._Frame, OperatorMethodMixin): - """ Superclass for DataFrame and Series + """Superclass for DataFrame and Series Parameters ---------- @@ -263,13 +263,13 @@ def sort_values( return df def to_parquet(self, path, *args, **kwargs): - """ Calls dask.dataframe.io.to_parquet with CudfEngine backend """ + """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" from dask_cudf.io import to_parquet return to_parquet(self, path, *args, **kwargs) def to_orc(self, path, **kwargs): - """ Calls dask_cudf.io.to_orc """ + """Calls dask_cudf.io.to_orc""" from dask_cudf.io import to_orc return to_orc(self, path, **kwargs) @@ -326,8 +326,7 @@ def repartition(self, *args, **kwargs): return super().repartition(*args, **kwargs) def shuffle(self, *args, **kwargs): - """ Wraps dask.dataframe DataFrame.shuffle method - """ + """Wraps dask.dataframe DataFrame.shuffle method""" shuffle_arg = kwargs.pop("shuffle", None) if shuffle_arg and shuffle_arg != "tasks": raise ValueError("dask_cudf does not support disk-based shuffle.") diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 0cf9d835523..3eb3975c41f 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -180,24 +180,24 @@ def groupby_agg( sort=False, as_index=True, ): - """ Optimized groupby aggregation for Dask-CuDF. - - This aggregation algorithm only supports the following options: - - - "count" - - "mean" - - "std" - - "var" - - "sum" - - "min" - - "max" - - "collect" - - "first" - - "last" - - This "optimized" approach is more performant than the algorithm - in `dask.dataframe`, because it allows the cudf backend to - perform multiple aggregations at once. + """Optimized groupby aggregation for Dask-CuDF. + + This aggregation algorithm only supports the following options: + + - "count" + - "mean" + - "std" + - "var" + - "sum" + - "min" + - "max" + - "collect" + - "first" + - "last" + + This "optimized" approach is more performant than the algorithm + in `dask.dataframe`, because it allows the cudf backend to + perform multiple aggregations at once. """ # Assert that aggregations are supported aggs = _redirect_aggs(aggs_in) @@ -348,8 +348,7 @@ def groupby_agg( def _redirect_aggs(arg): - """ Redirect aggregations to their corresponding name in cuDF - """ + """Redirect aggregations to their corresponding name in cuDF""" redirects = { sum: "sum", max: "max", @@ -375,8 +374,7 @@ def _redirect_aggs(arg): def _is_supported(arg, supported: set): - """ Check that aggregations in `arg` are a subset of `supported` - """ + """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): if isinstance(arg, dict): _global_set: Set[str] = set() @@ -395,8 +393,7 @@ def _is_supported(arg, supported: set): def _make_name(*args, sep="_"): - """ Combine elements of `args` into a new string - """ + """Combine elements of `args` into a new string""" _args = (arg for arg in args if arg != "") return sep.join(_args) @@ -404,15 +401,15 @@ def _make_name(*args, sep="_"): def _groupby_partition_agg( df, gb_cols, aggs, columns, split_out, dropna, sort, sep ): - """ Initial partition-level aggregation task. - - This is the first operation to be executed on each input - partition in `groupby_agg`. Depending on `aggs`, four possible - groupby aggregations ("count", "sum", "min", and "max") are - performed. The result is then partitioned (by hashing `gb_cols`) - into a number of distinct dictionary elements. The number of - elements in the output dictionary (`split_out`) corresponds to - the number of partitions in the final output of `groupby_agg`. + """Initial partition-level aggregation task. + + This is the first operation to be executed on each input + partition in `groupby_agg`. Depending on `aggs`, four possible + groupby aggregations ("count", "sum", "min", and "max") are + performed. The result is then partitioned (by hashing `gb_cols`) + into a number of distinct dictionary elements. The number of + elements in the output dictionary (`split_out`) corresponds to + the number of partitions in the final output of `groupby_agg`. """ # Modify dict for initial (partition-wise) aggregations @@ -459,15 +456,15 @@ def _groupby_partition_agg( def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): - """ Node in groupby-aggregation reduction tree. - - Following the initial `_groupby_partition_agg` tasks, - the `groupby_agg` algorithm will perform a tree reduction - to combine the data from the input partitions into - `split_out` different output partitions. For each node in - the reduction tree, the input DataFrame objects are - concatenated, and "sum", "min" and/or "max" groupby - aggregations are used to combine the necessary statistics. + """Node in groupby-aggregation reduction tree. + + Following the initial `_groupby_partition_agg` tasks, + the `groupby_agg` algorithm will perform a tree reduction + to combine the data from the input partitions into + `split_out` different output partitions. For each node in + the reduction tree, the input DataFrame objects are + concatenated, and "sum", "min" and/or "max" groupby + aggregations are used to combine the necessary statistics. """ df = _concat(dfs, ignore_index=True) @@ -493,8 +490,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): - """ Calculate variance (given count, sum, and sum-squared columns). - """ + """Calculate variance (given count, sum, and sum-squared columns).""" # Select count, sum, and sum-squared n = df[count_name] @@ -526,13 +522,13 @@ def _finalize_gb_agg( str_cols_out, aggs_renames, ): - """ Final aggregation task. + """Final aggregation task. - This is the final operation on each output partitions - of the `groupby_agg` algorithm. This function must - take care of higher-order aggregations, like "mean", - "std" and "var". We also need to deal with the column - index, the row index, and final sorting behavior. + This is the final operation on each output partitions + of the `groupby_agg` algorithm. This function must + take care of higher-order aggregations, like "mean", + "std" and "var". We also need to deal with the column + index, the row index, and final sorting behavior. """ # Deal with higher-order aggregations diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 9b7a58a3c59..2e5d55e92d2 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -351,7 +351,7 @@ def read_parquet( row_groups_per_part=None, **kwargs, ): - """ Read parquet files into a Dask DataFrame + """Read parquet files into a Dask DataFrame Calls ``dask.dataframe.read_parquet`` to cordinate the execution of ``cudf.read_parquet``, and ultimately read multiple partitions into diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 21bf3aee7d1..5f2af445170 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -46,7 +46,7 @@ def _quantile(a, q): def merge_quantiles(finalq, qs, vals): - """ Combine several quantile calculations of different data. + """Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] """ if isinstance(finalq, Iterator): @@ -223,8 +223,7 @@ def sort_values( ascending=True, na_position="last", ): - """ Sort by the given list/tuple of column names. - """ + """Sort by the given list/tuple of column names.""" if na_position not in ("first", "last"): raise ValueError("na_position must be either 'first' or 'last'")