From cc2f1929c95c53984d439ff6af349f10c2358ff7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 4 Aug 2021 15:38:52 -0700 Subject: [PATCH 01/14] Expand CSV and JSON reader APIs to accept `dtypes` as a vector or map of `data_type` objects (#8856) Goal of the PR is to enable CSV to read columns as decimal, and to replace the string-based `dtype` part of the API. `data_type` based API is needed because we need to specify scale for decimal columns, and doing this via a string that describes the type is :hankey: Changes in the PR: - Added overloads to `dtype` related getters/setters to also take a vector or a map of `data_type` objects. In case of CSV, vector of `data_type`s was already supported. Reworked the implementation to support different use cases that the "dtype-as-string" code path supports. - Fixed naming of compression option setter. - Added `parse_dates` option to make up for the special strings that CSV supported to denote that a column needs to be parsed as hexadecimal (the option to pass strings is to be removed). - Changed naming of `infer_date` option to `parse_dates`. - Updated all CSV and JSON tests to use the new APIs. Breaking because API to specify date columns has been renamed to match the new `parse_hex` API; renamed from `infer_date` to `parse_dates` Depends on #8843 Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - AJ Schmidt (https://github.com/ajschmidt8) - Elias Stehle (https://github.com/elstehle) URL: https://github.com/rapidsai/cudf/pull/8856 --- conda/recipes/libcudf/meta.yaml | 1 + .../detail/utilities/visitor_overload.hpp | 30 ++ cpp/include/cudf/io/csv.hpp | 113 +++++- cpp/include/cudf/io/json.hpp | 72 +++- cpp/src/io/csv/reader_impl.cu | 83 +++-- cpp/src/io/csv/reader_impl.hpp | 16 + cpp/src/io/json/reader_impl.cu | 104 ++++-- cpp/src/io/json/reader_impl.hpp | 2 + cpp/tests/io/csv_test.cpp | 344 ++++++++++-------- cpp/tests/io/json_test.cpp | 78 ++-- python/cudf/cudf/_lib/cpp/io/csv.pxd | 23 +- python/cudf/cudf/_lib/cpp/io/json.pxd | 9 + python/cudf/cudf/_lib/csv.pyx | 12 +- 13 files changed, 589 insertions(+), 298 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/visitor_overload.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 75bfe6c34bc..35d444d026c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -102,6 +102,7 @@ test: - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp + - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp new file mode 100644 index 00000000000..a55ca323c50 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::detail { + +/** + * @brief Helper class to support inline-overloading for all of a variant's alternative types + */ +template +struct visitor_overload : Ts... { + using Ts::operator()...; +}; +template +visitor_overload(Ts...) -> visitor_overload; + +} // namespace cudf::detail diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 1dff99735ec..d4a21b2e98c 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -104,14 +104,19 @@ class csv_reader_options { // Whether a quote inside a value is double-quoted bool _doublequote = true; // Names of columns to read as datetime - std::vector _infer_date_names; + std::vector _parse_dates_names; // Indexes of columns to read as datetime - std::vector _infer_date_indexes; + std::vector _parse_dates_indexes; + // Names of columns to parse as hexadecimal + std::vector _parse_hex_names; + // Indexes of columns to parse as hexadecimal + std::vector _parse_hex_indexes; // Conversion settings // Per-column types; disables type inference on those columns - std::variant, std::vector> _dtypes; + std::variant, std::vector, std::map> + _dtypes; // Additional values to recognize as boolean true values std::vector _true_values{"True", "TRUE", "true"}; // Additional values to recognize as boolean false values @@ -280,17 +285,30 @@ class csv_reader_options { /** * @brief Returns names of columns to read as datetime. */ - std::vector const& get_infer_date_names() const { return _infer_date_names; } + std::vector const& get_parse_dates_names() const { return _parse_dates_names; } /** * @brief Returns indexes of columns to read as datetime. */ - std::vector const& get_infer_date_indexes() const { return _infer_date_indexes; } + std::vector const& get_parse_dates_indexes() const { return _parse_dates_indexes; } + + /** + * @brief Returns names of columns to read as hexadecimal. + */ + std::vector const& get_parse_hex_names() const { return _parse_hex_names; } + + /** + * @brief Returns indexes of columns to read as hexadecimal. + */ + std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } /** * @brief Returns per-column types. */ - std::variant, std::vector> const& get_dtypes() const + std::variant, + std::vector, + std::map> const& + get_dtypes() const { return _dtypes; } @@ -547,9 +565,9 @@ class csv_reader_options { * * @param col_names Vector of column names to infer as datetime. */ - void set_infer_date_names(std::vector col_names) + void set_parse_dates(std::vector col_names) { - _infer_date_names = std::move(col_names); + _parse_dates_names = std::move(col_names); } /** @@ -557,11 +575,32 @@ class csv_reader_options { * * @param col_names Vector of column indices to infer as datetime. */ - void set_infer_date_indexes(std::vector col_ind) + void set_parse_dates(std::vector col_ind) { _parse_dates_indexes = std::move(col_ind); } + + /** + * @brief Sets names of columns to parse as hexadecimal + * + * @param col_names Vector of column names to parse as hexadecimal + */ + void set_parse_hex(std::vector col_names) { - _infer_date_indexes = std::move(col_ind); + _parse_hex_names = std::move(col_names); } + /** + * @brief Sets indexes of columns to parse as hexadecimal + * + * @param col_names Vector of column indices to parse as hexadecimal + */ + void set_parse_hex(std::vector col_ind) { _parse_hex_indexes = std::move(col_ind); } + + /** + * @brief Sets per-column types + * + * @param types Column name -> data type map specifying the columns' target data types + */ + void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** * @brief Sets per-column types * @@ -576,7 +615,8 @@ class csv_reader_options { */ [[deprecated( "The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] void + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] void set_dtypes(std::vector types) { _dtypes = std::move(types); @@ -958,24 +998,60 @@ class csv_reader_options_builder { /** * @brief Sets names of columns to read as datetime. * - * @param col_names Vector of column names to infer as datetime. + * @param col_names Vector of column names to read as datetime. * @return this for chaining. */ - csv_reader_options_builder& infer_date_names(std::vector col_names) + csv_reader_options_builder& parse_dates(std::vector col_names) { - options._infer_date_names = std::move(col_names); + options._parse_dates_names = std::move(col_names); return *this; } /** * @brief Sets indexes of columns to read as datetime. * - * @param col_names Vector of column indices to infer as datetime. + * @param col_ind Vector of column indices to read as datetime * @return this for chaining. */ - csv_reader_options_builder& infer_date_indexes(std::vector col_ind) + csv_reader_options_builder& parse_dates(std::vector col_ind) { - options._infer_date_indexes = std::move(col_ind); + options._parse_dates_indexes = std::move(col_ind); + return *this; + } + + /** + * @brief Sets names of columns to parse as hexadecimal. + * + * @param col_names Vector of column names to parse as hexadecimal + * @return this for chaining. + */ + csv_reader_options_builder& parse_hex(std::vector col_names) + { + options._parse_hex_names = std::move(col_names); + return *this; + } + + /** + * @brief Sets indexes of columns to parse as hexadecimal. + * + * @param col_ind Vector of column indices to parse as hexadecimal + * @return this for chaining. + */ + csv_reader_options_builder& parse_hex(std::vector col_ind) + { + options._parse_hex_indexes = std::move(col_ind); + return *this; + } + + /** + * @brief Sets per-column types. + * + * @param types Column name -> data type map specifying the columns' target data types + * @return this for chaining. + */ + csv_reader_options_builder& dtypes(std::map types) + { + options._dtypes = std::move(types); return *this; } @@ -999,7 +1075,8 @@ class csv_reader_options_builder { */ [[deprecated( "The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] csv_reader_options_builder& + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] csv_reader_options_builder& dtypes(std::vector types) { options._dtypes = std::move(types); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 2f4d0936d8b..8954f7dcab1 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -23,7 +23,9 @@ #include +#include #include +#include #include namespace cudf { @@ -66,7 +68,8 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::vector _dtypes; + std::variant, std::vector, std::map> + _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -114,7 +117,13 @@ class json_reader_options { /** * @brief Returns data types of the columns. */ - std::vector const& get_dtypes() const { return _dtypes; } + std::variant, + std::vector, + std::map> const& + get_dtypes() const + { + return _dtypes; + } /** * @brief Returns compression format of the source. @@ -141,19 +150,40 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } + /** + * @brief Set data types for columns to be read. + * + * @param types Vector of dtypes in string format. + */ + [[deprecated( + "The string-based interface will be deprecated." + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] void + set_dtypes(std::vector types) + { + _dtypes = std::move(types); + } + + /** + * @brief Set data types for columns to be read. + * + * @param types Vector of dtypes + */ + void set_dtypes(std::vector types) { _dtypes = std::move(types); } + /** * @brief Set data types for columns to be read. * * @param types Vector dtypes in string format. */ - void dtypes(std::vector types) { _dtypes = std::move(types); } + void set_dtypes(std::map types) { _dtypes = std::move(types); } /** * @brief Set the compression type. * * @param comp_type The compression type used. */ - void compression(compression_type comp_type) { _compression = comp_type; } + void set_compression(compression_type comp_type) { _compression = comp_type; } /** * @brief Set number of bytes to skip from source start. @@ -205,10 +235,38 @@ class json_reader_options_builder { /** * @brief Set data types for columns to be read. * - * @param types Vector dtypes in string format. - * @return this for chaining. + * @param types Vector of dtypes in string format + * @return this for chaining + */ + [[deprecated( + "The string-based interface will be deprecated." + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] json_reader_options_builder& + dtypes(std::vector types) + { + options._dtypes = std::move(types); + return *this; + } + + /** + * @brief Set data types for columns to be read. + * + * @param types Vector of dtypes + * @return this for chaining + */ + json_reader_options_builder& dtypes(std::vector types) + { + options._dtypes = std::move(types); + return *this; + } + + /** + * @brief Set data types for columns to be read. + * + * @param types Column name -> dtype map. + * @return this for chaining */ - json_reader_options_builder& dtypes(std::vector types) + json_reader_options_builder& dtypes(std::map types) { options._dtypes = std::move(types); return *this; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 70ce0fce1cc..549b0474fe1 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -49,18 +50,6 @@ using cudf::device_span; using cudf::host_span; using cudf::detail::make_device_uvector_async; -namespace { -/** - * @brief Helper class to support inline-overloading for all of a variant's alternative types - */ -template -struct VisitorOverload : Ts... { - using Ts::operator()...; -}; -template -VisitorOverload(Ts...) -> VisitorOverload; -} // namespace - namespace cudf { namespace io { namespace detail { @@ -280,6 +269,41 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } +std::vector reader::impl::select_data_types( + std::map const& col_type_map) +{ + std::vector selected_dtypes; + + for (int col = 0; col < num_actual_cols_; col++) { + if (column_flags_[col] & column_parse::enabled) { + auto const col_type_it = col_type_map.find(col_names_[col]); + CUDF_EXPECTS(col_type_it != col_type_map.end(), + "Must specify data types for all active columns"); + selected_dtypes.emplace_back(col_type_it->second); + } + } + return selected_dtypes; +} + +std::vector reader::impl::select_data_types(std::vector const& dtypes) +{ + std::vector selected_dtypes; + + if (dtypes.size() == 1) { + // If it's a single dtype, assign that dtype to all active columns + selected_dtypes.resize(num_active_cols_, dtypes.front()); + } else { + // If it's a list, assign dtypes to active columns in the given order + CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_cols_, + "Must specify data types for all columns"); + + for (int col = 0; col < num_actual_cols_; col++) { + if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); } + } + } + return selected_dtypes; +} + table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) { auto const data_row_offsets = select_data_and_row_offsets(stream); @@ -355,13 +379,13 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } - // User can specify which columns should be inferred as datetime - if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) { - for (const auto index : opts_.get_infer_date_indexes()) { + // User can specify which columns should be read as datetime + if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) { + for (const auto index : opts_.get_parse_dates_indexes()) { column_flags_[index] |= column_parse::as_datetime; } - for (const auto& name : opts_.get_infer_date_names()) { + for (const auto& name : opts_.get_parse_dates_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; @@ -369,6 +393,20 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } + // User can specify which columns should be parsed as hexadecimal + if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { + for (const auto index : opts_.get_parse_hex_indexes()) { + column_flags_[index] |= column_parse::as_hexadecimal; + } + + for (const auto& name : opts_.get_parse_hex_names()) { + auto it = std::find(col_names_.begin(), col_names_.end(), name); + if (it != col_names_.end()) { + column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal; + } + } + } + // Return empty table rather than exception if nothing to load if (num_active_cols_ == 0) { return {std::make_unique(), {}}; } @@ -382,11 +420,14 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) if (has_to_infer_column_types) { column_types = infer_column_types(data, row_offsets, stream); } else { - column_types = - std::visit(VisitorOverload{ - [&](const std::vector& data_types) { return data_types; }, - [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, - opts_.get_dtypes()); + column_types = std::visit( + cudf::detail::visitor_overload{ + [&](const std::vector& data_types) { return select_data_types(data_types); }, + [&](const std::map& data_types) { + return select_data_types(data_types); + }, + [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, + opts_.get_dtypes()); } out_columns.reserve(column_types.size()); diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 29c6b48bc8a..36c2bf4f9e7 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -181,6 +181,22 @@ class reader::impl { device_span row_offsets, rmm::cuda_stream_view stream); + /** + * @brief Selects the columns' data types from the map of dtypes. + * + * @param col_type_map Column name -> data type map specifying the columns' target data types + * @return Sorted list of selected columns' data types + */ + std::vector select_data_types(std::map const& col_type_map); + + /** + * @brief Selects the columns' data types from the list of dtypes. + * + * @param dtypes Vector of data types specifying the columns' target data types + * @return Sorted list of selected columns' data types + */ + std::vector select_data_types(std::vector const& dtypes); + /** * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. * diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index b4395d6c965..a8f117c22bf 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -50,7 +51,6 @@ namespace json { using namespace cudf::io; namespace { - /** * @brief Estimates the maximum expected length or a row, based on the number * of columns @@ -236,7 +236,9 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) { size_t map_range_size = 0; if (range_size != 0) { - map_range_size = range_size + calculate_max_row_size(options_.get_dtypes().size()); + auto const dtype_option_size = + std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes()); + map_range_size = range_size + calculate_max_row_size(dtype_option_size); } // Support delayed opening of the file if using memory mapping datasource @@ -464,47 +466,71 @@ void reader::impl::set_column_names(device_span rec_starts, } } -void reader::impl::set_data_types(device_span rec_starts, - rmm::cuda_stream_view stream) +std::vector reader::impl::parse_data_types( + std::vector const& types_as_strings) { - auto const dtype = options_.get_dtypes(); - if (!dtype.empty()) { - CUDF_EXPECTS(dtype.size() == metadata_.column_names.size(), - "Need to specify the type of each column.\n"); - - // Assume that the dtype is in dictionary format only if all elements contain a colon - const bool is_dict = - std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) { - return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); + CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), + "Need to specify the type of each column.\n"); + std::vector dtypes; + // Assume that the dtype is in dictionary format only if all elements contain a colon + const bool is_dict = std::all_of( + std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { + return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); + }); + + auto split_on_colon = [](std::string_view s) { + auto const i = s.find(":"); + return std::pair{s.substr(0, i), s.substr(i + 1)}; + }; + + if (is_dict) { + std::map col_type_map; + std::transform( + std::cbegin(types_as_strings), + std::cend(types_as_strings), + std::inserter(col_type_map, col_type_map.end()), + [&](auto const& ts) { + auto const [col_name, type_str] = split_on_colon(ts); + return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; }); - auto split_on_colon = [](std::string_view s) { - auto const i = s.find(":"); - return std::pair{s.substr(0, i), s.substr(i + 1)}; - }; + // Using the map here allows O(n log n) complexity + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(dtypes), + [&](auto const& column_name) { return col_type_map[column_name]; }); + } else { + std::transform(std::cbegin(types_as_strings), + std::cend(types_as_strings), + std::back_inserter(dtypes), + [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); + } + return dtypes; +} - if (is_dict) { - std::map col_type_map; - std::transform( - std::cbegin(dtype), - std::cend(dtype), - std::inserter(col_type_map, col_type_map.end()), - [&](auto const& ts) { - auto const [col_name, type_str] = split_on_colon(ts); - return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; - }); - - // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(dtypes_), - [&](auto const& column_name) { return col_type_map[column_name]; }); - } else { - std::transform(std::cbegin(dtype), - std::cend(dtype), - std::back_inserter(dtypes_), - [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); - } +void reader::impl::set_data_types(device_span rec_starts, + rmm::cuda_stream_view stream) +{ + bool has_to_infer_column_types = + std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); + if (!has_to_infer_column_types) { + dtypes_ = std::visit( + cudf::detail::visitor_overload{ + [&](const std::vector& dtypes) { return dtypes; }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }, + [&](std::vector const& dtypes) { return parse_data_types(dtypes); }}, + options_.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index bbda7e9ba74..5cf51369cdf 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -158,6 +158,8 @@ class reader::impl { */ void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); + std::vector parse_data_types(std::vector const& types_as_strings); + /** * @brief Set the data type array data member * diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 94f01fd62f3..43d9bd7b514 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -50,6 +50,16 @@ namespace cudf_io = cudf::io; +using cudf::data_type; +using cudf::type_id; +using cudf::type_to_id; + +template +auto dtype() +{ + return data_type{type_to_id()}; +} + template using column_wrapper = typename std::conditional, @@ -80,7 +90,6 @@ struct CsvReaderTest : public cudf::test::BaseFixture { // Typed test fixture for timestamp type tests template struct CsvReaderNumericTypeTest : public CsvReaderTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } }; // Declare typed test cases @@ -93,8 +102,8 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { void run_tests(const std::vector& reference_strings, numeric::scale_type scale) { cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end()); - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id(), scale}); + auto input_column = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), scale}); std::string buffer = std::accumulate(reference_strings.begin(), reference_strings.end(), @@ -105,7 +114,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({cudf::data_type{cudf::type_to_id(), scale}}) + .dtypes({data_type{type_to_id(), scale}}) .header(-1); const auto result = cudf_io::read_csv(in_opts); @@ -389,9 +398,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) reference_strings = valid_reference_strings; using DecimalType = TypeParam; - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}); + auto input_column = + cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), numeric::scale_type{-2}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -435,9 +444,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) reference_strings = valid_reference_strings; using DecimalType = TypeParam; - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id(), numeric::scale_type{3}}); + auto input_column = + cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), numeric::scale_type{3}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -479,11 +488,10 @@ TEST_F(CsvReaderTest, MultiColumn) { std::ostringstream line; for (int i = 0; i < num_rows; ++i) { - line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int16_values[i] - << "," << int32_values[i] << "," << int32_values[i] << "," << int64_values[i] << "," - << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," << uint16_values[i] - << "," << uint32_values[i] << "," << uint64_values[i] << "," << float32_values[i] << "," - << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "\n"; + line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int32_values[i] + << "," << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," + << uint16_values[i] << "," << uint32_values[i] << "," << uint64_values[i] << "," + << float32_values[i] << "," << float64_values[i] << "\n"; } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -492,39 +500,29 @@ TEST_F(CsvReaderTest, MultiColumn) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .header(-1) - .dtypes(std::vector{"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float", - "float32", - "double", - "float64"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); expect_column_data_equal(int8_values, view.column(0)); expect_column_data_equal(int16_values, view.column(1)); - expect_column_data_equal(int16_values, view.column(2)); - expect_column_data_equal(int32_values, view.column(3)); - expect_column_data_equal(int32_values, view.column(4)); - expect_column_data_equal(int64_values, view.column(5)); - expect_column_data_equal(int64_values, view.column(6)); - expect_column_data_equal(uint8_values, view.column(7)); - expect_column_data_equal(uint16_values, view.column(8)); - expect_column_data_equal(uint32_values, view.column(9)); - expect_column_data_equal(uint64_values, view.column(10)); - expect_column_data_equal(float32_values, view.column(11)); - expect_column_data_equal(float32_values, view.column(12)); - expect_column_data_equal(float64_values, view.column(13)); - expect_column_data_equal(float64_values, view.column(14)); + expect_column_data_equal(int32_values, view.column(2)); + expect_column_data_equal(int64_values, view.column(3)); + expect_column_data_equal(uint8_values, view.column(4)); + expect_column_data_equal(uint16_values, view.column(5)); + expect_column_data_equal(uint32_values, view.column(6)); + expect_column_data_equal(uint64_values, view.column(7)); + expect_column_data_equal(float32_values, view.column(8)); + expect_column_data_equal(float64_values, view.column(9)); } TEST_F(CsvReaderTest, RepeatColumn) @@ -549,7 +547,7 @@ TEST_F(CsvReaderTest, RepeatColumn) // repeats column in indexes and names, misses 1 column. cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes(std::vector{"int16", "int64", "uint64", "float"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) .names({"A", "B", "C", "D"}) .use_cols_indexes({1, 0, 0}) .use_cols_names({"D", "B", "B"}) @@ -575,7 +573,7 @@ TEST_F(CsvReaderTest, Booleans) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A", "B", "C", "D"}) - .dtypes(std::vector{"int32", "int32", "short", "bool"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) .true_values({"yes", "Yes", "YES", "foo", "FOO"}) .false_values({"no", "No", "NO", "Bar", "bar"}) .header(-1); @@ -584,10 +582,10 @@ TEST_F(CsvReaderTest, Booleans) // Booleans are the same (integer) data type, but valued at 0 or 1 const auto view = result.tbl->view(); EXPECT_EQ(4, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::INT32, view.column(1).type().id()); - ASSERT_EQ(cudf::type_id::INT16, view.column(2).type().id()); - ASSERT_EQ(cudf::type_id::BOOL8, view.column(3).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(1).type().id()); + ASSERT_EQ(type_id::INT16, view.column(2).type().id()); + ASSERT_EQ(type_id::BOOL8, view.column(3).type().id()); expect_column_data_equal(std::vector{1, 0, 0, 0, 1}, view.column(0)); expect_column_data_equal(std::vector{0, 1, 1, 0, 1}, view.column(2)); @@ -607,14 +605,14 @@ TEST_F(CsvReaderTest, Dates) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_ms{983750400000ms}, @@ -643,15 +641,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_SECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_s{983750400s}, @@ -680,15 +677,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_ms{983750400000ms}, @@ -717,15 +713,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_us{983750400000000us}, @@ -754,15 +749,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal( @@ -795,14 +789,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[s]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_SECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -824,14 +817,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[ms]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -853,14 +845,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[us]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -882,14 +873,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[ns]"}) - .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -907,14 +897,14 @@ TEST_F(CsvReaderTest, FloatingPoint) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float32"}) + .dtypes({dtype()}) .lineterminator(';') .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id()); + ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); const auto ref_vals = std::vector{5.6, 56.79, 12000000000, 0.7, 3.000, 12.34, 0.31, -73.98007199999998}; @@ -940,14 +930,14 @@ TEST_F(CsvReaderTest, Strings) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, @@ -970,14 +960,14 @@ TEST_F(CsvReaderTest, StringsQuotes) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quotechar('`'); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"abc,\ndef, ghi", "jkl, `mno`, pqr", "stu `vwx` yz"}, view.column(1)); @@ -999,15 +989,15 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE) .doublequote(false); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, @@ -1025,7 +1015,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(1) .skiprows(2) .nrows(2); @@ -1033,7 +1023,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{5, 6}, view.column(0)); } @@ -1049,7 +1039,7 @@ TEST_F(CsvReaderTest, ByteRange) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(-1) .byte_range_offset(11) .byte_range_size(15); @@ -1057,7 +1047,7 @@ TEST_F(CsvReaderTest, ByteRange) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{4000, 5000, 6000}, view.column(0)); } @@ -1068,14 +1058,14 @@ TEST_F(CsvReaderTest, ByteRangeStrings) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{input.c_str(), input.size()}) .names({"A"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .byte_range_offset(4); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::STRING, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(0).type().id()); expect_column_data_equal(std::vector{"c"}, view.column(0)); } @@ -1091,14 +1081,14 @@ TEST_F(CsvReaderTest, BlanksAndComments) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(-1) .comment('#'); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{1, 3, 4, 5, 8, 9}, view.column(0)); } @@ -1166,12 +1156,12 @@ TEST_F(CsvReaderTest, ArrowFileSource) auto arrow_source = cudf_io::arrow_io_source{infile}; cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}) - .dtypes(std::vector{"int8"}); + .dtypes({dtype()}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); + ASSERT_EQ(type_id::INT8, view.column(0).type().id()); expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } @@ -1187,13 +1177,13 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float32"}) + .dtypes({dtype()}) .header(-1); const auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id()); + ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); const auto col_data = cudf::test::to_host(view.column(0)); // col_data.first contains the column data @@ -1212,7 +1202,7 @@ TEST_F(CsvReaderTest, StringInference) const auto result = cudf_io::read_csv(in_opts); EXPECT_EQ(result.tbl->num_columns(), 1); - EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING); } TEST_F(CsvReaderTest, TypeInferenceThousands) @@ -1226,9 +1216,9 @@ TEST_F(CsvReaderTest, TypeInferenceThousands) const auto result_view = result.tbl->view(); EXPECT_EQ(result_view.num_columns(), 3); - EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(1).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64); auto tsnd_sep_col = std::vector{1400L, 123456L}; auto int_col = std::vector{123L, 123456L}; @@ -1254,9 +1244,9 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal) const auto result_view = result.tbl->view(); EXPECT_EQ(result_view.num_columns(), 3); - EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::STRING); - EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(1).type().id(), type_id::STRING); + EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64); auto int_col = std::vector{1400L, 123456L}; auto str_col = std::vector{"1.23", "123.456"}; @@ -1296,7 +1286,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .na_filter(false) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1310,7 +1300,7 @@ TEST_F(CsvReaderTest, nullHandling) { cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1327,7 +1317,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .na_values({"Null"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1345,7 +1335,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .keep_default_na(false) .na_values({"Null"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1477,16 +1467,35 @@ TEST_F(CsvReaderTest, HexTest) std::ofstream outfile(filepath, std::ofstream::out); outfile << "0x0\n-0x1000\n0xfedcba\n0xABCDEF\n0xaBcDeF\n9512c20b\n"; } + // specify hex columns by name + { + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .names({"A"}) + .dtypes({dtype()}) + .header(-1) + .parse_hex({"A"}); + auto result = cudf_io::read_csv(in_opts); - cudf_io::csv_reader_options in_opts = - cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .names({"A"}) - .dtypes(std::vector{"hex"}) - .header(-1); - auto result = cudf_io::read_csv(in_opts); + expect_column_data_equal( + std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, + result.tbl->view().column(0)); + } + + // specify hex columns by index + { + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .names({"A"}) + .dtypes({dtype()}) + .header(-1) + .parse_hex(std::vector{0}); + auto result = cudf_io::read_csv(in_opts); - expect_column_data_equal(std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, - result.tbl->view().column(0)); + expect_column_data_equal( + std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, + result.tbl->view().column(0)); + } } TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter) @@ -1555,18 +1564,13 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) std::vector input_columns{int8_column, int16_column, - int16_column, - int32_column, int32_column, int64_column, - int64_column, uint8_column, uint16_column, uint32_column, uint64_column, float32_column, - float32_column, - float64_column, float64_column}; cudf::table_view input_table{input_columns}; @@ -1577,26 +1581,21 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .header(-1) - .dtypes(std::vector{"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float", - "float32", - "double", - "float64"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); - std::vector non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + std::vector non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8}; const auto input_sliced_view = input_table.select(non_float64s); const auto result_sliced_view = result_table.select(non_float64s); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_sliced_view, result_sliced_view); @@ -1606,9 +1605,6 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) auto float64_col_idx = non_float64s.size(); check_float_column( input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity); - ++float64_col_idx; - check_float_column( - input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity); } TEST_F(CsvReaderTest, DatesWithWriter) @@ -1633,7 +1629,7 @@ TEST_F(CsvReaderTest, DatesWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -1764,7 +1760,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float64"}) + .dtypes({dtype()}) .header(-1); // in_opts.lineterminator = ';'; auto result = cudf_io::read_csv(in_opts); @@ -1790,7 +1786,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -1815,7 +1811,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -1839,7 +1835,7 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}); + .dtypes(std::vector{dtype(), dtype()}); auto result = cudf_io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); @@ -1917,7 +1913,7 @@ TEST_F(CsvReaderTest, UserImplementedSource) TestSource source{csv_data.str()}; cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{&source}) - .dtypes(std::vector{"int8", "int16", "int32"}) + .dtypes({dtype(), dtype(), dtype()}) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -1962,8 +1958,11 @@ TEST_F(CsvReaderTest, DurationsWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{ - "timedelta[D]", "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]"}); + .dtypes({data_type{type_id::DURATION_DAYS}, + data_type{type_id::DURATION_SECONDS}, + data_type{type_id::DURATION_MILLISECONDS}, + data_type{type_id::DURATION_MICROSECONDS}, + data_type{type_id::DURATION_NANOSECONDS}}); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); @@ -2164,4 +2163,35 @@ TEST_F(CsvReaderTest, DefaultWriteChunkSize) } } +TEST_F(CsvReaderTest, DtypesMap) +{ + std::string csv_in{"12,9\n34,8\n56,7"}; + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"A", "B"}) + .dtypes({{"B", dtype()}, {"A", dtype()}}) + .header(-1); + auto result = cudf_io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + assert(result_table->num_columns() == 2); + assert(result_table.column(0).type() == data_type{type_id::INT32}); + assert(result_table.column(1).type() == data_type{type_id::INT16}); + expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); + expect_column_data_equal(std::vector{9, 8, 7}, result_table.column(1)); +} + +TEST_F(CsvReaderTest, DtypesMapInvalid) +{ + std::string csv_in{""}; + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"A", "B"}) + .dtypes({{"A", dtype()}}); + + EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 308821489c5..a263fa0fce0 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -42,6 +42,16 @@ using int64_wrapper = wrapper; using timestamp_ms_wrapper = wrapper; using bool_wrapper = wrapper; +using cudf::data_type; +using cudf::type_id; +using cudf::type_to_id; + +template +auto dtype() +{ + return data_type{type_to_id()}; +} + template using column_wrapper = typename std::conditional, @@ -151,7 +161,7 @@ TEST_F(JsonReaderTest, BasicJsonLines) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes({"int", "float64"}) + .dtypes(std::vector{dtype(), dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -182,7 +192,7 @@ TEST_F(JsonReaderTest, FloatingPoint) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"float32"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -206,7 +216,7 @@ TEST_F(JsonReaderTest, JsonLinesStrings) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes({"2:str", "0:int", "1:float64"}) + .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -245,9 +255,8 @@ TEST_F(JsonReaderTest, MultiColumn) std::ostringstream line; for (int i = 0; i < num_rows; ++i) { line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," - << int16_values[i] << "," << int32_values[i] << "," << int32_values[i] << "," - << int64_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," - << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "]\n"; + << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," + << float64_values[i] << "]\n"; } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -255,17 +264,12 @@ TEST_F(JsonReaderTest, MultiColumn) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "float", - "float32", - "double", - "float64"}) + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -275,34 +279,21 @@ TEST_F(JsonReaderTest, MultiColumn) EXPECT_EQ(view.column(0).type().id(), cudf::type_id::INT8); EXPECT_EQ(view.column(1).type().id(), cudf::type_id::INT16); - EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT16); - EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT32); - EXPECT_EQ(view.column(4).type().id(), cudf::type_id::INT32); - EXPECT_EQ(view.column(5).type().id(), cudf::type_id::INT64); - EXPECT_EQ(view.column(6).type().id(), cudf::type_id::INT64); - EXPECT_EQ(view.column(7).type().id(), cudf::type_id::FLOAT32); - EXPECT_EQ(view.column(8).type().id(), cudf::type_id::FLOAT32); - EXPECT_EQ(view.column(9).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(view.column(10).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT32); + EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT64); + EXPECT_EQ(view.column(4).type().id(), cudf::type_id::FLOAT32); + EXPECT_EQ(view.column(5).type().id(), cudf::type_id::FLOAT64); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(0), int8_wrapper{int8_values.begin(), int8_values.end(), validity}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(1), int16_wrapper{int16_values.begin(), int16_values.end(), validity}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(2), - int16_wrapper{int16_values.begin(), int16_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), - int_wrapper{int32_values.begin(), int32_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(4), int_wrapper{int32_values.begin(), int32_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(5), - int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(6), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - check_float_column(view.column(7), float32_values, validity); - check_float_column(view.column(8), float32_values, validity); - check_float_column(view.column(9), float64_values, validity); - check_float_column(view.column(10), float64_values, validity); + check_float_column(view.column(4), float32_values, validity); + check_float_column(view.column(5), float64_values, validity); } TEST_F(JsonReaderTest, Booleans) @@ -315,7 +306,7 @@ TEST_F(JsonReaderTest, Booleans) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"bool"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -342,7 +333,7 @@ TEST_F(JsonReaderTest, Dates) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -379,7 +370,7 @@ TEST_F(JsonReaderTest, Durations) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"timedelta64[ns]"}) + .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -665,13 +656,12 @@ TEST_F(JsonReaderTest, ArrowFileSource) auto arrow_source = cudf_io::arrow_io_source{infile}; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{&arrow_source}) - .dtypes({"int8"}) + .dtypes({dtype()}) .lines(true); ; cudf_io::table_with_metadata result = cudf_io::read_json(in_options); - EXPECT_EQ(result.tbl->num_columns(), - static_cast(in_options.get_dtypes().size())); + EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -690,7 +680,7 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"float32"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index c5e235b5697..725757121d9 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -49,8 +50,10 @@ cdef extern from "cudf/io/csv.hpp" \ cudf_io_types.quote_style get_quoting() except+ char get_quotechar() except+ bool is_enabled_doublequote() except+ - vector[string] get_infer_date_names() except+ - vector[int] get_infer_date_indexes() except+ + vector[string] get_parse_dates_names() except+ + vector[int] get_parse_dates_indexes() except+ + vector[string] get_parse_hex_names() except+ + vector[int] get_parse_hex_indexes() except+ # Conversion settings vector[string] get_dtype() except+ @@ -92,11 +95,15 @@ cdef extern from "cudf/io/csv.hpp" \ void set_quoting(cudf_io_types.quote_style style) except+ void set_quotechar(char val) except+ void set_doublequote(bool val) except+ - void set_infer_date_names(vector[string]) except+ - void set_infer_date_indexes(vector[int]) except+ + void set_parse_dates(vector[string]) except+ + void set_parse_dates(vector[int]) except+ + void set_parse_hex(vector[string]) except+ + void set_parse_hex(vector[int]) except+ # Conversion settings void set_dtypes(vector[string] types) except+ + void set_dtypes(vector[data_type] types) except+ + void set_dtypes(map[string, data_type] types) except+ void set_true_values(vector[string] vals) except+ void set_false_values(vector[string] vals) except+ void set_na_values(vector[string] vals) except+ @@ -157,11 +164,15 @@ cdef extern from "cudf/io/csv.hpp" \ ) except+ csv_reader_options_builder& quotechar(char val) except+ csv_reader_options_builder& doublequote(bool val) except+ - csv_reader_options_builder& infer_date_names(vector[string]) except+ - csv_reader_options_builder& infer_date_indexes(vector[int]) except+ + csv_reader_options_builder& parse_dates(vector[string]) except+ + csv_reader_options_builder& parse_dates(vector[int]) except+ # Conversion settings csv_reader_options_builder& dtypes(vector[string] types) except+ + csv_reader_options_builder& dtypes(vector[data_type] types) except+ + csv_reader_options_builder& dtypes( + map[string, data_type] types + ) except+ csv_reader_options_builder& true_values(vector[string] vals) except+ csv_reader_options_builder& false_values(vector[string] vals) except+ csv_reader_options_builder& na_values(vector[string] vals) except+ diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 6f20195e87f..4a3792f5023 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -26,6 +27,8 @@ cdef extern from "cudf/io/json.hpp" \ # setter void set_dtypes(vector[string] types) except+ + void set_dtypes(vector[data_type] types) except+ + void set_dtypes(map[string, data_type] types) except+ void set_compression( cudf_io_types.compression_type compression ) except+ @@ -47,6 +50,12 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dtypes( vector[string] types ) except+ + json_reader_options_builder& dtypes( + vector[data_type] types + ) except+ + json_reader_options_builder& dtypes( + map[string, data_type] types + ) except+ json_reader_options_builder& compression( cudf_io_types.compression_type compression ) except+ diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 2dfa61ee900..1d9c8fa58e6 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -117,8 +117,8 @@ cdef csv_reader_options make_csv_reader_options( cdef vector[string] c_use_cols_names cdef size_type c_nrows = nrows if nrows is not None else -1 cdef quote_style c_quoting - cdef vector[string] c_infer_date_names - cdef vector[int] c_infer_date_indexes + cdef vector[string] c_parse_dates_names + cdef vector[int] c_parse_dates_indexes cdef vector[string] c_dtypes cdef vector[string] c_true_values cdef vector[string] c_false_values @@ -221,14 +221,14 @@ cdef csv_reader_options make_csv_reader_options( "`parse_dates`: non-lists are unsupported") for col in parse_dates: if isinstance(col, str): - c_infer_date_names.push_back(str(col).encode()) + c_parse_dates_names.push_back(str(col).encode()) elif isinstance(col, int): - c_infer_date_indexes.push_back(col) + c_parse_dates_indexes.push_back(col) else: raise NotImplementedError( "`parse_dates`: Nesting is unsupported") - csv_reader_options_c.set_infer_date_names(c_infer_date_names) - csv_reader_options_c.set_infer_date_indexes(c_infer_date_indexes) + csv_reader_options_c.set_parse_dates(c_parse_dates_names) + csv_reader_options_c.set_parse_dates(c_parse_dates_indexes) if dtype is not None: if isinstance(dtype, abc.Mapping): From 29b5f9ac6d24c64163349f1a5b2b5b5ef049769e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 4 Aug 2021 19:11:03 -0400 Subject: [PATCH 02/14] Move template parameter to function parameter in cudf::detail::left_semi_anti_join (#8914) The `semi_join.cu` takes about 6 minutes to compile on my Linux 18.04 desktop when doing a full build of libcudf. The `join_kind` template parameter used internally in `cudf::detail::left_semi_anti_join` for `left_semi_join` and `left_anti_join` APIs is not used in a `constexpr` or to pass to any other templated function. This PR moves the template parameter to a runtime parameter on the detail functions reducing the compile time for `semi_join.cu` by ~2x. Another improvement includes un-inlining the `is_trivial_join` utility function to reduce the compile time for files that include `join_common_utils.hpp`. Finally, the device vector used as a gather map in `detail::left_semi_anti_join` was wrapped with a `column_view` in order to call `detail::gather` without iterators. This allowed not including the heavy `gather.cuh`. This improved the compile time about 10% and reduced the object file `semi_join.cu.o` size by 2x. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/8914 --- cpp/CMakeLists.txt | 1 + cpp/src/join/join.cu | 2 +- cpp/src/join/join_common_utils.cuh | 134 ++++--------------------- cpp/src/join/join_common_utils.hpp | 25 +---- cpp/src/join/join_utils.cu | 155 +++++++++++++++++++++++++++++ cpp/src/join/semi_join.cu | 105 +++++++++++-------- 6 files changed, 244 insertions(+), 178 deletions(-) create mode 100644 cpp/src/join/join_utils.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90c17067b55..a6f7a41825d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -305,6 +305,7 @@ add_library(cudf src/join/cross_join.cu src/join/hash_join.cu src/join/join.cu + src/join/join_utils.cu src/join/semi_join.cu src/lists/contains.cu src/lists/combine/concatenate_list_elements.cu diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 526edbf6903..740431b8563 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 2b1c870bea1..d5c23b1d612 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -21,9 +21,7 @@ #include #include -#include -#include #include namespace cudf { @@ -31,7 +29,9 @@ namespace detail { /** * @brief Computes the trivial left join operation for the case when the - * right table is empty. In this case all the valid indices of the left table + * right table is empty. + * + * In this case all the valid indices of the left table * are returned with their corresponding right indices being set to * JoinNoneValue, i.e. -1. * @@ -41,21 +41,12 @@ namespace detail { * * @return Join output indices vector pair */ -inline std::pair>, - std::unique_ptr>> +std::pair>, + std::unique_ptr>> get_trivial_left_join_indices( table_view const& left, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto left_indices = std::make_unique>(left.num_rows(), stream, mr); - thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); - auto right_indices = - std::make_unique>(left.num_rows(), stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); - return std::make_pair(std::move(left_indices), std::move(right_indices)); -} + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); // Convenient alias for a pair of unique pointers to device uvectors. using VectorPair = std::pair>, @@ -83,47 +74,11 @@ using VectorPair = std::pair>, * * @return A pair of vectors containing the concatenated output. */ -inline VectorPair concatenate_vector_pairs(VectorPair& a, - VectorPair& b, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS((a.first->size() == a.second->size()), - "Mismatch between sizes of vectors in vector pair"); - CUDF_EXPECTS((b.first->size() == b.second->size()), - "Mismatch between sizes of vectors in vector pair"); - if (a.first->is_empty()) { - return std::move(b); - } else if (b.first->is_empty()) { - return std::move(a); - } - auto original_size = a.first->size(); - a.first->resize(a.first->size() + b.first->size(), stream); - a.second->resize(a.second->size() + b.second->size(), stream); - thrust::copy( - rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size); - thrust::copy(rmm::exec_policy(stream), - b.second->begin(), - b.second->end(), - a.second->begin() + original_size); - return std::move(a); -} - -/** - * @brief Device functor to determine if an index is contained in a range. - */ -template -struct valid_range { - T start, stop; - __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {} - - __host__ __device__ __forceinline__ bool operator()(const T index) - { - return ((index >= start) && (index < stop)); - } -}; +VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream); /** * @brief Creates a table containing the complement of left join indices. + * * This table has two columns. The first one is filled with JoinNoneValue(-1) * and the second one contains values from 0 to right_table_row_count - 1 * excluding those found in the right_indices column. @@ -136,72 +91,27 @@ struct valid_range { * * @return Pair of vectors containing the left join indices complement */ -inline std::pair>, - std::unique_ptr>> +std::pair>, + std::unique_ptr>> get_left_join_indices_complement(std::unique_ptr>& right_indices, size_type left_table_row_count, size_type right_table_row_count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Get array of indices that do not appear in right_indices - - // Vector allocated for unmatched result - auto right_indices_complement = - std::make_unique>(right_table_row_count, stream); - - // If left table is empty in a full join call then all rows of the right table - // should be represented in the joined indices. This is an optimization since - // if left table is empty and full join is called all the elements in - // right_indices will be JoinNoneValue, i.e. -1. This if path should - // produce exactly the same result as the else path but will be faster. - if (left_table_row_count == 0) { - thrust::sequence(rmm::exec_policy(stream), - right_indices_complement->begin(), - right_indices_complement->end(), - 0); - } else { - // Assume all the indices in invalid_index_map are invalid - auto invalid_index_map = - std::make_unique>(right_table_row_count, stream); - thrust::uninitialized_fill( - rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); - - // Functor to check for index validity since left joins can create invalid indices - valid_range valid(0, right_table_row_count); + rmm::mr::device_memory_resource* mr); - // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count - // Thus specifying that those locations are valid - thrust::scatter_if(rmm::exec_policy(stream), - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0) + right_indices->size(), - right_indices->begin(), // Index locations - right_indices->begin(), // Stencil - Check if index location is valid - invalid_index_map->begin(), // Output indices - valid); // Stencil Predicate - size_type begin_counter = static_cast(0); - size_type end_counter = static_cast(right_table_row_count); +/** + * @brief Device functor to determine if an index is contained in a range. + */ +template +struct valid_range { + T start, stop; + __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {} - // Create list of indices that have been marked as invalid - size_type indices_count = thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(begin_counter), - thrust::make_counting_iterator(end_counter), - invalid_index_map->begin(), - right_indices_complement->begin(), - thrust::identity()) - - right_indices_complement->begin(); - right_indices_complement->resize(indices_count, stream); + __host__ __device__ __forceinline__ bool operator()(const T index) + { + return ((index >= start) && (index < stop)); } - - auto left_invalid_indices = - std::make_unique>(right_indices_complement->size(), stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), - left_invalid_indices->begin(), - left_invalid_indices->end(), - JoinNoneValue); - - return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); -} +}; /** * @brief Adds a pair of indices to the shared memory cache diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index d2337e28ed4..d2541b006a7 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,6 @@ #include #include -#include - #include #include @@ -49,26 +47,7 @@ using row_equality = cudf::row_equality_comparator; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; -inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) -{ - // If there is nothing to join, then send empty table with all columns - if (left.is_empty() || right.is_empty()) { return true; } - - // If left join and the left table is empty, return immediately - if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } - - // If Inner Join and either table is empty, return immediately - if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) { - return true; - } - - // If left semi join (contains) and right table is empty, - // return immediately - if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; } - - return false; -} +bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type); } // namespace detail - } // namespace cudf diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu new file mode 100644 index 00000000000..4aca4b4a9cf --- /dev/null +++ b/cpp/src/join/join_utils.cu @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) +{ + // If there is nothing to join, then send empty table with all columns + if (left.is_empty() || right.is_empty()) { return true; } + + // If left join and the left table is empty, return immediately + if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } + + // If Inner Join and either table is empty, return immediately + if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) { + return true; + } + + // If left semi join (contains) and right table is empty, + // return immediately + if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; } + + return false; +} + +std::pair>, + std::unique_ptr>> +get_trivial_left_join_indices(table_view const& left, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto left_indices = std::make_unique>(left.num_rows(), stream, mr); + thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); + auto right_indices = + std::make_unique>(left.num_rows(), stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); + return std::make_pair(std::move(left_indices), std::move(right_indices)); +} + +VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS((a.first->size() == a.second->size()), + "Mismatch between sizes of vectors in vector pair"); + CUDF_EXPECTS((b.first->size() == b.second->size()), + "Mismatch between sizes of vectors in vector pair"); + if (a.first->is_empty()) { + return std::move(b); + } else if (b.first->is_empty()) { + return std::move(a); + } + auto original_size = a.first->size(); + a.first->resize(a.first->size() + b.first->size(), stream); + a.second->resize(a.second->size() + b.second->size(), stream); + thrust::copy( + rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size); + thrust::copy(rmm::exec_policy(stream), + b.second->begin(), + b.second->end(), + a.second->begin() + original_size); + return std::move(a); +} + +std::pair>, + std::unique_ptr>> +get_left_join_indices_complement(std::unique_ptr>& right_indices, + size_type left_table_row_count, + size_type right_table_row_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Get array of indices that do not appear in right_indices + + // Vector allocated for unmatched result + auto right_indices_complement = + std::make_unique>(right_table_row_count, stream); + + // If left table is empty in a full join call then all rows of the right table + // should be represented in the joined indices. This is an optimization since + // if left table is empty and full join is called all the elements in + // right_indices will be JoinNoneValue, i.e. -1. This if path should + // produce exactly the same result as the else path but will be faster. + if (left_table_row_count == 0) { + thrust::sequence(rmm::exec_policy(stream), + right_indices_complement->begin(), + right_indices_complement->end(), + 0); + } else { + // Assume all the indices in invalid_index_map are invalid + auto invalid_index_map = + std::make_unique>(right_table_row_count, stream); + thrust::uninitialized_fill( + rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); + + // Functor to check for index validity since left joins can create invalid indices + valid_range valid(0, right_table_row_count); + + // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count + // Thus specifying that those locations are valid + thrust::scatter_if(rmm::exec_policy(stream), + thrust::make_constant_iterator(0), + thrust::make_constant_iterator(0) + right_indices->size(), + right_indices->begin(), // Index locations + right_indices->begin(), // Stencil - Check if index location is valid + invalid_index_map->begin(), // Output indices + valid); // Stencil Predicate + size_type begin_counter = static_cast(0); + size_type end_counter = static_cast(right_table_row_count); + + // Create list of indices that have been marked as invalid + size_type indices_count = thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(begin_counter), + thrust::make_counting_iterator(end_counter), + invalid_index_map->begin(), + right_indices_complement->begin(), + thrust::identity()) - + right_indices_complement->begin(); + right_indices_complement->resize(indices_count, stream); + } + + auto left_invalid_indices = + std::make_unique>(right_indices_complement->size(), stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), + left_invalid_indices->begin(), + left_invalid_indices->end(), + JoinNoneValue); + + return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index cc34aed33ea..69a7b8c722b 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -18,15 +18,12 @@ #include #include -#include - #include -#include +#include +#include #include -#include #include #include -#include #include #include @@ -34,11 +31,15 @@ #include #include +#include +#include +#include + namespace cudf { namespace detail { -template std::unique_ptr> left_semi_anti_join( + join_kind const kind, cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls, @@ -48,13 +49,13 @@ std::unique_ptr> left_semi_anti_join( CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty"); CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); - if (is_trivial_join(left_keys, right_keys, JoinKind)) { + if (is_trivial_join(left_keys, right_keys, kind)) { return std::make_unique>(0, stream, mr); } - if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { + if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right_keys.num_rows())) { auto result = std::make_unique>(left_keys.num_rows(), stream, mr); - thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end()); + thrust::sequence(rmm::exec_policy(stream), result->begin(), result->end()); return result; } @@ -115,7 +116,7 @@ std::unique_ptr> left_semi_anti_join( // // For semi join we want contains to be true, for anti join we want contains to be false - bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); + bool const join_type_boolean = (kind == join_kind::LEFT_SEMI_JOIN); auto gather_map = std::make_unique>(left_num_rows, stream, mr); @@ -152,27 +153,26 @@ std::unique_ptr> left_semi_anti_join( * @throws cudf::logic_error if number of returned columns is 0 * @throws cudf::logic_error if number of elements in `right_on` and `left_on` are not equal * - * @param[in] left The left table - * @param[in] right The right table - * @param[in] left_on The column indices from `left` to join on. - * The column from `left` indicated by `left_on[i]` - * will be compared against the column from `right` - * indicated by `right_on[i]`. - * @param[in] right_on The column indices from `right` to join on. - * The column from `right` indicated by `right_on[i]` - * will be compared against the column from `left` - * indicated by `left_on[i]`. - * @param[in] compare_nulls Controls whether null join-key values should match or not. - * @param[in] mr Device memory resource to used to allocate the returned table's - * device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @tparam join_kind Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN + * @param kind Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN + * @param left The left table + * @param right The right table + * @param left_on The column indices from `left` to join on. + * The column from `left` indicated by `left_on[i]` + * will be compared against the column from `right` + * indicated by `right_on[i]`. + * @param right_on The column indices from `right` to join on. + * The column from `right` indicated by `right_on[i]` + * will be compared against the column from `left` + * indicated by `left_on[i]`. + * @param compare_nulls Controls whether null join-key values should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to used to allocate the returned table * - * @returns Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. + * @returns Result of joining `left` and `right` tables on the columns + * specified by `left_on` and `right_on`. */ -template std::unique_ptr left_semi_anti_join( + join_kind const kind, cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -183,11 +183,11 @@ std::unique_ptr left_semi_anti_join( { CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); - if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) { + if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, kind)) { return empty_like(left); } - if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) { + if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right.num_rows())) { // Everything matches, just copy the proper columns from the left table return std::make_unique
(left, stream, mr); } @@ -202,14 +202,23 @@ std::unique_ptr left_semi_anti_join( auto const left_selected = matched.second.front(); auto const right_selected = matched.second.back(); - auto gather_map = - left_semi_anti_join(left_selected, right_selected, compare_nulls, stream); + auto gather_vector = + left_semi_anti_join(kind, left_selected, right_selected, compare_nulls, stream); + + // wrapping the device vector with a column view allows calling the non-iterator + // version of detail::gather, improving compile time by 10% and reducing the + // object file size by 2.2x without affecting performance + auto gather_map = column_view(data_type{type_id::INT32}, + static_cast(gather_vector->size()), + gather_vector->data(), + nullptr, + 0); auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather(left_updated, - gather_map->begin(), - gather_map->end(), + gather_map, out_of_bounds_policy::DONT_CHECK, + negative_index_policy::NOT_ALLOWED, stream, mr); } @@ -224,8 +233,14 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join(detail::join_kind::LEFT_SEMI_JOIN, + left, + right, + left_on, + right_on, + compare_nulls, + rmm::cuda_stream_default, + mr); } std::unique_ptr> left_semi_join( @@ -235,8 +250,8 @@ std::unique_ptr> left_semi_join( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join( + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, @@ -247,8 +262,14 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join(detail::join_kind::LEFT_ANTI_JOIN, + left, + right, + left_on, + right_on, + compare_nulls, + rmm::cuda_stream_default, + mr); } std::unique_ptr> left_anti_join( @@ -258,8 +279,8 @@ std::unique_ptr> left_anti_join( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - left, right, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_semi_anti_join( + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf From e11b05477297df2f7ee19e6c7785a8c5d7486bfc Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Thu, 5 Aug 2021 15:55:30 -0400 Subject: [PATCH 03/14] Series string repeat (#8882) Addressing #8732. `str.repeat` function with a scalar value (e.g., `s.str.repeat(2)`) implemented - still working on working with a sequence of integers (e.g., `s.str.repeat([0,2,3])`). Still need to write tests as well. Authors: - Sarah Yurick (https://github.com/sarahyurick) Approvers: - Nghia Truong (https://github.com/ttnghia) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/8882 --- python/cudf/cudf/_lib/cpp/strings/repeat.pxd | 19 +++++++ python/cudf/cudf/_lib/strings/__init__.py | 1 + python/cudf/cudf/_lib/strings/repeat.pyx | 49 ++++++++++++++++++ python/cudf/cudf/core/column/string.py | 53 ++++++++++++++++++++ python/cudf/cudf/tests/test_string.py | 26 ++++++++++ 5 files changed, 148 insertions(+) create mode 100644 python/cudf/cudf/_lib/cpp/strings/repeat.pxd create mode 100644 python/cudf/cudf/_lib/strings/repeat.pyx diff --git a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd b/python/cudf/cudf/_lib/cpp/strings/repeat.pxd new file mode 100644 index 00000000000..2a6754b9a11 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/repeat.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.types cimport size_type + + +cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ + nogil: + + cdef unique_ptr[column] repeat_strings( + column_view strings, + size_type repeat) except + + + cdef unique_ptr[column] repeat_strings( + column_view strings, + column_view repeats) except + diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 866c2861995..598ac804dd6 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -64,6 +64,7 @@ from cudf._lib.strings.findall import findall from cudf._lib.strings.json import get_json_object from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill +from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( insert, replace, diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx new file mode 100644 index 00000000000..49a46f418b1 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -0,0 +1,49 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings cimport repeat as cpp_repeat +from cudf._lib.cpp.types cimport size_type + + +def repeat_scalar(Column source_strings, + size_type repeats): + """ + Returns a Column after repeating + each string in `source_strings` + `repeats` number of times. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_repeat.repeat_strings( + source_view, + repeats + )) + + return Column.from_unique_ptr(move(c_result)) + + +def repeat_sequence(Column source_strings, + Column repeats): + """ + Returns a Column after repeating + each string in `source_strings` + `repeats` number of times. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + cdef column_view repeats_view = repeats.view() + + with nogil: + c_result = move(cpp_repeat.repeat_strings( + source_view, + repeats_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fe231e1def9..7d6afbb4056 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -750,6 +750,59 @@ def contains( ) return self._return_or_inplace(result_col) + def repeat(self, repeats: Union[int, Sequence],) -> SeriesOrIndex: + """ + Duplicate each string in the Series or Index. + Equivalent to `str.repeat() + `_. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or Index of object + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = cudf.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + if can_convert_to_column(repeats): + return self._return_or_inplace( + libstrings.repeat_sequence( + self._column, column.as_column(repeats, dtype="int"), + ), + ) + + return self._return_or_inplace( + libstrings.repeat_scalar(self._column, repeats) + ) + def replace( self, pat: Union[str, Sequence], diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a8c00ce031e..9a7ef4e2099 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -852,6 +852,32 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): assert_eq(expect, got) +@pytest.mark.parametrize( + "data", [["hello", "world", None, "", "!"]], +) +@pytest.mark.parametrize( + "repeats", + [ + 2, + 0, + -3, + [5, 4, 3, 2, 6], + [5, None, 3, 2, 6], + [0, 0, 0, 0, 0], + [-1, -2, -3, -4, -5], + [None, None, None, None, None], + ], +) +def test_string_repeat(data, repeats): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + + expect = ps.str.repeat(repeats) + got = gs.str.repeat(repeats) + + assert_eq(expect, got) + + # Pandas isn't respect the `n` parameter so ignoring it in test parameters @pytest.mark.parametrize( "pat,regex", From db63f6135d7c4cb5834c040096d88fadaf11ffda Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 5 Aug 2021 15:51:03 -0500 Subject: [PATCH 04/14] Fix concatenation of `cudf.RangeIndex` (#8970) Fixes: #6872 In cudf, we have been concatenating a collection of `RangeIndex`'s by materializing each one of them, but instead we should rather be introspecting each RangeIndex to decide whether to materialize of not. This PR fixes it. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Benjamin Zaitlen (https://github.com/quasiben) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/8970 --- python/cudf/cudf/core/index.py | 51 +++++++++++++++++-- python/cudf/cudf/tests/test_index.py | 19 +++++++ python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 97ee0948209..e9ab3d5797c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,7 @@ import pickle from numbers import Number -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import cupy import numpy as np @@ -588,13 +588,18 @@ def sum(self): @classmethod def _concat(cls, objs): - data = concat_columns([o._values for o in objs]) + if all(isinstance(obj, RangeIndex) for obj in objs): + result = _concat_range_index(objs) + else: + data = concat_columns([o._values for o in objs]) + result = as_index(data) + names = {obj.name for obj in objs} if len(names) == 1: [name] = names else: name = None - result = as_index(data) + result.name = name return result @@ -3032,3 +3037,43 @@ def __new__( ) return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + + +def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: + """ + An internal Utility function to concat RangeIndex objects. + """ + start = step = next_ = None + + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + if not non_empty_indexes: + # Here all "indexes" had 0 length, i.e. were empty. + # In this case return an empty range index. + return RangeIndex(0, 0) + + for obj in non_empty_indexes: + if start is None: + # This is set by the first non-empty index + start = obj.start + if step is None and len(obj) > 1: + step = obj.step + elif step is None: + # First non-empty index had only one element + if obj.start == start: + result = as_index(concat_columns([x._values for x in indexes])) + return result + step = obj.start - start + + non_consecutive = (step != obj.step and len(obj) > 1) or ( + next_ is not None and obj.start != next_ + ) + if non_consecutive: + result = as_index(concat_columns([x._values for x in indexes])) + return result + if step is not None: + next_ = obj[-1] + step + + stop = non_empty_indexes[-1].stop if next_ is None else next_ + return RangeIndex(start, stop, step) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f03454c479a..3f58eb3d6e7 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2316,3 +2316,22 @@ def test_get_loc_multi_string(idx, key, method): got = gi.get_loc(key, method=method) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "objs", + [ + [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)], + [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)], + [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)], + ], +) +def test_range_index_concat(objs): + cudf_objs = [cudf.from_pandas(obj) for obj in objs] + + actual = cudf.concat(cudf_objs) + + expected = objs[0] + for obj in objs[1:]: + expected = expected.append(obj) + assert_eq(expected, actual) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cf5203a22e5..ace9701b677 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx(): ddf = dgd.from_cudf(cdf, npartitions=2) - assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex) + assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) From 7816a3d3714b009ec0bf00f48acb477139e30373 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 5 Aug 2021 21:17:56 -0500 Subject: [PATCH 05/14] Add deprecation warning for `Series.set_mask` API (#8943) `Series.set_mask` is more of internal implementation detail that the end-users will not have knowledge about, hence deprecating the API. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/8943 --- python/cudf/cudf/core/series.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 413fcefc2bc..f786853b3f4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -845,6 +845,11 @@ def set_mask(self, mask, null_count=None): 4 5 dtype: int64 """ + warnings.warn( + "Series.set_mask is deprecated and will be removed " + "in the future.", + DeprecationWarning, + ) col = self._column.set_mask(mask) return self._copy_construct(data=col) From ed3a601111d996c8bf49bbaf4fa16a6a741c6b3d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 5 Aug 2021 22:28:06 -0400 Subject: [PATCH 06/14] Fix debug compile error for csv_test.cpp (#8981) Found an compile error in `csv_test.cpp` that only occurs with a debug build since it exists in an `assert()` statement. Looks like this was introduced in PR #8856 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/8981 --- cpp/tests/io/csv_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 43d9bd7b514..53e0ab14fd3 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -415,7 +415,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) result_strings.reserve(reference_strings.size()); std::ifstream read_result_file(filepath); - assert(read_result_file.is_open()); + ASSERT_TRUE(read_result_file.is_open()); std::copy(std::istream_iterator(read_result_file), std::istream_iterator(), @@ -461,7 +461,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) result_strings.reserve(reference_strings.size()); std::ifstream read_result_file(filepath); - assert(read_result_file.is_open()); + ASSERT_TRUE(read_result_file.is_open()); std::copy(std::istream_iterator(read_result_file), std::istream_iterator(), @@ -2175,9 +2175,9 @@ TEST_F(CsvReaderTest, DtypesMap) auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); - assert(result_table->num_columns() == 2); - assert(result_table.column(0).type() == data_type{type_id::INT32}); - assert(result_table.column(1).type() == data_type{type_id::INT16}); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32}); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT16}); expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); expect_column_data_equal(std::vector{9, 8, 7}, result_table.column(1)); } From ffb37c93c295a78338292d42fd21722b152383cb Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Fri, 6 Aug 2021 14:40:03 +0800 Subject: [PATCH 07/14] Fix concatenate empty structs (#8947) Closes #8929 Current PR is to fix the illegal memory access problem occurred when concatenating structs with empty children. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Nghia Truong (https://github.com/ttnghia) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/8947 --- cpp/src/structs/copying/concatenate.cu | 7 ++++++- cpp/tests/copying/concatenate_tests.cu | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu index 6f18c4bcbd4..fe5483b119d 100644 --- a/cpp/src/structs/copying/concatenate.cu +++ b/cpp/src/structs/copying/concatenate.cu @@ -28,6 +28,7 @@ #include #include +#include namespace cudf { namespace structs { @@ -53,7 +54,11 @@ std::unique_ptr concatenate(host_span columns, return cudf::detail::concatenate(cols, stream, mr); }); - size_type const total_length = children[0]->size(); + // get total length from concatenated children; if no child exists, we would compute it + auto const acc_size_fn = [](size_type s, column_view const& c) { return s + c.size(); }; + auto const total_length = + !children.empty() ? children[0]->size() + : std::accumulate(columns.begin(), columns.end(), size_type{0}, acc_size_fn); // if any of the input columns have nulls, construct the output mask bool const has_nulls = diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu index 7d3b7beb2cb..5237c75e4d4 100644 --- a/cpp/tests/copying/concatenate_tests.cu +++ b/cpp/tests/copying/concatenate_tests.cu @@ -826,6 +826,22 @@ TEST_F(StructsColumnTest, ConcatenateStructs) cudf::test::expect_columns_equivalent(*result, *expected); } +TEST_F(StructsColumnTest, ConcatenateEmptyStructs) +{ + using namespace cudf::test; + + auto expected = cudf::make_structs_column(10, {}, 0, rmm::device_buffer()); + auto first = cudf::make_structs_column(5, {}, 0, rmm::device_buffer()); + auto second = cudf::make_structs_column(2, {}, 0, rmm::device_buffer()); + auto third = cudf::make_structs_column(0, {}, 0, rmm::device_buffer()); + auto fourth = cudf::make_structs_column(3, {}, 0, rmm::device_buffer()); + + // concatenate + auto result = cudf::concatenate(std::vector({*first, *second, *third, *fourth})); + CUDF_EXPECTS(result->size() == expected->size(), "column size changed after concat"); + cudf::test::expect_columns_equivalent(*result, *expected); +} + TEST_F(StructsColumnTest, ConcatenateSplitStructs) { using namespace cudf::test; From 5de9def62fb9d0d4ab04abb75918157bffedb384 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Fri, 6 Aug 2021 08:40:27 -0500 Subject: [PATCH 08/14] Make Java AstNode public (#8953) This makes the Java `AstNode` base class public so applications can construct an AST using a common type for all AST nodes. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/8953 --- java/src/main/java/ai/rapids/cudf/ast/AstNode.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java b/java/src/main/java/ai/rapids/cudf/ast/AstNode.java index 78cf39b05d2..8160462de98 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java +++ b/java/src/main/java/ai/rapids/cudf/ast/AstNode.java @@ -19,7 +19,7 @@ import java.nio.ByteBuffer; /** Base class of every node in an AST */ -abstract class AstNode { +public abstract class AstNode { /** * Enumeration for the types of AST nodes that can appear in a serialized AST. * NOTE: This must be kept in sync with the `jni_serialized_node_type` in CompiledExpression.cpp! From 0c09f25c377e89327d5669947a5cf02b58b59a15 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Fri, 6 Aug 2021 08:40:48 -0500 Subject: [PATCH 09/14] Java conditional joins should not require matching column counts (#8955) The Java conditional join APIs had a copy-n-paste error from the equality join code where it mandated the left and right table column counts matched. Conditional joins use an AST expression for the join condition which does not require a 1-to-1 mapping between left and right table columns, so this check has been removed from the conditional join APIs. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/8955 --- java/src/main/java/ai/rapids/cudf/Table.java | 20 ------------------- .../test/java/ai/rapids/cudf/TableTest.java | 20 ++++++++++++++----- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 360bb5c7467..861c6485a5c 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -2004,10 +2004,6 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual */ public GatherMap[] leftJoinGatherMaps(Table rightTable, CompiledExpression condition, boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } long[] gatherMapData = conditionalLeftJoinGatherMaps(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); @@ -2049,10 +2045,6 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua */ public GatherMap[] innerJoinGatherMaps(Table rightTable, CompiledExpression condition, boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } long[] gatherMapData = conditionalInnerJoinGatherMaps(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); @@ -2094,10 +2086,6 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual */ public GatherMap[] fullJoinGatherMaps(Table rightTable, CompiledExpression condition, boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } long[] gatherMapData = conditionalFullJoinGatherMaps(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); @@ -2146,10 +2134,6 @@ public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqua */ public GatherMap leftSemiJoinGatherMap(Table rightTable, CompiledExpression condition, boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } long[] gatherMapData = conditionalLeftSemiJoinGatherMap(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); @@ -2191,10 +2175,6 @@ public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqua */ public GatherMap leftAntiJoinGatherMap(Table rightTable, CompiledExpression condition, boolean compareNullsEqual) { - if (getNumberOfColumns() != rightTable.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + - "rightKeys: " + rightTable.getNumberOfColumns()); - } long[] gatherMapData = conditionalLeftAntiJoinGatherMap(getNativeView(), rightTable.getNativeView(), condition.getNativeHandle(), compareNullsEqual); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 1b2ed1ad0b8..6b347897f82 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1496,7 +1496,9 @@ void testConditionalLeftJoinGatherMaps() { new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column( 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9) .column(inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1) @@ -1589,7 +1591,9 @@ void testConditionalInnerJoinGatherMaps() { new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(2, 2, 2, 5, 5, 7, 9, 9) .column(0, 1, 3, 0, 1, 1, 0, 1) @@ -1684,7 +1688,9 @@ void testConditionalFullJoinGatherMaps() { new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(inv, inv, inv, 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9) .column( 2, 4, 5, inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1) @@ -1763,7 +1769,9 @@ void testConditionalLeftSemiJoinGatherMap() { new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(2, 5, 7, 9) // left .build(); @@ -1827,7 +1835,9 @@ void testConditionalLeftAntiJoinGatherMap() { new ColumnReference(0, TableReference.LEFT), new ColumnReference(0, TableReference.RIGHT)); try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); - Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table right = new Table.TestBuilder() + .column(6, 5, 9, 8, 10, 32) + .column(0, 1, 2, 3, 4, 5).build(); Table expected = new Table.TestBuilder() .column(0, 1, 3, 4, 6, 8) // left .build(); From d45f10860cc0c62c4e5a9452db206426fb41299a Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 6 Aug 2021 11:17:54 -0500 Subject: [PATCH 10/14] A small optimization for JNI copy column view to column vector (#8985) So if the underlying class is a ColumnVector then we don't make a copy of the data, we just increment the reference count. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) - Kuhu Shukla (https://github.com/kuhushukla) - Niranjan Artal (https://github.com/nartal1) URL: https://github.com/rapidsai/cudf/pull/8985 --- java/src/main/java/ai/rapids/cudf/ColumnVector.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e543d0c7b21..6902e2b322b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -152,6 +152,16 @@ private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) { incRefCountInternal(true); } + + /** + * For a ColumnVector this is really just incrementing the reference count. + * @return this + */ + @Override + public ColumnVector copyToColumnVector() { + return incRefCount(); + } + /** * Retrieves the column_view for a cudf::column and if it fails to do so, the column is deleted * and the exception is thrown to the caller. @@ -803,7 +813,7 @@ private static native long stringConcatenation(long[] columnViews, long separato /** * Native method to concatenate columns of strings together using a separator specified for each row * and returns the result as a string column. - * @param columns array of longs holding the native handles of the column_views to combine. + * @param columnViews array of longs holding the native handles of the column_views to combine. * @param sep_column long holding the native handle of the strings_column_view used as separators. * @param separator_narep string scalar indicating null behavior when a separator is null. * If set to null and the separator is null the resulting string will From f207f98684fd8103ceba9285d66dec0b6242d07e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 6 Aug 2021 14:15:48 -0400 Subject: [PATCH 11/14] Support bracket syntax for cudf::strings::replace_with_backrefs group index values (#8841) Closes #8816 The current `\d` syntax for the replacement template parameter will fail if a number immediately follows the index pattern as described in #8816. This PR adds support for the `${d}` pattern but only if the `\d` pattern is not found in the replacement string. This should minimize breaking any current templates already being used with this API. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Christopher Harris (https://github.com/cwharris) URL: https://github.com/rapidsai/cudf/pull/8841 --- cpp/include/cudf/strings/replace_re.hpp | 10 ++-- cpp/src/strings/replace/backref_re.cu | 60 +++++++++++++++-------- cpp/tests/strings/replace_regex_tests.cpp | 25 ++++++++++ 3 files changed, 70 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index 28ab19e53d9..087d1a94603 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,22 +72,24 @@ std::unique_ptr replace_re( /** * @brief For each string, replaces any character sequence matching the given pattern - * using the repl template for back-references. + * using the replacement template for back-references. * * Any null string entries return corresponding null output column entries. * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * + * @throw cudf::logic_error if capture index values in `replacement` are not in range 1-99 + * * @param strings Strings instance for this operation. * @param pattern The regular expression patterns to search within each string. - * @param repl The replacement template for creating the output string. + * @param replacement The replacement template for creating the output string. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ std::unique_ptr replace_with_backrefs( strings_column_view const& strings, std::string const& pattern, - std::string const& repl, + std::string const& replacement, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace strings diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 462efedffe5..5f7b195e8f9 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -37,39 +37,57 @@ namespace strings { namespace detail { namespace { +/** + * @brief Return the capturing group index pattern to use with the given replacement string. + * + * Only two patterns are supported at this time `\d` and `${d}` where `d` is an integer in + * the range 1-99. The `\d` pattern is returned by default unless no `\d` pattern is found in + * the `repl` string, + * + * Reference: https://www.regular-expressions.info/refreplacebackref.html + */ +std::string get_backref_pattern(std::string const& repl) +{ + std::string const backslash_pattern = "\\\\(\\d+)"; + std::string const bracket_pattern = "\\$\\{(\\d+)\\}"; + std::smatch m; + return std::regex_search(repl, m, std::regex(backslash_pattern)) ? backslash_pattern + : bracket_pattern; +} /** * @brief Parse the back-ref index and position values from a given replace format. * - * The backref numbers are expected to be 1-based. + * The back-ref numbers are expected to be 1-based. + * + * Returns a modified string without back-ref indicators and a vector of back-ref + * byte position pairs. These are used by the device code to build the output + * string by placing the captured group elements into the replace format. * - * Returns a modified string without back-ref indicators and a vector of backref - * byte position pairs. - * ``` - * Example: - * for input string: 'hello \2 and \1' - * the returned pairs: (2,6),(1,11) - * returned string is: 'hello and ' - * ``` + * For example, for input string 'hello \2 and \1' the returned `backref_type` vector + * contains `[(2,6),(1,11)]` and the returned string is 'hello and '. */ std::pair> parse_backrefs(std::string const& repl) { std::vector backrefs; std::string str = repl; // make a modifiable copy std::smatch m; - std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1" - std::string rtn; // result without refs + std::regex ex(get_backref_pattern(repl)); + std::string rtn; size_type byte_offset = 0; - while (std::regex_search(str, m, ex)) { - if (m.size() == 0) break; - std::string const backref = m[0]; - size_type const position = static_cast(m.position(0)); - size_type const length = static_cast(backref.length()); + while (std::regex_search(str, m, ex) && !m.empty()) { + // parse the back-ref index number + size_type const index = static_cast(std::atoi(std::string{m[1]}.c_str())); + CUDF_EXPECTS(index > 0 && index < 100, "Group index numbers must be in the range 1-99"); + + // store the new byte offset and index value + size_type const position = static_cast(m.position(0)); byte_offset += position; - size_type const index = std::atoi(backref.c_str() + 1); // back-ref index number - CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0"); - rtn += str.substr(0, position); - str = str.substr(position + length); backrefs.push_back({index, byte_offset}); + + // update the output string + rtn += str.substr(0, position); + // remove the back-ref pattern to continue parsing + str = str.substr(position + static_cast(m.length(0))); } if (!str.empty()) // add the remainder rtn += str; // of the string @@ -96,7 +114,7 @@ std::unique_ptr replace_with_backrefs( auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream); auto const regex_insts = d_prog->insts_counts(); - // parse the repl string for backref indicators + // parse the repl string for back-ref indicators auto const parse_result = parse_backrefs(repl); rmm::device_uvector backrefs(parse_result.second.size(), stream); CUDA_TRY(cudaMemcpyAsync(backrefs.data(), diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index a2486d60051..1f01f0f1429 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -167,6 +167,20 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest) +{ + cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"}); + auto strings_view = cudf::strings_column_view(strings); + + std::string pattern = "(\\d+)-(\\d+)"; + std::string repl_template = "${2} X ${1}0"; + auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); + + cudf::test::strings_column_wrapper expected( + {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest) { cudf::test::strings_column_wrapper strings( @@ -203,6 +217,17 @@ TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest) +{ + cudf::test::strings_column_wrapper strings({"this string left intentionally blank"}); + auto view = cudf::strings_column_view(strings); + + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\0"), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\123"), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "", "\\1"), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error); +} + TEST_F(StringsReplaceTests, MediumReplaceRegex) { // This results in 95 regex instructions and falls in the 'medium' range. From 4b5853dde68ea4bfd49cf58cf2358ed01c036d93 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 6 Aug 2021 13:51:26 -0700 Subject: [PATCH 12/14] Expose `days_in_month` function in libcudf and add python bindings (#8892) Closes #8681 This PR adds `days_in_month` function in libcudf and in cudf-python. It extracts the number of days of the month that the date is in. Minor addition: - Adds examples to `is_leap_year` documentation - A previous issue with cuda10.x that precludes use of `ymd.days()` in `add_calendrical_months_functor` is now fixed because we dropped support for cuda10.x. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/8892 --- cpp/include/cudf/datetime.hpp | 15 ++++ cpp/src/datetime/datetime_ops.cu | 73 +++++++++----------- cpp/tests/datetime/datetime_ops_test.cpp | 35 ++++++++++ python/cudf/cudf/_lib/cpp/datetime.pxd | 1 + python/cudf/cudf/_lib/datetime.pyx | 14 ++++ python/cudf/cudf/core/series.py | 87 ++++++++++++++++++++++++ python/cudf/cudf/tests/test_datetime.py | 20 ++++++ 7 files changed, 205 insertions(+), 40 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 3d90ac063e1..2e4ac870969 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -206,6 +206,21 @@ std::unique_ptr is_leap_year( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Extract the number of days in the month + * + * output[i] contains the number of days in the month of date `column[i]` + * output[i] is null if `column[i]` is null + * + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + * + * @param cudf::column_view of the input datetime values + * @return cudf::column of datatype INT16 of days in month of the corresponding date + */ +std::unique_ptr days_in_month( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns the quarter of the date * diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 4d8acb3bd3b..9879a6c5423 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -83,12 +83,6 @@ static __device__ int16_t const days_until_month[2][13] = { {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} // For leap years }; -CUDA_DEVICE_CALLABLE uint8_t days_in_month(cuda::std::chrono::month mon, bool is_leap_year) -{ - return days_until_month[is_leap_year][unsigned{mon}] - - days_until_month[is_leap_year][unsigned{mon} - 1]; -} - // Round up the date to the last day of the month and return the // date only (without the time component) struct extract_last_day_of_month { @@ -96,18 +90,23 @@ struct extract_last_day_of_month { CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const { using namespace cuda::std::chrono; - // IDEAL: does not work with CUDA10.0 due to nvcc compiler bug - // cannot invoke ym_last_day.day() - // const year_month_day orig_ymd(floor(ts)); - // const year_month_day_last ym_last_day(orig_ymd.year(), month_day_last(orig_ymd.month())); - // return timestamp_D(sys_days(ym_last_day)); - - // Only has the days - time component is chopped off, which is what we want - auto const days_since_epoch = floor(ts); - auto const date = year_month_day(days_since_epoch); - auto const last_day = days_in_month(date.month(), date.year().is_leap()); + const year_month_day ymd(floor(ts)); + auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last}; + return timestamp_D{sys_days{ymdl}}; + } +}; - return timestamp_D(days_since_epoch + days(last_day - static_cast(date.day()))); +// Extract the number of days of the month +// A similar operator to `extract_last_day_of_month`, except this returns +// an integer while the other returns a timestamp. +struct days_in_month_op { + template + CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + auto const date = year_month_day(floor(ts)); + auto const ymdl = year_month_day_last(date.year() / date.month() / last); + return static_cast(unsigned{ymdl.day()}); } }; @@ -144,6 +143,7 @@ struct extract_quarter_op { } }; +// Returns true if the year is a leap year struct is_leap_year_op { template CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const @@ -220,22 +220,6 @@ struct add_calendrical_months_functor { { } - // std chrono implementation is copied here due to nvcc bug 2909685 - // https://howardhinnant.github.io/date_algorithms.html#days_from_civil - static CUDA_DEVICE_CALLABLE timestamp_D - compute_sys_days(cuda::std::chrono::year_month_day const& ymd) - { - const int yr = static_cast(ymd.year()) - (ymd.month() <= cuda::std::chrono::month{2}); - const unsigned mth = static_cast(ymd.month()); - const unsigned dy = static_cast(ymd.day()); - - const int era = (yr >= 0 ? yr : yr - 399) / 400; - const unsigned yoe = static_cast(yr - era * 400); // [0, 399] - const unsigned doy = (153 * (mth + (mth > 2 ? -3 : 9)) + 2) / 5 + dy - 1; // [0, 365] - const unsigned doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096] - return timestamp_D{duration_D{era * 146097 + static_cast(doe) - 719468}}; - } - template typename std::enable_if_t::value, void> operator()( rmm::cuda_stream_view stream) const @@ -265,15 +249,10 @@ struct add_calendrical_months_functor { // If the new date isn't valid, scale it back to the last day of the // month. - // IDEAL: if (!ymd.ok()) ymd = ymd.year()/ymd.month()/last; - auto month_days = days_in_month(ymd.month(), ymd.year().is_leap()); - if (unsigned{ymd.day()} > month_days) - ymd = ymd.year() / ymd.month() / day{month_days}; + if (!ymd.ok()) ymd = ymd.year() / ymd.month() / last; // Put back the time component to the date - return - // IDEAL: sys_days{ymd} + ... - compute_sys_days(ymd) + (time_val - days_since_epoch); + return sys_days{ymd} + (time_val - days_since_epoch); }); } }; @@ -393,6 +372,13 @@ std::unique_ptr is_leap_year(column_view const& column, return apply_datetime_op(column, stream, mr); } +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return apply_datetime_op(column, stream, mr); +} + std::unique_ptr extract_quarter(column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -476,6 +462,13 @@ std::unique_ptr is_leap_year(column_view const& column, rmm::mr::device_ return detail::is_leap_year(column, rmm::cuda_stream_default, mr); } +std::unique_ptr days_in_month(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::days_in_month(column, rmm::cuda_stream_default, mr); +} + std::unique_ptr extract_quarter(column_view const& column, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index c05e95c164e..39ad5f556d4 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -570,6 +570,41 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear) {true, false, true, true, true, true, true, true, false, true, true, false}}); } +TEST_F(BasicDatetimeOpsTest, TestDaysInMonths) + +{ + using namespace cudf::test; + using namespace cudf::datetime; + using namespace cuda::std::chrono; + + auto timestamps_s = + cudf::test::fixed_width_column_wrapper{ + { + 0L, // NULL + -1887541682L, // 1910-03-10 10:51:58 + 0L, // NULL + -1251006943L, // 1930-05-11 18:04:17 + -932134638L, // 1940-06-18 09:42:42 + -614354877L, // 1950-07-14 09:52:03 + -296070394L, // 1960-08-14 06:13:26 + 22840404L, // 1970-09-22 08:33:24 + 339817190L, // 1980-10-08 01:39:50 + 657928062L, // 1990-11-06 21:47:42 + 976630837L, // 2000-12-12 14:20:37 + 1294699018L, // 2011-01-10 22:36:58 + 1613970182L, // 2021-02-22 05:03:02 - non leap year February + 1930963331L, // 2031-03-11 02:42:11 + 2249867102L, // 2041-04-18 03:05:02 + 951426858L, // 2000-02-24 21:14:18 - leap year February + }, + iterators::nulls_at({0, 2})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*days_in_month(timestamps_s), + cudf::test::fixed_width_column_wrapper{ + {-1, 31, -1, 31, 30, 31, 31, 30, 31, 30, 31, 31, 28, 31, 30, 29}, + iterators::nulls_at({0, 2})}); +} + TEST_F(BasicDatetimeOpsTest, TestQuarter) { using namespace cudf::test; diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 56ebc3a77fc..b8cac6cd42f 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -18,3 +18,4 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: ) except + cdef unique_ptr[column] day_of_year(const column_view& column) except + cdef unique_ptr[column] is_leap_year(const column_view& column) except + + cdef unique_ptr[column] days_in_month(const column_view& column) except + diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 3b13cedcfd7..3a1c3ebbf5e 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -60,6 +60,8 @@ def extract_datetime_component(Column col, object field): def is_leap_year(Column col): + """Returns a boolean indicator whether the year of the date is a leap year + """ cdef unique_ptr[column] c_result cdef column_view col_view = col.view() @@ -67,3 +69,15 @@ def is_leap_year(Column col): c_result = move(libcudf_datetime.is_leap_year(col_view)) return Column.from_unique_ptr(move(c_result)) + + +def days_in_month(Column col): + """Extracts the number of days in the month of the date + """ + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + + with nogil: + c_result = move(libcudf_datetime.days_in_month(col_view)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f786853b3f4..db88e3f7620 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6433,6 +6433,42 @@ def is_leap_year(self): ------- Series Booleans indicating if dates belong to a leap year. + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y')) + >>> s + 0 2000-12-31 + 1 2001-12-31 + 2 2002-12-31 + 3 2003-12-31 + 4 2004-12-31 + 5 2005-12-31 + 6 2006-12-31 + 7 2007-12-31 + 8 2008-12-31 + 9 2009-12-31 + 10 2010-12-31 + 11 2011-12-31 + 12 2012-12-31 + dtype: datetime64[ns] + >>> s.dt.is_leap_year + 0 True + 1 False + 2 False + 3 False + 4 True + 5 False + 6 False + 7 False + 8 True + 9 False + 10 False + 11 False + 12 True + dtype: bool """ res = libcudf.datetime.is_leap_year(self.series._column).fillna(False) return Series._from_data( @@ -6453,6 +6489,57 @@ def is_month_start(self): """ return (self.day == 1).fillna(False) + @property + def days_in_month(self): + """ + Get the total number of days in the month that the date falls on. + + Returns + ------- + Series + Integers representing the number of days in month + + Example + ------- + >>> import pandas as pd, cudf + >>> s = cudf.Series( + ... pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M')) + >>> s + 0 2000-08-31 + 1 2000-09-30 + 2 2000-10-31 + 3 2000-11-30 + 4 2000-12-31 + 5 2001-01-31 + 6 2001-02-28 + 7 2001-03-31 + 8 2001-04-30 + 9 2001-05-31 + 10 2001-06-30 + 11 2001-07-31 + dtype: datetime64[ns] + >>> s.dt.days_in_month + 0 31 + 1 30 + 2 31 + 3 30 + 4 31 + 5 31 + 6 28 + 7 31 + 8 30 + 9 31 + 10 30 + 11 31 + dtype: int16 + """ + res = libcudf.datetime.days_in_month(self.series._column) + return Series._from_data( + ColumnAccessor({None: res}), + index=self.series._index, + name=self.series.name, + ) + def _get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) return Series( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 5f5a0a78414..0c2dfb0d268 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -12,6 +12,7 @@ import pytest import cudf +import cudf.testing.dataset_generator as dataset_generator from cudf.core import DataFrame, Series from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( @@ -1299,6 +1300,25 @@ def test_is_leap_year(): assert_eq(expect2, got2) +@pytest.mark.parametrize("dtype", DATETIME_TYPES) +def test_days_in_months(dtype): + nrows = 1000 + + data = dataset_generator.rand_dataframe( + dtypes_meta=[ + {"dtype": dtype, "null_frequency": 0.4, "cardinality": nrows} + ], + rows=nrows, + use_threads=False, + seed=23, + ) + + ps = data.to_pandas()["0"] + gs = cudf.from_pandas(ps) + + assert_eq(ps.dt.days_in_month, gs.dt.days_in_month) + + @pytest.mark.parametrize( "data", [ From 115f3b6ea76900b3a44db0e31dd61b2ef5810445 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 6 Aug 2021 18:06:32 -0700 Subject: [PATCH 13/14] Refactor Frame reductions (#8944) This PR moves implementations of reductions out of the `Series`/`DataFrame` classes and into `Frame`. The resulting reduction code is implemented in terms of columns, which improves the performance of `DataFrame` reductions, and using a single code path makes it easier to maintain. The `median` and `sum_of_squares` reductions, which were previously only available for `Series`, are now transparently enabled for `DataFrame` as well. This PR also explicitly disables reductions for Index objects to match pandas Index APIs. Since a few reductions had previously been implemented, removing these features constitutes a breaking change. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Keith Kraus (https://github.com/kkraus14) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/8944 --- python/cudf/cudf/core/column/column.py | 32 +- python/cudf/cudf/core/dataframe.py | 738 ++++------------------ python/cudf/cudf/core/frame.py | 545 ++++++++++++++++ python/cudf/cudf/core/index.py | 68 -- python/cudf/cudf/core/series.py | 556 +--------------- python/cudf/cudf/tests/test_dataframe.py | 5 +- python/cudf/cudf/tests/test_index.py | 11 +- python/cudf/cudf/tests/test_reductions.py | 7 +- 8 files changed, 732 insertions(+), 1230 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 48e6293c3f4..8aeaf08273f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -172,11 +172,31 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: def _null_equals(self, other: ColumnBase) -> ColumnBase: return self.binary_operator("NULL_EQUALS", other) - def all(self) -> bool: - return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_)) + def all(self, skipna: bool = True) -> bool: + # If all entries are null the result is True, including when the column + # is empty. + result_col = self.nans_to_nulls() if skipna else self - def any(self) -> bool: - return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_)) + if result_col.null_count == result_col.size: + return True + + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + else: + return result_col + + def any(self, skipna: bool = True) -> bool: + # Early exit for fast cases. + result_col = self.nans_to_nulls() if skipna else self + if not skipna and result_col.has_nulls: + return True + elif skipna and result_col.null_count == result_col.size: + return False + + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + else: + return result_col def __sizeof__(self) -> int: n = 0 @@ -911,9 +931,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: return self.as_interval_column(dtype, **kwargs) elif is_decimal_dtype(dtype): return self.as_decimal_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.datetime64): + elif np.issubdtype(cast(Any, dtype), np.datetime64): return self.as_datetime_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.timedelta64): + elif np.issubdtype(cast(Any, dtype), np.timedelta64): return self.as_timedelta_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bc068413efb..8cdc6eebaee 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6263,7 +6263,8 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Single 5 dtype: int64 """ - if axis not in (0, "index", None): + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") return self._apply_support_method( @@ -6274,268 +6275,37 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): **kwargs, ) - def min( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the minimum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + 1: 1, + None: 0, + "index": 0, + "columns": 1, + } - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() - a 1 - b 7 - dtype: int64 - """ - return self._apply_support_method( - "min", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def max( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, ): - """ - Return the maximum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.max() - a 4 - b 10 - dtype: int64 - """ - return self._apply_support_method( - "max", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - """ - return self._apply_support_method( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - """ - return self._apply_support_method( - "prod", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def prod( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") - Returns - ------- - scalar + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + axis = self._get_axis_from_axis_arg(axis) - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. + if axis == 0: + result = [ + getattr(self._data[col], op)(**kwargs) + for col in self._data.names + ] - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.prod() - a 24 - b 5040 - dtype: int64 - """ - return self.product( - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) + return Series._from_data( + {None: result}, as_index(self._data.names) + ) + elif axis == 1: + return self._apply_support_method_axis_1(op, **kwargs) def cummin(self, axis=None, skipna=True, *args, **kwargs): """ @@ -6686,50 +6456,6 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "cumprod", axis=axis, skipna=skipna, *args, **kwargs ) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._apply_support_method( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -6830,117 +6556,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - """ - - return self._apply_support_method( - "std", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - """ - return self._apply_support_method( - "var", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -7041,213 +6656,126 @@ def skew( ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.all() - a True - b False - dtype: bool - """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - return self._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) + obj = self.select_dtypes(include="bool") if bool_only else self + return super(DataFrame, obj).all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. + obj = self.select_dtypes(include="bool") if bool_only else self + return super(DataFrame, obj).any(axis, skipna, level, **kwargs) - Returns - ------- - Series + def _apply_support_method_axis_0(self, method, *args, **kwargs): + result = [ + getattr(self[col], method)(*args, **kwargs) + for col in self._data.names + ] - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. + if isinstance(result[0], Series): + support_result = result + result = DataFrame(index=support_result[0].index) + for idx, col in enumerate(self._data.names): + result[col] = support_result[idx] + else: + result = Series(result) + result = result.set_index(self._data.names) + return result - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.any() - a True - b True - dtype: bool - """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, + def _apply_support_method_axis_1(self, method, *args, **kwargs): + # for dask metadata compatibility + skipna = kwargs.pop("skipna", None) + if method not in _cupy_nan_methods_map and skipna not in ( + None, + True, + 1, + ): + raise NotImplementedError( + f"Row-wise operation to calculate '{method}'" + f" currently do not support `skipna=False`." ) - return self._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - - def _apply_support_method(self, method, axis=0, *args, **kwargs): - assert axis in (None, 0, 1) - if axis in (None, 0): - result = [ - getattr(self[col], method)(*args, **kwargs) - for col in self._data.names - ] + level = kwargs.pop("level", None) + if level not in (None,): + raise NotImplementedError( + "Row-wise operations currently do not support `level`." + ) - if isinstance(result[0], Series): - support_result = result - result = DataFrame(index=support_result[0].index) - for idx, col in enumerate(self._data.names): - result[col] = support_result[idx] - else: - result = Series(result) - result = result.set_index(self._data.names) - return result + numeric_only = kwargs.pop("numeric_only", None) + if numeric_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " + "support `numeric_only=False`." + ) - elif axis == 1: - # for dask metadata compatibility - skipna = kwargs.pop("skipna", None) - if method not in _cupy_nan_methods_map and skipna not in ( - None, - True, - 1, - ): - raise NotImplementedError( - f"Row-wise operation to calculate '{method}'" - f" currently do not support `skipna=False`." - ) + min_count = kwargs.pop("min_count", None) + if min_count not in (None, 0): + raise NotImplementedError( + "Row-wise operations currently do not " "support `min_count`." + ) - level = kwargs.pop("level", None) - if level not in (None,): - raise NotImplementedError( - "Row-wise operations currently do not support `level`." - ) + bool_only = kwargs.pop("bool_only", None) + if bool_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " "support `bool_only`." + ) - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `numeric_only=False`." + prepared, mask, common_dtype = self._prepare_for_rowwise_op( + method, skipna + ) + for col in prepared._data.names: + if prepared._data[col].nullable: + prepared._data[col] = ( + prepared._data[col] + .astype( + cudf.utils.dtypes.get_min_float_dtype( + prepared._data[col] + ) + if not is_datetime_dtype(common_dtype) + else np.dtype("float64") + ) + .fillna(np.nan) ) - - min_count = kwargs.pop("min_count", None) - if min_count not in (None, 0): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `min_count`." + arr = cupy.asarray(prepared.as_gpu_matrix()) + + if skipna is not False and method in _cupy_nan_methods_map: + method = _cupy_nan_methods_map[method] + + result = getattr(cupy, method)(arr, axis=1, **kwargs) + + if result.ndim == 1: + type_coerced_methods = { + "count", + "min", + "max", + "sum", + "prod", + "cummin", + "cummax", + "cumsum", + "cumprod", + } + result_dtype = ( + common_dtype + if method in type_coerced_methods + or is_datetime_dtype(common_dtype) + else None + ) + result = column.as_column(result, dtype=result_dtype) + if mask is not None: + result = result.set_mask( + cudf._lib.transform.bools_to_mask(mask._column) ) + return Series(result, index=self.index, dtype=result_dtype,) + else: + result_df = DataFrame(result).set_index(self.index) + result_df.columns = prepared.columns + return result_df - bool_only = kwargs.pop("bool_only", None) - if bool_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `bool_only`." - ) + def _apply_support_method(self, method, axis=0, *args, **kwargs): + axis = self._get_axis_from_axis_arg(axis) - prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna - ) - for col in prepared._data.names: - if prepared._data[col].nullable: - prepared._data[col] = ( - prepared._data[col] - .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] - ) - if not is_datetime_dtype(common_dtype) - else np.dtype("float64") - ) - .fillna(np.nan) - ) - arr = cupy.asarray(prepared.as_gpu_matrix()) - - if skipna is not False and method in _cupy_nan_methods_map: - method = _cupy_nan_methods_map[method] - - result = getattr(cupy, method)(arr, axis=1, **kwargs) - - if result.ndim == 1: - type_coerced_methods = { - "count", - "min", - "max", - "sum", - "prod", - "cummin", - "cummax", - "cumsum", - "cumprod", - } - result_dtype = ( - common_dtype - if method in type_coerced_methods - or is_datetime_dtype(common_dtype) - else None - ) - result = column.as_column(result, dtype=result_dtype) - if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) - return Series(result, index=self.index, dtype=result_dtype,) - else: - result_df = DataFrame(result).set_index(self.index) - result_df.columns = prepared.columns - return result_df + if axis == 0: + return self._apply_support_method_axis_0(method, *args, **kwargs) + elif axis == 1: + return self._apply_support_method_axis_1(method, *args, **kwargs) def _columns_view(self, columns): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 14b8ebe801f..6a976f54c2b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3604,6 +3604,530 @@ def __pos__(self): def __abs__(self): return self._unaryop("abs") + # Reductions + @classmethod + def _get_axis_from_axis_arg(cls, axis): + try: + return cls._SUPPORT_AXIS_LOOKUP[axis] + except KeyError: + valid_axes = ", ".join( + ( + ax + for ax in cls._SUPPORT_AXIS_LOOKUP.keys() + if ax is not None + ) + ) + raise ValueError(f"Invalid axis, must be one of {valid_axes}.") + + def _reduce(self, *args, **kwargs): + raise NotImplementedError( + f"Reductions are not supported for objects of type {type(self)}." + ) + + def min( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the minimum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.min() + a 1 + b 7 + dtype: int64 + """ + return self._reduce( + "min", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def max( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the maximum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.max() + a 4 + b 10 + dtype: int64 + """ + return self._reduce( + "max", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def sum( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return sum of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + """ + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + def product( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return product of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.product() + a 24 + b 5040 + dtype: int64 + """ + axis = self._get_axis_from_axis_arg(axis) + return self._reduce( + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "product" if axis == 0 else "prod", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + # Alias for pandas compatibility. + prod = product + + def mean( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the mean of the values for the requested axis. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. Not implemented for + Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + mean : Series or DataFrame (if level specified) + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.mean() + a 2.5 + b 8.5 + dtype: float64 + """ + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def std( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return sample standard deviation of the DataFrame. + + Normalized by N-1 by default. This can be changed using + the `ddof` argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is N - ddof, where N represents the number of elements. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.std() + a 1.290994 + b 1.290994 + dtype: float64 + """ + + return self._reduce( + "std", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def var( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return unbiased variance of the DataFrame. + + Normalized by N-1 by default. This can be changed using the + ddof argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is + N - ddof, where N represents the number of elements. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.var() + a 1.666667 + b 1.666667 + dtype: float64 + """ + return self._reduce( + "var", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def all(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether all elements are True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be True, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.all() + a True + b False + dtype: bool + """ + return self._reduce( + "all", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def any(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether any elements is True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be False, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.any() + a True + b True + dtype: bool + """ + return self._reduce( + "any", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def sum_of_squares(self, dtype=None): + """Return the sum of squares of values. + + Parameters + ---------- + dtype: data type + Data type to cast the result to. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.sum_of_squares() + a 38 + b 249 + dtype: int64 + """ + return self._reduce("sum_of_squares", dtype=dtype) + + def median( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + + skipna : bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and `numeric_only`. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + """ + return self._reduce( + "median", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. @@ -3612,6 +4136,27 @@ class SingleColumnFrame(Frame): this class. """ + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + None: 0, + "index": 0, + } + + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, + ): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") + + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + return getattr(self._column, op)(**kwargs) + @classmethod def _from_data( cls, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e9ab3d5797c..a2f13daf44c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -518,74 +518,6 @@ def gpu_values(self): """ return self._values.data_array_view - def min(self): - """ - Return the minimum value of the Index. - - Returns - ------- - scalar - Minimum value. - - See Also - -------- - cudf.core.index.Index.max : Return the maximum value in an Index. - cudf.core.series.Series.min : Return the minimum value in a Series. - cudf.core.dataframe.DataFrame.min : Return the minimum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.min() - 1 - """ - return self._values.min() - - def max(self): - """ - Return the maximum value of the Index. - - Returns - ------- - scalar - Maximum value. - - See Also - -------- - cudf.core.index.Index.min : Return the minimum value in an Index. - cudf.core.series.Series.max : Return the maximum value in a Series. - cudf.core.dataframe.DataFrame.max : Return the maximum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.max() - 3 - """ - return self._values.max() - - def sum(self): - """ - Return the sum of all values of the Index. - - Returns - ------- - scalar - Sum of all values. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.sum() - 6 - """ - return self._values.sum() - @classmethod def _concat(cls, objs): if all(isinstance(obj, RangeIndex) for obj in objs): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index db88e3f7620..cb7a82bd4c8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2726,109 +2726,18 @@ def nans_to_nulls(self): return self._copy_construct(data=result_col) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.all() - True - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return True - else: - result_series = self - return result_series._column.all() + return super().all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.any() - True - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - skipna = False if skipna is None else skipna - - if skipna is False and self.has_nulls: - return True - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return False - - else: - result_series = self - - return result_series._column.any() + return super().any(axis, skipna, level, **kwargs) def to_pandas(self, index=True, nullable=False, **kwargs): """ @@ -4021,230 +3930,6 @@ def count(self, level=None, **kwargs): return self.valid_count - def min( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the minimum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.min() - 1 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.min(skipna=skipna, dtype=dtype) - - def max( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the maximum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.max() - 5 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.max(skipna=skipna, dtype=dtype) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.sum() - 15 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.sum( - skipna=skipna, dtype=dtype, min_count=min_count - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.product() - 120 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.product( - skipna=skipna, dtype=dtype, min_count=min_count - ) - - prod = product - def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series. @@ -4327,8 +4012,6 @@ def cummax(self, axis=0, skipna=True, *args, **kwargs): 3 5 4 5 """ - assert axis in (None, 0) - if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") @@ -4479,228 +4162,6 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): index=self.index, ) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values in the series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser.mean() - 15.5 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.mean(skipna=skipna) - - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the Series. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 10, 20, 30, 40]) - >>> series - 0 10 - 1 10 - 2 20 - 3 30 - 4 40 - dtype: int64 - >>> series.std() - 13.038404810405298 - >>> series.std(ddof=2) - 15.05545305418162 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.std(skipna=skipna, ddof=ddof) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the Series. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 0, 1]) - >>> series - 0 10 - 1 11 - 2 12 - 3 0 - 4 1 - dtype: int64 - >>> series.var() - 33.7 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.var(skipna=skipna, ddof=ddof) - - def sum_of_squares(self, dtype=None): - return self._column.sum_of_squares(dtype=dtype) - - def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.median(skipna=skipna) - def mode(self, dropna=True): """ Return the mode(s) of the dataset. @@ -4957,7 +4418,11 @@ def corr(self, other, method="pearson", min_periods=None): -0.20454263717316112 """ - assert method in ("pearson",) and min_periods in (None,) + if method not in ("pearson",): + raise ValueError(f"Unknown method {method}") + + if min_periods not in (None,): + raise NotImplementedError("Unsupported argument 'min_periods'") if self.empty or other.empty: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -5406,7 +4871,8 @@ def hash_encode(self, stop, use_name=False): 2 76 dtype: int32 """ - assert stop > 0 + if not stop > 0: + raise ValueError("stop must be a positive integer.") initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None hashed_values = Series(self._hash(initial_hash)) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 21683d4bdd0..76d24dcd5d2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1847,6 +1847,7 @@ def gdf(pdf): lambda df, **kwargs: df.cumsum(**kwargs), lambda df, **kwargs: df.cumprod(**kwargs), lambda df, **kwargs: df.mean(**kwargs), + lambda df, **kwargs: df.median(**kwargs), lambda df, **kwargs: df.sum(**kwargs), lambda df, **kwargs: df.max(**kwargs), lambda df, **kwargs: df.std(ddof=1, **kwargs), @@ -3423,8 +3424,6 @@ def test_all(data): expected = pdata.all(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.all(bool_only=False) with pytest.raises(NotImplementedError): gdata.all(level="a") @@ -3484,8 +3483,6 @@ def test_any(data, axis): expected = pdata.any(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.any(bool_only=False) with pytest.raises(NotImplementedError): gdata.any(level="a") diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3f58eb3d6e7..38b924006bf 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -125,7 +125,16 @@ def test_index_comparision(): @pytest.mark.parametrize( - "func", [lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()] + "func", + [ + lambda x: x.min(), + lambda x: x.max(), + lambda x: x.sum(), + lambda x: x.mean(), + lambda x: x.any(), + lambda x: x.all(), + lambda x: x.prod(), + ], ) def test_reductions(func): x = np.asarray([4, 5, 6, 10]) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 7cbc56f943c..2a45c75f6da 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -110,20 +110,25 @@ def test_sum_of_squares(dtype, nelem): dtype = np.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) + df = cudf.DataFrame(sr) got = sr.sum_of_squares() - # got = dtype(got) + got_df = df.sum_of_squares() expect = (data ** 2).sum() if np.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) + np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) else: print("overflow, passing") else: np.testing.assert_approx_equal( expect, got, significant=accuracy_for_dtype[dtype] ) + np.testing.assert_approx_equal( + expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype] + ) @pytest.mark.parametrize( From a4eabf0070b499bef3dff460449ec27640fa53fe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sat, 7 Aug 2021 08:38:18 -0700 Subject: [PATCH 14/14] Move compute_column API out of ast namespace (#8957) The `compute_column` API is functionally analogous to `cudf::transform`, except that the former expresses the transformation in terms of a cudf expression while the latter accepts string UDFs to be JITted and evaluated. Moving `compute_column` into `transform.hpp` means that the only public APIs in the `ast` namespace are those required to _define_ an expression, so we clearly separate the logic for defining and parsing of expressions from APIs (like conditional joins and `compute_column`) that evaluate those expressions on device and then use the output in different ways. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Jason Lowe (https://github.com/jlowe) - Vukasin Milovanovic (https://github.com/vuule) - Robert Maynard (https://github.com/robertmaynard) - Paul Taylor (https://github.com/trxcllnt) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/8957 --- conda/recipes/libcudf/meta.yaml | 1 - cpp/CMakeLists.txt | 2 +- cpp/benchmarks/ast/transform_benchmark.cpp | 4 +- .../cudf/ast/detail/expression_evaluator.cuh | 10 +-- .../cudf/ast/detail/expression_parser.hpp | 2 +- cpp/include/cudf/ast/detail/transform.cuh | 61 ------------------- cpp/include/cudf/ast/transform.hpp | 43 ------------- cpp/include/cudf/detail/transform.hpp | 12 ++++ cpp/include/cudf/transform.hpp | 17 ++++++ cpp/src/join/conditional_join.cuh | 2 +- .../compute_column.cu} | 20 +++--- cpp/tests/ast/transform_tests.cpp | 50 +++++++-------- .../main/native/src/CompiledExpression.cpp | 4 +- 13 files changed, 75 insertions(+), 153 deletions(-) delete mode 100644 cpp/include/cudf/ast/detail/transform.cuh delete mode 100644 cpp/include/cudf/ast/transform.hpp rename cpp/src/{ast/transform.cu => transform/compute_column.cu} (90%) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 35d444d026c..88065ef49e0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -51,7 +51,6 @@ test: - test -f $PREFIX/lib/libcudf.so - test -f $PREFIX/lib/libcudftestutil.a - test -f $PREFIX/include/cudf/aggregation.hpp - - test -f $PREFIX/include/cudf/ast/transform.hpp - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp - test -f $PREFIX/include/cudf/ast/detail/operators.hpp - test -f $PREFIX/include/cudf/ast/nodes.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a6f7a41825d..f19a44a0c2a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -152,7 +152,6 @@ add_library(cudf src/aggregation/aggregation.cu src/aggregation/result_cache.cpp src/ast/expression_parser.cpp - src/ast/transform.cu src/binaryop/binaryop.cpp src/binaryop/compiled/binary_ops.cu src/binaryop/compiled/Add.cu @@ -437,6 +436,7 @@ add_library(cudf src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu src/transform/bools_to_mask.cu + src/transform/compute_column.cu src/transform/encode.cu src/transform/mask_to_bools.cu src/transform/nans_to_nulls.cu diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp index 6f131cf0d6a..75b502bf7bf 100644 --- a/cpp/benchmarks/ast/transform_benchmark.cpp +++ b/cpp/benchmarks/ast/transform_benchmark.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include #include #include #include +#include #include #include @@ -119,7 +119,7 @@ static void BM_ast_transform(benchmark::State& state) // Execute benchmark for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::ast::compute_column(table, expression_tree_root); + cudf::compute_column(table, expression_tree_root); } // Use the number of bytes read from global memory diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index df9688754ba..2a3cd059e80 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -191,13 +191,13 @@ struct mutable_column_expression_result }; /** - * @brief Despite to a binary operator based on a single data type. + * @brief Dispatch to a binary operator based on a single data type. * * This functor is a dispatcher for binary operations that assumes that both - * operands to a binary operation are of the same type. This assumption is - * encoded in the one non-deducible template parameter LHS, the type of the - * left-hand operand, which is then used as the template parameter for both the - * left and right operands to the binary operator f. + * operands are of the same type. This assumption is encoded in the + * non-deducible template parameter LHS, the type of the left-hand operand, + * which is then used as the template parameter for both the left and right + * operands to the binary operator f. */ struct single_dispatch_binary_operator { /** diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index db8845825c5..bb42bfbc631 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -30,7 +30,7 @@ namespace ast { namespace detail { /** - * @brief Enum defining data reference types used by a node. + * @brief Node data reference types. * * This enum is device-specific. For instance, intermediate data references are generated by the * linearization process but cannot be explicitly created by the user. diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh deleted file mode 100644 index 81d1bc04b3d..00000000000 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace cudf { - -namespace ast { - -namespace detail { - -/** - * @brief Compute a new column by evaluating an expression tree on a table. - * - * This evaluates an expression over a table to produce a new column. Also called an n-ary - * transform. - * - * @param table The table used for expression evaluation. - * @param expr The root of the expression tree. - * @param stream Stream on which to perform the computation. - * @param mr Device memory resource. - * @return std::unique_ptr Output column. - */ -std::unique_ptr compute_column( - table_view const table, - expression const& expr, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -} // namespace detail - -} // namespace ast - -} // namespace cudf diff --git a/cpp/include/cudf/ast/transform.hpp b/cpp/include/cudf/ast/transform.hpp deleted file mode 100644 index 59697e5f75c..00000000000 --- a/cpp/include/cudf/ast/transform.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace cudf { - -namespace ast { - -/** - * @brief Compute a new column by evaluating an expression tree on a table. - * - * This evaluates an expression over a table to produce a new column. Also called an n-ary - * transform. - * - * @param table The table used for expression evaluation. - * @param expr The root of the expression tree. - * @param mr Device memory resource. - * @return std::unique_ptr Output column. - */ -std::unique_ptr compute_column( - table_view const table, - expression const& expr, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace ast - -} // namespace cudf diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp index b94223cdabe..96ef27529be 100644 --- a/cpp/include/cudf/detail/transform.hpp +++ b/cpp/include/cudf/detail/transform.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -35,6 +36,17 @@ std::unique_ptr transform( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::compute_column + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr compute_column( + table_view const table, + ast::expression const& expr, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::nans_to_nulls * diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index f5880e9b37f..6cf62d1c684 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -74,6 +75,22 @@ std::pair, size_type> nans_to_nulls( column_view const& input, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Compute a new column by evaluating an expression tree on a table. + * + * This evaluates an expression over a table to produce a new column. Also called an n-ary + * transform. + * + * @param table The table used for expression evaluation. + * @param expr The root of the expression tree. + * @param mr Device memory resource. + * @return std::unique_ptr Output column. + */ +std::unique_ptr compute_column( + table_view const table, + ast::expression const& expr, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Creates a bitmask from a column of boolean elements. * diff --git a/cpp/src/join/conditional_join.cuh b/cpp/src/join/conditional_join.cuh index 6794fc89e9e..3d5af7d0657 100644 --- a/cpp/src/join/conditional_join.cuh +++ b/cpp/src/join/conditional_join.cuh @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/src/ast/transform.cu b/cpp/src/transform/compute_column.cu similarity index 90% rename from cpp/src/ast/transform.cu rename to cpp/src/transform/compute_column.cu index 3d788b4069e..1d4cde10306 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/transform/compute_column.cu @@ -18,14 +18,15 @@ #include #include #include -#include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include namespace cudf { -namespace ast { namespace detail { /** @@ -63,8 +63,8 @@ __launch_bounds__(max_block_size) __global__ // workaround is to declare an arbitrary (here char) array type then cast it // after the fact to the appropriate type. extern __shared__ char raw_intermediate_storage[]; - IntermediateDataType* intermediate_storage = - reinterpret_cast*>(raw_intermediate_storage); + ast::detail::IntermediateDataType* intermediate_storage = + reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; @@ -74,13 +74,13 @@ __launch_bounds__(max_block_size) __global__ table, device_expression_data, thread_intermediate_storage); for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { - auto output_dest = mutable_column_expression_result(output_column); + auto output_dest = ast::detail::mutable_column_expression_result(output_column); evaluator.evaluate(output_dest, row_index); } } std::unique_ptr compute_column(table_view const table, - expression const& expr, + ast::expression const& expr, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -122,11 +122,11 @@ std::unique_ptr compute_column(table_view const table, // Execute the kernel auto table_device = table_device_view::create(table, stream); if (has_nulls) { - cudf::ast::detail::compute_column_kernel + cudf::detail::compute_column_kernel <<>>( *table_device, device_expression_data, *mutable_output_device); } else { - cudf::ast::detail::compute_column_kernel + cudf::detail::compute_column_kernel <<>>( *table_device, device_expression_data, *mutable_output_device); } @@ -137,13 +137,11 @@ std::unique_ptr compute_column(table_view const table, } // namespace detail std::unique_ptr compute_column(table_view const table, - expression const& expr, + ast::expression const& expr, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::compute_column(table, expr, rmm::cuda_stream_default, mr); } -} // namespace ast - } // namespace cudf diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 738c58c32b8..19797d0ce2e 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -24,6 +23,7 @@ #include #include #include +#include #include #include @@ -58,7 +58,7 @@ TEST_F(TransformTest, BasicAddition) auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); auto expected = column_wrapper{13, 27, 21, 50}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -74,7 +74,7 @@ TEST_F(TransformTest, BasicAdditionLarge) auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); auto expected = column_wrapper(b, b + 2000); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -90,7 +90,7 @@ TEST_F(TransformTest, LessComparator) auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); auto expected = column_wrapper{true, false, true, false}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -109,7 +109,7 @@ TEST_F(TransformTest, LessComparatorLarge) auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; }); auto expected = column_wrapper(c, c + 2000); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -134,7 +134,7 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic) auto expression_tree = cudf::ast::expression( cudf::ast::ast_operator::ADD, expression_left_subtree, expression_right_subtree); - auto result = cudf::ast::compute_column(table, expression_tree); + auto result = cudf::compute_column(table, expression_tree); auto expected = column_wrapper{7, 73, 22, -99}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -160,7 +160,7 @@ TEST_F(TransformTest, MultiLevelTreeArithmeticLarge) auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0); auto expr_tree = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree); - auto result = cudf::ast::compute_column(table, expr_tree); + auto result = cudf::compute_column(table, expr_tree); auto calc = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); }; auto d = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); }); auto expected = column_wrapper(d, d + 2000); @@ -185,7 +185,7 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic) auto expression_tree = cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree); - auto result = cudf::ast::compute_column(table, expression_tree); + auto result = cudf::compute_column(table, expression_tree); auto expected = column_wrapper{0.6, std::numeric_limits::infinity(), -3.201, -2099.18}; @@ -212,7 +212,7 @@ TEST_F(TransformTest, MultiLevelTreeComparator) auto expression_tree = cudf::ast::expression( cudf::ast::ast_operator::LOGICAL_AND, expression_left_subtree, expression_right_subtree); - auto result = cudf::ast::compute_column(table, expression_tree); + auto result = cudf::compute_column(table, expression_tree); auto expected = column_wrapper{false, true, false, false}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -233,8 +233,8 @@ TEST_F(TransformTest, MultiTypeOperationFailure) cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0); // Operations on different types are not allowed - EXPECT_THROW(cudf::ast::compute_column(table, expression_0_plus_1), cudf::logic_error); - EXPECT_THROW(cudf::ast::compute_column(table, expression_1_plus_0), cudf::logic_error); + EXPECT_THROW(cudf::compute_column(table, expression_0_plus_1), cudf::logic_error); + EXPECT_THROW(cudf::compute_column(table, expression_1_plus_0), cudf::logic_error); } TEST_F(TransformTest, LiteralComparison) @@ -248,7 +248,7 @@ TEST_F(TransformTest, LiteralComparison) auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{false, false, false, true}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -263,7 +263,7 @@ TEST_F(TransformTest, UnaryNot) auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{false, true, false, false}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -278,17 +278,17 @@ TEST_F(TransformTest, UnaryTrigonometry) auto expected_sin = column_wrapper{0.0, std::sqrt(2) / 2, std::sqrt(3.0) / 2.0}; auto expression_sin = cudf::ast::expression(cudf::ast::ast_operator::SIN, col_ref_0); - auto result_sin = cudf::ast::compute_column(table, expression_sin); + auto result_sin = cudf::compute_column(table, expression_sin); cudf::test::expect_columns_equivalent(expected_sin, result_sin->view(), verbosity); auto expected_cos = column_wrapper{1.0, std::sqrt(2) / 2, 0.5}; auto expression_cos = cudf::ast::expression(cudf::ast::ast_operator::COS, col_ref_0); - auto result_cos = cudf::ast::compute_column(table, expression_cos); + auto result_cos = cudf::compute_column(table, expression_cos); cudf::test::expect_columns_equivalent(expected_cos, result_cos->view(), verbosity); auto expected_tan = column_wrapper{0.0, 1.0, std::sqrt(3.0)}; auto expression_tan = cudf::ast::expression(cudf::ast::ast_operator::TAN, col_ref_0); - auto result_tan = cudf::ast::compute_column(table, expression_tan); + auto result_tan = cudf::compute_column(table, expression_tan); cudf::test::expect_columns_equivalent(expected_tan, result_tan->view(), verbosity); } @@ -311,7 +311,7 @@ TEST_F(TransformTest, StringComparison) auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); auto expected = column_wrapper{true, false, true, false}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -324,7 +324,7 @@ TEST_F(TransformTest, CopyColumn) auto col_ref_0 = cudf::ast::column_reference(0); auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, col_ref_0); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{3, 0, 1, 50}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -340,7 +340,7 @@ TEST_F(TransformTest, CopyLiteral) auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{-123, -123, -123, -123}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -357,7 +357,7 @@ TEST_F(TransformTest, TrueDiv) auto expression = cudf::ast::expression(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.5, 0.0, 0.5, 25.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -374,7 +374,7 @@ TEST_F(TransformTest, FloorDiv) auto expression = cudf::ast::expression(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.0, 0.0, 0.0, 25.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -391,7 +391,7 @@ TEST_F(TransformTest, Mod) auto expression = cudf::ast::expression(cudf::ast::ast_operator::MOD, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.0, 0.0, -1.0, 0.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -408,7 +408,7 @@ TEST_F(TransformTest, PyMod) auto expression = cudf::ast::expression(cudf::ast::ast_operator::PYMOD, col_ref_0, literal); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); auto expected = column_wrapper{1.0, 0.0, 1.0, 0.0}; cudf::test::expect_columns_equal(expected, result->view(), verbosity); @@ -425,7 +425,7 @@ TEST_F(TransformTest, BasicAdditionNulls) auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); auto expected = column_wrapper{{0, 0, 0, 50}, {0, 0, 0, 1}}; - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } @@ -451,7 +451,7 @@ TEST_F(TransformTest, BasicAdditionLargeNulls) auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); auto expected = column_wrapper(b, b + N, validities.begin()); - auto result = cudf::ast::compute_column(table, expression); + auto result = cudf::compute_column(table, expression); cudf::test::expect_columns_equal(expected, result->view(), verbosity); } diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp index 31f3184f107..fe57f79c955 100644 --- a/java/src/main/native/src/CompiledExpression.cpp +++ b/java/src/main/native/src/CompiledExpression.cpp @@ -20,9 +20,9 @@ #include #include -#include #include #include +#include #include #include "cudf_jni_apis.hpp" @@ -366,7 +366,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn auto compiled_expr_ptr = reinterpret_cast(j_ast); auto tview_ptr = reinterpret_cast(j_table); std::unique_ptr result = - cudf::ast::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression()); + cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression()); return reinterpret_cast(result.release()); } CATCH_STD(env, 0);