From e50fa00aed685395a16d252787a834d308a548bc Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 14:58:50 -0800 Subject: [PATCH 01/10] Expose streams in Parquet reader and writer APIs (#14359) This PR contributes to https://github.com/rapidsai/cudf/issues/13744. -Added stream parameters to public APIs ``` cudf::io::read_parquet cudf::io::write_parquet cudf::io::parquet_chunked_writer cudf::io::chunked_parquet_reader ``` -Added stream gtests Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Mark Harris (https://github.com/harrism) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/14359 --- cpp/include/cudf/io/parquet.hpp | 16 ++- cpp/src/io/functions.cpp | 27 ++--- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/io/parquet_test.cpp | 138 ++++++++++++++++++++++++++ 4 files changed, 166 insertions(+), 16 deletions(-) create mode 100644 cpp/tests/streams/io/parquet_test.cpp diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ea18da74d5a..dc035db8d39 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -401,6 +401,7 @@ class parquet_reader_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata * @@ -408,6 +409,7 @@ class parquet_reader_options_builder { */ table_with_metadata read_parquet( parquet_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -438,11 +440,13 @@ class chunked_parquet_reader { * @param chunk_read_limit Limit on total number of bytes to be returned per read, * or `0` if there is no limit * @param options The options used to read Parquet file + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ chunked_parquet_reader( std::size_t chunk_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -461,12 +465,14 @@ class chunked_parquet_reader { * @param pass_read_limit Limit on the amount of memory used for reading and decompressing data or * `0` if there is no limit * @param options The options used to read Parquet file + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ chunked_parquet_reader( std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -1163,11 +1169,13 @@ class parquet_writer_options_builder { * @endcode * * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if * requested in parquet_writer_options (empty blob otherwise). */ -std::unique_ptr> write_parquet(parquet_writer_options const& options); +std::unique_ptr> write_parquet( + parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Merges multiple raw metadata blobs that were previously created by write_parquet @@ -1778,8 +1786,10 @@ class parquet_chunked_writer { * @brief Constructor with chunked writer options * * @param[in] options options used to write table + * @param[in] stream CUDA stream used for device memory operations and kernel launches */ - parquet_chunked_writer(chunked_parquet_writer_options const& options); + parquet_chunked_writer(chunked_parquet_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Writes table to output. diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a9049d5640e..e5489963618 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -488,13 +488,14 @@ using namespace cudf::io::parquet::detail; namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); - auto reader = std::make_unique( - std::move(datasources), options, cudf::get_default_stream(), mr); + auto reader = + std::make_unique(std::move(datasources), options, stream, mr); return reader->read(options); } @@ -554,7 +555,8 @@ table_input_metadata::table_input_metadata(table_metadata const& metadata) /** * @copydoc cudf::io::write_parquet */ -std::unique_ptr> write_parquet(parquet_writer_options const& options) +std::unique_ptr> write_parquet(parquet_writer_options const& options, + rmm::cuda_stream_view stream) { namespace io_detail = cudf::io::detail; @@ -562,7 +564,7 @@ std::unique_ptr> write_parquet(parquet_writer_options const auto sinks = make_datasinks(options.get_sink()); auto writer = std::make_unique( - std::move(sinks), options, io_detail::single_write_mode::YES, cudf::get_default_stream()); + std::move(sinks), options, io_detail::single_write_mode::YES, stream); writer->write(options.get_table(), options.get_partitions()); @@ -574,13 +576,10 @@ std::unique_ptr> write_parquet(parquet_writer_options const */ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : reader{std::make_unique(chunk_read_limit, - 0, - make_datasources(options.get_source()), - options, - cudf::get_default_stream(), - mr)} + : reader{std::make_unique( + chunk_read_limit, 0, make_datasources(options.get_source()), options, stream, mr)} { } @@ -590,12 +589,13 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) : reader{std::make_unique(chunk_read_limit, pass_read_limit, make_datasources(options.get_source()), options, - cudf::get_default_stream(), + stream, mr)} { } @@ -628,14 +628,15 @@ table_with_metadata chunked_parquet_reader::read_chunk() const /** * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ -parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options) +parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options, + rmm::cuda_stream_view stream) { namespace io_detail = cudf::io::detail; auto sinks = make_datasinks(options.get_sink()); writer = std::make_unique( - std::move(sinks), options, io_detail::single_write_mode::NO, cudf::get_default_stream()); + std::move(sinks), options, io_detail::single_write_mode::NO, stream); } /** diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 48bc4ac6fc1..f7b805b68f5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -655,6 +655,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp new file mode 100644 index 00000000000..c6d531bc376 --- /dev/null +++ b/cpp/tests/streams/io/parquet_test.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// Global environment for temporary files +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +class ParquetTest : public cudf::test::BaseFixture {}; + +template +std::vector> make_uniqueptrs_vector(UniqPtrs&&... uniqptrs) +{ + std::vector> ptrsvec; + (ptrsvec.push_back(std::forward(uniqptrs)), ...); + return ptrsvec; +} + +cudf::table construct_table() +{ + constexpr auto num_rows = 10; + + std::vector zeros(num_rows, 0); + std::vector ones(num_rows, 1); + + cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col2(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col6 = [&ones, num_rows] { + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{12}}; + }); + return cudf::test::fixed_width_column_wrapper(col6_data, + col6_data + num_rows); + }(); + cudf::test::fixed_width_column_wrapper col7 = [&ones, num_rows] { + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{-12}}; + }); + return cudf::test::fixed_width_column_wrapper(col7_data, + col7_data + num_rows); + }(); + + cudf::test::lists_column_wrapper col8{ + {1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}}; + + cudf::test::structs_column_wrapper col9 = [&ones] { + cudf::test::fixed_width_column_wrapper child_col(ones.begin(), ones.end()); + return cudf::test::structs_column_wrapper{child_col}; + }(); + + cudf::test::strings_column_wrapper col10 = [] { + std::vector col10_data(num_rows, "rapids"); + return cudf::test::strings_column_wrapper(col10_data.begin(), col10_data.end()); + }(); + + auto colsptr = make_uniqueptrs_vector(col0.release(), + col1.release(), + col2.release(), + col3.release(), + col4.release(), + col5.release(), + col6.release(), + col7.release(), + col8.release(), + col9.release(), + col10.release()); + return cudf::table(std::move(colsptr)); +} + +TEST_F(ParquetTest, ParquetWriter) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tab); + cudf::io::write_parquet(out_opts, cudf::test::get_default_stream()); +} + +TEST_F(ParquetTest, ParquetReader) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tab); + cudf::io::write_parquet(out_opts, cudf::test::get_default_stream()); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts, cudf::test::get_default_stream()); + auto meta = cudf::io::read_parquet_metadata(cudf::io::source_info{filepath}); +} + +TEST_F(ParquetTest, ChunkedOperations) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::chunked_parquet_writer_options out_opts = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(out_opts, cudf::test::get_default_stream()).write(tab); + + auto reader = cudf::io::chunked_parquet_reader( + 1L << 31, + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}), + cudf::test::get_default_stream()); + while (reader.has_next()) { + auto chunk = reader.read_chunk(); + } +} From 9937c7f742ee4b453aa26198f4821095db40e671 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:07:49 -1000 Subject: [PATCH 02/10] Remove unnecessary **kwargs in function signatures (#14635) Helps makes function signatures stricter to avoid typo inputs being accepted into signatures Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14635 --- python/cudf/cudf/_lib/groupby.pyx | 4 +- python/cudf/cudf/_lib/string_casting.pyx | 36 +++++------- python/cudf/cudf/core/column/categorical.py | 18 ++++-- python/cudf/cudf/core/column/column.py | 59 +++++++++----------- python/cudf/cudf/core/column/datetime.py | 14 +++-- python/cudf/cudf/core/column/decimal.py | 9 ++- python/cudf/cudf/core/column/interval.py | 4 +- python/cudf/cudf/core/column/lists.py | 6 +- python/cudf/cudf/core/column/numerical.py | 10 ++-- python/cudf/cudf/core/column/string.py | 14 ++--- python/cudf/cudf/core/column/timedelta.py | 10 ++-- python/cudf/cudf/core/dataframe.py | 17 +++--- python/cudf/cudf/core/indexed_frame.py | 4 +- python/cudf/cudf/core/series.py | 2 - python/cudf/cudf/core/single_column_frame.py | 4 +- 15 files changed, 104 insertions(+), 107 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index b3778e45cde..f332fead8d1 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from pandas.core.groupby.groupby import DataError @@ -104,7 +104,7 @@ cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj cdef dict __dict__ - def __cinit__(self, list keys, bool dropna=True, *args, **kwargs): + def __cinit__(self, list keys, bool dropna=True): cdef libcudf_types.null_policy c_null_handling cdef table_view keys_view diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 4b44ac83a70..3826e71f850 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.column cimport Column @@ -95,7 +95,7 @@ def dtos(Column input_col): return floating_to_string(input_col) -def stod(Column input_col, **kwargs): +def stod(Column input_col): """ Converting/Casting input column of type string to double @@ -127,7 +127,7 @@ def ftos(Column input_col): return floating_to_string(input_col) -def stof(Column input_col, **kwargs): +def stof(Column input_col): """ Converting/Casting input column of type string to float @@ -188,7 +188,7 @@ def i8tos(Column input_col): return integer_to_string(input_col) -def stoi8(Column input_col, **kwargs): +def stoi8(Column input_col): """ Converting/Casting input column of type string to int8 @@ -284,7 +284,7 @@ def ltos(Column input_col): return integer_to_string(input_col) -def stol(Column input_col, **kwargs): +def stol(Column input_col): """ Converting/Casting input column of type string to int64 @@ -316,7 +316,7 @@ def ui8tos(Column input_col): return integer_to_string(input_col) -def stoui8(Column input_col, **kwargs): +def stoui8(Column input_col): """ Converting/Casting input column of type string to uint8 @@ -348,7 +348,7 @@ def ui16tos(Column input_col): return integer_to_string(input_col) -def stoui16(Column input_col, **kwargs): +def stoui16(Column input_col): """ Converting/Casting input column of type string to uint16 @@ -380,7 +380,7 @@ def uitos(Column input_col): return integer_to_string(input_col) -def stoui(Column input_col, **kwargs): +def stoui(Column input_col): """ Converting/Casting input column of type string to uint32 @@ -412,7 +412,7 @@ def ultos(Column input_col): return integer_to_string(input_col) -def stoul(Column input_col, **kwargs): +def stoul(Column input_col): """ Converting/Casting input column of type string to uint64 @@ -456,7 +456,7 @@ def _to_booleans(Column input_col, object string_true="True"): return Column.from_unique_ptr(move(c_result)) -def to_booleans(Column input_col, **kwargs): +def to_booleans(Column input_col): return _to_booleans(input_col) @@ -631,9 +631,7 @@ def timedelta2int(Column input_col, dtype, format): return Column.from_unique_ptr(move(c_result)) -def int2timedelta( - Column input_col, - **kwargs): +def int2timedelta(Column input_col, str format): """ Converting/Casting input Timedelta column to string column with specified format @@ -649,8 +647,7 @@ def int2timedelta( """ cdef column_view input_column_view = input_col.view() - cdef string c_duration_format = kwargs.get( - 'format', "%D days %H:%M:%S").encode('UTF-8') + cdef string c_duration_format = format.encode('UTF-8') cdef unique_ptr[column] c_result with nogil: c_result = move( @@ -661,7 +658,7 @@ def int2timedelta( return Column.from_unique_ptr(move(c_result)) -def int2ip(Column input_col, **kwargs): +def int2ip(Column input_col): """ Converting/Casting integer column to string column in ipv4 format @@ -684,7 +681,7 @@ def int2ip(Column input_col, **kwargs): return Column.from_unique_ptr(move(c_result)) -def ip2int(Column input_col, **kwargs): +def ip2int(Column input_col): """ Converting string ipv4 column to integer column @@ -732,7 +729,6 @@ def htoi(Column input_col, **kwargs): Parameters ---------- input_col : input column of type string - out_type : The type of integer column expected Returns ------- @@ -742,9 +738,7 @@ def htoi(Column input_col, **kwargs): cdef column_view input_column_view = input_col.view() cdef type_id tid = ( ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - kwargs.get('dtype', cudf.dtype("int64")) - ] + SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] ) ) cdef data_type c_out_type = data_type(tid) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 59fd4631067..71143fa7a95 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1310,22 +1310,28 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: new_categories=dtype.categories, ordered=bool(dtype.ordered) ) - def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column(self, dtype, format=None, **kwargs) -> StringColumn: + def as_string_column( + self, dtype, format: str | None = None + ) -> StringColumn: return self._get_decategorized_column().as_string_column( dtype, format=format ) - def as_datetime_column(self, dtype, **kwargs) -> DatetimeColumn: + def as_datetime_column( + self, dtype, format: str | None = None + ) -> DatetimeColumn: return self._get_decategorized_column().as_datetime_column( - dtype, **kwargs + dtype, format ) - def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: + def as_timedelta_column( + self, dtype, format: str | None = None + ) -> TimeDeltaColumn: return self._get_decategorized_column().as_timedelta_column( - dtype, **kwargs + dtype, format ) def _get_decategorized_column(self) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 19e76d4a95b..81579b53bb7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -978,11 +978,17 @@ def distinct_count(self, dropna: bool = True) -> int: def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() - def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: + def astype( + self, dtype: Dtype, copy: bool = False, format: str | None = None + ) -> ColumnBase: + if copy: + col = self.copy() + else: + col = self if self.dtype == dtype: - return self + return col if is_categorical_dtype(dtype): - return self.as_categorical_column(dtype) + return col.as_categorical_column(dtype) if ( isinstance(dtype, str) @@ -999,9 +1005,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: else: dtype = pandas_dtypes_to_np_dtypes.get(dtype, dtype) if _is_non_decimal_numeric_dtype(dtype): - return self.as_numerical_column(dtype, **kwargs) + return col.as_numerical_column(dtype) elif is_categorical_dtype(dtype): - return self.as_categorical_column(dtype) + return col.as_categorical_column(dtype) elif cudf.dtype(dtype).type in { np.str_, np.object_, @@ -1014,23 +1020,23 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: f"Casting to {dtype} is not supported, use " "`.astype('str')` instead." ) - return self.as_string_column(dtype, **kwargs) + return col.as_string_column(dtype, format=format) elif isinstance(dtype, (ListDtype, StructDtype)): - if not self.dtype == dtype: + if not col.dtype == dtype: raise NotImplementedError( f"Casting {self.dtype} columns not currently supported" ) - return self + return col elif isinstance(dtype, IntervalDtype): - return self.as_interval_column(dtype, **kwargs) + return col.as_interval_column(dtype) elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): - return self.as_decimal_column(dtype, **kwargs) + return col.as_decimal_column(dtype) elif np.issubdtype(cast(Any, dtype), np.datetime64): - return self.as_datetime_column(dtype, **kwargs) + return col.as_datetime_column(dtype, format=format) elif np.issubdtype(cast(Any, dtype), np.timedelta64): - return self.as_timedelta_column(dtype, **kwargs) + return col.as_timedelta_column(dtype, format=format) else: - return self.as_numerical_column(dtype, **kwargs) + return col.as_numerical_column(dtype) def as_categorical_column(self, dtype) -> ColumnBase: if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)): @@ -1076,50 +1082,35 @@ def as_categorical_column(self, dtype) -> ColumnBase: ) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": raise NotImplementedError def as_interval_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.IntervalColumn": raise NotImplementedError def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": raise NotImplementedError def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": raise NotImplementedError def as_decimal_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]: raise NotImplementedError - def as_decimal128_column( - self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.Decimal128Column": - raise NotImplementedError - - def as_decimal64_column( - self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.Decimal64Column": - raise NotImplementedError - - def as_decimal32_column( - self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.Decimal32Column": - raise NotImplementedError - def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask) if not is_bool_dtype(mask.dtype): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2b44b46bb9e..2ab2dd46c53 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -422,21 +422,23 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: ) return output - def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn: + def as_datetime_column( + self, dtype: Dtype, format: str | None = None + ) -> DatetimeColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": raise TypeError( f"cannot astype a datetimelike from {self.dtype} to {dtype}" ) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, @@ -448,7 +450,7 @@ def as_numerical_column( return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get( @@ -725,9 +727,9 @@ def _local_time(self): return utc_to_local(self, str(self.dtype.tz)) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": - return self._local_time.as_string_column(dtype, format, **kwargs) + return self._local_time.as_string_column(dtype, format) def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 299875f0091..0e90b522f2c 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,5 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations + import warnings from decimal import Decimal from typing import Any, Optional, Sequence, Union, cast @@ -37,7 +39,8 @@ class DecimalBaseColumn(NumericalBaseColumn): _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS def as_decimal_column( - self, dtype: Dtype, **kwargs + self, + dtype: Dtype, ) -> Union["DecimalBaseColumn"]: if ( isinstance(dtype, cudf.core.dtypes.DecimalDtype) @@ -53,7 +56,7 @@ def as_decimal_column( return libcudf.unary.cast(self, dtype) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if len(self) > 0: return cpp_from_decimal(self) @@ -201,7 +204,7 @@ def _decimal_quantile( return result._with_type_metadata(self.dtype) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": return libcudf.unary.cast(self, dtype) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index eed7bba3628..81059717b20 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from typing import Optional import pandas as pd @@ -99,7 +99,7 @@ def copy(self, deep=True): closed=closed, ) - def as_interval_column(self, dtype, **kwargs): + def as_interval_column(self, dtype): if isinstance(dtype, IntervalDtype): if isinstance(self.dtype, CategoricalDtype): new_struct = self._get_decategorized_column() diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index a5653e66513..0cccec6f28a 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from __future__ import annotations from functools import cached_property from typing import List, Optional, Sequence, Tuple, Union @@ -243,7 +245,7 @@ def from_sequences( return res def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": """ Create a strings column from a list column diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 148fa252fad..5461d1b13b5 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -340,7 +340,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn": return libcudf.string_casting.int2ip(self) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[ @@ -353,7 +353,7 @@ def as_string_column( ) def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": return cast( "cudf.core.column.DatetimeColumn", @@ -367,7 +367,7 @@ def as_datetime_column( ) def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": return cast( "cudf.core.column.TimeDeltaColumn", @@ -381,11 +381,11 @@ def as_timedelta_column( ) def as_decimal_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": return libcudf.unary.cast(self, dtype) - def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06b5ac31ca6..84333fc205a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5633,7 +5633,7 @@ def __contains__(self, item: ScalarLike) -> bool: ) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": out_dtype = cudf.api.types.dtype(dtype) string_col = self @@ -5696,14 +5696,13 @@ def _as_datetime_or_timedelta_column(self, dtype, format): return result_col def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": out_dtype = cudf.api.types.dtype(dtype) # infer on host from the first not na element # or return all null column if all values # are null in current column - format = kwargs.get("format", None) if format is None: if self.null_count == len(self): return cast( @@ -5720,19 +5719,20 @@ def as_datetime_column( return self._as_datetime_or_timedelta_column(out_dtype, format) def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": out_dtype = cudf.api.types.dtype(dtype) - format = "%D days %H:%M:%S" + if format is None: + format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) def as_decimal_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": return libstrings.to_decimal(self, dtype) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> StringColumn: return self diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2f842130f48..6038a1a1e97 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -288,7 +288,7 @@ def fillna( return super().fillna(fill_value, method) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, @@ -300,14 +300,14 @@ def as_numerical_column( return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": raise TypeError( f"cannot astype a timedelta from {self.dtype} to {dtype}" ) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get( @@ -323,7 +323,9 @@ def as_string_column( column.column_empty(0, dtype="object", masked=False), ) - def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: + def as_timedelta_column( + self, dtype: Dtype, format: str | None = None + ) -> TimeDeltaColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 813ecc32069..51b661593fc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -24,6 +24,7 @@ Set, Tuple, Union, + cast, ) import cupy @@ -1986,8 +1987,6 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - *args, - **kwargs, ) -> Tuple[ Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -2338,7 +2337,7 @@ def to_dict( @_cudf_nvtx_annotate def scatter_by_map( - self, map_index, map_size=None, keep_index=True, **kwargs + self, map_index, map_size=None, keep_index=True, debug: bool = False ): """Scatter to a list of dataframes. @@ -2379,7 +2378,11 @@ def scatter_by_map( # Convert string or categorical to integer if isinstance(map_index, cudf.core.column.StringColumn): - map_index = map_index.as_categorical_column("category").codes + cat_index = cast( + cudf.core.column.CategoricalColumn, + map_index.as_categorical_column("category"), + ) + map_index = cat_index.codes warnings.warn( "Using StringColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." @@ -2391,7 +2394,7 @@ def scatter_by_map( "Use an integer array/column for better performance." ) - if kwargs.get("debug", False) == 1 and map_size is not None: + if debug and map_size is not None: count = map_index.distinct_count() if map_size < count: raise ValueError( @@ -2406,7 +2409,7 @@ def scatter_by_map( partitioned = self._from_columns_like_self( partitioned_columns, column_names=self._column_names, - index_names=self._index_names if keep_index else None, + index_names=list(self._index_names) if keep_index else None, ) # due to the split limitation mentioned @@ -2537,7 +2540,7 @@ def items(self): yield (k, self[k]) @_cudf_nvtx_annotate - def equals(self, other, **kwargs): + def equals(self, other): ret = super().equals(other) # If all other checks matched, validate names. if ret: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ab089ceb103..5955e21fea0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -3612,8 +3612,6 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - *args, - **kwargs, ) -> Tuple[ Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8739a61dd8b..df5a62b384e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1578,8 +1578,6 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - *args, - **kwargs, ) -> Tuple[ Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 911e7ac905c..b73f756d7dc 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -310,8 +310,6 @@ def _make_operands_for_binop( other: Any, fill_value: Any = None, reflect: bool = False, - *args, - **kwargs, ) -> Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], NotImplementedType, From 9e6400b7d1f9d525b7f45e7b56874dc830c02d1f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 11 Jan 2024 18:38:07 -0600 Subject: [PATCH 03/10] Describe unpickling expectations when cudf.pandas is enabled (#14693) Adds to the docs the unpickling expectations that were noted in #14692. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/14693 --- docs/cudf/source/cudf_pandas/faq.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index bf9c2b98c2d..bbeaf0a5f00 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -113,6 +113,9 @@ There are a few known limitations that you should be aware of: pandas - `cudf.pandas` isn't compatible with directly using `import cudf` and is intended to be used with pandas-based workflows. +- Unpickling objects that were pickled with "regular" pandas will not + work: you must have pickled an object with `cudf.pandas` enabled for + it to be unpickled when `cudf.pandas` is enabled. - Global variables can be accessed but can't be modified during CPU-fallback ```python From 3c55a6e82e1a53581e1efd1b29d9bc0802bb054b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 11 Jan 2024 16:57:26 -0800 Subject: [PATCH 04/10] Fix CMake args (#14746) This was an oversight in #13531 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/14746 --- ci/build_wheel_cudf.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index e79b9a35aa2..cde22bb70d1 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -1,11 +1,11 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. set -euo pipefail package_dir="python/cudf" -export SKBUILD_CONFIGURE_OPTIONS="-DUSE_LIBARROW_FROM_PYARROW=ON" +export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" ./ci/build_wheel.sh cudf ${package_dir} From 2003ea2e8d2b03fb1b3a3c2f2046893395328fd2 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 11 Jan 2024 20:16:21 -0500 Subject: [PATCH 05/10] Remove usages of rapids-env-update (#14748) Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/14748 --- ci/build_cpp.sh | 8 ++++++-- ci/build_python.sh | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index f1ad8ee7778..740a6409ccd 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,9 +1,13 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail -source rapids-env-update +rapids-configure-conda-channels + +source rapids-configure-sccache + +source rapids-date-string export CMAKE_GENERATOR=Ninja diff --git a/ci/build_python.sh b/ci/build_python.sh index 32fe7b6b3ce..3c2a7761e1a 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -1,9 +1,13 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail -source rapids-env-update +rapids-configure-conda-channels + +source rapids-configure-sccache + +source rapids-date-string export CMAKE_GENERATOR=Ninja From c0a3cd14eabd18ba8cedd3b7dd87cba8b6706719 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:13:59 -1000 Subject: [PATCH 06/10] Clean up base column methods (#14725) * Removed the need for a `drop_nan` argument in `Column.dropna` * Removed the need for `Column.as_frame` * Removed the need for `Column.force_deep_copy` Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14725 --- .../cudf/benchmarks/internal/bench_column.py | 7 ++--- python/cudf/cudf/core/column/categorical.py | 11 ++++---- python/cudf/cudf/core/column/column.py | 28 ++++--------------- python/cudf/cudf/core/column/interval.py | 5 +--- python/cudf/cudf/core/column/numerical.py | 5 ---- python/cudf/cudf/io/dlpack.py | 4 +-- 6 files changed, 18 insertions(+), 42 deletions(-) diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index d4969b39f7f..8da769b7858 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of Column methods.""" @@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column): @benchmark_with_object(cls="column", dtype="float") -@pytest.mark.parametrize("dropnan", [True, False]) -def bench_dropna(benchmark, column, dropnan): - benchmark(column.dropna, drop_nan=dropnan) +def bench_dropna(benchmark, column): + benchmark(column.dropna) @benchmark_with_object(cls="column", dtype="float") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 71143fa7a95..eb4220c5895 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -987,15 +987,16 @@ def to_pandas( .fillna(_DEFAULT_CATEGORICAL_VALUE) .values_host ) - if isinstance(col.categories.dtype, IntervalDtype): + cats = col.categories + if cats.dtype.kind in "biuf": + cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] + elif not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna - categories = col.categories.to_pandas() - else: - categories = col.categories.dropna(drop_nan=True).to_pandas() + cats = cats.dropna() data = pd.Categorical.from_codes( - codes, categories=categories, ordered=col.ordered + codes, categories=cats.to_pandas(), ordered=col.ordered ) return pd.Series(data, index=index) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 81579b53bb7..3cf686da7b0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -109,16 +109,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } - def as_frame(self) -> "cudf.core.frame.Frame": - """ - Converts a Column to Frame - """ - return cudf.core.single_column_frame.SingleColumnFrame( - {None: self.copy(deep=False)} - ) - def data_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object @@ -155,7 +147,7 @@ def data_array_view( return cuda.as_cuda_array(obj).view(self.dtype) def mask_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array @@ -291,8 +283,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self, drop_nan: bool = False) -> ColumnBase: - # The drop_nan argument is only used for numerical columns. + def dropna(self) -> ColumnBase: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -437,14 +428,6 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view(mode="read") - def force_deep_copy(self) -> Self: - """ - A method to create deep copy irrespective of whether - `copy-on-write` is enabled. - """ - result = libcudf.copying.copy_column(self) - return result._with_type_metadata(self.dtype) - def copy(self, deep: bool = True) -> Self: """ Makes a copy of the Column. @@ -464,7 +447,8 @@ def copy(self, deep: bool = True) -> Self: them. """ if deep: - return self.force_deep_copy() + result = libcudf.copying.copy_column(self) + return result._with_type_metadata(self.dtype) else: return cast( Self, @@ -1069,7 +1053,7 @@ def as_categorical_column(self, dtype) -> ColumnBase: ) # columns include null index in factorization; remove: if self.has_nulls(): - cats = cats.dropna(drop_nan=False) + cats = cats.dropna() min_type = min_unsigned_type(len(cats), 8) if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 81059717b20..6a7e7729123 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -142,7 +142,4 @@ def element_indexing(self, index: int): result = super().element_indexing(index) if cudf.get_option("mode.pandas_compatible"): return pd.Interval(**result, closed=self._closed) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } + return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 5461d1b13b5..0577e0f37ed 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -20,7 +20,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.stream_compaction import drop_nulls from cudf._lib.types import size_type_dtype from cudf._typing import ( ColumnBinaryOperand, @@ -421,10 +420,6 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count - def dropna(self, drop_nan: bool = False) -> NumericalColumn: - col = self.nans_to_nulls() if drop_nan else self - return drop_nulls([col])[0] - def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index e1950c9f250..bed376e4a79 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import cudf @@ -71,7 +71,7 @@ def to_dlpack(cudf_obj): if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)): gdf = cudf_obj elif isinstance(cudf_obj, ColumnBase): - gdf = cudf_obj.as_frame() + gdf = cudf.Series._from_data({None: cudf_obj}) else: raise TypeError( f"Input of type {type(cudf_obj)} cannot be converted " From 7a42b8b57923b9515391cfe2c4668380b15ed118 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:14:30 -1000 Subject: [PATCH 07/10] Use as_column instead of arange for range like inputs (#14689) 1. Allows range-like inputs in `as_column` to short circuit and not materialize when creating columns 2. Avoids diverging column construction logic between `column.arange` and `column.as_column` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14689 --- python/cudf/cudf/core/column/__init__.py | 3 +- python/cudf/cudf/core/column/categorical.py | 12 ++- python/cudf/cudf/core/column/column.py | 99 +++++---------------- python/cudf/cudf/core/dataframe.py | 10 ++- python/cudf/cudf/core/groupby/groupby.py | 10 ++- python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/indexed_frame.py | 18 ++-- python/cudf/cudf/core/join/join.py | 8 +- python/cudf/cudf/core/multiindex.py | 16 ++-- python/cudf/cudf/core/series.py | 9 +- python/cudf/cudf/core/window/rolling.py | 6 +- python/cudf/cudf/tests/test_column.py | 10 +-- 12 files changed, 79 insertions(+), 126 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index aba4ded4f9d..3dddcae85dc 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. """ isort: skip_file @@ -8,7 +8,6 @@ from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( ColumnBase, - arange, as_column, build_categorical_column, build_column, diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index eb4220c5895..f52621dc444 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1159,7 +1159,7 @@ def find_and_replace( new_cats_col = new_cats_col.apply_boolean_mask(bmask) new_cats = cudf.DataFrame._from_data( { - "index": cudf.core.column.arange(len(new_cats_col)), + "index": column.as_column(range(len(new_cats_col))), "cats": new_cats_col, } ) @@ -1531,9 +1531,13 @@ def _set_categories( ) out_code_dtype = min_unsigned_type(max_cat_size) - cur_order = column.arange(len(cur_codes)) - old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) - new_codes = column.arange(len(new_cats), dtype=out_code_dtype) + cur_order = column.as_column(range(len(cur_codes))) + old_codes = column.as_column( + range(len(cur_cats)), dtype=out_code_dtype + ) + new_codes = column.as_column( + range(len(new_cats)), dtype=out_code_dtype + ) new_df = cudf.DataFrame._from_data( data={"new_codes": new_codes, "cats": new_cats} diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3cf686da7b0..c13ec33c51c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -554,10 +554,8 @@ def slice( ]._with_type_metadata(self.dtype) else: # Need to create a gather map for given slice with stride - gather_map = arange( - start=start, - stop=stop, - step=stride, + gather_map = as_column( + range(start, stop, stride), dtype=cudf.dtype(np.int32), ) return self.take(gather_map) @@ -626,10 +624,8 @@ def _scatter_by_slice( ) # step != 1, create a scatter map with arange - scatter_map = arange( - start=start, - stop=stop, - step=step, + scatter_map = as_column( + range(start, stop, step), dtype=cudf.dtype(np.int32), ) @@ -745,7 +741,7 @@ def indices_of( assert len(value) == 1 mask = libcudf.search.contains(value, self) return apply_boolean_mask( - [arange(0, len(self), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: @@ -1379,7 +1375,9 @@ def _return_sentinel_column(): [self], [cats], how="left" ) codes = libcudf.copying.gather( - [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True + [as_column(range(len(cats)), dtype=dtype)], + right_gather_map, + nullify=True, ) del right_gather_map # reorder `codes` so that its values correspond to the @@ -1905,13 +1903,26 @@ def as_column( * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects + * range objects """ - if isinstance(arbitrary, ColumnBase): + if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)): + column = libcudf.filling.sequence( + len(arbitrary), + as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")), + as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), + ) + if cudf.get_option("default_integer_bitwidth") and dtype is None: + dtype = cudf.dtype( + f'i{cudf.get_option("default_integer_bitwidth")//8}' + ) + if dtype is not None: + column = column.astype(dtype) + return column + elif isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary - elif isinstance(arbitrary, cudf.Series): data = arbitrary._column if dtype is not None: @@ -2614,70 +2625,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: return columns -def arange( - start: Union[int, float], - stop: Optional[Union[int, float]] = None, - step: Union[int, float] = 1, - dtype=None, -) -> cudf.core.column.NumericalColumn: - """ - Returns a column with evenly spaced values within a given interval. - - Values are generated within the half-open interval [start, stop). - The first three arguments are mapped like the range built-in function, - i.e. start and step are optional. - - Parameters - ---------- - start : int/float - Start of the interval. - stop : int/float, default is None - Stop of the interval. - step : int/float, default 1 - Step width between each pair of consecutive values. - dtype : default None - Data type specifier. It is inferred from other arguments by default. - - Returns - ------- - cudf.core.column.NumericalColumn - - Examples - -------- - >>> import cudf - >>> col = cudf.core.column.arange(2, 7, 1, dtype='int16') - >>> col - - >>> cudf.Series(col) - 0 2 - 1 3 - 2 4 - 3 5 - 4 6 - dtype: int16 - """ - if stop is None: - stop = start - start = 0 - - if step is None: - step = 1 - - size = len(range(int(start), int(stop), int(step))) - if size == 0: - if dtype is None: - dtype = cudf.dtype("int64") - return cast( - cudf.core.column.NumericalColumn, column_empty(0, dtype=dtype) - ) - - return libcudf.filling.sequence( - size, - as_device_scalar(start, dtype=dtype), - as_device_scalar(step, dtype=dtype), - ) - - def full( size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None ) -> ColumnBase: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 51b661593fc..f9cf180ff44 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -342,10 +342,16 @@ def _getitem_tuple_arg(self, arg): tmp_col_name = (tmp_col_name, *extra) cantor_name = (cantor_name, *extra) other_df = DataFrame( - {tmp_col_name: column.arange(len(tmp_arg[0]))}, + { + tmp_col_name: column.as_column( + range(len(tmp_arg[0])) + ) + }, index=as_index(tmp_arg[0]), ) - columns_df[cantor_name] = column.arange(len(columns_df)) + columns_df[cantor_name] = column.as_column( + range(len(columns_df)) + ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 73e6774f5ce..fbd85fd9876 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import itertools @@ -23,7 +23,7 @@ from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like from cudf.core.abc import Serializable -from cudf.core.column.column import ColumnBase, arange, as_column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable @@ -761,7 +761,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # subsample the gather map from the full input ordering, # rather than permuting the gather map of the output. _, (ordering,), _ = self._groupby.groups( - [arange(0, len(self.obj))] + [as_column(range(0, len(self.obj)))] ) # Invert permutation from original order to groups on the # subset of entries we want. @@ -2543,7 +2543,9 @@ def _mimic_pandas_order( # result coming back from libcudf has null_count few rows than # the input, so we must produce an ordering from the full # input range. - _, (ordering,), _ = self._groupby.groups([arange(0, len(self.obj))]) + _, (ordering,), _ = self._groupby.groups( + [as_column(range(0, len(self.obj)))] + ) if self._dropna and any( c.has_nulls(include_nan=True) > 0 for c in self.grouping._key_columns diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5c33cd09ad1..e012d8e7140 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -286,9 +286,7 @@ def _num_rows(self): @_cudf_nvtx_annotate def _values(self): if len(self) > 0: - return column.arange( - self._start, self._stop, self._step, dtype=self.dtype - ) + return column.as_column(self._range, dtype=self.dtype) else: return column.column_empty(0, masked=False, dtype=self.dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5955e21fea0..2a35ac0f959 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -182,12 +182,8 @@ def _indices_from_labels(obj, labels): # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" - lhs = cudf.DataFrame( - {"__": cudf.core.column.arange(len(labels))}, index=labels - ) - rhs = cudf.DataFrame( - {"_": cudf.core.column.arange(len(obj))}, index=obj.index - ) + lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels) + rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index) return lhs.join(rhs).sort_values(by=["__", "_"])["_"] @@ -1897,10 +1893,8 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: if stride != 1: return self._gather( GatherMap.from_column_unchecked( - cudf.core.column.arange( - start, - stop=stop, - step=stride, + as_column( + range(start, stop, stride), dtype=libcudf.types.size_type_dtype, ), len(self), @@ -2541,9 +2535,9 @@ def _align_to_index( # to recover ordering after index alignment. sort_col_id = str(uuid4()) if how == "left": - lhs[sort_col_id] = cudf.core.column.arange(len(lhs)) + lhs[sort_col_id] = as_column(range(len(lhs))) elif how == "right": - rhs[sort_col_id] = cudf.core.column.arange(len(rhs)) + rhs[sort_col_id] = as_column(range(len(rhs))) result = lhs.join(rhs, how=how, sort=sort) if how in ("left", "right"): diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 20f5b7989eb..86f0c8465ba 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -232,7 +232,11 @@ def _gather_maps(self, left_cols, right_cols): key_order = list( itertools.chain.from_iterable( libcudf.copying.gather( - [cudf.core.column.arange(n, dtype=size_type_dtype)], + [ + cudf.core.column.as_column( + range(n), dtype=size_type_dtype + ) + ], map_, nullify=null, ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 489f0e74dd6..0f323dd5540 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -501,9 +501,9 @@ def __repr__(self): # TODO: Update the following two arange calls to # a single arange call once arange has support for # a vector start/end points. - indices = column.arange(start=0, stop=n, step=1) + indices = column.as_column(range(n)) indices = indices.append( - column.arange(start=len(self) - n, stop=len(self), step=1) + column.as_column(range(len(self) - n, len(self), 1)) ) preprocess = self.take(indices) else: @@ -795,7 +795,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): [ frame, cudf.DataFrame( - {"idx": cudf.Series(column.arange(len(frame)))} + {"idx": cudf.Series(column.as_column(range(len(frame))))} ), ], axis=1, @@ -807,7 +807,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): lookup_order = "_" + "_".join(map(str, lookup._data.names)) - lookup[lookup_order] = column.arange(len(lookup)) + lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] ) @@ -840,14 +840,16 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): ): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) - return column.arange(start, stop, step) + return column.as_column(range(start, stop, step)) start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) stop_values = self._compute_validity_mask( index, row_tuple.stop, max_length ) - return column.arange(start_values.min(), stop_values.max() + 1) + return column.as_column( + range(start_values.min(), stop_values.max() + 1) + ) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length) @@ -1024,7 +1026,7 @@ def __getitem__(self, index): index = np.array(index) elif isinstance(index, slice): start, stop, step = index.indices(len(self)) - index = column.arange(start, stop, step) + index = column.as_column(range(start, stop, step)) result = MultiIndex.from_frame( self.to_frame(index=False, name=range(0, self.nlevels)).take( index diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index df5a62b384e..bc1eaef86db 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -55,7 +55,6 @@ DatetimeColumn, IntervalColumn, TimeDeltaColumn, - arange, as_column, full, ) @@ -1366,7 +1365,9 @@ def map(self, arg, na_action=None) -> "Series": raise NotImplementedError( "default values in dicts are currently not supported." ) - lhs = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) + lhs = cudf.DataFrame( + {"x": self, "orig_order": as_column(range(len(self)))} + ) rhs = cudf.DataFrame( { "x": arg.keys(), @@ -1386,7 +1387,9 @@ def map(self, arg, na_action=None) -> "Series": "Reindexing only valid with" " uniquely valued Index objects" ) - lhs = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) + lhs = cudf.DataFrame( + {"x": self, "orig_order": as_column(range(len(self)))} + ) rhs = cudf.DataFrame( { "x": arg.keys(), diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8a92ea86d57..207fb469990 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION +# Copyright (c) 2020-2024, NVIDIA CORPORATION import itertools @@ -235,7 +235,7 @@ def _apply_agg_column(self, source_column, agg_name): start = as_column(start, dtype="int32") end = as_column(end, dtype="int32") - idx = cudf.core.column.arange(len(start)) + idx = as_column(range(len(start))) preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( "int32" ) @@ -531,7 +531,7 @@ def __init__(self, groupby, window, min_periods=None, center=False): def _window_to_window_sizes(self, window): if is_integer(window): return cudautils.grouped_window_sizes_from_offset( - column.arange(len(self.obj)).data_array_view(mode="read"), + as_column(range(len(self.obj))).data_array_view(mode="read"), self._group_starts, window, ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index a4b27ae19ac..3d21994a8d5 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -8,7 +8,7 @@ import cudf from cudf._lib.transform import mask_to_bools -from cudf.core.column.column import arange, as_column +from cudf.core.column.column import as_column from cudf.testing._utils import assert_eq, assert_exceptions_equal from cudf.utils import dtypes as dtypeutils @@ -552,9 +552,3 @@ def test_astype_with_aliases(alias, expect_dtype, data): gd_data = cudf.Series.from_pandas(pd_data) assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) - - -def test_arange_empty(): - result = arange(0) - assert len(result) == 0 - assert result.dtype == np.dtype(np.int64) From 27b106f832999afa5b3353aaa2adcdb695fb4a47 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 11 Jan 2024 18:32:19 -0800 Subject: [PATCH 08/10] [Java] Choose The Correct RoundingMode For Checking Decimal OutOfBounds (#14731) This PR fixes an error in the `outOfBounds` method in which the `RoundingMode` was selected based on positive values only. The RHS should be rounded towards positive infinity (ROUND_CEILING) for the lower bound and towards negative infinity (ROUND_FLOOR) for the upper bound closes #14732 Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Jason Lowe (https://github.com/jlowe) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/14731 --- .../java/ai/rapids/cudf/DecimalUtils.java | 30 +++++++------- .../java/ai/rapids/cudf/DecimalUtilsTest.java | 40 +++++++++++++++++++ 2 files changed, 55 insertions(+), 15 deletions(-) create mode 100644 java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java diff --git a/java/src/main/java/ai/rapids/cudf/DecimalUtils.java b/java/src/main/java/ai/rapids/cudf/DecimalUtils.java index 1979bd1bd5b..7a5be9b08b9 100644 --- a/java/src/main/java/ai/rapids/cudf/DecimalUtils.java +++ b/java/src/main/java/ai/rapids/cudf/DecimalUtils.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,13 +82,13 @@ public static ColumnVector lessThan(ColumnView lhs, BigDecimal rhs) { int leftScale = lhs.getType().getScale(); int leftPrecision = lhs.getType().getDecimalMaxPrecision(); - // First we have to round the scalar (rhs) to the same scale as lhs. Because this is a - // less than and it is rhs that we are rounding, we will round away from 0 (UP) - // to make sure we always return the correct value. - // For example: - // 100.1 < 100.19 - // If we rounded down the rhs 100.19 would become 100.1, and now 100.1 is not < 100.1 - BigDecimal roundedRhs = rhs.setScale(-leftScale, BigDecimal.ROUND_UP); + // First we have to round the scalar (rhs) to the same scale as lhs. + // For comparing the two values they should be the same scale, we round the value to positive infinity to maintain + // the relation. Ex: + // 10.2 < 10.29 = true, after rounding rhs to ceiling ===> 10.2 < 10.3 = true, relation is maintained + // 10.3 < 10.29 = false, after rounding rhs to ceiling ===> 10.3 < 10.3 = false, relation is maintained + // 10.1 < 10.10 = false, after rounding rhs to ceiling ===> 10.1 < 10.1 = false, relation is maintained + BigDecimal roundedRhs = rhs.setScale(-leftScale, BigDecimal.ROUND_CEILING); if (roundedRhs.precision() > leftPrecision) { // converting rhs to the same precision as lhs would result in an overflow/error, but @@ -136,13 +136,13 @@ public static ColumnVector greaterThan(ColumnView lhs, BigDecimal rhs) { int cvScale = lhs.getType().getScale(); int maxPrecision = lhs.getType().getDecimalMaxPrecision(); - // First we have to round the scalar (rhs) to the same scale as lhs. Because this is a - // greater than and it is rhs that we are rounding, we will round towards 0 (DOWN) - // to make sure we always return the correct value. - // For example: - // 100.2 > 100.19 - // If we rounded up the rhs 100.19 would become 100.2, and now 100.2 is not > 100.2 - BigDecimal roundedRhs = rhs.setScale(-cvScale, BigDecimal.ROUND_DOWN); + // First we have to round the scalar (rhs) to the same scale as lhs. + // For comparing the two values they should be the same scale, we round the value to negative infinity to maintain + // the relation. Ex: + // 10.3 > 10.29 = true, after rounding rhs to floor ===> 10.3 > 10.2 = true, relation is maintained + // 10.2 > 10.29 = false, after rounding rhs to floor ===> 10.2 > 10.2 = false, relation is maintained + // 10.1 > 10.10 = false, after rounding rhs to floor ===> 10.1 > 10.1 = false, relation is maintained + BigDecimal roundedRhs = rhs.setScale(-cvScale, BigDecimal.ROUND_FLOOR); if (roundedRhs.precision() > maxPrecision) { // converting rhs to the same precision as lhs would result in an overflow/error, but diff --git a/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java b/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java new file mode 100644 index 00000000000..a96eeda5dd7 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java @@ -0,0 +1,40 @@ +/* + * + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import java.math.BigDecimal; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; + +public class DecimalUtilsTest extends CudfTestBase { + @Test + public void testOutOfBounds() { + try (ColumnView cv = ColumnVector.fromDecimals( + new BigDecimal("-1E+3"), + new BigDecimal("1E+3"), + new BigDecimal("9E+1"), + new BigDecimal("-9E+1"), + new BigDecimal("-91")); + ColumnView expected = ColumnVector.fromBooleans(true, true, false, false, true); + ColumnView result = DecimalUtils.outOfBounds(cv, 1, -1)) { + assertColumnsAreEqual(expected, result); + } + } +} From 5c78b7ea6b75f503d5df4abc828d80a0b470a284 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Jan 2024 08:49:20 +0000 Subject: [PATCH 09/10] Fix logic bug introduced in #14730 (#14742) The removal of `valid_count` on columns in #14730 had one logic bug, fixed here. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14742 --- python/cudf/cudf/core/column/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f52621dc444..6b3ee0ba852 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1380,7 +1380,7 @@ def _concat( # Find the first non-null column: head = next( - (obj for obj in objs if not obj.null_count != len(obj)), objs[0] + (obj for obj in objs if obj.null_count != len(obj)), objs[0] ) # Combine and de-dupe the categories From 7ca988f207730a3ae936e90d0104c4e6a14749ff Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 12 Jan 2024 12:22:58 -0600 Subject: [PATCH 10/10] Fix ``Groupby.get_group`` (#14728) Closes https://github.com/rapidsai/cudf/issues/14727 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Charles Blackmon-Luca (https://github.com/charlesbluca) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/14728 --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/tests/groupby/test_indexing.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fbd85fd9876..4e8947652ff 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -377,7 +377,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self.obj - return obj.loc[self.groups[name]] + return obj.loc[self.groups[name].drop_duplicates()] @_cudf_nvtx_annotate def size(self): diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_indexing.py index 06777c8e6af..57e8bc1c2d8 100644 --- a/python/cudf/cudf/tests/groupby/test_indexing.py +++ b/python/cudf/cudf/tests/groupby/test_indexing.py @@ -1 +1,12 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import cudf +from cudf.testing._utils import assert_eq + + +def test_rank_return_type_compatible_mode(): + # in compatible mode, rank() always returns floats + df = cudf.DataFrame({"a": range(10), "b": [0] * 10}, index=[0] * 10) + pdf = df.to_pandas() + expect = pdf.groupby("b").get_group(0) + result = df.groupby("b").get_group(0) + assert_eq(expect, result)