From c2846fb8580a2a6ddd29808bea470623b71feb19 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 14 Feb 2022 12:32:12 -0800 Subject: [PATCH 01/10] Fix incorrect slicing of GDS read/write calls (#10274) Issue happens when the read/write size is a multiple of the maximum slice size. It this case, size of the last slice is computed as `0`, instead of `max_slice_size`: `(t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes` This PR reimplements this part of code and adds unit tests. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/10274 --- cpp/src/io/utilities/file_io_utilities.cpp | 36 ++++++++++------- cpp/src/io/utilities/file_io_utilities.hpp | 17 +++++++- cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/file_io_test.cpp | 46 ++++++++++++++++++++++ 4 files changed, 85 insertions(+), 15 deletions(-) create mode 100644 cpp/tests/io/file_io_test.cpp diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index dabe992d959..e2893a2e881 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -194,20 +194,13 @@ template > make_sliced_tasks( F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool) { + constexpr size_t default_max_slice_size = 4 * 1024 * 1024; + static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size); + auto const slices = make_file_io_slices(size, max_slice_size); std::vector> slice_tasks; - constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024; - static auto const max_slice_bytes = - getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes); - size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes); - size_t slice_offset = 0; - for (size_t t = 0; t < n_slices; ++t) { - DataT* ptr_slice = ptr + slice_offset; - - size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes; - slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset)); - - slice_offset += slice_size; - } + std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { + return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset); + }); return slice_tasks; } @@ -318,6 +311,21 @@ std::unique_ptr make_cufile_output(std::string const& filepa return nullptr; } +std::vector make_file_io_slices(size_t size, size_t max_slice_size) +{ + max_slice_size = std::max(1024ul, max_slice_size); + auto const n_slices = util::div_rounding_up_safe(size, max_slice_size); + std::vector slices; + slices.reserve(n_slices); + std::generate_n(std::back_inserter(slices), n_slices, [&, idx = 0]() mutable { + auto const slice_offset = idx++ * max_slice_size; + auto const slice_size = std::min(size - slice_offset, max_slice_size); + return file_io_slice{slice_offset, slice_size}; + }); + + return slices; +} + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index fcee4e43a20..be3ecc49ab0 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -291,6 +291,21 @@ std::unique_ptr make_cufile_input(std::string const& filepath */ std::unique_ptr make_cufile_output(std::string const& filepath); +/** + * @brief Byte range to be read/written in a single operation. + */ +struct file_io_slice { + size_t offset; + size_t size; +}; + +/** + * @brief Split the total number of bytes to read/write into slices to enable parallel IO. + * + * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse. + */ +std::vector make_file_io_slices(size_t size, size_t max_slice_size); + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 913761ecd03..27dd472b3f5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -199,6 +199,7 @@ ConfigureTest( ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp) ConfigureTest(CSV_TEST io/csv_test.cpp) +ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp new file mode 100644 index 00000000000..b546239fdca --- /dev/null +++ b/cpp/tests/io/file_io_test.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +// Base test fixture for tests +struct CuFileIOTest : public cudf::test::BaseFixture { +}; + +TEST_F(CuFileIOTest, SliceSize) +{ + std::vector> test_cases{ + {1 << 20, 1 << 18}, {1 << 18, 1 << 20}, {1 << 20, 3333}, {0, 1 << 18}, {0, 0}, {1 << 20, 0}}; + for (auto const& test_case : test_cases) { + auto const slices = cudf::io::detail::make_file_io_slices(test_case.first, test_case.second); + if (slices.empty()) { + ASSERT_EQ(test_case.first, 0); + } else { + ASSERT_EQ(slices.front().offset, 0); + ASSERT_EQ(slices.back().offset + slices.back().size, test_case.first); + for (auto i = 1u; i < slices.size(); ++i) { + ASSERT_EQ(slices[i].offset, slices[i - 1].offset + slices[i - 1].size); + } + } + } +} + +CUDF_TEST_PROGRAM_MAIN() From 374b38745932616dbeca2e84e1e86d63c228a179 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 14 Feb 2022 15:59:06 -0500 Subject: [PATCH 02/10] Replace custom `cached_property` implementation with functools (#10272) Replaces our custom `cached_property` with that provided by `functools` since Python 3.8. This uncovered a couple of typing bugs that previously eluded us. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - https://github.com/brandon-b-miller - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10272 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column_accessor.py | 7 +++---- python/cudf/cudf/core/frame.py | 12 ++++++------ python/cudf/cudf/core/groupby/groupby.py | 3 ++- python/cudf/cudf/core/index.py | 5 +++-- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/join/join.py | 6 +++--- python/cudf/cudf/core/multiindex.py | 9 +++------ python/cudf/cudf/utils/utils.py | 22 ---------------------- 9 files changed, 22 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index cdb1e9fc86f..c96d940c378 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +from functools import cached_property from typing import Any, Set import pandas as pd @@ -31,7 +32,6 @@ is_mixed_with_object_dtype, numeric_normalize_types, ) -from cudf.utils.utils import cached_property class BaseIndex(Serializable): diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 67976ac27d4..9cb86ca1cd2 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,7 +4,7 @@ import itertools from collections.abc import MutableMapping -from functools import reduce +from functools import cached_property, reduce from typing import ( TYPE_CHECKING, Any, @@ -20,7 +20,6 @@ import cudf from cudf.core import column -from cudf.utils.utils import cached_property if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -360,9 +359,9 @@ def select_by_index(self, index: Any) -> ColumnAccessor: start, stop, step = index.indices(len(self._data)) keys = self.names[start:stop:step] elif pd.api.types.is_integer(index): - keys = [self.names[index]] + keys = (self.names[index],) else: - keys = (self.names[i] for i in index) + keys = tuple(self.names[i] for i in index) data = {k: self._data[k] for k in keys} return self.__class__( data, multiindex=self.multiindex, level_names=self.level_names, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6e46c107d2e..6038bb49bfb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -90,22 +90,22 @@ def _num_rows(self) -> int: return len(self._data.columns[0]) @property - def _column_names(self) -> List[Any]: # TODO: List[str]? - return self._data.names + def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + return tuple(self._data.names) @property - def _index_names(self) -> List[Any]: # TODO: List[str]? + def _index_names(self) -> Optional[Tuple[Any, ...]]: # TODO: Tuple[str]? # TODO: Temporarily suppressing mypy warnings to avoid introducing bugs # by returning an empty list where one is not expected. return ( None # type: ignore if self._index is None - else self._index._data.names + else tuple(self._index._data.names) ) @property - def _columns(self) -> List[Any]: # TODO: List[Column]? - return self._data.columns + def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? + return tuple(self._data.columns) def serialize(self): header = { diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ff700144bed..4bd14a2c47b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -4,6 +4,7 @@ import itertools import pickle import warnings +from functools import cached_property import numpy as np import pandas as pd @@ -16,7 +17,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import GetAttrGetItemMixin, cached_property +from cudf.utils.utils import GetAttrGetItemMixin # The three functions below return the quantiles [25%, 50%, 75%] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f71f930a21c..5b60e8dbd1c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,10 +1,11 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from __future__ import annotations import math import pickle import warnings +from functools import cached_property from numbers import Number from typing import ( Any, @@ -54,7 +55,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type -from cudf.utils.utils import cached_property, search_range +from cudf.utils.utils import search_range T = TypeVar("T", bound="Frame") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 10b9f2396bb..7788a5346c8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6,6 +6,7 @@ import operator import warnings from collections import Counter, abc +from functools import cached_property from typing import Callable, Type, TypeVar from uuid import uuid4 @@ -29,7 +30,6 @@ from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame -from cudf.utils.utils import cached_property doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 39ff4718550..c7e46cf0165 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,7 +1,7 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations -from typing import TYPE_CHECKING, Callable, cast +from typing import TYPE_CHECKING, Any, Callable, List, cast import cudf from cudf import _lib as libcudf @@ -320,7 +320,7 @@ def _sort_result(self, result: Frame) -> Frame: # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. - by = [] + by: List[Any] = [] if self._using_left_index and self._using_right_index: if result._index is not None: by.extend(result._index._data.columns) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8581b97c217..5e0cd2ca8cb 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -6,6 +6,7 @@ import numbers import pickle from collections.abc import Sequence +from functools import cached_property from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union @@ -22,11 +23,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import ( - NotIterable, - _maybe_indices_to_slice, - cached_property, -) +from cudf.utils.utils import NotIterable, _maybe_indices_to_slice class MultiIndex(Frame, BaseIndex, NotIterable): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 8571d9ffed5..4143cbd1d66 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -144,28 +144,6 @@ def set_allocator( IS_NEP18_ACTIVE = _is_nep18_active() -class cached_property: - """ - Like @property, but only evaluated upon first invocation. - To force re-evaluation of a cached_property, simply delete - it with `del`. - """ - - # TODO: Can be replaced with functools.cached_property when we drop support - # for Python 3.7. - - def __init__(self, func): - self.func = func - - def __get__(self, instance, cls): - if instance is None: - return self - else: - value = self.func(instance) - object.__setattr__(instance, self.func.__name__, value) - return value - - class GetAttrGetItemMixin: """This mixin changes `__getattr__` to attempt a `__getitem__` call. From a443dd1ae6144d8afc9b9bcd5390d9165ce3017d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 14 Feb 2022 14:20:46 -0800 Subject: [PATCH 03/10] Convert Column Name to String Before Using Struct Column Factory (#10156) Closes #10155 `build_struct_column` requires that the field names to be strings. But dataframe column names can be any hashable types. Passing in column names as field names in `to_struct` is thus unsafe. This PR adds a check and raise a warning if the cast to string is required to take place. Authors: - Michael Wang (https://github.com/isVoid) - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Sheilah Kirui (https://github.com/skirui-source) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10156 --- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 10 +++++++++- python/cudf/cudf/tests/test_struct.py | 8 ++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2535ba5ab8d..393afe4a5b9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1602,8 +1602,8 @@ def build_struct_column( Parameters ---------- - names : list-like - Field names to map to children dtypes + names : sequence of strings + Field names to map to children dtypes, must be strings. children : tuple mask: Buffer diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 371404ca477..90119ba7b17 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5864,8 +5864,16 @@ def to_struct(self, name=None): ----- Note that a copy of the columns is made. """ + if not all(isinstance(name, str) for name in self._data.names): + warnings.warn( + "DataFrame contains non-string column name(s). Struct column " + "requires field name to be string. Non-string column names " + "will be casted to string as the field name." + ) + field_names = [str(name) for name in self._data.names] + col = cudf.core.column.build_struct_column( - names=self._data.names, children=self._data.columns, size=len(self) + names=field_names, children=self._data.columns, size=len(self) ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor( diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index dbff626c363..167f171fa26 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -205,6 +205,14 @@ def test_dataframe_to_struct(): df["a"][0] = 5 assert_eq(got, expect) + # check that a non-string (but convertible to string) named column can be + # converted to struct + df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) + expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) + with pytest.warns(UserWarning, match="will be casted"): + got = df.to_struct() + assert_eq(got, expect) + @pytest.mark.parametrize( "series, slce", From f5ae74f720eb3fe5c82d96d306587a470144dc4f Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Tue, 15 Feb 2022 09:30:18 +0800 Subject: [PATCH 04/10] Allow Java bindings to use default decimal precisions when writing columns (#10276) Closes #9851 This PR is to fix the bug raised in #9851 : Currently, Java bindings set decimal precision for each column when building OrcWriterOptions. It fills zero for non-decimal columns which do not carry precision information. In principle, we should only set precision for decimal columns. It is not easy to write a test for this change, since we won't try to read non-decimal data as decimal type in both spark-rapids and cuDF Java. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Jason Lowe (https://github.com/jlowe) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10276 --- .../java/ai/rapids/cudf/ColumnWriterOptions.java | 9 ++++++--- java/src/main/native/src/TableJni.cpp | 13 ++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java index 0e49636fae6..78b3d5d52ec 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,9 @@ private ColumnWriterOptions(AbstractStructBuilder builder) { (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]); } + // The sentinel value of unknown precision (default value) + public static int UNKNOWN_PRECISION = -1; + /** * Constructor used for list */ @@ -103,7 +106,7 @@ protected ColumnWriterOptions withDecimal(String name, int precision, protected ColumnWriterOptions withTimestamp(String name, boolean isInt96, boolean isNullable) { - return new ColumnWriterOptions(name, isInt96, 0, isNullable); + return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable); } /** @@ -243,7 +246,7 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, public ColumnWriterOptions(String columnName, boolean isNullable) { this.isTimestampTypeInt96 = false; - this.precision = 0; + this.precision = UNKNOWN_PRECISION; this.isNullable = isNullable; this.columnName = columnName; } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index eac76222475..1cf56da35da 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -676,9 +676,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { cudf::io::column_in_metadata child; - child.set_name(col_names[read_index]) - .set_decimal_precision(precisions[read_index]) - .set_nullability(nullability[read_index]); + child.set_name(col_names[read_index]).set_nullability(nullability[read_index]); + if (precisions[read_index] > -1) { + child.set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); } @@ -717,8 +718,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) { metadata.column_metadata[write_index] .set_name(cpp_names[read_index]) - .set_nullability(col_nullability[read_index]) - .set_decimal_precision(precisions[read_index]); + .set_nullability(col_nullability[read_index]); + if (precisions[read_index] > -1) { + metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]); } From 17b7907ee56747caf946e07870ccb5187729d57a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 14 Feb 2022 17:31:58 -0800 Subject: [PATCH 05/10] Add copyright check as pre-commit hook. (#10290) Since #10253 added copyright checks, it is helpful to have the corresponding check enabled via pre-commit so that copyright issues can be found locally before pushing. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10290 --- .pre-commit-config.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1e1ad94ab0b..9e72c0119f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -88,6 +88,13 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmake-format==0.6.11 + - id: copyright-check + name: copyright-check + # This hook's use of Git tools appears to conflict with + # existing CI invocations so we don't invoke it during CI runs. + stages: [commit] + entry: python ./ci/checks/copyright.py --git-modified-only + language: python default_language_version: python: python3 From 8b0737d7a4cfd3a266a5450b15df4b978fb6dc4f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 14 Feb 2022 17:32:09 -0800 Subject: [PATCH 06/10] Enable numpy ufuncs for DataFrame (#10287) This PR addresses the primary issue in #9083, enabling all numpy ufuncs for DataFrame objects. It builds on the work in #10217, generalizing that code path to support multiple columns and moving the method up to `IndexedFrame` to share the logic with `DataFrame`. The custom preprocessing of inputs before handing off to cupy that was implemented in #10217 has been replaced by reusing parts of the existing binop machinery for greater generality, which is especially important for DataFrame binops since they support a wider range of alternative operand types. The current internal refactor is intentionally minimal to leave the focus on the new ufunc features. I will make a follow-up to clean up the internal functions by adding a proper set of hooks into the binop and ufunc implementations so that we can share these implementations with Index types as well, at which point we will be able to remove the extraneous APIs discussed in https://github.com/rapidsai/cudf/issues/9083#issuecomment-1005175782. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10287 --- python/cudf/cudf/core/dataframe.py | 33 ++-- python/cudf/cudf/core/indexed_frame.py | 148 +++++++++++++++++ python/cudf/cudf/core/series.py | 159 +++---------------- python/cudf/cudf/core/single_column_frame.py | 6 +- python/cudf/cudf/tests/test_array_ufunc.py | 101 +++++++++++- 5 files changed, 292 insertions(+), 155 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 90119ba7b17..fb15f8da8d9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False): {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(self) - else: - return NotImplemented - @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): @@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") - def _binaryop( + def _prep_for_binop( self, other: Any, fn: str, @@ -1885,6 +1876,7 @@ def _binaryop( # implementation assumes that binary operations between a column and # NULL are always commutative, even for binops (like subtraction) that # are normally anticommutative. + # TODO: We probably should support pandas DataFrame/Series objects. if isinstance(rhs, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { @@ -1948,11 +1940,30 @@ def _binaryop( right = right_dict[col] operands[col] = (left, right, reflect, fill_value) else: + return NotImplemented, None + + return operands, lhs._index + + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented return self._from_data( ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=lhs._index, + index=out_index, ) @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7788a5346c8..e1ff3984948 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1697,6 +1697,154 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # pandas bitwise operations return bools if indexes are misaligned. + if ( + "bitwise" in fname + and isinstance(other, IndexedFrame) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cp, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + index = self._index + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if ( + isinstance(inp, cudf.core.column.ColumnBase) + and inp.has_nulls() + ): + new_mask = cudf.core.column.as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cp.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = cudf.core.column.as_column(out).set_mask( + mask + ) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out + + return NotImplemented + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 90ebeba5087..3aef4447a28 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,6 @@ import pickle import warnings from collections import abc as abc -from itertools import repeat from numbers import Number from shutil import get_terminal_size from typing import Any, MutableMapping, Optional, Set, Union @@ -959,141 +958,6 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - # For more detail on this function and how it should work, see - # https://numpy.org/doc/stable/reference/ufuncs.html - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: - return NotImplemented - - # Binary operations - binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", - } - - # First look for methods of the class. - fname = ufunc.__name__ - if fname in binary_operations: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - # These operators need to be mapped to their inverses when - # performing a reflected operation because no reflected version of - # the operators themselves exist. - ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", - } - - op = binary_operations[fname] - if reflect and op in ops_without_reflection: - op = ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" - - # pandas bitwise operations return bools if indexes are misaligned. - # TODO: Generalize for other types of Frames - if ( - "bitwise" in fname - and isinstance(other, Series) - and not self.index.equals(other.index) - ): - return getattr(self, op)(other).astype(bool) - # Float_power returns float irrespective of the input type. - if fname == "float_power": - return getattr(self, op)(other).astype(float) - return getattr(self, op)(other) - - # Special handling for unary operations. - if fname == "negative": - return self * -1 - if fname == "positive": - return self.copy(deep=True) - if fname == "invert": - return ~self - if fname == "absolute": - return self.abs() - if fname == "fabs": - return self.abs().astype(np.float64) - - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations, but those operations - # are not defined on pd.Series/DataFrame. For now those operations will - # dispatch to cupy, but if ufuncs are ever a bottleneck we could add - # special handling to dispatch those (or any other) functions that we - # could implement without cupy. - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cupy, fname) - if cupy_func: - # Indices must be aligned before converting to arrays. - if ufunc.nin == 2 and all(map(isinstance, inputs, repeat(Series))): - inputs = _align_indices(inputs, allow_non_unique=True) - index = inputs[0].index - else: - index = self.index - - cupy_inputs = [] - mask = None - for inp in inputs: - # TODO: Generalize for other types of Frames - if isinstance(inp, Series) and inp.has_nulls: - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and of - # bitmasks. Once we expose cudf::detail::bitwise_and, then - # we can use that instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume that - # the end result propagates nulls via a bitwise and, so - # these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - - def make_frame(arr): - return self.__class__._from_data( - {self.name: as_column(arr).set_mask(mask)}, index=index - ) - - if ufunc.nout > 1: - return tuple(make_frame(out) for out in cp_output) - return make_frame(cp_output) - - return NotImplemented - def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: @@ -1342,9 +1206,9 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop( + def _prep_for_binop( self, - other: Frame, + other: Any, fn: str, fill_value: Any = None, reflect: bool = False, @@ -1376,9 +1240,24 @@ def _binaryop( lhs = self operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return operands, lhs._index + + def _binaryop( + self, + other: Frame, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) return ( - lhs._from_data( - data=lhs._colwise_binop(operands, fn), index=lhs._index, + self._from_data( + data=self._colwise_binop(operands, fn), index=out_index, ) if operands is not NotImplemented else NotImplemented diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index bf867923b57..50b206d3388 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1): def _make_operands_for_binop( self, - other: T, + other: Any, fill_value: Any = None, reflect: bool = False, *args, @@ -310,7 +310,7 @@ def _make_operands_for_binop( else: result_name = self.name - # This needs to be tested correctly + # TODO: This needs to be tested correctly if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index a384ddecca6..f1aad1af9e6 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -109,7 +109,7 @@ def test_ufunc_series(ufunc, has_nulls, indexed): @pytest.mark.parametrize("reflect", [True, False]) def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): fname = ufunc.__name__ - if fname in ("greater", "greater_equal") and has_nulls: + if fname in ("greater", "greater_equal", "logical_and") and has_nulls: pytest.xfail( "The way cudf casts nans in arrays to nulls during binops with " "cudf objects is currently incompatible with pandas." @@ -181,3 +181,102 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # this throws a value-error because of presence of out kwarg with pytest.raises(TypeError): func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + # TODO: When pandas starts supporting misaligned indexes properly, remove + # this check but enable the one below. + if indexed: + pytest.xfail( + "pandas does not currently support misaligned indexes in " + "DataFrames, but we do. Until this is fixed we will skip these " + "tests. See the error here: " + "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, " # noqa: E501 + "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258" # noqa: E501 + ) + # TODO: Enable the check below when we remove the check above. + # if indexed and fname in ( + # "greater", + # "greater_equal", + # "less", + # "less_equal", + # "not_equal", + # "equal", + # ): + # pytest.skip("Comparison operators do not support misaligned indexes.") # noqa: E501 + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.dataframe._align_indices(*args) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - cudf.from_pandas( + expect[not_equal.to_pandas()] + ) + if diffs["foo"].abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise From 851e23545d29bca17a778f84dbc2000b3dde0ba8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:31:45 -0600 Subject: [PATCH 07/10] Reduce pytest runtime (#10203) This PR reduces the overall runtime of the cuDF pytest suite. Changes include: - asserting equal on the GPU where possible for large datasets - in some cases reducing excessive test data size part of https://github.com/rapidsai/cudf/issues/9999 Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10203 --- python/cudf/cudf/testing/_utils.py | 9 +++- .../test_avro_reader_fastavro_integration.py | 4 +- python/cudf/cudf/tests/test_binops.py | 15 +++--- python/cudf/cudf/tests/test_csv.py | 11 +++-- .../cudf/tests/test_extension_compilation.py | 12 +++-- python/cudf/cudf/tests/test_indexing.py | 46 +++++++++---------- python/cudf/cudf/tests/test_orc.py | 24 +++++----- python/cudf/cudf/tests/test_parquet.py | 8 ++-- python/cudf/cudf/tests/test_repr.py | 45 ++++++++---------- python/cudf/cudf/tests/test_reshape.py | 8 ++-- python/cudf/cudf/tests/test_string.py | 7 +-- python/cudf/cudf/tests/test_udf_masked_ops.py | 18 +++++--- 12 files changed, 106 insertions(+), 101 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index b97b2d660d6..e767c0c62be 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. +import itertools import re import warnings from collections.abc import Mapping, Sequence @@ -330,3 +331,9 @@ def does_not_raise(): def xfail_param(param, **kwargs): return pytest.param(param, marks=pytest.mark.xfail(**kwargs)) + + +parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( + "left_dtype,right_dtype", + list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), +) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index a064bec9e82..9eb01ae31b4 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -210,7 +210,7 @@ def test_can_parse_no_schema(): assert_eq(expected, actual) -@pytest.mark.parametrize("rows", [0, 1, 10, 100000]) +@pytest.mark.parametrize("rows", [0, 1, 10, 1000]) @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) def test_avro_compression(rows, codec): schema = { diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 76add8b9c5d..02ca7a0cd58 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -4,7 +4,7 @@ import decimal import operator import random -from itertools import product +from itertools import combinations_with_replacement, product import cupy as cp import numpy as np @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype): def _series_compare_nulls_typegen(): - tests = [] - tests += list(product(DATETIME_TYPES, DATETIME_TYPES)) - tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES)) - tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES)) - tests += list(product(STRING_TYPES, STRING_TYPES)) - - return tests + return [ + *combinations_with_replacement(DATETIME_TYPES, 2), + *combinations_with_replacement(TIMEDELTA_TYPES, 2), + *combinations_with_replacement(NUMERIC_TYPES, 2), + *combinations_with_replacement(STRING_TYPES, 2), + ] @pytest.mark.parametrize("cmpop", _cmpops) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 9208b8c7cd4..f3d69e1745e 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import gzip import os @@ -8,6 +8,7 @@ from io import BytesIO, StringIO from pathlib import Path +import cupy as cp import numpy as np import pandas as pd import pytest @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir): def test_csv_reader_carriage_return(tmpdir): rows = 1000 names = ["int_row", "int_double_row"] - buffer = ",".join(names) + "\r\n" for row in range(rows): buffer += str(row) + ", " + str(2 * row) + "\r\n" df = read_csv(StringIO(buffer)) + expect = cudf.DataFrame( + {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} + ) assert len(df) == rows - for row in range(0, rows): - assert df[names[0]][row] == row - assert df[names[1]][row] == 2 * row + assert_eq(expect, df) def test_csv_reader_tabs(): diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 47c9448cf63..692f40873d7 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -1,13 +1,17 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import operator import cupy as cp +import numpy as np import pytest from numba import cuda, types from numba.cuda import compile_ptx +from numba.np.numpy_support import from_dtype from cudf import NA from cudf.core.udf.api import Masked from cudf.core.udf.typing import MaskedType +from cudf.testing._utils import parametrize_numeric_dtypes_pairwise arith_ops = ( operator.add, @@ -159,19 +163,21 @@ def func(x): @pytest.mark.parametrize("op", ops) -@pytest.mark.parametrize("ty1", number_types, ids=number_ids) -@pytest.mark.parametrize("ty2", number_types, ids=number_ids) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize( "masked", ((False, True), (True, False), (True, True)), ids=("um", "mu", "mm"), ) -def test_compile_arith_masked_ops(op, ty1, ty2, masked): +def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): def func(x, y): return op(x, y) cc = (7, 5) + ty1 = from_dtype(np.dtype(left_dtype)) + ty2 = from_dtype(np.dtype(right_dtype)) + if masked[0]: ty1 = MaskedType(ty1) if masked[1]: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 102e5b57e8e..19d7c8a10ab 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from itertools import combinations @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe): @pytest.mark.parametrize( - "gdf", + "gdf_kwargs", [ - cudf.DataFrame({"a": range(1000000)}), - cudf.DataFrame({"a": range(1000000), "b": range(1000000)}), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( - { + {"data": {"a": range(100000)}}, + {"data": {"a": range(100000), "b": range(100000)}}, + { + "data": { "a": range(20), "b": range(20), "c": ["abc", "def", "xyz", "def", "pqr"] * 4, } - ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(1000000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(1000000)), - cudf.DataFrame( - columns=["a", "col2", "...col n"], index=range(1000000) - ), - cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")), - cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(1000000)).astype("str"), - ), + }, + {"index": [1, 2, 3]}, + {"index": range(100000)}, + {"columns": ["a", "b", "c", "d"]}, + {"columns": ["a"], "index": range(100000)}, + {"columns": ["a", "col2", "...col n"], "index": range(100000)}, + {"index": cudf.Series(range(100000)).astype("str")}, + { + "columns": ["a", "b", "c", "d"], + "index": cudf.Series(range(100000)).astype("str"), + }, ], ) @pytest.mark.parametrize( "slice", [ - slice(250000, 500000), - slice(250000, 250001), - slice(500000), + slice(25000, 50000), + slice(25000, 25001), + slice(50000), slice(1, 10), slice(10, 20), slice(15, 24000), slice(6), ], ) -def test_dataframe_sliced(gdf, slice): +def test_dataframe_sliced(gdf_kwargs, slice): + gdf = cudf.DataFrame(**gdf_kwargs) pdf = gdf.to_pandas() actual = gdf[slice] diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 8689f773a02..623098741a9 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -16,6 +16,7 @@ import cudf from cudf.io.orc import ORCWriter +from cudf.testing import assert_frame_equal from cudf.testing._utils import ( assert_eq, gen_rand_series, @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path, engine=engine, columns=columns, use_index=use_index ) - assert_eq(expect, got, check_categorical=False) + assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) def test_orc_reader_filenotfound(tmpdir): @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): else: print(type(excpr).__name__) - expect = orcfile.read(columns=columns).to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression) - got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas()) + expect.to_orc(gdf_fname.strpath, compression=compression) + got = cudf.from_pandas( + pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + ) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): else: print(type(excpr).__name__) - expect = orcfile.read().to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq) - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + expect = cudf.from_pandas(orcfile.read().to_pandas()) + expect.to_orc(gdf_fname.strpath, statistics=stats_freq) + got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas()) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -492,8 +495,7 @@ def test_chunked_orc_writer( writer.close() got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() - - assert_eq(expect, got) + assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e1ca9f6f006..7feaa400446 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): assert_eq(expect, got) -@pytest.mark.parametrize("skip", range(0, 128)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_skiprows(skip, tmpdir): - num_rows = 128 + num_rows = 10 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("skip", range(0, 120)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_num_rows(skip, tmpdir): - num_rows = 128 + num_rows = 20 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index ca02ee55df0..8f2e4811e36 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import textwrap @@ -13,7 +13,14 @@ from cudf.testing import _utils as utils from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes -repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +repr_categories = [ + "uint16", + "int64", + "float64", + "str", + "category", + "datetime64[ns]", +] @pytest.mark.parametrize("dtype", repr_categories) @@ -84,36 +91,22 @@ def test_full_series(nrows, dtype): pd.reset_option("display.max_rows") +@pytest.mark.parametrize("nrows", [5, 10, 15]) +@pytest.mark.parametrize("ncols", [5, 10, 15]) +@pytest.mark.parametrize("size", [20, 21]) @pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -def test_full_dataframe_20(dtype, nrows, ncols): - size = 20 - pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} - ).astype(dtype) - gdf = cudf.from_pandas(pdf) - - assert pdf.__repr__() == gdf.__repr__() - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - -@pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1]) -@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1]) -def test_full_dataframe_21(dtype, nrows, ncols): - size = 21 +def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} ).astype(dtype) gdf = cudf.from_pandas(pdf) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") + with pd.option_context( + "display.max_rows", int(nrows), "display.max_columns", int(ncols) + ): + assert repr(pdf) == repr(gdf) + assert pdf._repr_html_() == gdf._repr_html_() + assert pdf._repr_latex_() == gdf._repr_latex_() @given( diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b8f975f233e..2efa781c506 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import re @@ -17,9 +17,9 @@ ) -@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) +@pytest.mark.parametrize("num_id_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_value_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_rows", [1, 2, 100]) @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index efe8e523d4e..56218372c23 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -532,12 +532,7 @@ def _cat_convert_seq_to_cudf(others): @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) @pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) @pytest.mark.parametrize( - "index", - [ - ["1", "2", "3", "4", "5"], - pd.Series(["1", "2", "3", "4", "5"]), - pd.Index(["1", "2", "3", "4", "5"]), - ], + "index", [["1", "2", "3", "4", "5"]], ) def test_string_cat(ps_gs, others, sep, na_rep, index): ps, gs = ps_gs diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 56090c8eacf..faaea6eec08 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,3 +1,4 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import math import operator @@ -14,7 +15,11 @@ unary_ops, ) from cudf.core.udf.utils import precompiled -from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq +from cudf.testing._utils import ( + _decimal_series, + assert_eq, + parametrize_numeric_dtypes_pairwise, +) def run_masked_udf_test(func, data, args=(), **kwargs): @@ -238,10 +243,9 @@ def func(row): run_masked_udf_test(func, gdf, check_dtype=False) -@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES)) -@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES)) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(dtype_a, dtype_b, op): +def test_apply_mixed_dtypes(left_dtype, right_dtype, op): """ Test that operations can be performed between columns of different dtypes and return a column with the correct @@ -251,7 +255,7 @@ def test_apply_mixed_dtypes(dtype_a, dtype_b, op): # First perform the op on two dummy data on host, if numpy can # safely type cast, we should expect it to work in udf too. try: - op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42)) + op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) except TypeError: pytest.skip("Operation is unsupported for corresponding dtype.") @@ -261,8 +265,8 @@ def func(row): return op(x, y) gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(dtype_a) - gdf["b"] = gdf["b"].astype(dtype_b) + gdf["a"] = gdf["a"].astype(left_dtype) + gdf["b"] = gdf["b"].astype(right_dtype) run_masked_udf_test(func, gdf, check_dtype=False) From ea2508ededc85ebba1a1bec362af8e27cd7c020e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 15 Feb 2022 11:49:24 -0600 Subject: [PATCH 08/10] Deprecate `DataFrame.iteritems` and introduce `.items` (#10298) This PR deprecates `DataFrame.iteritems` to be inline with pandas https://github.com/pandas-dev/pandas/pull/45321/files. This PR also introduces `DataFrame.items` to support a dask breaking change: https://github.com/dask/dask/pull/8660 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10298 --- python/cudf/cudf/core/dataframe.py | 12 +++++++++++- python/cudf/cudf/tests/test_concat.py | 6 +++--- python/cudf/cudf/tests/test_dataframe.py | 8 +++----- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fb15f8da8d9..75c515df719 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2066,6 +2066,16 @@ def __iter__(self): @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): + """Iterate over column names and series pairs""" + warnings.warn( + "iteritems is deprecated and will be removed in a future version. " + "Use .items instead.", + FutureWarning, + ) + return self.items() + + @annotate("DATAFRAME_ITEMS", color="blue", domain="cudf_python") + def items(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) @@ -4570,7 +4580,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): df = cls() # Set columns - for col_name, col_value in dataframe.iteritems(): + for col_name, col_value in dataframe.items(): # necessary because multi-index can return multiple # columns for a single key if len(col_value.shape) == 1: diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index b8724fe36f5..1ab5931fe5f 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -574,7 +574,7 @@ def test_concat_empty_dataframes(df, other, ignore_index): expected = pd.concat(other_pd, ignore_index=ignore_index) actual = gd.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): if not is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: @@ -1184,7 +1184,7 @@ def test_concat_join_empty_dataframes( ) if expected.shape != df.shape: if axis == 0: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): if not is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: @@ -1306,7 +1306,7 @@ def test_concat_join_empty_dataframes_axis_1( ) if expected.shape != df.shape: if axis == 0: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fb173bc0eab..fdb3f430a18 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -917,9 +917,7 @@ def test_dataframe_dtypes(): dtypes = pd.Series( [np.int32, np.float32, np.float64], index=["c", "a", "b"] ) - df = cudf.DataFrame( - {k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()} - ) + df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()}) assert df.dtypes.equals(dtypes) @@ -1956,7 +1954,7 @@ def test_dataframe_reductions(data, axis, func, skipna): elif func not in cudf.core.dataframe._cupy_nan_methods_map: if skipna is False: expected_exception = NotImplementedError - elif any(col.nullable for name, col in gdf.iteritems()): + elif any(col.nullable for name, col in gdf.items()): expected_exception = ValueError elif func in ("cummin", "cummax"): expected_exception = AttributeError @@ -2134,7 +2132,7 @@ def test_iter(pdf, gdf): def test_iteritems(gdf): - for k, v in gdf.iteritems(): + for k, v in gdf.items(): assert k in gdf.columns assert isinstance(v, cudf.Series) assert_eq(v, gdf[k]) From 7a620c4cad3a4ab5f2c32c451e5b661ffb6cedb8 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 15 Feb 2022 23:51:44 +0530 Subject: [PATCH 09/10] generate url decode benchmark input in device (#10278) Use device functions to move input generation to device in url_decode benchmark. Splitting PR https://github.com/rapidsai/cudf/pull/10109 for review Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/10278 --- cpp/benchmarks/CMakeLists.txt | 2 +- .../string/{url_decode.cpp => url_decode.cu} | 65 +++++++++++-------- 2 files changed, 40 insertions(+), 27 deletions(-) rename cpp/benchmarks/string/{url_decode.cpp => url_decode.cu} (53%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 0704180bad0..3bc6dc10fdf 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -265,7 +265,7 @@ ConfigureBench( string/split.cpp string/substring.cpp string/translate.cpp - string/url_decode.cpp + string/url_decode.cu ) # ################################################################################################## diff --git a/cpp/benchmarks/string/url_decode.cpp b/cpp/benchmarks/string/url_decode.cu similarity index 53% rename from cpp/benchmarks/string/url_decode.cpp rename to cpp/benchmarks/string/url_decode.cu index 6dc79c44437..c460820d788 100644 --- a/cpp/benchmarks/string/url_decode.cpp +++ b/cpp/benchmarks/string/url_decode.cu @@ -16,11 +16,12 @@ #include #include -#include #include #include +#include #include +#include #include #include @@ -28,43 +29,55 @@ #include #include -#include -#include +#include +#include +#include +#include struct url_string_generator { - size_t num_chars; - std::bernoulli_distribution dist; - - url_string_generator(size_t num_chars, double esc_seq_chance) - : num_chars{num_chars}, dist{esc_seq_chance} + char* chars; + double esc_seq_chance; + thrust::minstd_rand engine; + thrust::uniform_real_distribution esc_seq_dist; + url_string_generator(char* c, double esc_seq_chance, thrust::minstd_rand& engine) + : chars(c), esc_seq_chance(esc_seq_chance), engine(engine), esc_seq_dist(0, 1) { } - std::string operator()(std::mt19937& engine) + __device__ void operator()(thrust::tuple str_begin_end) { - std::string str; - str.reserve(num_chars); - while (str.size() < num_chars) { - if (str.size() < num_chars - 3 && dist(engine)) { - str += "%20"; + auto begin = thrust::get<0>(str_begin_end); + auto end = thrust::get<1>(str_begin_end); + engine.discard(begin); + for (auto i = begin; i < end; ++i) { + if (esc_seq_dist(engine) < esc_seq_chance and i < end - 3) { + chars[i] = '%'; + chars[i + 1] = '2'; + chars[i + 2] = '0'; + i += 2; } else { - str.push_back('a'); + chars[i] = 'a'; } } - return str; } }; -cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, - cudf::size_type chars_per_row, - double esc_seq_chance) +auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, double esc_seq_chance) { - std::mt19937 engine(1); - url_string_generator url_gen(chars_per_row, esc_seq_chance); - std::vector strings; - strings.reserve(num_rows); - std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); }); - return cudf::test::strings_column_wrapper(strings.begin(), strings.end()); + std::vector strings{std::string(chars_per_row, 'a')}; + auto col_1a = cudf::test::strings_column_wrapper(strings.begin(), strings.end()); + auto table_a = cudf::repeat(cudf::table_view{{col_1a}}, num_rows); + auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... + auto chars_col = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view(); + auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); + + auto engine = thrust::default_random_engine{}; + thrust::for_each_n(thrust::device, + thrust::make_zip_iterator(offset_col.begin(), + offset_col.begin() + 1), + num_rows, + url_string_generator{chars_col.begin(), esc_seq_chance, engine}); + return result_col; } class UrlDecode : public cudf::benchmark { @@ -76,7 +89,7 @@ void BM_url_decode(benchmark::State& state, int esc_seq_pct) cudf::size_type const chars_per_row = state.range(1); auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0); - auto strings_view = cudf::strings_column_view(column); + auto strings_view = cudf::strings_column_view(column->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); From f263820f98cb280b17f5409e9c9204b943fe1968 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 15 Feb 2022 23:58:03 +0530 Subject: [PATCH 10/10] move input generation for type dispatcher benchmark to device (#10280) Use `cudf::sequence` to move input generation to device in type dispatcher benchmark. Splitting PR https://github.com/rapidsai/cudf/pull/10109 for review Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/10280 --- .../type_dispatcher/type_dispatcher.cu | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 90097889a86..ca19e3046ad 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,16 @@ * limitations under the License. */ -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" +#include +#include #include #include #include -#include #include +#include +#include #include #include @@ -170,21 +171,18 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa template void type_dispatcher_benchmark(::benchmark::State& state) { - const auto source_size = static_cast(state.range(1)); - - const auto n_cols = static_cast(state.range(0)); - + const auto n_cols = static_cast(state.range(0)); + const auto source_size = static_cast(state.range(1)); const auto work_per_thread = static_cast(state.range(2)); - auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto init = cudf::make_fixed_width_scalar(static_cast(0)); - std::vector> source_column_wrappers; + std::vector> source_column_wrappers; std::vector source_columns; for (int i = 0; i < n_cols; ++i) { - source_column_wrappers.push_back( - cudf::test::fixed_width_column_wrapper(data, data + source_size)); - source_columns.push_back(source_column_wrappers[i]); + source_column_wrappers.push_back(cudf::sequence(source_size, *init)); + source_columns.push_back(*source_column_wrappers[i]); } cudf::mutable_table_view source_table{source_columns};