From aeef0a1f4159d4c87f987d20225401040973d10f Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 18 Jul 2024 16:56:30 -0400 Subject: [PATCH 1/2] Remove hash_character_ngrams dependency from jaccard_index (#16241) Removes internal dependency of `nvtext::hash_character_ngrams` from `nvtext::jaccard_index`. Works around the size-type limit imposed by `hash_character_ngrams` which returns a `list` column. This also specializes the hashing logic for the jaccard calculation specifically. The overall algorithm has not changed. Code has moved around a bit and internal list-columns have been replaced with just offsets and values vectors. Closes #16157 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/16241 --- cpp/benchmarks/text/jaccard.cpp | 4 +- cpp/src/text/jaccard.cu | 478 ++++++++++++++++++++++---------- 2 files changed, 339 insertions(+), 143 deletions(-) diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d05c195d077..d5b74da6773 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {32768, 131072, 262144}) + .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 9cf934165f6..e465fb79c89 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -19,16 +19,19 @@ #include #include #include +#include #include -#include +#include +#include +#include #include #include #include -#include #include #include +#include #include #include @@ -36,127 +39,375 @@ #include #include #include +#include +#include +#include #include namespace nvtext { namespace detail { namespace { +constexpr cudf::thread_index_type block_size = 256; +constexpr cudf::thread_index_type bytes_per_thread = 4; + /** * @brief Retrieve the row data (span) for the given column/row-index * - * @param d_input Input lists column + * @param values Flat vector of all values + * @param offsets Offsets identifying rows within values * @param idx Row index to retrieve * @return A device-span of the row values */ -__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx) +__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx) { - auto const offsets = - d_input.child(cudf::lists_column_view::offsets_column_index).data(); - auto const offset = offsets[idx]; - auto const size = offsets[idx + 1] - offset; - auto const begin = - d_input.child(cudf::lists_column_view::child_column_index).data() + offset; + auto const offset = offsets[row_idx]; + auto const size = offsets[row_idx + 1] - offset; + auto const begin = values + offset; return cudf::device_span(begin, size); } /** - * @brief Count the unique values within each row of the input column + * @brief Kernel to count the unique values within each row of the input column + * + * This is called with a warp per row. * - * This is called with a warp per row + * @param d_values Sorted hash values to count uniqueness + * @param d_offsets Offsets to each set of row elements in d_values + * @param rows Number of rows in the output + * @param d_results Number of unique values in each row */ -struct sorted_unique_fn { - cudf::column_device_view const d_input; - cudf::size_type* d_results; +CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values, + int64_t const* d_offsets, + cudf::size_type rows, + cudf::size_type* d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(rows) * cudf::detail::warp_size)) { return; } - // warp per row - __device__ void operator()(cudf::size_type idx) const - { - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; - auto const row_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; - auto const row = get_row(d_input, row_idx); - auto const begin = row.begin(); + auto const row_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + auto const row = get_row(d_values, d_offsets, row_idx); + auto const begin = row.begin(); - cudf::size_type count = 0; - for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) { - count += (itr == begin || *itr != *(itr - 1)); - } - auto const result = warp_reduce(temp_storage).Sum(count); - if (lane_idx == 0) { d_results[row_idx] = result; } + cudf::size_type count = 0; + for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) { + count += (itr == begin || *itr != *(itr - 1)); } -}; + auto const result = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_results[row_idx] = result; } +} -rmm::device_uvector compute_unique_counts(cudf::column_view const& input, +/** + * @brief Count the unique values within each row of the input column + * + * @param values Sorted hash values to count uniqueness + * @param offsets Offsets to each set of row elements in d_values + * @param rows Number of rows in the output + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Number of unique values + */ +rmm::device_uvector compute_unique_counts(uint32_t const* values, + int64_t const* offsets, + cudf::size_type rows, rmm::cuda_stream_view stream) { - auto const d_input = cudf::column_device_view::create(input, stream); - auto d_results = rmm::device_uvector(input.size(), stream); - sorted_unique_fn fn{*d_input, d_results.data()}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::counting_iterator(0), - input.size() * cudf::detail::warp_size, - fn); + auto d_results = rmm::device_uvector(rows, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + static_cast(rows) * cudf::detail::warp_size, block_size); + sorted_unique_fn<<>>( + values, offsets, rows, d_results.data()); return d_results; } +/** + * @brief Kernel to count the number of common values within each row of the 2 input columns + * + * This is called with a warp per row. + * + * @param d_values1 Sorted hash values to check against d_values2 + * @param d_offsets1 Offsets to each set of row elements in d_values1 + * @param d_values2 Sorted hash values to check against d_values1 + * @param d_offsets2 Offsets to each set of row elements in d_values2 + * @param rows Number of rows in the output + * @param d_results Number of common values in each row + */ +CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1, + int64_t const* d_offsets1, + uint32_t const* d_values2, + int64_t const* d_offsets2, + cudf::size_type rows, + cudf::size_type* d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(rows) * cudf::detail::warp_size)) { return; } + + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; + + auto const row_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + auto const needles = get_row(d_values1, d_offsets1, row_idx); + auto const haystack = get_row(d_values2, d_offsets2, row_idx); + + auto begin = haystack.begin(); + auto const end = haystack.end(); + + cudf::size_type count = 0; + for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end; + itr += cudf::detail::warp_size) { + if (itr != needles.begin() && *itr == *(itr - 1)) { continue; } // skip duplicates + // search haystack for this needle (*itr) + auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr); + count += (found != end) && (*found == *itr); // increment if found; + begin = found; // shorten the next lower-bound range + } + // sum up the counts across this warp + auto const result = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_results[row_idx] = result; } +} + /** * @brief Count the number of common values within each row of the 2 input columns * - * This is called with a warp per row + * @param d_values1 Sorted hash values to check against d_values2 + * @param d_offsets1 Offsets to each set of row elements in d_values1 + * @param d_values2 Sorted hash values to check against d_values1 + * @param d_offsets2 Offsets to each set of row elements in d_values2 + * @param rows Number of rows in the output + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Number of common values */ -struct sorted_intersect_fn { - cudf::column_device_view const d_input1; - cudf::column_device_view const d_input2; - cudf::size_type* d_results; +rmm::device_uvector compute_intersect_counts(uint32_t const* values1, + int64_t const* offsets1, + uint32_t const* values2, + int64_t const* offsets2, + cudf::size_type rows, + rmm::cuda_stream_view stream) +{ + auto d_results = rmm::device_uvector(rows, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + static_cast(rows) * cudf::detail::warp_size, block_size); + sorted_intersect_fn<<>>( + values1, offsets1, values2, offsets2, rows, d_results.data()); + return d_results; +} - // warp per row - __device__ void operator()(cudf::size_type idx) const - { - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; +/** + * @brief Counts the number of substrings in each row of the given strings column + * + * Each warp processes a single string. + * Formula is `count = max(1, str.length() - width + 1)` + * If a string has less than width characters (but not empty), the count is 1 + * since the entire string is still hashed. + * + * @param d_strings Input column of strings + * @param width Substring size in characters + * @param d_counts Output number of substring per row of input + */ +CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings, + cudf::size_type width, + int64_t* d_counts) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(d_strings.size()) * cudf::detail::warp_size)) { + return; + } - auto const row_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + if (d_strings.is_null(str_idx)) { + d_counts[str_idx] = 0; + return; + } - auto const needles = get_row(d_input1, row_idx); - auto const haystack = get_row(d_input2, row_idx); + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_counts[str_idx] = 0; + return; + } - auto begin = haystack.begin(); - auto const end = haystack.end(); + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; - // TODO: investigate cuCollections device-side static-map to match row values + auto const end = d_str.data() + d_str.size_bytes(); + auto const lane_idx = idx % cudf::detail::warp_size; + cudf::size_type count = 0; + for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; + itr += cudf::detail::warp_size * bytes_per_thread) { + for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { + count += static_cast(cudf::strings::detail::is_begin_utf8_char(*s)); + } + } + auto const char_count = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); } +} + +/** + * @brief Kernel to hash the substrings for each input row + * + * Each warp processes a single string. + * Substrings of string "hello world" with width=4 produce: + * "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld" + * Each of these substrings is hashed and the hash stored in d_results + * + * @param d_strings Input column of strings + * @param width Substring size in characters + * @param d_output_offsets Offsets into d_results + * @param d_results Hash values for each substring + */ +CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings, + cudf::size_type width, + int64_t const* d_output_offsets, + uint32_t* d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(d_strings.size()) * cudf::detail::warp_size)) { + return; + } - cudf::size_type count = 0; - for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end; - itr += cudf::detail::warp_size) { - if (itr != needles.begin() && *itr == *(itr - 1)) { continue; } // skip duplicates - // search haystack for this needle (*itr) - auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr); - count += (found != end) && (*found == *itr); // increment if found; - begin = found; // shorten the next lower-bound range + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { return; } + + __shared__ uint32_t hvs[block_size]; // temp store for hash values + + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto const end = d_str.data() + d_str.size_bytes(); + auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; + + auto d_hashes = d_results + d_output_offsets[str_idx]; + auto itr = d_str.data() + lane_idx; + for (auto i = 0; i < warp_count; ++i) { + uint32_t hash = 0; + if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) { + // resolve substring + auto const sub_str = + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width); + // hash only if we have the full width of characters or this is the beginning of the string + if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); } } - // sum up the counts across this warp - auto const result = warp_reduce(temp_storage).Sum(count); - if (lane_idx == 0) { d_results[row_idx] = result; } + hvs[threadIdx.x] = hash; // store hash into shared memory + __syncwarp(); + if (lane_idx == 0) { + // copy valid hash values for this warp into d_hashes + auto const hashes = &hvs[threadIdx.x]; + auto const hashes_end = hashes + cudf::detail::warp_size; + d_hashes = + thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; }); + } + __syncwarp(); + itr += cudf::detail::warp_size; } -}; +} -rmm::device_uvector compute_intersect_counts(cudf::column_view const& input1, - cudf::column_view const& input2, - rmm::cuda_stream_view stream) +void segmented_sort(uint32_t const* input, + uint32_t* output, + int64_t items, + cudf::size_type segments, + int64_t const* offsets, + rmm::cuda_stream_view stream) { - auto const d_input1 = cudf::column_device_view::create(input1, stream); - auto const d_input2 = cudf::column_device_view::create(input2, stream); - auto d_results = rmm::device_uvector(input1.size(), stream); - sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::counting_iterator(0), - input1.size() * cudf::detail::warp_size, - fn); - return d_results; + rmm::device_buffer temp; + std::size_t temp_bytes = 0; + cub::DeviceSegmentedSort::SortKeys( + temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value()); + temp = rmm::device_buffer(temp_bytes, stream); + cub::DeviceSegmentedSort::SortKeys( + temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value()); +} + +/** + * @brief Create hashes for each substring + * + * The hashes are sorted using a segmented-sort as setup to + * perform the unique and intersect operations. + * + * @param input Input strings column to hash + * @param width Substring width in characters + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The sorted hash values and offsets to each row + */ +std::pair, rmm::device_uvector> hash_substrings( + cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream) +{ + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + // count substrings + auto offsets = rmm::device_uvector(input.size() + 1, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + static_cast(input.size()) * cudf::detail::warp_size, block_size); + count_substrings_kernel<<>>( + *d_strings, width, offsets.data()); + auto const total_hashes = + cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream); + + // hash substrings + rmm::device_uvector hashes(total_hashes, stream); + substring_hash_kernel<<>>( + *d_strings, width, offsets.data(), hashes.data()); + + // sort hashes + rmm::device_uvector sorted(total_hashes, stream); + if (total_hashes < static_cast(std::numeric_limits::max())) { + segmented_sort( + hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream); + } else { + // The CUB segmented sort can only handle max total values + // so this code calls it in sections. + auto const section_size = std::numeric_limits::max() / 2L; + auto const sort_sections = cudf::util::div_rounding_up_safe(total_hashes, section_size); + auto const offset_indices = [&] { + // build a set of indices that point to offsets subsections + auto sub_offsets = rmm::device_uvector(sort_sections + 1, stream); + thrust::sequence( + rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size); + auto indices = rmm::device_uvector(sub_offsets.size(), stream); + thrust::lower_bound(rmm::exec_policy(stream), + offsets.begin(), + offsets.end(), + sub_offsets.begin(), + sub_offsets.end(), + indices.begin()); + return cudf::detail::make_std_vector_sync(indices, stream); + }(); + + // Call segmented sort with the sort sections + for (auto i = 0L; i < sort_sections; ++i) { + auto const index1 = offset_indices[i]; + auto const index2 = std::min(offset_indices[i + 1], static_cast(offsets.size() - 1)); + auto const offset1 = offsets.element(index1, stream); + auto const offset2 = offsets.element(index2, stream); + + auto const num_items = offset2 - offset1; + auto const num_segments = index2 - index1; + + // There is a bug in the CUB segmented sort and the workaround is to + // shift the offset values so the first offset is 0. + // This transform can be removed once the bug is fixed. + auto sort_offsets = rmm::device_uvector(num_segments + 1, stream); + thrust::transform(rmm::exec_policy(stream), + offsets.begin() + index1, + offsets.begin() + index2 + 1, + sort_offsets.begin(), + [offset1] __device__(auto const o) { return o - offset1; }); + + segmented_sort(hashes.begin() + offset1, + sorted.begin() + offset1, + num_items, + num_segments, + sort_offsets.begin(), + stream); + } + } + return std::make_pair(std::move(sorted), std::move(offsets)); } /** @@ -186,62 +437,6 @@ struct jaccard_fn { } }; -/** - * @brief Create hashes for each substring - * - * Uses the hash_character_ngrams to hash substrings of the input column. - * This returns a lists column where each row is the hashes for the substrings - * of the corresponding input string row. - * - * The hashes are then sorted using a segmented-sort as setup to - * perform the unique and intersect operations. - */ -std::unique_ptr hash_substrings(cudf::strings_column_view const& col, - cudf::size_type width, - rmm::cuda_stream_view stream) -{ - auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource()); - auto const input = cudf::lists_column_view(hashes->view()); - auto const offsets = input.offsets_begin(); - auto const data = input.child().data(); - - rmm::device_uvector sorted(input.child().size(), stream); - - // this is wicked fast and much faster than using cudf::lists::detail::sort_list - rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(), - temp_storage_bytes, - data, - sorted.data(), - sorted.size(), - input.size(), - offsets, - offsets + 1, - stream.value()); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(), - temp_storage_bytes, - data, - sorted.data(), - sorted.size(), - input.size(), - offsets, - offsets + 1, - stream.value()); - - auto contents = hashes->release(); - // the offsets are taken from the hashes column since they are the same - // before and after the segmented-sort - return cudf::make_lists_column( - col.size(), - std::move(contents.children.front()), - std::make_unique(std::move(sorted), rmm::device_buffer{}, 0), - 0, - rmm::device_buffer{}, - stream, - rmm::mr::get_current_device_resource()); -} } // namespace std::unique_ptr jaccard_index(cudf::strings_column_view const& input1, @@ -261,13 +456,14 @@ std::unique_ptr jaccard_index(cudf::strings_column_view const& inp auto const [d_uniques1, d_uniques2, d_intersects] = [&] { // build hashes of the substrings - auto const hash1 = hash_substrings(input1, width, stream); - auto const hash2 = hash_substrings(input2, width, stream); + auto const [hash1, offsets1] = hash_substrings(input1, width, stream); + auto const [hash2, offsets2] = hash_substrings(input2, width, stream); // compute the unique counts in each set and the intersection counts - auto d_uniques1 = compute_unique_counts(hash1->view(), stream); - auto d_uniques2 = compute_unique_counts(hash2->view(), stream); - auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream); + auto d_uniques1 = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream); + auto d_uniques2 = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream); + auto d_intersects = compute_intersect_counts( + hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream); return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)}; }(); From 4acca4d57303f52907aa158a2ef996c9d42a73d6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jul 2024 11:07:07 -1000 Subject: [PATCH 2/2] Use Column.can_cast_safely instead of some ad-hoc dtype functions in .where (#16303) There were a couple of dedicated functions in `python/cudf/cudf/utils/dtypes.py` specific to `.where` that could be subsumed by `Column.can_cast_safely`. The minor downside is that we need to cast where's argument to a Column first, but IMO it's probably OK given the deduplication Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16303 --- python/cudf/cudf/core/_internals/where.py | 78 ++++++++++++++---- python/cudf/cudf/utils/dtypes.py | 96 +---------------------- 2 files changed, 62 insertions(+), 112 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 4a36be76b6d..6003a0f6aea 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -9,12 +9,7 @@ import cudf from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.dtypes import CategoricalDtype -from cudf.utils.dtypes import ( - _can_cast, - _dtype_can_hold_element, - find_common_type, - is_mixed_with_object_dtype, -) +from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype if TYPE_CHECKING: from cudf._typing import ScalarLike @@ -44,6 +39,8 @@ def _check_and_cast_columns_with_other( inplace: bool, ) -> tuple[ColumnBase, ScalarLike | ColumnBase]: # Returns type-casted `source_col` & `other` based on `inplace`. + from cudf.core.column import as_column + source_dtype = source_col.dtype if isinstance(source_dtype, CategoricalDtype): return _normalize_categorical(source_col, other) @@ -84,17 +81,9 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if ( - _is_non_decimal_numeric_dtype(source_dtype) - and not other_is_scalar # can-cast fails for Python scalars - and _can_cast(other, source_dtype) - ): - common_dtype = source_dtype - elif ( - isinstance(source_col, cudf.core.column.NumericalColumn) - and other_is_scalar - and _dtype_can_hold_element(source_dtype, other) - ): + if _is_non_decimal_numeric_dtype(source_dtype) and as_column( + other + ).can_cast_safely(source_dtype): common_dtype = source_dtype else: common_dtype = find_common_type( @@ -130,3 +119,58 @@ def _make_categorical_like(result, column): ordered=column.ordered, ) return result + + +def _can_cast(from_dtype, to_dtype): + """ + Utility function to determine if we can cast + from `from_dtype` to `to_dtype`. This function primarily calls + `np.can_cast` but with some special handling around + cudf specific dtypes. + """ + if cudf.utils.utils.is_na_like(from_dtype): + return True + if isinstance(from_dtype, type): + from_dtype = cudf.dtype(from_dtype) + if isinstance(to_dtype, type): + to_dtype = cudf.dtype(to_dtype) + + # TODO : Add precision & scale checking for + # decimal types in future + + if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype): + if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): + return True + elif isinstance(to_dtype, np.dtype): + if to_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype, to_dtype) + elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): + if from_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + # TODO: Add level based checks too once casting of + # list columns is supported + if isinstance(to_dtype, cudf.core.dtypes.ListDtype): + return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): + if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): + return True + elif isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype._categories.dtype, to_dtype) + else: + return False + else: + return np.can_cast(from_dtype, to_dtype) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 59e5ec1df04..af912bee342 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -10,8 +10,6 @@ from pandas.core.dtypes.common import infer_dtype_from_object import cudf -from cudf._typing import DtypeObj -from cudf.api.types import is_bool, is_float, is_integer """Map numpy dtype to pyarrow types. Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special @@ -584,61 +582,6 @@ def _dtype_pandas_compatible(dtype): return dtype -def _can_cast(from_dtype, to_dtype): - """ - Utility function to determine if we can cast - from `from_dtype` to `to_dtype`. This function primarily calls - `np.can_cast` but with some special handling around - cudf specific dtypes. - """ - if cudf.utils.utils.is_na_like(from_dtype): - return True - if isinstance(from_dtype, type): - from_dtype = cudf.dtype(from_dtype) - if isinstance(to_dtype, type): - to_dtype = cudf.dtype(to_dtype) - - # TODO : Add precision & scale checking for - # decimal types in future - - if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype): - if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): - return True - elif isinstance(to_dtype, np.dtype): - if to_dtype.kind in {"i", "f", "u", "U", "O"}: - return True - else: - return False - elif isinstance(from_dtype, np.dtype): - if isinstance(to_dtype, np.dtype): - return np.can_cast(from_dtype, to_dtype) - elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): - if from_dtype.kind in {"i", "f", "u", "U", "O"}: - return True - else: - return False - elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): - return True - else: - return False - elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): - # TODO: Add level based checks too once casting of - # list columns is supported - if isinstance(to_dtype, cudf.core.dtypes.ListDtype): - return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) - else: - return False - elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): - if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): - return True - elif isinstance(to_dtype, np.dtype): - return np.can_cast(from_dtype._categories.dtype, to_dtype) - else: - return False - else: - return np.can_cast(from_dtype, to_dtype) - - def _maybe_convert_to_default_type(dtype): """Convert `dtype` to default if specified by user. @@ -661,44 +604,7 @@ def _maybe_convert_to_default_type(dtype): return dtype -def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: - if not len(rng): - return True - return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) - - -def _dtype_can_hold_element(dtype: np.dtype, element) -> bool: - if dtype.kind in {"i", "u"}: - if isinstance(element, range): - if _dtype_can_hold_range(element, dtype): - return True - return False - - elif is_integer(element) or ( - is_float(element) and element.is_integer() - ): - info = np.iinfo(dtype) - if info.min <= element <= info.max: - return True - return False - - elif dtype.kind == "f": - if is_integer(element) or is_float(element): - casted = dtype.type(element) - if np.isnan(casted) or casted == element: - return True - # otherwise e.g. overflow see TestCoercionFloat32 - return False - - elif dtype.kind == "b": - if is_bool(element): - return True - return False - - raise NotImplementedError(f"Unsupported dtype: {dtype}") - - -def _get_base_dtype(dtype: DtypeObj) -> DtypeObj: +def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: # TODO: replace the use of this function with just `dtype.base` # when Pandas 2.1.0 is the minimum version we support: # https://github.com/pandas-dev/pandas/pull/52706