From dda3f316cecd2cc23f97cd4fa9e44ec93efe5395 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Sat, 23 Mar 2024 00:09:11 +0000 Subject: [PATCH 01/69] Fix arrow-based round trip of empty dataframes (#15373) When materializing range indices we were not previously creating the correct metadata. So do that. While here, tidy up a few corner cases around creating range indices when constructing empty data frames. - Closes #12243 - Closes #14159 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15373 --- python/cudf/cudf/_lib/utils.pyx | 40 +++++++------ python/cudf/cudf/core/dataframe.py | 43 ++++++++------ .../tests/dataframe/test_io_serialization.py | 59 ++++++++++++++++++- python/cudf/cudf/tests/test_parquet.py | 14 ++++- 4 files changed, 115 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index b6637e9df08..0afecb215e4 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -59,7 +59,7 @@ cpdef generate_pandas_metadata(table, index): types = [] index_levels = [] index_descriptors = [] - + columns_to_convert = list(table._columns) # Columns for name, col in table._data.items(): if cudf.get_option("mode.pandas_compatible"): @@ -90,6 +90,7 @@ cpdef generate_pandas_metadata(table, index): types.append(np_to_pa_dtype(col.dtype)) # Indexes + materialize_index = False if index is not False: for level, name in enumerate(table._index.names): if isinstance(table._index, cudf.core.multiindex.MultiIndex): @@ -107,22 +108,26 @@ cpdef generate_pandas_metadata(table, index): "step": table.index.step, } else: + materialize_index = True # When `index=True`, RangeIndex needs to be materialized. materialized_idx = cudf.Index(idx._values, name=idx.name) - descr = \ - _index_level_name( - index_name=materialized_idx.name, - level=level, - column_names=col_names - ) - index_levels.append(materialized_idx) - else: - descr = \ - _index_level_name( - index_name=idx.name, + descr = _index_level_name( + index_name=materialized_idx.name, level=level, column_names=col_names ) + index_levels.append(materialized_idx) + columns_to_convert.append(materialized_idx._values) + col_names.append(descr) + types.append(np_to_pa_dtype(materialized_idx.dtype)) + else: + descr = _index_level_name( + index_name=idx.name, + level=level, + column_names=col_names + ) + columns_to_convert.append(idx._values) + col_names.append(descr) if isinstance(idx.dtype, cudf.CategoricalDtype): raise ValueError( "'category' column dtypes are currently not " @@ -141,17 +146,16 @@ cpdef generate_pandas_metadata(table, index): types.append(np_to_pa_dtype(idx.dtype)) index_levels.append(idx) - col_names.append(name) index_descriptors.append(descr) + df_meta = table.head(0) + if materialize_index: + df_meta.index = df_meta.index._as_int_index() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[ - col - for col in table._columns - ], + columns_to_convert=columns_to_convert, # It is OKAY to do `.head(0).to_pandas()` because # this method will extract `.columns` metadata only - df=table.head(0).to_pandas(), + df=df_meta.to_pandas(), column_names=col_names, index_levels=index_levels, index_descriptors=index_descriptors, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index da0a969b70c..2a4f93c1716 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5485,14 +5485,18 @@ def from_arrow(cls, table): return out @_cudf_nvtx_annotate - def to_arrow(self, preserve_index=True): + def to_arrow(self, preserve_index=None): """ Convert to a PyArrow Table. Parameters ---------- - preserve_index : bool, default True - whether index column and its meta data needs to be saved or not + preserve_index : bool, optional + whether index column and its meta data needs to be saved + or not. The default of None will store the index as a + column, except for a RangeIndex which is stored as + metadata only. Setting preserve_index to True will force + a RangeIndex to be materialized. Returns ------- @@ -5523,34 +5527,35 @@ def to_arrow(self, preserve_index=True): data = self.copy(deep=False) index_descr = [] - if preserve_index: - if isinstance(self.index, cudf.RangeIndex): + write_index = preserve_index is not False + keep_range_index = write_index and preserve_index is None + index = self.index + if write_index: + if isinstance(index, cudf.RangeIndex) and keep_range_index: descr = { "kind": "range", - "name": self.index.name, - "start": self.index._start, - "stop": self.index._stop, + "name": index.name, + "start": index._start, + "stop": index._stop, "step": 1, } else: - if isinstance(self.index, MultiIndex): + if isinstance(index, cudf.RangeIndex): + index = index._as_int_index() + index.name = "__index_level_0__" + if isinstance(index, MultiIndex): gen_names = tuple( - f"level_{i}" - for i, _ in enumerate(self.index._data.names) + f"level_{i}" for i, _ in enumerate(index._data.names) ) else: gen_names = ( - self.index.names - if self.index.name is not None - else ("index",) + index.names if index.name is not None else ("index",) ) - for gen_name, col_name in zip( - gen_names, self.index._data.names - ): + for gen_name, col_name in zip(gen_names, index._data.names): data._insert( data.shape[1], gen_name, - self.index._data[col_name], + index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5560,7 +5565,7 @@ def to_arrow(self, preserve_index=True): columns_to_convert=[self[col] for col in self._data.names], df=self, column_names=out.schema.names, - index_levels=[self.index], + index_levels=[index], index_descriptors=index_descr, preserve_index=preserve_index, types=out.schema.types, diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py index 06777c8e6af..ad81609470c 100644 --- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py +++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py @@ -1 +1,58 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import contextlib +from io import BytesIO + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.mark.parametrize( + "index", + [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], + ids=["RangeIndex", "IntIndex", "StridedRange"], +) +@pytest.mark.parametrize("write_index", [False, True, None]) +@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) +def test_dataframe_parquet_roundtrip(index, write_index, empty): + if empty: + data = {} + else: + data = {"a": [i * 2 for i in index]} + df = cudf.DataFrame(data=data, index=index) + pf = pd.DataFrame(data=data, index=index) + gpu_buf = BytesIO() + cpu_buf = BytesIO() + + df.to_parquet(gpu_buf, index=write_index) + pf.to_parquet(cpu_buf, index=write_index) + gpu_table = pq.read_table(gpu_buf) + cpu_table = pq.read_table(cpu_buf) + metadata_equal = ( + gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata + ) + if empty and write_index is not False: + # https://github.com/rapidsai/cudf/issues/15372 + ctx = pytest.raises(AssertionError) + else: + ctx = contextlib.nullcontext() + with ctx: + assert metadata_equal + + gpu_read = cudf.read_parquet(gpu_buf) + cpu_read = cudf.read_parquet(cpu_buf) + with ctx: + assert_eq(gpu_read, cpu_read) + + +@pytest.mark.parametrize("preserve_index", [False, True, None]) +def test_dataframe_to_arrow_preserve_index(preserve_index): + df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) + pf = df.to_pandas() + expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema + got = df.to_arrow(preserve_index=preserve_index).schema + assert expect == got diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b72fe84359..9ba71b28637 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2442,9 +2442,17 @@ def test_parquet_index(pdf, index): run_parquet_index(pdf, index) -@pytest.mark.parametrize("index", [None, True]) -@pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/12243", +@pytest.mark.parametrize( + "index", + [ + pytest.param( + None, + marks=pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/40743" + ), + ), + True, + ], ) def test_parquet_index_empty(index): pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) From 933e32ab9ad8e5057282c48129ddbd745c538967 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 25 Mar 2024 11:47:51 -0500 Subject: [PATCH 02/69] Update udf_cpp to use rapids_cpm_cccl. (#15331) This PR updates the `udf_cpp` target to use `rapids_cpm_cccl`. The previous `rapids_cpm_libcudacxx` has been deprecated. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/15331 --- python/cudf/udf_cpp/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index 57b52559f00..fe7f9d0b00d 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -26,8 +26,8 @@ rapids_find_package( INSTALL_EXPORT_SET udf-exports ) -include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) -rapids_cpm_libcudacxx(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports) +include(${rapids-cmake-dir}/cpm/cccl.cmake) +rapids_cpm_cccl(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports) add_library(cudf_strings_udf SHARED strings/src/strings/udf/udf_apis.cu) target_include_directories( From e3cbf62fcef479a051d116c451e69ddaa4568b57 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Mar 2024 15:45:09 -0400 Subject: [PATCH 03/69] Ignore DLManagedTensor in the docs build (#15392) Fixes a docs build error since `DLManagedTensor` cannot be resolved from the dlpack documentation. --- docs/cudf/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 3bba50b482c..7afc8fe19bf 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -388,6 +388,7 @@ def _generate_namespaces(namespaces): "thrust", "cuda", "arrow", + "DLManagedTensor", # Unknown types "int8_t", "int16_t", From a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 26 Mar 2024 17:24:41 -0700 Subject: [PATCH 04/69] Use logical types in Parquet reader (#15365) Closes #15224. Now use logical type exclusively in the reader rather than the deprecated converted type. Authors: - Ed Seidl (https://github.com/etseidl) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15365 --- cpp/src/io/parquet/decode_fixed.cu | 4 +- cpp/src/io/parquet/decode_preprocess.cu | 2 +- cpp/src/io/parquet/page_data.cu | 18 +- cpp/src/io/parquet/page_data.cuh | 3 +- cpp/src/io/parquet/page_decode.cuh | 58 ++--- cpp/src/io/parquet/page_hdr.cu | 4 +- cpp/src/io/parquet/page_string_decode.cu | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 41 ++-- cpp/src/io/parquet/reader_impl.cpp | 16 +- cpp/src/io/parquet/reader_impl_chunking.cu | 49 ++--- cpp/src/io/parquet/reader_impl_helpers.cpp | 210 ++++++++++--------- cpp/src/io/parquet/reader_impl_preprocess.cu | 4 +- 12 files changed, 220 insertions(+), 193 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 062363db503..945a7dcb4c6 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -165,7 +165,7 @@ __device__ inline void gpuDecodeValues( constexpr int max_batch_size = num_warps * cudf::detail::warp_size; PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.data_type & 7; + int const dtype = s->col.physical_type; // decode values int pos = start; @@ -187,7 +187,7 @@ __device__ inline void gpuDecodeValues( uint32_t dtype_len = s->dtype_len; void* dst = nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; - if (s->col.converted_type == DECIMAL) { + if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { case INT32: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; case INT64: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8f772636c7e..e49801e6172 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -389,7 +389,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) // we only need to preprocess hierarchies with repetition in them (ie, hierarchies // containing lists anywhere within). compute_string_sizes = - compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4); + compute_string_sizes && s->col.physical_type == BYTE_ARRAY && !s->col.is_strings_to_cat; // early out optimizations: diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 261e04e3f19..62ce5b9f9a5 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -77,7 +77,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) if (s->dict_base) { out_thread0 = (s->dict_bits > 0) ? 64 : 32; } else { - switch (s->col.data_type & 7) { + switch (s->col.physical_type) { case BOOLEAN: [[fallthrough]]; case BYTE_ARRAY: [[fallthrough]]; case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break; @@ -123,16 +123,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) // be needed in the other DecodeXXX kernels. if (s->dict_base) { src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, t & 0x1f).first; - } else if ((s->col.data_type & 7) == BOOLEAN) { + } else if (s->col.physical_type == BOOLEAN) { src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f); - } else if ((s->col.data_type & 7) == BYTE_ARRAY or - (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + } else if (s->col.physical_type == BYTE_ARRAY or + s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); } if (t == 32) { s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values - int const dtype = s->col.data_type & 7; + int const dtype = s->col.physical_type; src_pos += t - out_thread0; // the position in the output column/buffer @@ -166,10 +166,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) uint32_t dtype_len = s->dtype_len; void* dst = nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + auto const is_decimal = + s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; if (dtype == BYTE_ARRAY) { - if (s->col.converted_type == DECIMAL) { + if (is_decimal) { auto const [ptr, len] = gpuGetStringData(s, sb, val_src_pos); - auto const decimal_precision = s->col.decimal_precision; + auto const decimal_precision = s->col.logical_type->precision(); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { gpuOutputByteArrayAsInt(ptr, len, static_cast(dst)); } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { @@ -182,7 +184,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) } } else if (dtype == BOOLEAN) { gpuOutputBoolean(sb, val_src_pos, static_cast(dst)); - } else if (s->col.converted_type == DECIMAL) { + } else if (is_decimal) { switch (dtype) { case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); break; case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); break; diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh index f0fa7d814cf..df8d801d66c 100644 --- a/cpp/src/io/parquet/page_data.cuh +++ b/cpp/src/io/parquet/page_data.cuh @@ -34,8 +34,7 @@ template inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv) { auto [ptr, len] = gpuGetStringData(s, sb, src_pos); - // make sure to only hash `BYTE_ARRAY` when specified with the output type size - if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) { + if (s->col.is_strings_to_cat and s->col.physical_type == BYTE_ARRAY) { // Output hash. This hash value is used if the option to convert strings to // categoricals is enabled. The seed value is chosen arbitrarily. uint32_t constexpr hash_seed = 33; diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index a081ee4e03f..fa1de5f301d 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -441,7 +441,7 @@ gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int ta while (pos < target_pos) { int len = 0; - if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { if (k < dict_size) { len = s->dtype_len_in; } } else { if (k + 4 <= dict_size) { @@ -1144,11 +1144,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, if (s->page.num_input_values > 0) { uint8_t* cur = s->page.page_data; uint8_t* end = cur + s->page.uncompressed_page_size; - - uint32_t dtype_len_out = s->col.data_type >> 3; - s->ts_scale = 0; + s->ts_scale = 0; // Validate data type - auto const data_type = s->col.data_type & 7; + auto const data_type = s->col.physical_type; + auto const is_decimal = + s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; switch (data_type) { case BOOLEAN: s->dtype_len = 1; // Boolean are stored as 1 byte on the output @@ -1159,13 +1159,15 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, if (s->col.ts_clock_rate) { int32_t units = 0; // Duration types are not included because no scaling is done when reading - if (s->col.converted_type == TIMESTAMP_MILLIS) { - units = cudf::timestamp_ms::period::den; - } else if (s->col.converted_type == TIMESTAMP_MICROS) { - units = cudf::timestamp_us::period::den; - } else if (s->col.logical_type.has_value() and - s->col.logical_type->is_timestamp_nanos()) { - units = cudf::timestamp_ns::period::den; + if (s->col.logical_type.has_value()) { + auto const& lt = s->col.logical_type.value(); + if (lt.is_timestamp_millis()) { + units = cudf::timestamp_ms::period::den; + } else if (lt.is_timestamp_micros()) { + units = cudf::timestamp_us::period::den; + } else if (lt.is_timestamp_nanos()) { + units = cudf::timestamp_ns::period::den; + } } if (units and units != s->col.ts_clock_rate) { s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate) @@ -1176,8 +1178,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, case DOUBLE: s->dtype_len = 8; break; case INT96: s->dtype_len = 12; break; case BYTE_ARRAY: - if (s->col.converted_type == DECIMAL) { - auto const decimal_precision = s->col.decimal_precision; + if (is_decimal) { + auto const decimal_precision = s->col.logical_type->precision(); s->dtype_len = [decimal_precision]() { if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return sizeof(int32_t); @@ -1192,14 +1194,14 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } break; default: // FIXED_LEN_BYTE_ARRAY: - s->dtype_len = dtype_len_out; + s->dtype_len = s->col.type_length; if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); } break; } // Special check for downconversions s->dtype_len_in = s->dtype_len; if (data_type == FIXED_LEN_BYTE_ARRAY) { - if (s->col.converted_type == DECIMAL) { + if (is_decimal) { s->dtype_len = [dtype_len = s->dtype_len]() { if (dtype_len <= sizeof(int32_t)) { return sizeof(int32_t); @@ -1213,17 +1215,17 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dtype_len = sizeof(string_index_pair); } } else if (data_type == INT32) { - if (dtype_len_out == 1) { - // INT8 output - s->dtype_len = 1; - } else if (dtype_len_out == 2) { - // INT16 output - s->dtype_len = 2; - } else if (s->col.converted_type == TIME_MILLIS) { - // INT64 output - s->dtype_len = 8; + // check for smaller bitwidths + if (s->col.logical_type.has_value()) { + auto const& lt = s->col.logical_type.value(); + if (lt.type == LogicalType::INTEGER) { + s->dtype_len = lt.bit_width() / 8; + } else if (lt.is_time_millis()) { + // cudf outputs as INT64 + s->dtype_len = 8; + } } - } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) { + } else if (data_type == BYTE_ARRAY && s->col.is_strings_to_cat) { s->dtype_len = 4; // HASH32 output } else if (data_type == INT96) { s->dtype_len = 8; // Convert to 64-bit timestamp @@ -1298,7 +1300,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, case Encoding::PLAIN_DICTIONARY: case Encoding::RLE_DICTIONARY: // RLE-packed dictionary indices, first byte indicates index length in bits - if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) { + if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) { // String dictionary: use index s->dict_base = reinterpret_cast(s->col.str_dict_index); s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair); @@ -1316,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, case Encoding::PLAIN: s->dict_size = static_cast(end - cur); s->dict_val = 0; - if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } + if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } break; case Encoding::RLE: { // first 4 bytes are length of RLE data diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 4a50c7445b3..07e03460ecb 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -147,12 +147,12 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk) __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk) { - return (chunk.data_type & 7) == BYTE_ARRAY; + return chunk.physical_type == BYTE_ARRAY; } __device__ inline bool is_boolean(ColumnChunkDesc const& chunk) { - return (chunk.data_type & 7) == BOOLEAN; + return chunk.physical_type == BOOLEAN; } /** diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d8b1c1cc046..6f96d4dd1cf 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -689,7 +689,7 @@ CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPage auto const start_value = pp->start_val; // if data size is known, can short circuit here - if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (chunks[pp->chunk_idx].physical_type == FIXED_LEN_BYTE_ARRAY) { if (t == 0) { pp->str_bytes = pp->num_valids * s->dtype_len_in; @@ -881,7 +881,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSi auto const& col = s->col; size_t str_bytes = 0; // short circuit for FIXED_LEN_BYTE_ARRAY - if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (col.physical_type == FIXED_LEN_BYTE_ARRAY) { str_bytes = pp->num_valids * s->dtype_len_in; } else { // now process string info in the range [start_value, end_value) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 82ccb2b314a..200a8ec9ddb 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -370,8 +370,8 @@ struct ColumnChunkDesc { explicit ColumnChunkDesc(size_t compressed_size_, uint8_t* compressed_data_, size_t num_values_, - uint16_t datatype_, - uint16_t datatype_length_, + Type datatype_, + int32_t datatype_length_, size_t start_row_, uint32_t num_rows_, int16_t max_definition_level_, @@ -379,15 +379,14 @@ struct ColumnChunkDesc { int16_t max_nesting_depth_, uint8_t def_level_bits_, uint8_t rep_level_bits_, - int8_t codec_, - int8_t converted_type_, + Compression codec_, thrust::optional logical_type_, - int8_t decimal_precision_, int32_t ts_clock_rate_, int32_t src_col_index_, int32_t src_col_schema_, column_chunk_info const* chunk_info_, - float list_bytes_per_row_est_) + float list_bytes_per_row_est_, + bool strings_to_categorical_) : compressed_data(compressed_data_), compressed_size(compressed_size_), num_values(num_values_), @@ -395,7 +394,8 @@ struct ColumnChunkDesc { num_rows(num_rows_), max_level{max_definition_level_, max_repetition_level_}, max_nesting_depth{max_nesting_depth_}, - data_type(datatype_ | (datatype_length_ << 3)), + type_length(datatype_length_), + physical_type(datatype_), level_bits{def_level_bits_, rep_level_bits_}, num_data_pages(0), num_dict_pages(0), @@ -405,14 +405,13 @@ struct ColumnChunkDesc { column_data_base{nullptr}, column_string_base{nullptr}, codec(codec_), - converted_type(converted_type_), logical_type(logical_type_), - decimal_precision(decimal_precision_), ts_clock_rate(ts_clock_rate_), src_col_index(src_col_index_), src_col_schema(src_col_schema_), h_chunk_info(chunk_info_), - list_bytes_per_row_est(list_bytes_per_row_est_) + list_bytes_per_row_est(list_bytes_per_row_est_), + is_strings_to_cat(strings_to_categorical_) { } @@ -423,7 +422,8 @@ struct ColumnChunkDesc { uint32_t num_rows{}; // number of rows in this chunk int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level int16_t max_nesting_depth{}; // max nesting depth of the output - uint16_t data_type{}; // basic column data type, ((type_length << 3) | // parquet::Type) + int32_t type_length{}; // type length from schema (for FLBA only) + Type physical_type{}; // parquet physical data type uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels int32_t num_data_pages{}; // number of data pages @@ -433,10 +433,8 @@ struct ColumnChunkDesc { bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column void** column_data_base{}; // base pointers of column data void** column_string_base{}; // base pointers of column string data - int8_t codec{}; // compressed codec enum - int8_t converted_type{}; // converted type enum + Compression codec{}; // compressed codec enum thrust::optional logical_type{}; // logical type - int8_t decimal_precision{}; // Decimal precision int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index @@ -446,6 +444,8 @@ struct ColumnChunkDesc { column_chunk_info const* h_chunk_info{}; float list_bytes_per_row_est{}; // for LIST columns, an estimate on number of bytes per row + + bool is_strings_to_cat{}; // convert strings to hashes }; /** @@ -615,11 +615,16 @@ struct EncPage { */ constexpr bool is_string_col(ColumnChunkDesc const& chunk) { - auto const not_converted_to_decimal = chunk.converted_type != DECIMAL; + // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing + // a decimal. + if (chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL) { + return false; + } + auto const non_hashed_byte_array = - (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4; - auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY; - return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array); + chunk.physical_type == BYTE_ARRAY and not chunk.is_strings_to_cat; + auto const fixed_len_byte_array = chunk.physical_type == FIXED_LEN_BYTE_ARRAY; + return non_hashed_byte_array or fixed_len_byte_array; } /** diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 8112328d962..2356878f6ba 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -28,6 +28,19 @@ namespace cudf::io::parquet::detail { +namespace { +// Tests the passed in logical type for a FIXED_LENGTH_BYTE_ARRAY column to see if it should +// be treated as a string. Currently the only logical type that has special handling is DECIMAL. +// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which +// for now would also be treated as a string). +inline bool is_treat_fixed_length_as_string(thrust::optional const& logical_type) +{ + if (!logical_type.has_value()) { return true; } + return logical_type->type != LogicalType::DECIMAL; +} + +} // namespace + void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows) { auto& pass = *_pass_itm_data; @@ -66,7 +79,8 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row // TODO: we could probably dummy up size stats for FLBA data since we know the width auto const has_flba = std::any_of(pass.chunks.begin(), pass.chunks.end(), [](auto const& chunk) { - return (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY && chunk.converted_type != DECIMAL; + return chunk.physical_type == FIXED_LEN_BYTE_ARRAY and + is_treat_fixed_length_as_string(chunk.logical_type); }); if (!_has_page_index || uses_custom_row_bounds || has_flba) { diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 5c387147e4b..912f53a8277 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -364,33 +364,28 @@ int64_t find_next_split(int64_t cur_pos, /** * @brief Converts cuDF units to Parquet units. * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + * @return A tuple of Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple conversion_info( +[[nodiscard]] std::tuple> conversion_info( type_id column_type_id, type_id timestamp_type_id, Type physical, - thrust::optional converted, - int32_t length) + thrust::optional logical_type) { - int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); + int32_t const clock_rate = + is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0; + + // TODO(ets): this is leftover from the original code, but will we ever output decimal as + // anything but fixed point? + if (logical_type.has_value() and logical_type->type == LogicalType::DECIMAL) { + // if decimal but not outputting as float or decimal, then convert to no logical type + if (column_type_id != type_id::FLOAT64 and + not cudf::is_fixed_point(data_type{column_type_id})) { + return std::make_tuple(clock_rate, thrust::nullopt); + } } - int8_t converted_type = converted.value_or(UNKNOWN); - if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); + return std::make_tuple(clock_rate, std::move(logical_type)); } /** @@ -1515,12 +1510,11 @@ void reader::impl::create_global_chunk_info() auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); auto& schema = _metadata->get_schema(col.schema_idx); - auto [type_width, clock_rate, converted_type] = + auto [clock_rate, logical_type] = conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), _timestamp_type.id(), schema.type, - schema.converted_type, - schema.type_length); + schema.logical_type); // for lists, estimate the number of bytes per row. this is used by the subpass reader to // determine where to split the decompression boundaries @@ -1538,7 +1532,7 @@ void reader::impl::create_global_chunk_info() nullptr, col_meta.num_values, schema.type, - type_width, + schema.type_length, row_group_start, row_group_rows, schema.max_definition_level, @@ -1547,14 +1541,13 @@ void reader::impl::create_global_chunk_info() required_bits(schema.max_definition_level), required_bits(schema.max_repetition_level), col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, + logical_type, clock_rate, i, col.schema_idx, chunk_info, - list_bytes_per_row_est)); + list_bytes_per_row_est, + schema.type == BYTE_ARRAY and _strings_to_categorical)); } remaining_rows -= row_group_rows; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 776caa99ac9..bfc69264ab2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -16,6 +16,7 @@ #include "reader_impl_helpers.hpp" +#include "io/parquet/parquet.hpp" #include "io/utilities/row_selection.hpp" #include @@ -25,44 +26,35 @@ namespace cudf::io::parquet::detail { namespace { -ConvertedType logical_type_to_converted_type(thrust::optional const& logical) +thrust::optional converted_to_logical_type(SchemaElement const& schema) { - if (not logical.has_value()) { return UNKNOWN; } - switch (logical->type) { - case LogicalType::STRING: return UTF8; - case LogicalType::MAP: return MAP; - case LogicalType::LIST: return LIST; - case LogicalType::ENUM: return ENUM; - case LogicalType::DECIMAL: return DECIMAL; // TODO use decimal scale/precision - case LogicalType::DATE: return DATE; - case LogicalType::TIME: - if (logical->is_time_millis()) { - return TIME_MILLIS; - } else if (logical->is_time_micros()) { - return TIME_MICROS; - } - break; - case LogicalType::TIMESTAMP: - if (logical->is_timestamp_millis()) { - return TIMESTAMP_MILLIS; - } else if (logical->is_timestamp_micros()) { - return TIMESTAMP_MICROS; - } - break; - case LogicalType::INTEGER: - switch (logical->bit_width()) { - case 8: return logical->is_signed() ? INT_8 : UINT_8; - case 16: return logical->is_signed() ? INT_16 : UINT_16; - case 32: return logical->is_signed() ? INT_32 : UINT_32; - case 64: return logical->is_signed() ? INT_64 : UINT_64; - default: break; - } - case LogicalType::UNKNOWN: return NA; - case LogicalType::JSON: return JSON; - case LogicalType::BSON: return BSON; - default: break; + if (schema.converted_type.has_value()) { + switch (schema.converted_type.value()) { + case ENUM: // treat ENUM as UTF8 string + case UTF8: return LogicalType{LogicalType::STRING}; + case MAP: return LogicalType{LogicalType::MAP}; + case LIST: return LogicalType{LogicalType::LIST}; + case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}}; + case DATE: return LogicalType{LogicalType::DATE}; + case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}}; + case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}}; + case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}}; + case TIMESTAMP_MICROS: return LogicalType{TimestampType{true, TimeUnit::MICROS}}; + case UINT_8: return LogicalType{IntType{8, false}}; + case UINT_16: return LogicalType{IntType{16, false}}; + case UINT_32: return LogicalType{IntType{32, false}}; + case UINT_64: return LogicalType{IntType{64, false}}; + case INT_8: return LogicalType{IntType{8, true}}; + case INT_16: return LogicalType{IntType{16, true}}; + case INT_32: return LogicalType{IntType{32, true}}; + case INT_64: return LogicalType{IntType{64, true}}; + case JSON: return LogicalType{LogicalType::JSON}; + case BSON: return LogicalType{LogicalType::BSON}; + case INTERVAL: // there is no logical type for INTERVAL yet + default: return LogicalType{LogicalType::UNDEFINED}; + } } - return UNKNOWN; + return thrust::nullopt; } } // namespace @@ -74,76 +66,90 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - auto const physical = schema.type; - auto const logical_type = schema.logical_type; - auto converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; - - // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to - // converted_type if logical_type isn't set - // Logical type used for actual data interpretation; the legacy converted type - // is superseded by 'logical' type whenever available. - auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); } - - switch (converted_type.value_or(UNKNOWN)) { - case UINT_8: return type_id::UINT8; - case INT_8: return type_id::INT8; - case UINT_16: return type_id::UINT16; - case INT_16: return type_id::INT16; - case UINT_32: return type_id::UINT32; - case UINT_64: return type_id::UINT64; - case DATE: return type_id::TIMESTAMP_DAYS; - case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case TIMESTAMP_MILLIS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_MILLISECONDS; - case TIMESTAMP_MICROS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_MICROSECONDS; - case DECIMAL: - if (physical == INT32) { return type_id::DECIMAL32; } - if (physical == INT64) { return type_id::DECIMAL64; } - if (physical == FIXED_LEN_BYTE_ARRAY) { - if (schema.type_length <= static_cast(sizeof(int32_t))) { - return type_id::DECIMAL32; + auto const physical = schema.type; + auto logical_type = schema.logical_type; + + // sanity check, but not worth failing over + if (schema.converted_type.has_value() and not logical_type.has_value()) { + CUDF_LOG_WARN("ConvertedType is specified but not LogicalType"); + logical_type = converted_to_logical_type(schema); + } + + if (logical_type.has_value()) { + switch (logical_type->type) { + case LogicalType::INTEGER: { + auto const is_signed = logical_type->is_signed(); + switch (logical_type->bit_width()) { + case 8: return is_signed ? type_id::INT8 : type_id::UINT8; + case 16: return is_signed ? type_id::INT16 : type_id::UINT16; + case 32: return is_signed ? type_id::INT32 : type_id::UINT32; + case 64: return is_signed ? type_id::INT64 : type_id::UINT64; + default: CUDF_FAIL("Invalid integer bitwidth"); } - if (schema.type_length <= static_cast(sizeof(int64_t))) { - return type_id::DECIMAL64; + } break; + + case LogicalType::DATE: return type_id::TIMESTAMP_DAYS; + + case LogicalType::TIME: + if (logical_type->is_time_millis()) { + return type_id::DURATION_MILLISECONDS; + } else if (logical_type->is_time_micros()) { + return type_id::DURATION_MICROSECONDS; + } else if (logical_type->is_time_nanos()) { + return type_id::DURATION_NANOSECONDS; } - if (schema.type_length <= static_cast(sizeof(__int128_t))) { - return type_id::DECIMAL128; + break; + + case LogicalType::TIMESTAMP: + if (timestamp_type_id != type_id::EMPTY) { + return timestamp_type_id; + } else if (logical_type->is_timestamp_millis()) { + return type_id::TIMESTAMP_MILLISECONDS; + } else if (logical_type->is_timestamp_micros()) { + return type_id::TIMESTAMP_MICROSECONDS; + } else if (logical_type->is_timestamp_nanos()) { + return type_id::TIMESTAMP_NANOSECONDS; } - } - if (physical == BYTE_ARRAY) { - CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); - if (decimal_precision <= MAX_DECIMAL32_PRECISION) { + + case LogicalType::DECIMAL: { + int32_t const decimal_precision = logical_type->precision(); + if (physical == INT32) { return type_id::DECIMAL32; - } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { + } else if (physical == INT64) { return type_id::DECIMAL64; + } else if (physical == FIXED_LEN_BYTE_ARRAY) { + if (schema.type_length <= static_cast(sizeof(int32_t))) { + return type_id::DECIMAL32; + } else if (schema.type_length <= static_cast(sizeof(int64_t))) { + return type_id::DECIMAL64; + } else if (schema.type_length <= static_cast(sizeof(__int128_t))) { + return type_id::DECIMAL128; + } + } else if (physical == BYTE_ARRAY) { + CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); + if (decimal_precision <= MAX_DECIMAL32_PRECISION) { + return type_id::DECIMAL32; + } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { + return type_id::DECIMAL64; + } else { + return type_id::DECIMAL128; + } } else { - return type_id::DECIMAL128; + CUDF_FAIL("Invalid representation of decimal type"); } - } - CUDF_FAIL("Invalid representation of decimal type"); - break; - - // maps are just List>. - case MAP: - case LIST: return type_id::LIST; - case NA: return type_id::STRING; - // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support - default: break; - } + } break; - if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) { - if (logical_type->is_timestamp_nanos()) { - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_NANOSECONDS; - } else if (logical_type->is_time_nanos()) { - return type_id::DURATION_NANOSECONDS; + // maps are just List>. + case LogicalType::MAP: + case LogicalType::LIST: return type_id::LIST; + + // All null column that can't have its type deduced. + // Note: originally LogicalType::UNKNOWN was converted to ConvertedType::NA, and + // NA then became type_id::STRING, but with the following TODO: + // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support + case LogicalType::UNKNOWN: return type_id::STRING; + + default: break; } } @@ -208,6 +214,7 @@ void metadata::sanitize_schema() // This is a list of structs, so we need to mark this as a list, but also // add a struct child and move this element's children to the struct schema_elem.converted_type = LIST; + schema_elem.logical_type = LogicalType::LIST; schema_elem.repetition_type = OPTIONAL; auto const struct_node_idx = static_cast(schema.size()); @@ -216,7 +223,7 @@ void metadata::sanitize_schema() struct_elem.repetition_type = REQUIRED; struct_elem.num_children = schema_elem.num_children; struct_elem.type = UNDEFINED_TYPE; - struct_elem.converted_type = UNKNOWN; + struct_elem.converted_type = thrust::nullopt; // swap children struct_elem.children_idx = std::move(schema_elem.children_idx); @@ -238,6 +245,11 @@ void metadata::sanitize_schema() } } + // convert ConvertedType to LogicalType for older files + if (schema_elem.converted_type.has_value() and not schema_elem.logical_type.has_value()) { + schema_elem.logical_type = converted_to_logical_type(schema_elem); + } + for (auto& child_idx : schema_elem.children_idx) { process(child_idx); } diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index e39445108a6..4b7a64ac6ab 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -643,7 +643,7 @@ struct set_str_dict_index_count { __device__ void operator()(PageInfo const& page) { auto const& chunk = chunks[page.chunk_idx]; - if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && (chunk.data_type & 0x7) == BYTE_ARRAY && + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) { // there is only ever one dictionary page per chunk, so this is safe to do in parallel. str_dict_index_count[page.chunk_idx] = page.num_input_values; @@ -659,7 +659,7 @@ struct set_str_dict_index_ptr { __device__ void operator()(size_t i) { auto& chunk = chunks[i]; - if ((chunk.data_type & 0x7) == BYTE_ARRAY && (chunk.num_dict_pages > 0)) { + if (chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) { chunk.str_dict_index = base + str_dict_index_offsets[i]; } } From 35f818b3e4bef8e331f083dadc9a4c45e2987a78 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 27 Mar 2024 13:39:41 -0500 Subject: [PATCH 05/69] Use `conda env create --yes` instead of `--force` (#15403) conda dropped support for the `--force` flag to `conda env create`. This changes that flag name to `--yes`. See https://github.com/conda/conda/blob/main/CHANGELOG.md#2430-2024-03-12 and https://github.com/rapidsai/miniforge-cuda/pull/63 for more info. --- ci/build_docs.sh | 2 +- ci/check_style.sh | 2 +- ci/test_cpp_common.sh | 4 ++-- ci/test_java.sh | 4 ++-- ci/test_notebooks.sh | 4 ++-- ci/test_python_common.sh | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 8e22f02b484..668d52e530b 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -17,7 +17,7 @@ rapids-dependency-file-generator \ --file_key docs \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n docs +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs conda activate docs rapids-print-env diff --git a/ci/check_style.sh b/ci/check_style.sh index b3890607f64..029cd305f1d 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -13,7 +13,7 @@ rapids-dependency-file-generator \ --file_key checks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n checks +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks conda activate checks RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh index 163d381c1d4..e1b2a367187 100644 --- a/ci/test_cpp_common.sh +++ b/ci/test_cpp_common.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ --file_key test_cpp \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test # Temporarily allow unbound variables for conda activation. set +u diff --git a/ci/test_java.sh b/ci/test_java.sh index 0863795162d..c93079742f0 100755 --- a/ci/test_java.sh +++ b/ci/test_java.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ --file_key test_java \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test export CMAKE_GENERATOR=Ninja diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index b746a18aed1..8be2d374bed 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. set -euo pipefail @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ --file_key test_notebooks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test # Temporarily allow unbound variables for conda activation. set +u diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index 1c330d47ac6..7559d970f6d 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Common setup steps shared by Python test jobs @@ -16,7 +16,7 @@ rapids-dependency-file-generator \ --file_key test_python \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test # Temporarily allow unbound variables for conda activation. set +u From aab6137c80c50eccc5007120f7140cfe6646b5e0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 1 Apr 2024 04:01:36 -0700 Subject: [PATCH 06/69] First pass at adding testing for pylibcudf (#15300) This PR adds tests of the `pylibcudf.copying` module along with establishing the infrastructure and best practices for writing pylibcudf tests going forward (and adding associated documentation). Resolves #15133 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Ashwin Srinath (https://github.com/shwina) - Jake Awe (https://github.com/AyodeAwe) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/15300 --- ci/test_python_cudf.sh | 8 + ci/test_wheel_cudf.sh | 8 + cpp/include/cudf/copying.hpp | 3 + cpp/src/copying/copy.cpp | 5 +- cpp/src/copying/copy_range.cu | 2 +- cpp/src/copying/scatter.cu | 11 +- docs/cudf/source/developer_guide/pylibcudf.md | 66 ++ docs/cudf/source/developer_guide/testing.md | 6 + python/cudf/cudf/_lib/cpp/copying.pxd | 42 +- python/cudf/cudf/_lib/pylibcudf/column.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/column.pyx | 9 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 6 +- python/cudf/cudf/_lib/pylibcudf/copying.pyx | 126 ++- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 1 + python/cudf/cudf/_lib/pylibcudf/table.pxd | 3 + python/cudf/cudf/_lib/pylibcudf/table.pyx | 8 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 5 + .../cudf/cudf/pylibcudf_tests/common/utils.py | 111 +++ python/cudf/cudf/pylibcudf_tests/conftest.py | 31 + python/cudf/cudf/pylibcudf_tests/pytest.ini | 8 + .../cudf/cudf/pylibcudf_tests/test_copying.py | 848 ++++++++++++++++++ 21 files changed, 1254 insertions(+), 54 deletions(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/common/utils.py create mode 100644 python/cudf/cudf/pylibcudf_tests/conftest.py create mode 100644 python/cudf/cudf/pylibcudf_tests/pytest.ini create mode 100644 python/cudf/cudf/pylibcudf_tests/test_copying.py diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index bacb54b3896..217dd2fd9a8 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -14,6 +14,14 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +rapids-logger "pytest pylibcudf" +pushd python/cudf/cudf/pylibcudf_tests +python -m pytest \ + --cache-clear \ + --dist=worksteal \ + . +popd + rapids-logger "pytest cudf" ./ci/run_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 83f0b976128..a6f122491b0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -18,6 +18,14 @@ if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then rapids-logger "Run smoke tests for cudf" python ./ci/wheel_smoke_test_cudf.py else + rapids-logger "pytest pylibcudf" + pushd python/cudf/cudf/pylibcudf_tests + python -m pytest \ + --cache-clear \ + --dist=worksteal \ + . + popd + rapids-logger "pytest cudf" pushd python/cudf/cudf/tests python -m pytest \ diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index b2cde82fada..df96efdaffc 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -253,6 +253,8 @@ std::unique_ptr empty_like(scalar const& input); * If the `mask_alloc` allocates a validity mask that mask is also uninitialized * and the validity bits and the null count should be set by the caller. * + * @throws cudf::data_type_error if input type is not of fixed width. + * * @param input Immutable view of input column to emulate * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN * @param mr Device memory resource used to allocate the returned column's device memory @@ -360,6 +362,7 @@ void copy_range_in_place(column_view const& source, * * @throws std::out_of_range for any invalid range. * @throws cudf::data_type_error if @p target and @p source have different types. + * @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary * * @param source The column to copy from inside the range * @param target The column to copy from outside the range diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index 490a1ccb254..cb7d507de81 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -122,7 +122,8 @@ std::unique_ptr allocate_like(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column"); + CUDF_EXPECTS( + is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error); mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable()); return std::make_unique(input.type(), diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 038646d8cf4..e10d7081a55 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -119,7 +119,7 @@ struct out_of_place_copy_range_dispatch { std::enable_if_t(), std::unique_ptr> operator()(Args...) { - CUDF_FAIL("Unsupported type for out of place copy."); + CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error); } }; diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 7931df4c9f0..3bc3979ec1b 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -144,7 +144,9 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match"); + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); auto const scalar_impl = static_cast(&source.get()); auto const source_view = string_view(scalar_impl->data(), scalar_impl->size()); @@ -166,6 +168,9 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); auto result = lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr); @@ -249,6 +254,10 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); + // For each field of `source`, copy construct a scalar from the field // and dispatch to the corresponding scalar scatterer diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 0120cbb286e..0b881b2b057 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -96,6 +96,72 @@ There are a couple of notable points from the snippet above: - The object returned from libcudf is immediately converted to a pylibcudf type. - `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case). +## Testing + +When writing pylibcudf tests, it is important to remember that all the APIs should be tested in the C++ layer in libcudf already. +The primary purpose of pylibcudf tests is to ensure the correctness of the _bindings_; the correctness of the underlying implementation should generally be validated in libcudf. +If pylibcudf tests uncover a libcudf bug, a suitable libcudf test should be added to cover this case rather than relying solely on pylibcudf testing. + +pylibcudf's ``conftest.py`` contains some standard parametrized dtype fixture lists that may in turn be used to parametrize other fixtures. +Fixtures allocating data should leverage these dtype lists wherever possible to simplify testing across the matrix of important types. +Where appropriate, new fixture lists may be added. + +To run tests as efficiently as possible, the test suite should make generous use of fixtures. +The simplest general structure to follow is for pyarrow array/table/scalar fixtures to be parametrized by one of the dtype list. +Then, a corresponding pylibcudf fixture may be created using a simple `from_arrow` call. +This approach ensures consistent global coverage across types for various tests. + +In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data. +This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above. +Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module. + +Here is an example demonstrating the above points: + +```python +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from cudf._lib import pylibcudf as plc +from utils import assert_column_eq + +# The pa_dtype fixture is defined in conftest.py. +@pytest.fixture(scope="module") +def pa_column(pa_dtype): + pa.array([1, 2, 3]) + + +@pytest.fixture(scope="module") +def column(pa_column): + return plc.interop.from_arrow(pa_column) + + +def test_foo(pa_column, column): + index = 1 + result = plc.foo(column) + expected = pa.foo(pa_column) + + assert_column_eq(result, expected) +``` + +Some guidelines on what should be tested: +- Tests SHOULD comprehensively cover the API, including all possible combinations of arguments required to ensure good test coverage. +- pylibcudf SHOULD NOT attempt to stress test large data sizes, and SHOULD instead defer to libcudf tests. + - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead. +- Nullable data should always be tested. +- Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated. + - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd). + +Some guidelines on how best to use pytests. +- By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance. +- Where necessary, mutable fixtures should be named as such (e.g. `mutable_col`) and be of function scope. If possible, they can be implemented as simply making a copy of a corresponding module-scope immutable fixture to avoid duplicating the generation logic. + +Tests should be organized corresponding to pylibcudf modules, i.e. one test module for each pylibcudf module. + +The following sections of the cuDF Python testing guide also generally apply to pylibcudf unless superseded by any statements above: +- [](#test_parametrization) +- [](#xfailing_tests) +- [](#testing_warnings) + ## Miscellaneous Notes ### Cython Scoped Enums diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index a28a6b9192d..f12f809d5db 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -55,6 +55,8 @@ Typically, exception cases require specific assertions or other special logic, s The main exception to this rule is tests based on comparison to pandas. Such tests may test exceptional cases alongside more typical cases since the logic is generally identical. +(test_parametrization)= + ### Parametrization: custom fixtures and `pytest.mark.parametrize` When it comes to parametrizing tests written with `pytest`, @@ -140,6 +142,8 @@ def test_odds(): Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review. +(xfailing_tests)= + ### Tests with expected failures (`xfail`s) In some circumstances it makes sense to mark a test as _expected_ to @@ -218,6 +222,8 @@ This way, when the bug is fixed, the test suite will fail at this point (and we will remember to update the test). +(testing_warnings)= + ### Testing code that throws warnings Some code may be expected to throw warnings. diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index f3e5c0aec72..053e2299f22 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool @@ -33,19 +33,19 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: const column_view& input, size_type offset, const scalar& fill_values - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] scatter ( const table_view& source_table, const column_view& scatter_map, const table_view& target_table, - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] scatter ( const vector[reference_wrapper[constscalar]]& source_scalars, const column_view& indices, const table_view& target, - ) except + + ) except +cudf_exception_handler cpdef enum class mask_allocation_policy(int32_t): NEVER @@ -54,22 +54,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: cdef unique_ptr[column] empty_like ( const column_view& input_column - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] allocate_like ( const column_view& input_column, mask_allocation_policy policy - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] allocate_like ( const column_view& input_column, size_type size, mask_allocation_policy policy - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] empty_like ( const table_view& input_table - ) except + + ) except +cudf_exception_handler cdef void copy_range_in_place ( const column_view& input_column, @@ -77,7 +77,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: size_type input_begin, size_type input_end, size_type target_begin - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_range ( const column_view& input_column, @@ -85,68 +85,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: size_type input_begin, size_type input_end, size_type target_begin - ) except + + ) except +cudf_exception_handler cdef vector[column_view] slice ( const column_view& input_column, vector[size_type] indices - ) except + + ) except +cudf_exception_handler cdef vector[table_view] slice ( const table_view& input_table, vector[size_type] indices - ) except + + ) except +cudf_exception_handler cdef vector[column_view] split ( const column_view& input_column, vector[size_type] splits - ) except + + ) except +cudf_exception_handler cdef vector[table_view] split ( const table_view& input_table, vector[size_type] splits - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const column_view& lhs, const column_view& rhs, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const scalar& lhs, const column_view& rhs, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const column_view& lhs, const scalar& rhs, const column_view boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const scalar& lhs, const scalar& rhs, const column_view boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] boolean_mask_scatter ( const table_view& input, const table_view& target, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] boolean_mask_scatter ( const vector[reference_wrapper[constscalar]]& input, const table_view& target, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[scalar] get_element ( const column_view& input, size_type index - ) except + + ) except +cudf_exception_handler cpdef enum class sample_with_replacement(bool): FALSE diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index fc5cc77c9e7..66ccdb53d1a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -43,6 +43,7 @@ cdef class Column: cpdef gpumemoryview data(self) cpdef gpumemoryview null_mask(self) cpdef list children(self) + cpdef Column copy(self) cpdef ListColumnView list_view(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 3c5c53f99cf..2565e92d5c9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -1,7 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer @@ -274,6 +274,13 @@ cdef class Column: """The children of the column.""" return self._children + cpdef Column copy(self): + """Create a copy of the column.""" + cdef unique_ptr[column] c_result + with nogil: + c_result = move(make_unique[column](self.view())) + return Column.from_libcudf(move(c_result)) + cdef class ListColumnView: """Accessor for methods of a Column that are specific to lists.""" diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index 7b5f1e70ea3..0211d122c8e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -58,12 +58,12 @@ cpdef Column copy_range( size_type target_begin, ) -cpdef Column shift(Column input, size_type offset, Scalar fill_values) - -cpdef list split(ColumnOrTable input, list splits) +cpdef Column shift(Column input, size_type offset, Scalar fill_value) cpdef list slice(ColumnOrTable input, list indices) +cpdef list split(ColumnOrTable input, list splits) + cpdef Column copy_if_else( LeftCopyIfElseOperand lhs, RightCopyIfElseOperand rhs, diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index d78955dc325..125a4ffe65f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -54,6 +54,11 @@ cpdef Table gather( ------- pylibcudf.Table The result of the gather + + Raises + ------ + ValueError + If the gather_map contains nulls. """ cdef unique_ptr[table] c_result with nogil: @@ -92,6 +97,20 @@ cpdef Table scatter( ------- Table The result of the scatter + + Raises + ------ + ValueError + If any of the following occur: + - scatter_map contains null values. + - source is a Table and the number of columns in source does not match the + number of columns in target. + - source is a Table and the number of rows in source does not match the + number of elements in scatter_map. + - source is a List[Scalar] and the number of scalars does not match the + number of columns in target. + TypeError + If data types of the source and target columns do not match. """ cdef unique_ptr[table] c_result cdef vector[reference_wrapper[const scalar]] source_scalars @@ -207,6 +226,17 @@ cpdef Column copy_range_in_place( The index of the last element in input_column to copy. target_begin : int The index of the first element in target_column to overwrite. + + Raises + ------ + TypeError + If the operation is attempted on non-fixed width types since those would require + memory reallocations, or if the input and target columns have different types. + IndexError + If the indices accessed by the ranges implied by input_begin, input_end, and + target_begin are out of bounds. + ValueError + If source has null values and target is not nullable. """ # Need to initialize this outside the function call so that Cython doesn't @@ -251,6 +281,14 @@ cpdef Column copy_range( ------- pylibcudf.Column A copy of target_column with the specified range overwritten. + + Raises + ------ + IndexError + If the indices accessed by the ranges implied by input_begin, input_end, and + target_begin are out of bounds. + TypeError + If target and source have different types. """ cdef unique_ptr[column] c_result @@ -266,7 +304,7 @@ cpdef Column copy_range( return Column.from_libcudf(move(c_result)) -cpdef Column shift(Column input, size_type offset, Scalar fill_values): +cpdef Column shift(Column input, size_type offset, Scalar fill_value): """Shift the elements of input by offset. For details on the implementation, see :cpp:func:`shift`. @@ -285,6 +323,12 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values): ------- pylibcudf.Column A copy of input shifted by offset. + + Raises + ------ + TypeError + If the fill_value is not of the same type as input, or if the input type is not + of fixed width or string type. """ cdef unique_ptr[column] c_result with nogil: @@ -292,37 +336,44 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values): cpp_copying.shift( input.view(), offset, - dereference(fill_values.c_obj) + dereference(fill_value.c_obj) ) ) return Column.from_libcudf(move(c_result)) -cpdef list split(ColumnOrTable input, list splits): - """Split input into multiple. +cpdef list slice(ColumnOrTable input, list indices): + """Slice input according to indices. - For details on the implementation, see :cpp:func:`split`. + For details on the implementation, see :cpp:func:`slice`. Parameters ---------- - input : Union[Column, Table] - The column to split. - splits : List[int] - The indices at which to split the column. + input_column : Union[Column, Table] + The column or table to slice. + indices : List[int] + The indices to select from input. Returns ------- List[Union[Column, Table]] - The result of splitting input. + The result of slicing ``input``. + + Raises + ------ + ValueError + If indices size is not even or the values in any pair of lower/upper bounds are + strictly decreasing. + IndexError + When any of the indices don't belong to the range ``[0, input_column.size())``. """ - cdef vector[size_type] c_splits = splits + cdef vector[size_type] c_indices = indices cdef vector[column_view] c_col_result cdef vector[table_view] c_tbl_result cdef int i - if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.split(input.view(), c_splits)) + c_col_result = move(cpp_copying.slice(input.view(), c_indices)) return [ Column.from_column_view(c_col_result[i], input) @@ -330,7 +381,7 @@ cpdef list split(ColumnOrTable input, list splits): ] else: with nogil: - c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) + c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) return [ Table.from_table_view(c_tbl_result[i], input) @@ -338,30 +389,31 @@ cpdef list split(ColumnOrTable input, list splits): ] -cpdef list slice(ColumnOrTable input, list indices): - """Slice input according to indices. +cpdef list split(ColumnOrTable input, list splits): + """Split input into multiple. - For details on the implementation, see :cpp:func:`slice`. + For details on the implementation, see :cpp:func:`split`. Parameters ---------- - input_column : Union[Column, Table] - The column or table to slice. - indices : List[int] - The indices to select from input. + input : Union[Column, Table] + The column to split. + splits : List[int] + The indices at which to split the column. Returns ------- List[Union[Column, Table]] - The result of slicing ``input``. + The result of splitting input. """ - cdef vector[size_type] c_indices = indices + cdef vector[size_type] c_splits = splits cdef vector[column_view] c_col_result cdef vector[table_view] c_tbl_result cdef int i + if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.slice(input.view(), c_indices)) + c_col_result = move(cpp_copying.split(input.view(), c_splits)) return [ Column.from_column_view(c_col_result[i], input) @@ -369,7 +421,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] else: with nogil: - c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) + c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) return [ Table.from_table_view(c_tbl_result[i], input) @@ -401,6 +453,15 @@ cpdef Column copy_if_else( ------- pylibcudf.Column The result of copying elements from lhs and rhs according to boolean_mask. + + Raises + ------ + TypeError + If lhs and rhs are not of the same type or if the boolean mask is not of type + bool. + ValueError + If boolean mask is not of the same length as lhs and rhs (whichever are + columns), or if lhs and rhs are not of the same length (if both are columns). """ cdef unique_ptr[column] result @@ -459,6 +520,16 @@ cpdef Table boolean_mask_scatter( ------- Table The result of the scatter + + Raises + ------ + ValueError + If input.num_columns() != target.num_columns(), boolean_mask.size() != + target.num_rows(), or if input is a Table and the number of `true` in + `boolean_mask` > input.num_rows(). + TypeError + If any input type does not match the corresponding target column's type, or + if boolean_mask.type() is not bool. """ cdef unique_ptr[table] result cdef vector[reference_wrapper[const scalar]] source_scalars @@ -502,6 +573,11 @@ cpdef Scalar get_element(Column input_column, size_type index): ------- pylibcudf.Scalar The element at index from input_column. + + Raises + ------ + IndexError + If index is out of bounds. """ cdef unique_ptr[scalar] c_output with nogil: diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index e7471033fc8..8dc41fccc0c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -140,6 +140,7 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None): @from_arrow.register(pa.Array) +@from_arrow.register(pa.ChunkedArray) def _from_arrow_column(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for arrays") diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 327f3911489..7467bfccaa8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -12,6 +12,9 @@ cdef class Table: cdef table_view view(self) nogil + cpdef int num_columns(self) + cpdef int num_rows(self) + @staticmethod cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 793e6330244..1fa60ec2b6c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -77,6 +77,14 @@ cdef class Table: for i in range(tv.num_columns()) ]) + cpdef int num_columns(self): + """The number of columns in this table.""" + return len(self._columns) + + cpdef int num_rows(self): + """The number of rows in this table.""" + return self._columns[0].size() + cpdef list columns(self): """The columns in this table.""" return self._columns diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index f6ff6e5a2fc..d8b92283412 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -39,6 +39,11 @@ cdef class DataType: """Get the scale associated with this data type.""" return self.c_obj.scale() + def __eq__(self, other): + if not isinstance(other, DataType): + return False + return self.id() == other.id() and self.scale() == other.scale() + @staticmethod cdef DataType from_libcudf(data_type dt): """Create a DataType from a libcudf data_type. diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py new file mode 100644 index 00000000000..6636ab9e5f8 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -0,0 +1,111 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Optional + +import pyarrow as pa +import pytest + +from cudf._lib import pylibcudf as plc + + +def metadata_from_arrow_array( + pa_array: pa.Array, +) -> Optional[plc.interop.ColumnMetadata]: + metadata = None + if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): + metadata = plc.interop.ColumnMetadata( + "", + # libcudf does not store field names, so just match pyarrow's. + [ + plc.interop.ColumnMetadata(pa_array.type.field(i).name) + for i in range(pa_array.type.num_fields) + ], + ) + return metadata + + +def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: + """Verify that the pylibcudf array and PyArrow array are equal.""" + # Nested types require children metadata to be passed to the conversion function. + plc_pa = plc.interop.to_arrow( + plc_column, metadata=metadata_from_arrow_array(pa_array) + ) + + if isinstance(plc_pa, pa.ChunkedArray): + plc_pa = plc_pa.combine_chunks() + if isinstance(pa_array, pa.ChunkedArray): + pa_array = pa_array.combine_chunks() + + assert plc_pa.equals(pa_array) + + +def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: + """Verify that the pylibcudf array and PyArrow array are equal.""" + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + +def cudf_raises(expected_exception: BaseException, *args, **kwargs): + # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions + match = kwargs.get("match", None) + if match is None: + kwargs["match"] = "CUDF failure at" + return pytest.raises(expected_exception, *args, **kwargs) + + +# TODO: Consider moving these type utilities into pylibcudf.types itself. +def is_signed_integer(plc_dtype: plc.DataType): + return ( + plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value + ) + + +def is_unsigned_integer(plc_dtype: plc.DataType): + return plc_dtype.id() in ( + plc.TypeId.UINT8, + plc.TypeId.UINT16, + plc.TypeId.UINT32, + plc.TypeId.UINT64, + ) + + +def is_integer(plc_dtype: plc.DataType): + return plc_dtype.id() in ( + plc.TypeId.INT8, + plc.TypeId.INT16, + plc.TypeId.INT32, + plc.TypeId.INT64, + ) + + +def is_floating(plc_dtype: plc.DataType): + return plc_dtype.id() in ( + plc.TypeId.FLOAT32, + plc.TypeId.FLOAT64, + ) + + +def is_boolean(plc_dtype: plc.DataType): + return plc_dtype.id() == plc.TypeId.BOOL8 + + +def is_string(plc_dtype: plc.DataType): + return plc_dtype.id() == plc.TypeId.STRING + + +def is_fixed_width(plc_dtype: plc.DataType): + return ( + is_integer(plc_dtype) + or is_floating(plc_dtype) + or is_boolean(plc_dtype) + ) + + +# We must explicitly specify this type via a field to ensure we don't include +# nullability accidentally. +DEFAULT_STRUCT_TESTING_TYPE = pa.struct( + [pa.field("v", pa.int64(), nullable=False)] +) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py new file mode 100644 index 00000000000..6d8284fb3db --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# Tell ruff it's OK that some imports occur after the sys.path.insert +# ruff: noqa: E402 +import os +import sys + +import pyarrow as pa +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) + +from utils import DEFAULT_STRUCT_TESTING_TYPE + + +# This fixture defines the standard set of types that all tests should default to +# running on. If there is a need for some tests to run on a different set of types, that +# type list fixture should also be defined below here if it is likely to be reused +# across modules. Otherwise it may be defined on a per-module basis. +@pytest.fixture( + scope="session", + params=[ + pa.int64(), + pa.float64(), + pa.string(), + pa.bool_(), + pa.list_(pa.int64()), + DEFAULT_STRUCT_TESTING_TYPE, + ], +) +def pa_type(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/pytest.ini b/python/cudf/cudf/pylibcudf_tests/pytest.ini new file mode 100644 index 00000000000..1761c0f011c --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/pytest.ini @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +xfail_strict = true +filterwarnings = + error + ignore:::.*xdist.* + ignore:::.*pytest.* diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py new file mode 100644 index 00000000000..0bf30f98636 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -0,0 +1,848 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import ( + DEFAULT_STRUCT_TESTING_TYPE, + assert_column_eq, + assert_table_eq, + cudf_raises, + is_fixed_width, + is_floating, + is_integer, + is_string, + metadata_from_arrow_array, +) + +from cudf._lib import pylibcudf as plc + + +# TODO: Test nullable data +@pytest.fixture(scope="module") +def pa_input_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([1, 2, 3], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["a", "b", "c"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([True, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[1], [2], [3]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="module") +def input_column(pa_input_column): + return plc.interop.from_arrow(pa_input_column) + + +@pytest.fixture(scope="module") +def pa_index_column(): + # Index column for testing gather/scatter, always integral. + return pa.array([1, 2, 3]) + + +@pytest.fixture(scope="module") +def index_column(pa_index_column): + return plc.interop.from_arrow(pa_index_column) + + +@pytest.fixture(scope="module") +def pa_target_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([4, 5, 6, 7, 8, 9], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([False, True, True, False, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array( + [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], + type=pa_type, + ) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="module") +def target_column(pa_target_column): + return plc.interop.from_arrow(pa_target_column) + + +@pytest.fixture +def mutable_target_column(target_column): + return target_column.copy() + + +@pytest.fixture(scope="module") +def pa_source_table(pa_input_column): + return pa.table([pa_input_column] * 3, [""] * 3) + + +@pytest.fixture(scope="module") +def source_table(pa_source_table): + return plc.interop.from_arrow(pa_source_table) + + +@pytest.fixture(scope="module") +def pa_target_table(pa_target_column): + return pa.table([pa_target_column] * 3, [""] * 3) + + +@pytest.fixture(scope="module") +def target_table(pa_target_table): + return plc.interop.from_arrow(pa_target_table) + + +@pytest.fixture(scope="module") +def pa_source_scalar(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.scalar(1, type=pa_type) + elif pa.types.is_string(pa_type): + return pa.scalar("a", type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.scalar(False, type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Longer list? + return pa.scalar([1], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.scalar({"v": 1}, type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="module") +def source_scalar(pa_source_scalar): + return plc.interop.from_arrow(pa_source_scalar) + + +@pytest.fixture(scope="module") +def pa_mask(pa_target_column): + return pa.array([True, False] * (len(pa_target_column) // 2)) + + +@pytest.fixture(scope="module") +def mask(pa_mask): + return plc.interop.from_arrow(pa_mask) + + +def test_gather(target_table, pa_target_table, index_column, pa_index_column): + result = plc.copying.gather( + target_table, + index_column, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + expected = pa_target_table.take(pa_index_column) + assert_table_eq(result, expected) + + +def test_gather_map_has_nulls(target_table): + gather_map = plc.interop.from_arrow(pa.array([0, 1, None])) + with cudf_raises(ValueError): + plc.copying.gather( + target_table, + gather_map, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + + +def _pyarrow_index_to_mask(indices, mask_size): + # Convert a list of indices to a boolean mask. + return pc.is_in(pa.array(range(mask_size)), pa.array(indices)) + + +def _pyarrow_boolean_mask_scatter_column(source, mask, target): + if isinstance(source, pa.Scalar): + # if_else requires array lengths to match exactly or the replacement must be a + # scalar, so we use this in the scalar case. + return pc.if_else(mask, target, source) + + if isinstance(source, pa.ChunkedArray): + source = source.combine_chunks() + if isinstance(target, pa.ChunkedArray): + target = target.combine_chunks() + + # replace_with_mask accepts a column whose size is the number of true values in + # the mask, so we can use it for columnar scatters. + return pc.replace_with_mask(target, mask, source) + + +def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): + # pyarrow equivalent of cudf's boolean_mask_scatter. + return pa.table( + [ + _pyarrow_boolean_mask_scatter_column(r, mask, v) + for v, r in zip(target_table, source) + ], + [""] * target_table.num_columns, + ) + + +def test_scatter_table( + source_table, + pa_source_table, + index_column, + pa_index_column, + target_table, + pa_target_table, +): + result = plc.copying.scatter( + source_table, + index_column, + target_table, + ) + + if pa.types.is_list( + dtype := pa_target_table[0].type + ) or pa.types.is_struct(dtype): + # pyarrow does not support scattering with list data. If and when they do, + # replace this hardcoding with their implementation. + with pytest.raises(pa.ArrowNotImplementedError): + _pyarrow_boolean_mask_scatter_table( + pa_source_table, + _pyarrow_index_to_mask( + pa_index_column, pa_target_table.num_rows + ), + pa_target_table, + ) + + if pa.types.is_list(dtype := pa_target_table[0].type): + expected = pa.table( + [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3 + ) + elif pa.types.is_struct(dtype): + expected = pa.table( + [ + pa.array( + [ + {"v": 4}, + {"v": 1}, + {"v": 2}, + {"v": 3}, + {"v": 8}, + {"v": 9}, + ], + type=DEFAULT_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) + else: + expected = _pyarrow_boolean_mask_scatter_table( + pa_source_table, + _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows), + pa_target_table, + ) + + assert_table_eq(result, expected) + + +def test_scatter_table_num_col_mismatch( + source_table, index_column, target_table +): + # Number of columns in source and target must match. + with cudf_raises(ValueError): + plc.copying.scatter( + plc.Table(source_table.columns()[:2]), + index_column, + target_table, + ) + + +def test_scatter_table_num_row_mismatch(source_table, target_table): + # Number of rows in source and scatter map must match. + with cudf_raises(ValueError): + plc.copying.scatter( + source_table, + plc.interop.from_arrow( + pa.array(range(source_table.num_rows() * 2)) + ), + target_table, + ) + + +def test_scatter_table_map_has_nulls(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.scatter( + source_table, + plc.interop.from_arrow(pa.array([None] * source_table.num_rows())), + target_table, + ) + + +def test_scatter_table_type_mismatch(source_table, index_column, target_table): + with cudf_raises(TypeError): + if is_integer( + dtype := target_table.columns()[0].type() + ) or is_floating(dtype): + pa_array = pa.array([True] * source_table.num_rows()) + else: + pa_array = pa.array([1] * source_table.num_rows()) + ncol = source_table.num_columns() + pa_table = pa.table([pa_array] * ncol, [""] * ncol) + plc.copying.scatter( + plc.interop.from_arrow(pa_table), + index_column, + target_table, + ) + + +def test_scatter_scalars( + source_scalar, + pa_source_scalar, + index_column, + pa_index_column, + target_table, + pa_target_table, +): + result = plc.copying.scatter( + [source_scalar] * target_table.num_columns(), + index_column, + target_table, + ) + + expected = _pyarrow_boolean_mask_scatter_table( + [pa_source_scalar] * target_table.num_columns(), + pc.invert( + _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows) + ), + pa_target_table, + ) + + assert_table_eq(result, expected) + + +def test_scatter_scalars_num_scalars_mismatch( + source_scalar, index_column, target_table +): + with cudf_raises(ValueError): + plc.copying.scatter( + [source_scalar] * (target_table.num_columns() - 1), + index_column, + target_table, + ) + + +def test_scatter_scalars_map_has_nulls(source_scalar, target_table): + with cudf_raises(ValueError): + plc.copying.scatter( + [source_scalar] * target_table.num_columns(), + plc.interop.from_arrow(pa.array([None, None])), + target_table, + ) + + +def test_scatter_scalars_type_mismatch(index_column, target_table): + with cudf_raises(TypeError): + if is_integer( + dtype := target_table.columns()[0].type() + ) or is_floating(dtype): + source_scalar = [plc.interop.from_arrow(pa.scalar(True))] + else: + source_scalar = [plc.interop.from_arrow(pa.scalar(1))] + plc.copying.scatter( + source_scalar * target_table.num_columns(), + index_column, + target_table, + ) + + +def test_empty_like_column(input_column): + result = plc.copying.empty_like(input_column) + assert result.type() == input_column.type() + + +def test_empty_like_table(source_table): + result = plc.copying.empty_like(source_table) + assert result.num_columns() == source_table.num_columns() + for icol, rcol in zip(source_table.columns(), result.columns()): + assert rcol.type() == icol.type() + + +@pytest.mark.parametrize("size", [None, 10]) +def test_allocate_like(input_column, size): + if is_fixed_width(input_column.type()): + result = plc.copying.allocate_like( + input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size + ) + assert result.type() == input_column.type() + assert result.size() == (input_column.size() if size is None else size) + else: + with pytest.raises(TypeError): + plc.copying.allocate_like( + input_column, + plc.copying.MaskAllocationPolicy.RETAIN, + size=size, + ) + + +def test_copy_range_in_place( + input_column, pa_input_column, mutable_target_column, pa_target_column +): + if not is_fixed_width(mutable_target_column.type()): + with pytest.raises(TypeError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + else: + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + expected = _pyarrow_boolean_mask_scatter_column( + pa_input_column, + _pyarrow_index_to_mask( + range(len(pa_input_column)), len(pa_target_column) + ), + pa_target_column, + ) + assert_column_eq(mutable_target_column, expected) + + +def test_copy_range_in_place_out_of_bounds( + input_column, mutable_target_column +): + if is_fixed_width(mutable_target_column.type()): + with cudf_raises(IndexError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 5, + 5 + input_column.size(), + 0, + ) + + +def test_copy_range_in_place_different_types(mutable_target_column): + if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): + input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + else: + input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + + with cudf_raises(TypeError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + + +def test_copy_range_in_place_null_mismatch( + pa_input_column, mutable_target_column +): + if is_fixed_width(mutable_target_column.type()): + pa_input_column = pc.if_else( + _pyarrow_index_to_mask([0], len(pa_input_column)), + pa_input_column, + pa.scalar(None, type=pa_input_column.type), + ) + input_column = plc.interop.from_arrow(pa_input_column) + with cudf_raises(ValueError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + + +def test_copy_range( + input_column, pa_input_column, target_column, pa_target_column +): + if is_fixed_width(dtype := target_column.type()) or is_string(dtype): + result = plc.copying.copy_range( + input_column, + target_column, + 0, + input_column.size(), + 0, + ) + expected = _pyarrow_boolean_mask_scatter_column( + pa_input_column, + _pyarrow_index_to_mask( + range(len(pa_input_column)), len(pa_target_column) + ), + pa_target_column, + ) + assert_column_eq(result, expected) + else: + with pytest.raises(TypeError): + plc.copying.copy_range( + input_column, + target_column, + 0, + input_column.size(), + 0, + ) + + +def test_copy_range_out_of_bounds(input_column, target_column): + with cudf_raises(IndexError): + plc.copying.copy_range( + input_column, + target_column, + 5, + 5 + input_column.size(), + 0, + ) + + +def test_copy_range_different_types(target_column): + if is_integer(dtype := target_column.type()) or is_floating(dtype): + input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + else: + input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + + with cudf_raises(TypeError): + plc.copying.copy_range( + input_column, + target_column, + 0, + input_column.size(), + 0, + ) + + +def test_shift( + target_column, pa_target_column, source_scalar, pa_source_scalar +): + shift = 2 + if is_fixed_width(dtype := target_column.type()) or is_string(dtype): + result = plc.copying.shift(target_column, shift, source_scalar) + expected = pa.concat_arrays( + [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] + ) + assert_column_eq(result, expected) + else: + with pytest.raises(TypeError): + plc.copying.shift(target_column, shift, source_scalar) + + +def test_shift_type_mismatch(target_column): + if is_integer(dtype := target_column.type()) or is_floating(dtype): + fill_value = plc.interop.from_arrow(pa.scalar("a")) + else: + fill_value = plc.interop.from_arrow(pa.scalar(1)) + + with cudf_raises(TypeError): + plc.copying.shift(target_column, 2, fill_value) + + +def test_slice_column(target_column, pa_target_column): + bounds = list(range(6)) + upper_bounds = bounds[1::2] + lower_bounds = bounds[::2] + result = plc.copying.slice(target_column, bounds) + for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): + assert_column_eq(slice_, pa_target_column[lb:ub]) + + +def test_slice_column_wrong_length(target_column): + with cudf_raises(ValueError): + plc.copying.slice(target_column, list(range(5))) + + +def test_slice_column_decreasing(target_column): + with cudf_raises(ValueError): + plc.copying.slice(target_column, list(range(5, -1, -1))) + + +def test_slice_column_out_of_bounds(target_column): + with cudf_raises(IndexError): + plc.copying.slice(target_column, list(range(2, 8))) + + +def test_slice_table(target_table, pa_target_table): + bounds = list(range(6)) + upper_bounds = bounds[1::2] + lower_bounds = bounds[::2] + result = plc.copying.slice(target_table, bounds) + for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): + assert_table_eq(slice_, pa_target_table[lb:ub]) + + +def test_split_column(target_column, pa_target_column): + upper_bounds = [1, 3, 5] + lower_bounds = [0] + upper_bounds[:-1] + result = plc.copying.split(target_column, upper_bounds) + for lb, ub, split in zip(lower_bounds, upper_bounds, result): + assert_column_eq(split, pa_target_column[lb:ub]) + + +def test_split_column_decreasing(target_column): + with cudf_raises(ValueError): + plc.copying.split(target_column, list(range(5, -1, -1))) + + +def test_split_column_out_of_bounds(target_column): + with cudf_raises(IndexError): + plc.copying.split(target_column, list(range(5, 8))) + + +def test_split_table(target_table, pa_target_table): + upper_bounds = [1, 3, 5] + lower_bounds = [0] + upper_bounds[:-1] + result = plc.copying.split(target_table, upper_bounds) + for lb, ub, split in zip(lower_bounds, upper_bounds, result): + assert_table_eq(split, pa_target_table[lb:ub]) + + +def test_copy_if_else_column_column( + target_column, pa_target_column, pa_source_scalar, mask, pa_mask +): + pa_other_column = pa.concat_arrays( + [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]] + ) + other_column = plc.interop.from_arrow(pa_other_column) + + result = plc.copying.copy_if_else( + target_column, + other_column, + mask, + ) + + expected = pc.if_else( + pa_mask, + pa_target_column, + pa_other_column, + ) + assert_column_eq(result, expected) + + +def test_copy_if_else_wrong_type(target_column, mask): + if is_integer(dtype := target_column.type()) or is_floating(dtype): + input_column = plc.interop.from_arrow( + pa.array(["a"] * target_column.size()) + ) + else: + input_column = plc.interop.from_arrow( + pa.array([1] * target_column.size()) + ) + + with cudf_raises(TypeError): + plc.copying.copy_if_else(input_column, target_column, mask) + + +def test_copy_if_else_wrong_type_mask(target_column): + with cudf_raises(TypeError): + plc.copying.copy_if_else( + target_column, + target_column, + plc.interop.from_arrow( + pa.array([1.0, 2.0] * (target_column.size() // 2)) + ), + ) + + +def test_copy_if_else_wrong_size(target_column): + with cudf_raises(ValueError): + plc.copying.copy_if_else( + plc.interop.from_arrow(pa.array([1])), + target_column, + plc.interop.from_arrow( + pa.array([True, False] * (target_column.size() // 2)) + ), + ) + + +def test_copy_if_else_wrong_size_mask(target_column): + with cudf_raises(ValueError): + plc.copying.copy_if_else( + target_column, + target_column, + plc.interop.from_arrow(pa.array([True])), + ) + + +@pytest.mark.parametrize("array_left", [True, False]) +def test_copy_if_else_column_scalar( + target_column, + pa_target_column, + source_scalar, + pa_source_scalar, + array_left, + mask, + pa_mask, +): + args = ( + (target_column, source_scalar) + if array_left + else (source_scalar, target_column) + ) + result = plc.copying.copy_if_else( + *args, + mask, + ) + + pa_args = ( + (pa_target_column, pa_source_scalar) + if array_left + else (pa_source_scalar, pa_target_column) + ) + expected = pc.if_else( + pa_mask, + *pa_args, + ) + assert_column_eq(result, expected) + + +def test_boolean_mask_scatter_from_table( + source_table, + pa_source_table, + target_table, + pa_target_table, + mask, + pa_mask, +): + result = plc.copying.boolean_mask_scatter( + source_table, + target_table, + mask, + ) + + if pa.types.is_list( + dtype := pa_target_table[0].type + ) or pa.types.is_struct(dtype): + # pyarrow does not support scattering with list data. If and when they do, + # replace this hardcoding with their implementation. + with pytest.raises(pa.ArrowNotImplementedError): + _pyarrow_boolean_mask_scatter_table( + pa_source_table, pa_mask, pa_target_table + ) + + if pa.types.is_list(dtype := pa_target_table[0].type): + expected = pa.table( + [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3 + ) + elif pa.types.is_struct(dtype): + expected = pa.table( + [ + pa.array( + [ + {"v": 1}, + {"v": 5}, + {"v": 2}, + {"v": 7}, + {"v": 3}, + {"v": 9}, + ], + type=DEFAULT_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) + else: + expected = _pyarrow_boolean_mask_scatter_table( + pa_source_table, pa_mask, pa_target_table + ) + + assert_table_eq(result, expected) + + +def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.boolean_mask_scatter( + plc.Table(source_table.columns()[:2]), + target_table, + plc.interop.from_arrow(pa.array([True, False] * 3)), + ) + + +def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.boolean_mask_scatter( + source_table, + target_table, + plc.interop.from_arrow(pa.array([True, False] * 2)), + ) + + +def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.boolean_mask_scatter( + plc.Table(source_table.columns()[:2]), + target_table, + plc.interop.from_arrow( + pa.array([True, False] * 2 + [False, False]) + ), + ) + + +def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): + if is_integer(dtype := target_table.columns()[0].type()) or is_floating( + dtype + ): + input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + else: + input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + + with cudf_raises(TypeError): + plc.copying.boolean_mask_scatter( + plc.Table([input_column] * 3), target_table, mask + ) + + +def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): + with cudf_raises(TypeError): + plc.copying.boolean_mask_scatter( + source_table, + target_table, + plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)), + ) + + +def test_boolean_mask_scatter_from_scalars( + source_scalar, + pa_source_scalar, + target_table, + pa_target_table, + mask, + pa_mask, +): + result = plc.copying.boolean_mask_scatter( + [source_scalar] * 3, + target_table, + mask, + ) + + expected = _pyarrow_boolean_mask_scatter_table( + [pa_source_scalar] * target_table.num_columns(), + pc.invert(pa_mask), + pa_target_table, + ) + + assert_table_eq(result, expected) + + +def test_get_element(input_column, pa_input_column): + index = 1 + result = plc.copying.get_element(input_column, index) + + assert ( + plc.interop.to_arrow( + result, metadata_from_arrow_array(pa_input_column) + ).as_py() + == pa_input_column[index].as_py() + ) + + +def test_get_element_out_of_bounds(input_column): + with cudf_raises(IndexError): + plc.copying.get_element(input_column, 100) From 0a8807eb2f8f87cbfdc49538b73ff498526adf66 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 1 Apr 2024 14:31:16 -0700 Subject: [PATCH 07/69] Apply the cuFile error work around to data_sink as well (#15335) Issue #14140 Follow-up on https://github.com/rapidsai/cudf/pull/15293 Moving the `cudaFree(0)` call to a function called both by file `datasource` and `data_sink`. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15335 --- cpp/src/io/utilities/data_sink.cpp | 1 + cpp/src/io/utilities/datasource.cpp | 6 +----- cpp/src/io/utilities/file_io_utilities.cpp | 8 ++++++++ cpp/src/io/utilities/file_io_utilities.hpp | 3 +++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 5557648ebbe..66905c5256f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -36,6 +36,7 @@ class file_sink : public data_sink { public: explicit file_sink(std::string const& filepath) { + detail::force_init_cuda_context(); _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); } diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 54e7c6bf1d6..d8dbd3614c8 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -43,12 +43,8 @@ class file_source : public datasource { public: explicit file_source(char const* filepath) : _file(filepath, O_RDONLY) { + detail::force_init_cuda_context(); if (detail::cufile_integration::is_kvikio_enabled()) { - // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors - // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is - // already initialized - cudaFree(0); - _kvikio_file = kvikio::FileHandle(filepath); CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 01090a43a0e..39031526fc8 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -34,6 +34,14 @@ namespace cudf { namespace io { namespace detail { +void force_init_cuda_context() +{ + // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors + // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is already + // initialized. + cudaFree(0); +} + [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create) { // save errno because it may be overwritten by subsequent calls diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 0d5a5b218da..74a2ae53961 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -37,6 +37,9 @@ namespace detail { [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create); +// Call before any cuFile API calls to ensure the CUDA context is initialized. +void force_init_cuda_context(); + /** * @brief Class that provides RAII for file handling. */ From e5f9e2d6d39df4c5f4a6b7bab150a1fa00f0a1cb Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 1 Apr 2024 17:43:37 -0400 Subject: [PATCH 08/69] Refactor stream mode setup for gtests (#15337) Setting up the stream mode logic was duplicated in `testing_main.hpp` and `error_handing_test.cu`. Refactoring the logic will help setup for a large strings test fixture in a follow-on PR. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - https://github.com/nvdbaranec - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/15337 --- cpp/include/cudf_test/testing_main.hpp | 57 ++++++++++++++++---------- cpp/tests/error/error_handling_test.cu | 14 +------ 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp index 88e3088d794..ecac761f7cb 100644 --- a/cpp/include/cudf_test/testing_main.hpp +++ b/cpp/include/cudf_test/testing_main.hpp @@ -145,6 +145,32 @@ inline auto parse_cudf_test_opts(int argc, char** argv) } } +/** + * @brief Sets up stream mode memory resource adaptor + * + * The resource adaptor is only set as the current device resource if the + * stream mode is enabled. + * + * The caller must keep the return object alive for the life of the test runs. + * + * @param cmd_opts Command line options returned by parse_cudf_test_opts + * @return Memory resource adaptor + */ +inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts) +{ + auto resource = rmm::mr::get_current_device_resource(); + auto const stream_mode = cmd_opts["stream_mode"].as(); + auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); + auto const error_on_invalid_stream = (stream_error_mode == "error"); + auto const check_default_stream = (stream_mode == "new_cudf_default"); + auto adaptor = + make_stream_checking_resource_adaptor(resource, error_on_invalid_stream, check_default_stream); + if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { + rmm::mr::set_current_device_resource(&adaptor); + } + return adaptor; +} + /** * @brief Macro that defines main function for gtest programs that use rmm * @@ -155,25 +181,14 @@ inline auto parse_cudf_test_opts(int argc, char** argv) * function parses the command line to customize test behavior, like the * allocation mode used for creating the default memory resource. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cudf::test::create_memory_resource(rmm_mode); \ - rmm::mr::set_current_device_resource(resource.get()); \ - \ - auto const stream_mode = cmd_opts["stream_mode"].as(); \ - if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { \ - auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); \ - auto const error_on_invalid_stream = (stream_error_mode == "error"); \ - auto const check_default_stream = (stream_mode == "new_cudf_default"); \ - auto adaptor = make_stream_checking_resource_adaptor( \ - resource.get(), error_on_invalid_stream, check_default_stream); \ - rmm::mr::set_current_device_resource(&adaptor); \ - return RUN_ALL_TESTS(); \ - } \ - \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ + auto resource = cudf::test::create_memory_resource(rmm_mode); \ + rmm::mr::set_current_device_resource(resource.get()); \ + auto adaptor = make_stream_mode_adaptor(cmd_opts); \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 674d2e0a6ea..46d01ec14ff 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -128,17 +128,7 @@ TEST(DebugAssert, cudf_assert_true) int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - auto const cmd_opts = parse_cudf_test_opts(argc, argv); - auto const stream_mode = cmd_opts["stream_mode"].as(); - if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { - auto resource = rmm::mr::get_current_device_resource(); - auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); - auto const error_on_invalid_stream = (stream_error_mode == "error"); - auto const check_default_stream = (stream_mode == "new_cudf_default"); - auto adaptor = make_stream_checking_resource_adaptor( - resource, error_on_invalid_stream, check_default_stream); - rmm::mr::set_current_device_resource(&adaptor); - return RUN_ALL_TESTS(); - } + auto const cmd_opts = parse_cudf_test_opts(argc, argv); + auto adaptor = make_stream_mode_adaptor(cmd_opts); return RUN_ALL_TESTS(); } From 09f8c8ad92b5b59a4525ee256feca6a68564b003 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 1 Apr 2024 17:23:28 -0500 Subject: [PATCH 09/69] Enable ``dask_cudf`` json and s3 tests with query-planning on (#15408) Addresses parts of https://github.com/rapidsai/cudf/issues/15027 (json and s3 testing). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15408 --- python/dask_cudf/dask_cudf/backends.py | 15 +++++++++++- .../dask_cudf/dask_cudf/io/tests/test_json.py | 4 ++-- .../dask_cudf/io/tests/test_parquet.py | 2 +- .../dask_cudf/dask_cudf/io/tests/test_s3.py | 6 +---- python/dask_cudf/dask_cudf/tests/utils.py | 24 +++++++++++++++---- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index c7b4a1c4c6a..d05be30602e 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -2,6 +2,7 @@ import warnings from collections.abc import Iterator +from functools import partial import cupy as cp import numpy as np @@ -484,7 +485,6 @@ def sizeof_cudf_series_index(obj): def _simple_cudf_encode(_): # Basic pickle-based encoding for a partd k-v store import pickle - from functools import partial import partd @@ -686,6 +686,19 @@ def from_dict( constructor=constructor, ) + @staticmethod + def read_json(*args, engine="auto", **kwargs): + return _default_backend( + dd.read_json, + *args, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + **kwargs, + ) + # Import/register cudf-specific classes for dask-expr try: diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index a2b1d7fc114..8dcf3f05e89 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -12,8 +12,8 @@ import dask_cudf from dask_cudf.tests.utils import skip_dask_expr -# No dask-expr support -pytestmark = skip_dask_expr() +# No dask-expr support for dask_expr<=1.0.5 +pytestmark = skip_dask_expr(lt_version="1.0.5+a") def test_read_json_backend_dispatch(tmp_path): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index de2a735b2ce..df41ef77b7c 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -535,7 +535,7 @@ def test_check_file_size(tmpdir): dask_cudf.io.read_parquet(fn, check_file_size=1).compute() -@xfail_dask_expr("HivePartitioning cannot be hashed") +@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index f4a6fabdb60..a67404da4fe 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -10,10 +10,6 @@ import pytest import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support -pytestmark = skip_dask_expr() moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -111,7 +107,7 @@ def test_read_csv(s3_base, s3so): s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"} ): df = dask_cudf.read_csv( - "s3://daskcsv/*.csv", chunksize="50 B", storage_options=s3so + "s3://daskcsv/*.csv", blocksize="50 B", storage_options=s3so ) assert df.a.sum().compute() == 4 diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index e838b8d63bc..1ca1758736b 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +from packaging.version import Version import dask.dataframe as dd @@ -10,6 +11,13 @@ from dask_cudf.expr import QUERY_PLANNING_ON +if QUERY_PLANNING_ON: + import dask_expr + + DASK_EXPR_VERSION = Version(dask_expr.__version__) +else: + DASK_EXPR_VERSION = None + def _make_random_frame(nelem, npartitions=2, include_na=False): df = pd.DataFrame( @@ -27,9 +35,17 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): _default_reason = "Not compatible with dask-expr" -def skip_dask_expr(reason=_default_reason): - return pytest.mark.skipif(QUERY_PLANNING_ON, reason=reason) +def skip_dask_expr(reason=_default_reason, lt_version=None): + if lt_version is not None: + skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version) + else: + skip = QUERY_PLANNING_ON + return pytest.mark.skipif(skip, reason=reason) -def xfail_dask_expr(reason=_default_reason): - return pytest.mark.xfail(QUERY_PLANNING_ON, reason=reason) +def xfail_dask_expr(reason=_default_reason, lt_version=None): + if lt_version is not None: + xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version) + else: + xfail = QUERY_PLANNING_ON + return pytest.mark.xfail(xfail, reason=reason) From 268996ad101dc69414992aa0227eba4f93012c91 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 1 Apr 2024 18:59:48 -0400 Subject: [PATCH 10/69] Add `to_arrow_device` function to cudf interop using nanoarrow (#15047) Introduce new `to_arrow_device` and `to_arrow_schema` functions to utilize the `ArrowDeviceArray` structure for zero-copy passing of libcudf::table. Add nanoarrow as a vendored lib and a script to update it. Initial step towards addressing #14926 Authors: - Matt Topol (https://github.com/zeroshade) - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15047 --- cpp/CMakeLists.txt | 8 +- cpp/cmake/thirdparty/get_nanoarrow.cmake | 36 + cpp/include/cudf/interop.hpp | 96 ++- cpp/include/cudf/interop/detail/arrow.hpp | 48 ++ cpp/src/interop/to_arrow_device.cu | 727 ++++++++++++++++++++ cpp/tests/CMakeLists.txt | 7 +- cpp/tests/interop/nanoarrow_utils.hpp | 226 +++++++ cpp/tests/interop/to_arrow_device_test.cpp | 739 +++++++++++++++++++++ docs/cudf/source/conf.py | 1 + 9 files changed, 1882 insertions(+), 6 deletions(-) create mode 100644 cpp/cmake/thirdparty/get_nanoarrow.cmake create mode 100644 cpp/include/cudf/interop/detail/arrow.hpp create mode 100644 cpp/src/interop/to_arrow_device.cu create mode 100644 cpp/tests/interop/nanoarrow_utils.hpp create mode 100644 cpp/tests/interop/to_arrow_device_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 618d03f7078..f1d43e3c35f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -210,12 +210,14 @@ include(cmake/thirdparty/get_kvikio.cmake) include(cmake/thirdparty/get_fmt.cmake) # find spdlog include(cmake/thirdparty/get_spdlog.cmake) +# find nanoarrow +include(cmake/thirdparty/get_nanoarrow.cmake) # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved if(NOT BUILD_SHARED_LIBS) include("${rapids-cmake-dir}/export/find_package_file.cmake") list(APPEND METADATA_KINDS BUILD INSTALL) - list(APPEND dependencies KvikIO ZLIB nvcomp) + list(APPEND dependencies KvikIO ZLIB nvcomp nanoarrow) if(TARGET cufile::cuFile_interface) list(APPEND dependencies cuFile) endif() @@ -358,6 +360,7 @@ add_library( src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/to_arrow.cu + src/interop/to_arrow_device.cu src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu @@ -735,6 +738,7 @@ target_include_directories( "$" "$" PRIVATE "$" + "$" INTERFACE "$" ) @@ -783,7 +787,7 @@ target_link_libraries( cudf PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio - $ + $ nanoarrow ) # Add Conda library, and include paths if specified diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake new file mode 100644 index 00000000000..be938a89ccd --- /dev/null +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -0,0 +1,36 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds nanoarrow and sets any additional necessary environment variables. +function(find_and_configure_nanoarrow) + set(oneValueArgs VERSION FORK PINNED_TAG) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + rapids_cpm_find( + nanoarrow ${PKG_VERSION} + GLOBAL_TARGETS nanoarrow + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git + GIT_TAG ${PKG_PINNED_TAG} + # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin + # to an actual tag. + GIT_SHALLOW FALSE + OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" + ) + set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) +endfunction() + +find_and_configure_nanoarrow( + VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8 +) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 2ee6f19614d..871f48e3aac 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,11 +34,16 @@ #include #include #include +#include #include struct DLManagedTensor; +struct ArrowDeviceArray; + +struct ArrowSchema; + namespace cudf { /** * @addtogroup interop_dlpack @@ -162,6 +167,95 @@ std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata = {}, rmm::cuda_stream_view stream = cudf::get_default_stream(), arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); + +/** + * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter + * + */ +using unique_schema_t = std::unique_ptr; + +/** + * @brief typedef for a unique_ptr to an ArrowDeviceArray with a custom deleter + * + */ +using unique_device_array_t = std::unique_ptr; + +/** + * @brief Create ArrowSchema from cudf table and metadata + * + * Populates and returns an ArrowSchema C struct using a table and metadata. + * + * @note For decimals, since the precision is not stored for them in libcudf, + * decimals will be converted to an Arrow decimal128 which has the widest precision that cudf + * decimal type supports. For example, `numeric::decimal32` will be converted to Arrow decimal128 + * with the precision of 9 which is the maximum precision for 32-bit types. Similarly, + * `numeric::decimal128` will be converted to Arrow decimal128 with the precision of 38. + * + * @param input Table to create a schema from + * @param metadata Contains the hierarchy of names of columns and children + * @return ArrowSchema generated from `input` + */ +unique_schema_t to_arrow_schema(cudf::table_view const& input, + cudf::host_span metadata); + +/** + * @brief Create `ArrowDeviceArray` from cudf table and metadata + * + * Populates the C struct ArrowDeviceArray without performing copies if possible. + * This maintains the data on the GPU device and gives ownership of the table + * and its buffers to the ArrowDeviceArray struct. + * + * After calling this function, the release callback on the returned ArrowDeviceArray + * must be called to clean up the memory. + * + * @note For decimals, since the precision is not stored for them in libcudf + * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type + * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision + * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be + * converted to Arrow decimal128 of the precision 38. + * + * @note Copies will be performed in the cases where cudf differs from Arrow + * such as in the representation of bools (Arrow uses a bitmap, cudf uses 1-byte per value). + * + * @param table Input table, ownership of the data will be moved to the result + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for any allocations during conversion + * @return ArrowDeviceArray which will have ownership of the GPU data, consumer must call release + */ +unique_device_array_t to_arrow_device( + cudf::table&& table, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `ArrowDeviceArray` from cudf column and metadata + * + * Populates the C struct ArrowDeviceArray without performing copies if possible. + * This maintains the data on the GPU device and gives ownership of the table + * and its buffers to the ArrowDeviceArray struct. + * + * After calling this function, the release callback on the returned ArrowDeviceArray + * must be called to clean up the memory. + * + * @note For decimals, since the precision is not stored for them in libcudf + * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type + * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision + * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be + * converted to Arrow decimal128 of the precision 38. + * + * @note Copies will be performed in the cases where cudf differs from Arrow such as + * in the representation of bools (Arrow uses a bitmap, cudf uses 1 byte per value). + * + * @param col Input column, ownership of the data will be moved to the result + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for any allocations during conversion + * @return ArrowDeviceArray which will have ownership of the GPU data + */ +unique_device_array_t to_arrow_device( + cudf::column&& col, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Create `cudf::table` from given arrow Table input * diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp new file mode 100644 index 00000000000..8043ecf5422 --- /dev/null +++ b/cpp/include/cudf/interop/detail/arrow.hpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +// from Arrow C Device Data Interface +// https://arrow.apache.org/docs/format/CDeviceDataInterface.html +#ifndef ARROW_C_DEVICE_DATA_INTERFACE +#define ARROW_C_DEVICE_DATA_INTERFACE + +// Device type for the allocated memory +typedef int32_t ArrowDeviceType; + +// CPU device, same as using ArrowArray directly +#define ARROW_DEVICE_CPU 1 +// CUDA GPU Device +#define ARROW_DEVICE_CUDA 2 +// Pinned CUDA CPU memory by cudaMallocHost +#define ARROW_DEVICE_CUDA_HOST 3 +// CUDA managed/unified memory allocated by cudaMallocManaged +#define ARROW_DEVICE_CUDA_MANAGED 13 + +struct ArrowDeviceArray { + struct ArrowArray array; + int64_t device_id; + ArrowDeviceType device_type; + void* sync_event; + + // reserved bytes for future expansion + int64_t reserved[3]; +}; + +#endif // ARROW_C_DEVICE_DATA_INTERFACE diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu new file mode 100644 index 00000000000..e824412e71c --- /dev/null +++ b/cpp/src/interop/to_arrow_device.cu @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace detail { +namespace { +static constexpr int validity_buffer_idx = 0; +static constexpr int fixed_width_data_buffer_idx = 1; + +ArrowType id_to_arrow_type(cudf::type_id id) +{ + switch (id) { + case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; + case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; + case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; + case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; + case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; + case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; + case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; + case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; + case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; + case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; + case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; + case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; + default: CUDF_FAIL("Unsupported type_id conversion to arrow type"); + } +} + +struct dispatch_to_arrow_type { + template ())> + int operator()(column_view, column_metadata const&, ArrowSchema*) + { + CUDF_FAIL("Unsupported type for to_arrow_schema"); + } + + template ())> + int operator()(column_view input_view, column_metadata const&, ArrowSchema* out) + { + cudf::type_id id = input_view.type().id(); + switch (id) { + case cudf::type_id::TIMESTAMP_SECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr); + case cudf::type_id::TIMESTAMP_MILLISECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr); + case cudf::type_id::TIMESTAMP_MICROSECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr); + case cudf::type_id::TIMESTAMP_NANOSECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr); + case cudf::type_id::DURATION_SECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr); + case cudf::type_id::DURATION_MILLISECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr); + case cudf::type_id::DURATION_MICROSECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr); + case cudf::type_id::DURATION_NANOSECONDS: + return ArrowSchemaSetTypeDateTime( + out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr); + default: return ArrowSchemaSetType(out, id_to_arrow_type(id)); + } + } +}; + +template +int decimals_to_arrow(column_view input, ArrowSchema* out) +{ + // Arrow doesn't support decimal32/decimal64 currently. decimal128 + // is the smallest that arrow supports besides float32/float64 so we + // upcast to decimal128. + return ArrowSchemaSetTypeDecimal(out, + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -input.type().scale()); +} + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const&, + ArrowSchema* out) +{ + using DeviceType = int32_t; + return decimals_to_arrow(input, out); +} + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const&, + ArrowSchema* out) +{ + using DeviceType = int64_t; + return decimals_to_arrow(input, out); +} + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const&, + ArrowSchema* out) +{ + using DeviceType = __int128_t; + return decimals_to_arrow(input, out); +} + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const&, + ArrowSchema* out) +{ + return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING); +} + +// these forward declarations are needed due to the recursive calls to them +// inside their definitions and in struct_vew for handling children +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const& metadata, + ArrowSchema* out); + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const& metadata, + ArrowSchema* out); + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const& metadata, + ArrowSchema* out) +{ + CUDF_EXPECTS(metadata.children_meta.size() == static_cast(input.num_children()), + "Number of field names and number of children doesn't match\n"); + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children())); + for (int i = 0; i < input.num_children(); ++i) { + auto child = out->children[i]; + auto col = input.child(i); + ArrowSchemaInit(child); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str())); + + child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0; + + if (col.type().id() == cudf::type_id::EMPTY) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA)); + continue; + } + + NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher( + col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child)); + } + + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const& metadata, + ArrowSchema* out) +{ + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST)); + auto child = input.child(cudf::lists_column_view::child_column_index); + ArrowSchemaInit(out->children[0]); + if (child.type().id() == cudf::type_id::EMPTY) { + return ArrowSchemaSetType(out->children[0], NANOARROW_TYPE_NA); + } + auto child_meta = + metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0]; + + out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0; + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str())); + out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0; + return cudf::type_dispatcher( + child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]); +} + +template <> +int dispatch_to_arrow_type::operator()(column_view input, + column_metadata const& metadata, + ArrowSchema* out) +{ + cudf::dictionary_column_view dview{input}; + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id()))); + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out)); + ArrowSchemaInit(out->dictionary); + + auto dict_keys = dview.keys(); + return cudf::type_dispatcher( + dict_keys.type(), + detail::dispatch_to_arrow_type{}, + dict_keys, + metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0], + out->dictionary); +} + +template +void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t) +{ + auto* unique_buffer = reinterpret_cast*>(allocator->private_data); + delete unique_buffer; +} + +template +struct is_device_scalar : public std::false_type {}; + +template +struct is_device_scalar> : public std::true_type {}; + +template +struct is_device_uvector : public std::false_type {}; + +template +struct is_device_uvector> : public std::true_type {}; + +template +int set_buffer(std::unique_ptr device_buf, int64_t i, ArrowArray* out) +{ + ArrowBuffer* buf = ArrowArrayBuffer(out, i); + auto ptr = reinterpret_cast(device_buf->data()); + buf->size_bytes = [&] { + if constexpr (is_device_scalar::value) { + return sizeof(typename T::value_type); + } else if constexpr (is_device_uvector::value) { + return sizeof(typename T::value_type) * device_buf->size(); + } else { + return device_buf->size(); + } + }(); + // we make a new unique_ptr and move to it in case there was a custom deleter + NANOARROW_RETURN_NOT_OK( + ArrowBufferSetAllocator(buf, + ArrowBufferDeallocator(&device_buffer_finalize, + new std::unique_ptr(std::move(device_buf))))); + buf->data = ptr; + return NANOARROW_OK; +} + +int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const& column) +{ + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type)); + arr->length = column.size(); + arr->null_count = column.null_count(); + return NANOARROW_OK; +} + +struct dispatch_to_arrow_device { + template ())> + int operator()(cudf::column&&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*, + ArrowArray*) + { + CUDF_FAIL("Unsupported type for to_arrow_device"); + } + + template ())> + int operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) + { + nanoarrow::UniqueArray tmp; + + const ArrowType storage_type = [&] { + switch (column.type().id()) { + case cudf::type_id::TIMESTAMP_SECONDS: + case cudf::type_id::TIMESTAMP_MILLISECONDS: + case cudf::type_id::TIMESTAMP_MICROSECONDS: + case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64; + case cudf::type_id::DURATION_SECONDS: + case cudf::type_id::DURATION_MILLISECONDS: + case cudf::type_id::DURATION_MICROSECONDS: + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64; + default: return id_to_arrow_type(column.type().id()); + } + }(); + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column)); + + auto contents = column.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get())); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; + } +}; + +template +int decimals_to_arrow(cudf::column&& input, + int32_t precision, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input)); + + if constexpr (!std::is_same_v) { + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); + auto buf = + std::make_unique>(input.size() * BIT_WIDTH_RATIO, stream, mr); + + auto count = thrust::make_counting_iterator(0); + + thrust::for_each(rmm::exec_policy(stream, mr), + count, + count + input.size(), + [in = input.view().begin(), + out = buf->data(), + BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // the lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get())); + } + + auto contents = input.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + + if constexpr (std::is_same_v) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get())); + } + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + using DeviceType = int32_t; + return decimals_to_arrow( + std::move(column), cudf::detail::max_precision(), stream, mr, out); +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + using DeviceType = int64_t; + return decimals_to_arrow( + std::move(column), cudf::detail::max_precision(), stream, mr, out); +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + using DeviceType = __int128_t; + return decimals_to_arrow( + std::move(column), cudf::detail::max_precision(), stream, mr, out); +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column)); + + auto bitmask = bools_to_mask(column.view(), stream, mr); + auto contents = column.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get())); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column)); + + if (column.size() == 0) { + // the scalar zero here is necessary because the spec for string arrays states + // that the offsets buffer should contain "length + 1" signed integers. So in + // the case of a 0 length string array, there should be exactly 1 value, zero, + // in the offsets buffer. While some arrow implementations may accept a zero-sized + // offsets buffer, best practices would be to allocate the buffer with the single value. + auto zero = std::make_unique>(0, stream, mr); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; + } + + auto contents = column.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + + auto offsets_contents = + contents.children[cudf::strings_column_view::offsets_column_index]->release(); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get())); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), 2, tmp.get())); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out); + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out); + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column)); + NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children())); + + auto contents = column.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + + for (size_t i = 0; i < size_t(tmp->n_children); ++i) { + ArrowArray* child_ptr = tmp->children[i]; + auto& child = contents.children[i]; + if (child->type().id() == cudf::type_id::EMPTY) { + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(child_ptr, NANOARROW_TYPE_NA)); + child_ptr->length = child->size(); + child_ptr->null_count = child->size(); + } else { + NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher( + child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr)); + } + } + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column)); + NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1)); + + auto contents = column.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + + auto offsets_contents = + contents.children[cudf::lists_column_view::offsets_column_index]->release(); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get())); + + auto& child = contents.children[cudf::lists_column_view::child_column_index]; + if (child->type().id() == cudf::type_id::EMPTY) { + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(tmp->children[0], NANOARROW_TYPE_NA)); + tmp->children[0]->length = 0; + tmp->children[0]->null_count = 0; + } else { + NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher( + child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0])); + } + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_device::operator()(cudf::column&& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + ArrowArray* out) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array( + tmp.get(), + id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()), + column)); + NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get())); + + auto contents = column.release(); + if (contents.null_mask) { + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get())); + } + + auto indices_contents = + contents.children[cudf::dictionary_column_view::indices_column_index]->release(); + NANOARROW_RETURN_NOT_OK( + set_buffer(std::move(indices_contents.data), fixed_width_data_buffer_idx, tmp.get())); + + auto& keys = contents.children[cudf::dictionary_column_view::keys_column_index]; + NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher( + keys->type(), dispatch_to_arrow_device{}, std::move(*keys), stream, mr, tmp->dictionary)); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +struct ArrowDeviceArrayPrivateData { + ArrowArray parent; + cudaEvent_t sync_event; +}; + +void ArrowDeviceArrayRelease(ArrowArray* array) +{ + auto private_data = reinterpret_cast(array->private_data); + cudaEventDestroy(private_data->sync_event); + ArrowArrayRelease(&private_data->parent); + delete private_data; + array->release = nullptr; +} + +} // namespace +} // namespace detail + +unique_schema_t to_arrow_schema(cudf::table_view const& input, + cudf::host_span metadata) +{ + CUDF_EXPECTS((metadata.size() == static_cast(input.num_columns())), + "columns' metadata should be equal to the number of columns in table"); + + nanoarrow::UniqueSchema result; + ArrowSchemaInit(result.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns())); + + for (int i = 0; i < input.num_columns(); ++i) { + auto child = result->children[i]; + auto col = input.column(i); + ArrowSchemaInit(child); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str())); + child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0; + + if (col.type().id() == cudf::type_id::EMPTY) { + NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA)); + continue; + } + + NANOARROW_THROW_NOT_OK( + cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child)); + } + + unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) { + if (schema->release != nullptr) { ArrowSchemaRelease(schema); } + delete schema; + }); + result.move(out.get()); + return out; +} + +unique_device_array_t to_arrow_device(cudf::table&& table, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT)); + + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns())); + tmp->length = table.num_rows(); + tmp->null_count = 0; + + auto cols = table.release(); + for (size_t i = 0; i < cols.size(); ++i) { + auto child = tmp->children[i]; + auto col = cols[i].get(); + + if (col->type().id() == cudf::type_id::EMPTY) { + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(child, NANOARROW_TYPE_NA)); + child->length = col->size(); + child->null_count = col->size(); + continue; + } + + NANOARROW_THROW_NOT_OK(cudf::type_dispatcher( + col->type(), detail::dispatch_to_arrow_device{}, std::move(*col), stream, mr, child)); + } + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + auto private_data = std::make_unique(); + cudaEventCreate(&private_data->sync_event); + + auto status = cudaEventRecord(private_data->sync_event, stream); + if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); } + + ArrowArrayMove(tmp.get(), &private_data->parent); + unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) { + if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); } + delete arr; + }); + result->device_id = rmm::get_current_cuda_device().value(); + result->device_type = ARROW_DEVICE_CUDA; + result->sync_event = &private_data->sync_event; + result->array = private_data->parent; + result->array.private_data = private_data.release(); + result->array.release = &detail::ArrowDeviceArrayRelease; + return result; +} + +unique_device_array_t to_arrow_device(cudf::column&& col, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + nanoarrow::UniqueArray tmp; + if (col.type().id() == cudf::type_id::EMPTY) { + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_NA)); + tmp->length = col.size(); + tmp->null_count = col.size(); + } + + NANOARROW_THROW_NOT_OK(cudf::type_dispatcher( + col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get())); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + auto private_data = std::make_unique(); + cudaEventCreate(&private_data->sync_event); + + auto status = cudaEventRecord(private_data->sync_event, stream); + if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); } + + ArrowArrayMove(tmp.get(), &private_data->parent); + unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) { + if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); } + delete arr; + }); + result->device_id = rmm::get_current_cuda_device().value(); + result->device_type = ARROW_DEVICE_CUDA; + result->sync_event = &private_data->sync_event; + result->array = private_data->parent; + result->array.private_data = private_data.release(); + result->array.release = &detail::ArrowDeviceArrayRelease; + return result; +} + +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9dbf278c71d..053fcc0989a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -24,7 +24,7 @@ rapids_test_init() # properties and linking to build the test function(ConfigureTest CMAKE_TEST_NAME) set(options) - set(one_value GPUS PERCENT STREAM_MODE) + set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB) set(multi_value) cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN}) if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT) @@ -56,7 +56,7 @@ function(ConfigureTest CMAKE_TEST_NAME) target_link_libraries( ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main nvtx3-cpp - $ + $ "${_CUDF_TEST_EXTRA_LIB}" ) rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME}) rapids_test_add( @@ -267,7 +267,8 @@ ConfigureTest( # ################################################################################################## # * interop tests ------------------------------------------------------------------------- ConfigureTest( - INTEROP_TEST interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/dlpack_test.cpp + INTEROP_TEST interop/to_arrow_device_test.cpp interop/to_arrow_test.cpp + interop/from_arrow_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow ) # ################################################################################################## diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp new file mode 100644 index 00000000000..e7ffa9e40f4 --- /dev/null +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +// no-op allocator/deallocator to set into ArrowArray buffers that we don't +// want to own their buffers. +static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){ + .reallocate = [](ArrowBufferAllocator*, uint8_t* ptr, int64_t, int64_t) -> uint8_t* { + return ptr; + }, + .free = [](ArrowBufferAllocator*, uint8_t*, int64_t) {}, + .private_data = nullptr, +}; + +// populate the ArrowArray by copying host data buffers for fixed width types other +// than boolean. +template +std::enable_if_t() and !std::is_same_v, void> get_nanoarrow_array( + ArrowArray* arr, std::vector const& data, std::vector const& mask = {}) +{ + arr->length = data.size(); + NANOARROW_THROW_NOT_OK( + ArrowBufferAppend(ArrowArrayBuffer(arr, 1), data.data(), sizeof(T) * data.size())); + if (!mask.empty()) { + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size())); + ArrowBitmapAppendInt8Unsafe( + ArrowArrayValidityBitmap(arr), reinterpret_cast(mask.data()), mask.size()); + arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size()); + } else { + arr->null_count = 0; + } + + CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK, + "failed to construct array"); +} + +// populate an ArrowArray with pointers to the raw device buffers of a cudf::column_view +// and use the no-op alloc so that the ArrowArray doesn't presume ownership of the data +template +std::enable_if_t() and !std::is_same_v, void> populate_from_col( + ArrowArray* arr, cudf::column_view view) +{ + arr->length = view.size(); + arr->null_count = view.null_count(); + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc); + ArrowArrayValidityBitmap(arr)->buffer.data = + const_cast(reinterpret_cast(view.null_mask())); + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc); + ArrowArrayBuffer(arr, 1)->data = const_cast(view.data()); +} + +// populate an ArrowArray with boolean data by generating the appropriate +// bitmaps to copy the data. +template +std::enable_if_t, void> get_nanoarrow_array( + ArrowArray* arr, std::vector const& data, std::vector const& mask = {}) +{ + ArrowBitmap bool_data; + ArrowBitmapInit(&bool_data); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bool_data, data.size())); + std::for_each(data.begin(), data.end(), [&](const auto&& elem) { + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&bool_data, (elem) ? 1 : 0, 1)); + }); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(arr, 1, &bool_data.buffer)); + + if (!mask.empty()) { + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size())); + std::for_each(mask.begin(), mask.end(), [&](const auto&& elem) { + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(arr), (elem) ? 1 : 0, 1)); + }); + arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size()); + } else { + arr->null_count = 0; + } + + CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK, + "failed to construct boolean array"); +} + +// populate an ArrowArray from a boolean cudf column. Since Arrow and cudf +// still represent boolean arrays differently, we have to use bools_to_mask +// and give the ArrowArray object ownership of the device data. +template +std::enable_if_t, void> populate_from_col(ArrowArray* arr, + cudf::column_view view) +{ + arr->length = view.size(); + arr->null_count = view.null_count(); + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc); + ArrowArrayValidityBitmap(arr)->buffer.data = + const_cast(reinterpret_cast(view.null_mask())); + + auto bitmask = cudf::bools_to_mask(view); + auto ptr = reinterpret_cast(bitmask.first->data()); + ArrowBufferSetAllocator( + ArrowArrayBuffer(arr, 1), + ArrowBufferDeallocator( + [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) { + auto buf = reinterpret_cast*>(alloc->private_data); + delete buf; + }, + new std::unique_ptr(std::move(bitmask.first)))); + ArrowArrayBuffer(arr, 1)->data = ptr; +} + +// populate an ArrowArray by copying the string data and constructing the offsets +// buffer. +template +std::enable_if_t, void> get_nanoarrow_array( + ArrowArray* arr, std::vector const& data, std::vector const& mask = {}) +{ + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(arr)); + for (auto& str : data) { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(arr, ArrowCharView(str.c_str()))); + } + + if (!mask.empty()) { + ArrowBitmapReset(ArrowArrayValidityBitmap(arr)); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size())); + ArrowBitmapAppendInt8Unsafe( + ArrowArrayValidityBitmap(arr), reinterpret_cast(mask.data()), mask.size()); + arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size()); + } else { + arr->null_count = 0; + } + + CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK, + "failed to construct string array"); +} + +// populate an ArrowArray with the string data buffers of a cudf column_view +// using no-op allocator so the ArrowArray knows it doesn't have ownership +// of the device buffers. +template +std::enable_if_t, void> populate_from_col( + ArrowArray* arr, cudf::column_view view) +{ + arr->length = view.size(); + arr->null_count = view.null_count(); + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc); + ArrowArrayValidityBitmap(arr)->buffer.data = + const_cast(reinterpret_cast(view.null_mask())); + + cudf::strings_column_view sview{view}; + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc); + ArrowArrayBuffer(arr, 1)->data = const_cast(sview.offsets().data()); + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc); + ArrowArrayBuffer(arr, 2)->data = const_cast(view.data()); +} + +// populate a dictionary ArrowArray by delegating the copying of the indices +// and key arrays +template +void get_nanoarrow_dict_array(ArrowArray* arr, + std::vector const& keys, + std::vector const& ind, + std::vector const& validity = {}) +{ + get_nanoarrow_array(arr->dictionary, keys); + get_nanoarrow_array(arr, ind, validity); +} + +// populate a list ArrowArray by copying the offsets and data buffers +template +void get_nanoarrow_list_array(ArrowArray* arr, + std::vector data, + std::vector offsets, + std::vector data_validity = {}, + std::vector list_validity = {}) +{ + get_nanoarrow_array(arr->children[0], data, data_validity); + + arr->length = offsets.size() - 1; + NANOARROW_THROW_NOT_OK( + ArrowBufferAppend(ArrowArrayBuffer(arr, 1), offsets.data(), sizeof(int32_t) * offsets.size())); + if (!list_validity.empty()) { + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), list_validity.size())); + ArrowBitmapAppendInt8Unsafe(ArrowArrayValidityBitmap(arr), + reinterpret_cast(list_validity.data()), + arr->length); + arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, arr->length); + } else { + arr->null_count = 0; + } + + CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK, + "failed to construct list array"); +} + +// populate an ArrowArray list array from device buffers using a no-op +// allocator so that the ArrowArray doesn't have ownership of the buffers +void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view) +{ + arr->length = view.size(); + arr->null_count = view.null_count(); + + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc); + ArrowArrayValidityBitmap(arr)->buffer.data = + const_cast(reinterpret_cast(view.null_mask())); + + ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc); + ArrowArrayBuffer(arr, 1)->data = const_cast(view.offsets().data()); +} diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp new file mode 100644 index 00000000000..243aa4e81af --- /dev/null +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using vector_of_columns = std::vector>; + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_tables(cudf::size_type length) +{ + std::vector int64_data(length); + std::vector bool_data(length); + std::vector string_data(length); + std::vector validity(length); + std::vector bool_validity(length); + std::vector bool_data_validity; + cudf::size_type length_of_individual_list = 3; + cudf::size_type length_of_list = length_of_individual_list * length; + std::vector list_int64_data(length_of_list); + std::vector list_int64_data_validity(length_of_list); + std::vector list_offsets(length + 1); + + std::vector> columns; + + columns.emplace_back(cudf::test::fixed_width_column_wrapper( + int64_data.begin(), int64_data.end(), validity.begin()) + .release()); + columns.emplace_back( + cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + .release()); + auto col4 = cudf::test::fixed_width_column_wrapper( + int64_data.begin(), int64_data.end(), validity.begin()); + auto dict_col = cudf::dictionary::encode(col4); + columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::test::fixed_width_column_wrapper( + bool_data.begin(), bool_data.end(), bool_validity.begin()) + .release()); + auto list_child_column = cudf::test::fixed_width_column_wrapper( + list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin()); + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper(list_offsets.begin(), list_offsets.end()); + auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( + bool_data_validity.begin(), bool_data_validity.end())); + columns.emplace_back(cudf::make_lists_column(length, + list_offsets_column.release(), + list_child_column.release(), + list_nulls, + std::move(*list_mask))); + auto int_column = cudf::test::fixed_width_column_wrapper( + int64_data.begin(), int64_data.end(), validity.begin()) + .release(); + auto str_column = + cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + .release(); + vector_of_columns cols; + cols.push_back(move(int_column)); + cols.push_back(move(str_column)); + auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( + bool_data_validity.begin(), bool_data_validity.end())); + columns.emplace_back( + cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask))); + + nanoarrow::UniqueSchema schema; + ArrowSchemaInit(schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema.get(), 6)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[0], "a")); + if (columns[0]->null_count() > 0) { + schema->children[0]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[0]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[1], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[1], "b")); + if (columns[1]->null_count() > 0) { + schema->children[1]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[1]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[2], "c")); + if (columns[2]->null_count() > 0) { + schema->children[2]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[2]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[3], NANOARROW_TYPE_BOOL)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[3], "d")); + if (columns[3]->null_count() > 0) { + schema->children[3]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[3]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[4], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[4]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4]->children[0], "element")); + if (columns[4]->child(1).null_count() > 0) { + schema->children[4]->children[0]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[4]->children[0]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4], "e")); + if (columns[4]->has_nulls()) { + schema->children[4]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[4]->flags = 0; + } + + ArrowSchemaInit(schema->children[5]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema->children[5], 2)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[5]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[0], "integral")); + if (columns[5]->child(0).has_nulls()) { + schema->children[5]->children[0]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[5]->children[0]->flags = 0; + } + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[5]->children[1], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[1], "string")); + if (columns[5]->child(1).has_nulls()) { + schema->children[5]->children[1]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[5]->children[1]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5], "f")); + if (columns[5]->has_nulls()) { + schema->children[5]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[5]->flags = 0; + } + + nanoarrow::UniqueArray arrow; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); + + get_nanoarrow_array(arrow->children[0], int64_data, validity); + get_nanoarrow_array(arrow->children[1], string_data, validity); + cudf::dictionary_column_view view(dict_col->view()); + auto keys = cudf::test::to_host(view.keys()).first; + auto indices = cudf::test::to_host(view.indices()).first; + get_nanoarrow_dict_array(arrow->children[2], + std::vector(keys.begin(), keys.end()), + std::vector(indices.begin(), indices.end()), + validity); + get_nanoarrow_array(arrow->children[3], bool_data, bool_validity); + get_nanoarrow_list_array(arrow->children[4], + list_int64_data, + list_offsets, + list_int64_data_validity, + bool_data_validity); + + get_nanoarrow_array(arrow->children[5]->children[0], int64_data, validity); + get_nanoarrow_array(arrow->children[5]->children[1], string_data, validity); + arrow->children[5]->length = length; + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arrow->children[5]), length)); + std::for_each(bool_data_validity.begin(), bool_data_validity.end(), [&](auto&& elem) { + NANOARROW_THROW_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(arrow->children[5]), (elem) ? 1 : 0, 1)); + }); + arrow->children[5]->null_count = + ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length); + + CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arrow.get(), nullptr) == NANOARROW_OK, + "failed to build example Arrays"); + + return std::make_tuple( + std::make_unique(std::move(columns)), std::move(schema), std::move(arrow)); +} + +struct BaseArrowFixture : public cudf::test::BaseFixture { + void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual) + { + EXPECT_STREQ(expected->format, actual->format); + EXPECT_STREQ(expected->name, actual->name); + EXPECT_STREQ(expected->metadata, actual->metadata); + EXPECT_EQ(expected->flags, actual->flags); + EXPECT_EQ(expected->n_children, actual->n_children); + + if (expected->n_children == 0) { + EXPECT_EQ(nullptr, actual->children); + } else { + for (int i = 0; i < expected->n_children; ++i) { + SCOPED_TRACE(expected->children[i]->name); + compare_schemas(expected->children[i], actual->children[i]); + } + } + + if (expected->dictionary != nullptr) { + EXPECT_NE(nullptr, actual->dictionary); + SCOPED_TRACE("dictionary"); + compare_schemas(expected->dictionary, actual->dictionary); + } else { + EXPECT_EQ(nullptr, actual->dictionary); + } + } + + void compare_device_buffers(const size_t nbytes, + const int buffer_idx, + const ArrowArray* expected, + const ArrowArray* actual) + { + std::vector actual_bytes; + std::vector expected_bytes; + expected_bytes.resize(nbytes); + actual_bytes.resize(nbytes); + + // synchronous copies so we don't have to worry about async weirdness + cudaMemcpy( + expected_bytes.data(), expected->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost); + cudaMemcpy(actual_bytes.data(), actual->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost); + + ASSERT_EQ(expected_bytes, actual_bytes); + } + + void compare_arrays(const ArrowSchema* schema, + const ArrowArray* expected, + const ArrowArray* actual) + { + ArrowSchemaView schema_view; + ArrowSchemaViewInit(&schema_view, schema, nullptr); + + EXPECT_EQ(expected->length, actual->length); + EXPECT_EQ(expected->null_count, actual->null_count); + EXPECT_EQ(expected->offset, actual->offset); + EXPECT_EQ(expected->n_buffers, actual->n_buffers); + EXPECT_EQ(expected->n_children, actual->n_children); + + if (expected->length > 0) { + EXPECT_EQ(expected->buffers[0], actual->buffers[0]); + if (schema_view.type == NANOARROW_TYPE_BOOL) { + const size_t nbytes = (expected->length + 7) >> 3; + compare_device_buffers(nbytes, 1, expected, actual); + } else if (schema_view.type == NANOARROW_TYPE_DECIMAL128) { + const size_t nbytes = (expected->length * sizeof(__int128_t)); + compare_device_buffers(nbytes, 1, expected, actual); + } else { + for (int i = 1; i < expected->n_buffers; ++i) { + EXPECT_EQ(expected->buffers[i], actual->buffers[i]); + } + } + } + + if (expected->n_children == 0) { + EXPECT_EQ(nullptr, actual->children); + } else { + for (int i = 0; i < expected->n_children; ++i) { + SCOPED_TRACE(schema->children[i]->name); + compare_arrays(schema->children[i], expected->children[i], actual->children[i]); + } + } + + if (expected->dictionary != nullptr) { + EXPECT_NE(nullptr, actual->dictionary); + SCOPED_TRACE("dictionary"); + compare_arrays(schema->dictionary, expected->dictionary, actual->dictionary); + } else { + EXPECT_EQ(nullptr, actual->dictionary); + } + } +}; + +struct ToArrowDeviceTest : public BaseArrowFixture {}; + +template +struct ToArrowDeviceTestDurationsTest : public BaseArrowFixture {}; + +TYPED_TEST_SUITE(ToArrowDeviceTestDurationsTest, cudf::test::DurationTypes); + +TEST_F(ToArrowDeviceTest, EmptyTable) +{ + const auto [table, schema, arr] = get_nanoarrow_tables(0); + + auto struct_meta = cudf::column_metadata{"f"}; + struct_meta.children_meta = {{"integral"}, {"string"}}; + + cudf::dictionary_column_view dview{table->view().column(2)}; + + std::vector meta{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta}; + auto got_arrow_schema = cudf::to_arrow_schema(table->view(), meta); + + compare_schemas(schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + auto got_arrow_device = cudf::to_arrow_device(std::move(*table)); + EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id); + EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type); + + compare_arrays(schema.get(), arr.get(), &got_arrow_device->array); + ArrowArrayRelease(&got_arrow_device->array); +} + +TEST_F(ToArrowDeviceTest, DateTimeTable) +{ + auto data = {1, 2, 3, 4, 5, 6}; + auto col = + cudf::test::fixed_width_column_wrapper(data); + std::vector> cols; + cols.emplace_back(col.release()); + cudf::table input(std::move(cols)); + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + ArrowSchemaSetTypeStruct(expected_schema.get(), 1); + ArrowSchemaInit(expected_schema->children[0]); + ArrowSchemaSetTypeDateTime( + expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr); + ArrowSchemaSetName(expected_schema->children[0], "a"); + expected_schema->children[0]->flags = 0; + + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + auto data_ptr = input.get_column(0).view().data(); + auto got_arrow_array = cudf::to_arrow_device(std::move(input)); + EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + + EXPECT_EQ(data.size(), got_arrow_array->array.length); + EXPECT_EQ(0, got_arrow_array->array.null_count); + EXPECT_EQ(0, got_arrow_array->array.offset); + EXPECT_EQ(1, got_arrow_array->array.n_children); + EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]); + + EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length); + EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count); + EXPECT_EQ(0, got_arrow_array->array.children[0]->offset); + EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]); + EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]); + + ArrowArrayRelease(&got_arrow_array->array); +} + +TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable) +{ + using T = TypeParam; + + if (cudf::type_to_id() == cudf::type_id::DURATION_DAYS) { return; } + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + std::vector> cols; + cols.emplace_back(col.release()); + cudf::table input(std::move(cols)); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + ArrowSchemaSetTypeStruct(expected_schema.get(), 1); + + ArrowSchemaInit(expected_schema->children[0]); + const ArrowTimeUnit arrow_unit = [&] { + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND; + case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI; + case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO; + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + }(); + ArrowSchemaSetTypeDateTime( + expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr); + ArrowSchemaSetName(expected_schema->children[0], "a"); + expected_schema->children[0]->flags = 0; + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + BaseArrowFixture::compare_schemas(expected_schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + auto data_ptr = input.get_column(0).view().data(); + auto got_arrow_array = cudf::to_arrow_device(std::move(input)); + EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + + EXPECT_EQ(data.size(), got_arrow_array->array.length); + EXPECT_EQ(0, got_arrow_array->array.null_count); + EXPECT_EQ(0, got_arrow_array->array.offset); + EXPECT_EQ(1, got_arrow_array->array.n_children); + EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]); + + EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length); + EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count); + EXPECT_EQ(0, got_arrow_array->array.children[0]->offset); + EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]); + EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]); + + ArrowArrayRelease(&got_arrow_array->array); +} + +TEST_F(ToArrowDeviceTest, NestedList) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); + auto col = cudf::test::lists_column_wrapper( + {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids}); + + std::vector> cols; + cols.emplace_back(col.release()); + cudf::table input(std::move(cols)); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + ArrowSchemaSetTypeStruct(expected_schema.get(), 1); + + ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST); + ArrowSchemaSetName(expected_schema->children[0], "a"); + expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE; + + ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST); + ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"); + expected_schema->children[0]->children[0]->flags = 0; + + ArrowSchemaInitFromType(expected_schema->children[0]->children[0]->children[0], + NANOARROW_TYPE_INT64); + ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"); + expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE; + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + nanoarrow::UniqueArray expected_array; + EXPECT_EQ(NANOARROW_OK, + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + auto top_list = expected_array->children[0]; + cudf::lists_column_view lview{input.get_column(0).view()}; + populate_list_from_col(top_list, lview); + cudf::lists_column_view nested_view{lview.child()}; + populate_list_from_col(top_list->children[0], nested_view); + populate_from_col(top_list->children[0]->children[0], nested_view.child()); + + ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr); + + auto got_arrow_array = cudf::to_arrow_device(std::move(input)); + EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + ArrowArrayRelease(&got_arrow_array->array); +} + +TEST_F(ToArrowDeviceTest, StructColumn) +{ + // Create cudf table + auto nested_type_field_names = + std::vector>{{"string", "integral", "bool", "nested_list", "struct"}}; + auto str_col = + cudf::test::strings_column_wrapper{ + "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"} + .release(); + auto str_col2 = + cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release(); + int num_rows{str_col->size()}; + auto int_col = cudf::test::fixed_width_column_wrapper{{48, 27, 25}}.release(); + auto int_col2 = + cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); + auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); + auto list_col = + cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) + .release(); + vector_of_columns cols2; + cols2.push_back(std::move(str_col2)); + cols2.push_back(std::move(int_col2)); + auto [null_mask, null_count] = + cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper{{true, true, false}}); + auto sub_struct_col = + cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask)); + vector_of_columns cols; + cols.push_back(std::move(str_col)); + cols.push_back(std::move(int_col)); + cols.push_back(std::move(bool_col)); + cols.push_back(std::move(list_col)); + cols.push_back(std::move(sub_struct_col)); + + auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {}); + std::vector> table_cols; + table_cols.emplace_back(struct_col.release()); + cudf::table input(std::move(table_cols)); + + // Create name metadata + auto sub_metadata = cudf::column_metadata{"struct"}; + sub_metadata.children_meta = {{"string2"}, {"integral2"}}; + auto metadata = cudf::column_metadata{"a"}; + metadata.children_meta = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata}; + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + ArrowSchemaSetTypeStruct(expected_schema.get(), 1); + + ArrowSchemaInit(expected_schema->children[0]); + ArrowSchemaSetTypeStruct(expected_schema->children[0], 5); + ArrowSchemaSetName(expected_schema->children[0], "a"); + expected_schema->children[0]->flags = 0; + + auto child = expected_schema->children[0]; + ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING); + ArrowSchemaSetName(child->children[0], "string"); + child->children[0]->flags = 0; + + ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32); + ArrowSchemaSetName(child->children[1], "integral"); + child->children[1]->flags = 0; + + ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL); + ArrowSchemaSetName(child->children[2], "bool"); + child->children[2]->flags = 0; + + ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST); + ArrowSchemaSetName(child->children[3], "nested_list"); + child->children[3]->flags = 0; + ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST); + ArrowSchemaSetName(child->children[3]->children[0], "element"); + child->children[3]->children[0]->flags = 0; + ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64); + ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"); + child->children[3]->children[0]->children[0]->flags = 0; + + ArrowSchemaInit(child->children[4]); + ArrowSchemaSetTypeStruct(child->children[4], 2); + ArrowSchemaSetName(child->children[4], "struct"); + + ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING); + ArrowSchemaSetName(child->children[4]->children[0], "string2"); + ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32); + ArrowSchemaSetName(child->children[4]->children[1], "integral2"); + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{metadata}); + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + nanoarrow::UniqueArray expected_array; + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr); + + expected_array->length = input.num_rows(); + + auto array_a = expected_array->children[0]; + auto view_a = input.view().column(0); + array_a->length = view_a.size(); + array_a->null_count = view_a.null_count(); + + ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc); + ArrowArrayValidityBitmap(array_a)->buffer.data = + const_cast(reinterpret_cast(view_a.null_mask())); + + populate_from_col(array_a->children[0], view_a.child(0)); + populate_from_col(array_a->children[1], view_a.child(1)); + populate_from_col(array_a->children[2], view_a.child(2)); + populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)}); + populate_list_from_col(array_a->children[3]->children[0], + cudf::lists_column_view{view_a.child(3).child(1)}); + populate_from_col(array_a->children[3]->children[0]->children[0], + view_a.child(3).child(1).child(1)); + + auto array_struct = array_a->children[4]; + auto view_struct = view_a.child(4); + array_struct->length = view_struct.size(); + array_struct->null_count = view_struct.null_count(); + + ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc); + ArrowArrayValidityBitmap(array_struct)->buffer.data = + const_cast(reinterpret_cast(view_struct.null_mask())); + + populate_from_col(array_struct->children[0], view_struct.child(0)); + populate_from_col(array_struct->children[1], view_struct.child(1)); + + ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr); + + auto got_arrow_array = cudf::to_arrow_device(std::move(input)); + EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + ArrowArrayRelease(&got_arrow_array->array); +} + +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + +TEST_F(ToArrowDeviceTest, FixedPoint64Table) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const expect_data = std::vector{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto col = fp_wrapper({-1, 2, 3, 4, 5, 6}, scale_type{scale}); + std::vector> table_cols; + table_cols.emplace_back(col.release()); + auto input = cudf::table(std::move(table_cols)); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + ArrowSchemaSetTypeStruct(expected_schema.get(), 1); + ArrowSchemaInit(expected_schema->children[0]); + ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale); + ArrowSchemaSetName(expected_schema->children[0], "a"); + expected_schema->children[0]->flags = 0; + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + auto result_dev_data = std::make_unique>( + expect_data.size(), cudf::get_default_stream()); + cudaMemcpy(result_dev_data->data(), + expect_data.data(), + sizeof(int64_t) * expect_data.size(), + cudaMemcpyHostToDevice); + + cudf::get_default_stream().synchronize(); + nanoarrow::UniqueArray expected_array; + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr); + expected_array->length = input.num_rows(); + + expected_array->children[0]->length = input.num_rows(); + ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc); + ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data = + const_cast(reinterpret_cast(input.view().column(0).null_mask())); + + auto data_ptr = reinterpret_cast(result_dev_data->data()); + ArrowBufferSetAllocator( + ArrowArrayBuffer(expected_array->children[0], 1), + ArrowBufferDeallocator( + [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) { + auto buf = + reinterpret_cast>*>(alloc->private_data); + delete buf; + }, + new std::unique_ptr>(std::move(result_dev_data)))); + ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr; + ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr); + + auto got_arrow_array = cudf::to_arrow_device(std::move(input)); + ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + ArrowArrayRelease(&got_arrow_array->array); + } +} + +TEST_F(ToArrowDeviceTest, FixedPoint128Table) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const expect_data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6}; + auto col = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale}); + std::vector> table_cols; + table_cols.emplace_back(col.release()); + auto input = cudf::table(std::move(table_cols)); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + ArrowSchemaSetTypeStruct(expected_schema.get(), 1); + ArrowSchemaInit(expected_schema->children[0]); + ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision<__int128_t>(), + -scale); + ArrowSchemaSetName(expected_schema->children[0], "a"); + expected_schema->children[0]->flags = 0; + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + ArrowSchemaRelease(got_arrow_schema.get()); + + nanoarrow::UniqueArray expected_array; + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr); + expected_array->length = input.num_rows(); + + populate_from_col<__int128_t>(expected_array->children[0], input.view().column(0)); + ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr); + + auto got_arrow_array = cudf::to_arrow_device(std::move(input)); + EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + ArrowArrayRelease(&got_arrow_array->array); + } +} diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 7afc8fe19bf..b891ff99d47 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -306,6 +306,7 @@ def clean_all_xml_files(path): intersphinx_mapping = { "cupy": ("https://docs.cupy.dev/en/stable/", None), "dlpack": ("https://dmlc.github.io/dlpack/latest/", None), + "nanoarrow": ("https://arrow.apache.org/nanoarrow/latest", None), "numpy": ("https://numpy.org/doc/stable", None), "pandas": ("https://pandas.pydata.org/docs/", None), "pyarrow": ("https://arrow.apache.org/docs/", None), From aab8a76b532b46713b9784302ffd202586ecb5cc Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 2 Apr 2024 02:14:01 +0200 Subject: [PATCH 11/69] Fixes potential race in JSON parser when parsing JSON lines format and when recovering from invalid lines (#15419) PR adds a missing synchronization before the FST destructor of the FST used for cleaning excess characters following the first valid record on a JSON line. The problem is that the FST's destructor could otherwise free memory that is yet to be used by the still running FST instance. Closes https://github.com/rapidsai/cudf/issues/15409 Authors: - Elias Stehle (https://github.com/elstehle) Approvers: - Alessandro Bellina (https://github.com/abellina) - Shruti Shivakumar (https://github.com/shrshi) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15419 --- cpp/src/io/json/nested_json_gpu.cu | 3 + cpp/tests/io/json_test.cpp | 107 +++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index a6a57c36b08..4ddbe735963 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1583,6 +1583,9 @@ std::pair, rmm::device_uvector> ge thrust::make_discard_iterator(), fix_stack_of_excess_chars::start_state, stream); + + // Make sure memory of the FST's lookup tables isn't freed before the FST completes + stream.synchronize(); } constexpr auto max_translation_table_size = diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 0b70e5e3f93..bae71d3c2a8 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -35,12 +36,15 @@ #include #include +#include + #include #include #include #include +#include #include #define wrapper cudf::test::fixed_width_column_wrapper @@ -2050,6 +2054,109 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()}); } +// Sanity test that checks whether there's a race on the FST destructor +TEST_F(JsonReaderTest, JSONLinesRecoveringSync) +{ + // Set up host pinned memory pool to avoid implicit synchronizations to test for any potential + // races due to missing host-device synchronizations + using host_pooled_mr = rmm::mr::pool_memory_resource; + host_pooled_mr mr{std::make_shared().get(), + size_t{128} * 1024 * 1024}; + + // Set new resource + auto last_mr = cudf::io::set_host_memory_resource(mr); + + /** + * @brief Spark has the specific need to ignore extra characters that come after the first record + * on a JSON line + */ + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2}{})" + "\n" + // 1 -> (invalid) + R"({"b":{}should_be_invalid})" + "\n" + // 2 -> b (valid) + R"({"b":{"a":3} })" + "\n" + // 3 -> c: (valid) + R"({"c":1.2 } )" + "\n" + "\n" + // 4 -> (valid) + R"({"a":4} 123)" + "\n" + // 5 -> (valid) + R"({"a":5}//Comment after record)" + "\n" + // 6 -> (valid) + R"({"a":6} //Comment after whitespace)" + "\n" + // 7 -> (invalid) + R"({"a":5 //Invalid Comment within record})"; + + // Create input of a certain size to potentially reveal a missing host/device sync + std::size_t const target_size = 40000000; + auto const repetitions_log2 = + static_cast(std::ceil(std::log2(target_size / data.size()))); + auto const repetitions = 1ULL << repetitions_log2; + + for (std::size_t i = 0; i < repetitions_log2; ++i) { + data = data + "\n" + data; + } + + auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << data; + } + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 8 * repetitions); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64); + + std::vector a_validity{true, false, false, false, true, true, true, false}; + std::vector b_validity{false, false, true, false, false, false, false, false}; + std::vector c_validity{false, false, false, true, false, false, false, false}; + + std::vector a_data{-2, 0, 0, 0, 4, 5, 6, 0}; + std::vector b_a_data{0, 0, 3, 0, 0, 0, 0, 0}; + std::vector c_data{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}; + + for (std::size_t i = 0; i < repetitions_log2; ++i) { + a_validity.insert(a_validity.end(), a_validity.cbegin(), a_validity.cend()); + b_validity.insert(b_validity.end(), b_validity.cbegin(), b_validity.cend()); + c_validity.insert(c_validity.end(), c_validity.cbegin(), c_validity.cend()); + a_data.insert(a_data.end(), a_data.cbegin(), a_data.cend()); + b_a_data.insert(b_a_data.end(), b_a_data.cbegin(), b_a_data.cend()); + c_data.insert(c_data.end(), c_data.cbegin(), c_data.cend()); + } + + // Child column b->a + auto b_a_col = int64_wrapper(b_a_data.cbegin(), b_a_data.cend()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(0), int64_wrapper{a_data.cbegin(), a_data.cend(), a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(2), + float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()}); + + // Restore original memory source + cudf::io::set_host_memory_resource(last_mr); +} + TEST_F(JsonReaderTest, MixedTypes) { using LCWS = cudf::test::lists_column_wrapper; From 08ac1eb7832fe99f44b25f192d9931d393a96983 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Apr 2024 08:27:49 -1000 Subject: [PATCH 12/69] Bump ruff and codespell pre-commit checks (#15407) xref https://github.com/rapidsai/cudf/pull/15345#discussion_r1532379047 Before pursuing migrating isort to ruff, bumping ruff to the latest version Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15407 --- .pre-commit-config.yaml | 4 ++-- cpp/include/cudf/io/detail/parquet.hpp | 4 ++-- cpp/src/copying/contiguous_split.cu | 2 +- cpp/src/io/orc/aggregate_orc_metadata.cpp | 2 +- pyproject.toml | 8 +++++--- python/cudf/benchmarks/common/config.py | 3 ++- python/cudf/cudf/_fuzz_testing/utils.py | 6 +++--- python/cudf/cudf/core/buffer/buffer.py | 2 +- python/cudf/cudf/core/buffer/spillable_buffer.py | 2 +- python/cudf/cudf/core/column/__init__.py | 1 - python/cudf/cudf/core/column/methods.py | 12 ++++-------- python/cudf/cudf/core/column/string.py | 6 ++---- python/cudf/cudf/io/parquet.py | 6 +++--- .../cudf/pandas/scripts/analyze-test-failures.py | 3 ++- .../cudf/pandas/scripts/summarize-test-results.py | 3 ++- python/cudf/cudf/tests/test_index.py | 1 + python/cudf/cudf/tests/test_monotonic.py | 1 + python/cudf/cudf/tests/test_multiindex.py | 1 + python/cudf/cudf/utils/docutils.py | 1 + python/cudf/cudf/utils/dtypes.py | 2 +- 20 files changed, 36 insertions(+), 34 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 06fdcb9f761..3e99cf3fa9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -113,7 +113,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.2.6 hooks: - id: codespell additional_dependencies: [tomli] @@ -129,7 +129,7 @@ repos: - id: rapids-dependency-file-generator args: ["--clean"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.13 + rev: v0.3.4 hooks: - id: ruff files: python/.*$ diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 0b8ee9676de..df870f6f1e4 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,7 +110,7 @@ class chunked_reader : private reader { * The chunk_read_limit parameter controls the size of the output chunks produces. If the user * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have * a total bytes size (over all columns) of 100 MB or less. This is a soft limit and the code - * will not fail if it cannot satisfy the limit. It will make a best-effort atttempt only. + * will not fail if it cannot satisfy the limit. It will make a best-effort attempt only. * * The pass_read_limit parameter controls how much temporary memory is used in the process of * decoding the file. The primary contributor to this memory usage is the uncompressed size of diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 23224d3225d..23bcd344a32 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1139,7 +1139,7 @@ struct packed_src_and_dst_pointers { /** * @brief Create an instance of `packed_src_and_dst_pointers` populating destination - * partitition buffers (if any) from `out_buffers`. In the chunked_pack case + * partition buffers (if any) from `out_buffers`. In the chunked_pack case * `out_buffers` is empty, and the destination pointer is provided separately * to the `copy_partitions` kernel. * diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index f5f540bc3a4..d54524f0f0d 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -194,7 +194,7 @@ aggregate_orc_metadata::select_stripes( } else { int64_t count = 0; int64_t stripe_skip_rows = 0; - // Iterate all source files, each source file has corelating metadata + // Iterate all source files, each source file has correlating metadata for (size_t src_file_idx = 0; src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read; ++src_file_idx) { diff --git a/pyproject.toml b/pyproject.toml index 28eac66c1d6..797b5374cb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,11 +19,14 @@ exclude = [ skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp" # ignore short words, and typename parameters like OffsetT ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" -ignore-words-list = "inout,unparseable,falsy" +ignore-words-list = "inout,unparseable,falsy,couldn,Couldn" builtin = "clear" quiet-level = 3 [tool.ruff] +line-length = 79 + +[tool.ruff.lint] select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"] ignore = [ # whitespace before : @@ -36,9 +39,8 @@ exclude = [ # TODO: Remove this in a follow-up where we fix __all__. "__init__.py", ] -line-length = 79 -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"] "python/cudf/cudf/pandas/scripts/*" = ["D"] diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py index 305a21d0a29..c1e9d4d6116 100644 --- a/python/cudf/benchmarks/common/config.py +++ b/python/cudf/benchmarks/common/config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Module used for global configuration of benchmarks. @@ -20,6 +20,7 @@ in this file and import them in conftest.py to ensure that they are handled appropriately. """ + import os import sys diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 6e53195ac2d..d685174f3c2 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -99,9 +99,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): low=1, high=10 ) else: - meta[ - "max_types_at_each_level" - ] = obj._max_struct_types_at_each_level + meta["max_types_at_each_level"] = ( + obj._max_struct_types_at_each_level + ) elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 8d278c9c065..1631fa00412 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -181,7 +181,7 @@ def _from_host_memory(cls, data: Any) -> Self: Parameters ---------- data : Any - An object that represens host memory. + An object that represents host memory. Returns ------- diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index b25af13679c..a9569190e75 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -154,7 +154,7 @@ def _from_host_memory(cls, data: Any) -> Self: Parameters ---------- data : Any - An object that represens host memory. + An object that represents host memory. Returns ------- diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 2a46654ccc2..e7119fcdf47 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -4,7 +4,6 @@ isort: skip_file """ - from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( ColumnBase, diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 0f5a0eb086b..e827c7a3dd3 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -26,8 +26,7 @@ def _return_or_inplace( inplace: Literal[True], expand: bool = False, retain_index: bool = True, - ) -> None: - ... + ) -> None: ... @overload def _return_or_inplace( @@ -36,8 +35,7 @@ def _return_or_inplace( inplace: Literal[False], expand: bool = False, retain_index: bool = True, - ) -> ParentType: - ... + ) -> ParentType: ... @overload def _return_or_inplace( @@ -45,8 +43,7 @@ def _return_or_inplace( new_col, expand: bool = False, retain_index: bool = True, - ) -> ParentType: - ... + ) -> ParentType: ... @overload def _return_or_inplace( @@ -55,8 +52,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[ParentType]: - ... + ) -> Optional[ParentType]: ... def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fb76fcdaf39..06d7aa030db 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -257,14 +257,12 @@ def byte_count(self) -> SeriesOrIndex: @overload def cat( self, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> str: - ... + ) -> str: ... @overload def cat( self, others, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: - ... + ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... def cat(self, others=None, sep=None, na_rep=None): """ diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index bead9c352ef..e55898de675 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1220,9 +1220,9 @@ def __init__( ) -> None: if isinstance(path, str) and path.startswith("s3://"): self.fs_meta = {"is_s3": True, "actual_path": path} - self.dir_: Optional[ - tempfile.TemporaryDirectory - ] = tempfile.TemporaryDirectory() + self.dir_: Optional[tempfile.TemporaryDirectory] = ( + tempfile.TemporaryDirectory() + ) self.path = self.dir_.name else: self.fs_meta = {} diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index f1744c9e92b..8870fbc5c28 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -11,6 +11,7 @@ Example: python analyze-test-failures.py log.json frame/* """ + import json import sys from collections import Counter diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index bfc56319d82..ffd2abb960d 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -10,6 +10,7 @@ python summarize-test-results.py log.json --output json python summarize-test-results.py log.json --output table """ + import argparse import json diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05213d7601c..ebbca57bd40 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3,6 +3,7 @@ """ Test related to Index """ + import datetime import operator import re diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 53919a95115..3c627a5fe89 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -4,6 +4,7 @@ Tests related to is_unique, is_monotonic_increasing & is_monotonic_decreasing attributes """ + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 4926d79e734..76a82afb78e 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -3,6 +3,7 @@ """ Test related to MultiIndex """ + import datetime import itertools import operator diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 68447f423a4..4136d97d69f 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -3,6 +3,7 @@ """ Helper functions for parameterized docstring """ + import functools import re import string diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e9dbc23d767..8521239413e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -587,7 +587,7 @@ def find_common_type(dtypes): def _dtype_pandas_compatible(dtype): """ A utility function, that returns `str` instead of `object` - dtype when pandas comptibility mode is enabled. + dtype when pandas compatibility mode is enabled. """ if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"): return "str" From 08d86c92b3e3ccd950e4d63033d44675510cbb74 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 2 Apr 2024 12:29:43 -0700 Subject: [PATCH 13/69] Fix errors in chunked ORC writer when no tables were (successfully) written (#15393) Closes https://github.com/rapidsai/cudf/issues/15386, https://github.com/rapidsai/cudf/issues/15387 The fixes for the two issues overlap, so I included both in a single PR. Expanded the `_closed` flag to an enum that tracks if the operations in `close()` should be performed (one or more tables were written to the sink). This way, we don't perform the steps in close when there is no valid file to write the footer for. This includes: - No `write` calls; - All `write` calls failed; The new enum replaces `skip_close()` that used to fix this issue for a smaller subset of cases. Additionally, writing of the ORC header has been moved after the encode and uses the new state to only write the header in the first `write` call. This way we don't write anything to the sink if there were no `write` calls with the writer, and if the encode failed in the `write`s. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15393 --- cpp/include/cudf/io/detail/orc.hpp | 8 ----- cpp/src/io/functions.cpp | 11 +----- cpp/src/io/orc/writer_impl.cu | 29 +++++++-------- cpp/src/io/orc/writer_impl.hpp | 20 +++++------ cpp/tests/io/orc_test.cpp | 58 +++++++++++++++++++++++++++--- 5 files changed, 79 insertions(+), 47 deletions(-) diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index 3c1486b60c2..c63c952e148 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -124,14 +124,6 @@ class writer { * @brief Finishes the chunked/streamed write process. */ void close(); - - /** - * @brief Skip work done in `close()`; should be called if `write()` failed. - * - * Calling skip_close() prevents the writer from writing the (invalid) file footer and the - * postscript. - */ - void skip_close(); }; } // namespace orc::detail } // namespace cudf::io diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index b8353d312fe..46c6c67c8df 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -436,16 +436,7 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream) auto writer = std::make_unique( std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream); - try { - writer->write(options.get_table()); - } catch (...) { - // If an exception is thrown, the output is incomplete/corrupted. - // Make sure the writer will not close with such corrupted data. - // In addition, the writer may throw an exception while trying to close, which would terminate - // the process. - writer->skip_close(); - throw; - } + writer->write(options.get_table()); } /** diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ade0e75de35..750a593920c 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2438,7 +2438,6 @@ writer::impl::impl(std::unique_ptr sink, if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } - init_state(); } writer::impl::impl(std::unique_ptr sink, @@ -2460,20 +2459,13 @@ writer::impl::impl(std::unique_ptr sink, if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } - init_state(); } writer::impl::~impl() { close(); } -void writer::impl::init_state() -{ - // Write file header - _out_sink->host_write(MAGIC, std::strlen(MAGIC)); -} - void writer::impl::write(table_view const& input) { - CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed"); + CUDF_EXPECTS(_state != writer_state::CLOSED, "Data has already been flushed to out and closed"); if (not _table_meta) { _table_meta = make_table_meta(input); } @@ -2516,6 +2508,11 @@ void writer::impl::write(table_view const& input) } }(); + if (_state == writer_state::NO_DATA_WRITTEN) { + // Write the ORC file header if this is the first write + _out_sink->host_write(MAGIC, std::strlen(MAGIC)); + } + // Compression/encoding were all successful. Now write the intermediate results. write_orc_data_to_sink(enc_data, segmentation, @@ -2533,6 +2530,8 @@ void writer::impl::write(table_view const& input) // Update file-level and compression statistics update_statistics(orc_table.num_rows(), std::move(intermediate_stats), compression_stats); + + _state = writer_state::DATA_WRITTEN; } void writer::impl::update_statistics( @@ -2683,8 +2682,11 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table, void writer::impl::close() { - if (_closed) { return; } - _closed = true; + if (_state != writer_state::DATA_WRITTEN) { + // writer is either closed or no data has been written + _state = writer_state::CLOSED; + return; + } PostScript ps; if (_stats_freq != statistics_freq::STATISTICS_NONE) { @@ -2769,6 +2771,8 @@ void writer::impl::close() pbw.put_byte(ps_length); _out_sink->host_write(pbw.data(), pbw.size()); _out_sink->flush(); + + _state = writer_state::CLOSED; } // Forward to implementation @@ -2795,9 +2799,6 @@ writer::~writer() = default; // Forward to implementation void writer::write(table_view const& table) { _impl->write(table); } -// Forward to implementation -void writer::skip_close() { _impl->skip_close(); } - // Forward to implementation void writer::close() { _impl->close(); } diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 417d29efb58..bd082befe0c 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -227,6 +227,14 @@ struct encoded_footer_statistics { std::vector file_level; }; +enum class writer_state { + NO_DATA_WRITTEN, // No table data has been written to the sink; if the writer is closed or + // destroyed in this state, it should not write the footer. + DATA_WRITTEN, // At least one table has been written to the sink; when the writer is closed, + // it should write the footer. + CLOSED // Writer has been closed; no further writes are allowed. +}; + /** * @brief Implementation for ORC writer */ @@ -266,11 +274,6 @@ class writer::impl { */ ~impl(); - /** - * @brief Begins the chunked/streamed write process. - */ - void init_state(); - /** * @brief Writes a single subtable as part of a larger ORC file/table write. * @@ -283,11 +286,6 @@ class writer::impl { */ void close(); - /** - * @brief Skip writing the footer when closing/deleting the writer. - */ - void skip_close() { _closed = true; } - private: /** * @brief Write the intermediate ORC data into the data sink. @@ -363,7 +361,7 @@ class writer::impl { Footer _footer; Metadata _orc_meta; persisted_statistics _persisted_stripe_statistics; // Statistics data saved between calls. - bool _closed = false; // To track if the output has been written to sink. + writer_state _state = writer_state::NO_DATA_WRITTEN; }; } // namespace cudf::io::orc::detail diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 24e2e2cfea0..e108e68e1f9 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -2100,8 +2101,7 @@ TEST_F(OrcWriterTest, BounceBufferBug) auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; }); constexpr auto num_rows = 150000; - column_wrapper col(sequence, - sequence + num_rows); + column_wrapper col(sequence, sequence + num_rows); table_view expected({col}); auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc"); @@ -2120,8 +2120,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow) static_assert(total_rows > std::numeric_limits::max()); auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; }); - column_wrapper col(sequence, - sequence + num_rows); + column_wrapper col(sequence, sequence + num_rows); table_view chunk_table({col}); std::vector out_buffer; @@ -2169,4 +2168,55 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view()); } +TEST_F(OrcChunkedWriterTest, NoWriteCloseNotThrow) +{ + std::vector out_buffer; + + cudf::io::chunked_orc_writer_options write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer}); + auto writer = cudf::io::orc_chunked_writer(write_opts); + + EXPECT_NO_THROW(writer.close()); +} + +TEST_F(OrcChunkedWriterTest, FailedWriteCloseNotThrow) +{ + // A sink that throws on write() + class throw_sink : public cudf::io::data_sink { + public: + void host_write(void const* data, size_t size) override { throw std::runtime_error("write"); } + void flush() override {} + size_t bytes_written() override { return 0; } + }; + + auto sequence = thrust::make_counting_iterator(0); + column_wrapper col(sequence, sequence + 10); + table_view table({col}); + + throw_sink sink; + cudf::io::chunked_orc_writer_options write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&sink}); + auto writer = cudf::io::orc_chunked_writer(write_opts); + + try { + writer.write(table); + } catch (...) { + // ignore the exception; we're testing that close() doesn't throw when the only write() fails + } + + EXPECT_NO_THROW(writer.close()); +} + +TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite) +{ + std::vector out_buffer; + + cudf::io::chunked_orc_writer_options write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer}); + auto writer = cudf::io::orc_chunked_writer(write_opts); + + EXPECT_NO_THROW(writer.close()); + EXPECT_EQ(out_buffer.size(), 0); +} + CUDF_TEST_PROGRAM_MAIN() From 13a5c7be33bec538a9f81872471c29796e67bce5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:54:09 -0400 Subject: [PATCH 14/69] Rework cudf::replace_nulls to use strings::detail::copy_if_else (#15286) Removes the specialized kernels for strings in `cudf::replace_nulls` and replaces them with a call to `cudf::strings::detail::copy_if_else` which is already enabled with offsetalator support and optimized for long strings. This will also allow `cudf::replace_nulls` to use large strings with no further changes. Also includes a `replace_nulls` benchmark for strings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15286 --- cpp/benchmarks/CMakeLists.txt | 3 +- cpp/benchmarks/replace/nulls.cpp | 59 ++++++++++++++ cpp/src/replace/nulls.cu | 127 +++++-------------------------- 3 files changed, 79 insertions(+), 110 deletions(-) create mode 100644 cpp/benchmarks/replace/nulls.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index c82e475dece..798e4e76141 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -208,8 +208,9 @@ ConfigureNVBench( ) # ################################################################################################## -# * reduction benchmark --------------------------------------------------------------------------- +# * replace benchmark --------------------------------------------------------------------------- ConfigureBench(REPLACE_BENCH replace/clamp.cpp replace/nans.cpp) +ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp) # ################################################################################################## # * filling benchmark ----------------------------------------------------------------------------- diff --git a/cpp/benchmarks/replace/nulls.cpp b/cpp/benchmarks/replace/nulls.cpp new file mode 100644 index 00000000000..ccd00050789 --- /dev/null +++ b/cpp/benchmarks/replace/nulls.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static void replace_nulls(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("row_width")); + + if (static_cast(n_rows) * static_cast(max_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); + + auto const input_table = create_random_table( + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile); + auto const input = input_table->view().column(0); + auto const repl = input_table->view().column(1); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = cudf::strings_column_view(input).chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::replace_nulls(input, repl); }); +} + +NVBENCH_BENCH(replace_nulls) + .set_name("replace_nulls") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}); diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 014171f2b40..299cdc6a160 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -32,8 +32,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -56,63 +56,6 @@ namespace { // anonymous static constexpr int BLOCK_SIZE = 256; -template -CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input, - cudf::column_device_view replacement, - cudf::bitmask_type* output_valid, - cudf::size_type* offsets, - char* chars, - cudf::size_type* valid_counter) -{ - cudf::size_type nrows = input.size(); - auto i = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - - uint32_t active_mask = 0xffff'ffff; - active_mask = __ballot_sync(active_mask, i < nrows); - auto const lane_id{threadIdx.x % cudf::detail::warp_size}; - uint32_t valid_sum{0}; - - while (i < nrows) { - bool input_is_valid = input.is_valid_nocheck(i); - bool output_is_valid = true; - - if (replacement_has_nulls && !input_is_valid) { - output_is_valid = replacement.is_valid_nocheck(i); - } - - cudf::string_view out; - if (input_is_valid) { - out = input.element(i); - } else if (output_is_valid) { - out = replacement.element(i); - } - - bool nonzero_output = (input_is_valid || output_is_valid); - - if (phase == 0) { - offsets[i] = nonzero_output ? out.size_bytes() : 0; - uint32_t bitmask = __ballot_sync(active_mask, output_is_valid); - if (0 == lane_id) { - output_valid[cudf::word_index(i)] = bitmask; - valid_sum += __popc(bitmask); - } - } else if (phase == 1) { - if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes()); - } - - i += stride; - active_mask = __ballot_sync(active_mask, i < nrows); - } - - // Compute total valid count for this block and add it to global count - uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); - // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { - atomicAdd(valid_counter, static_cast(block_valid_count)); - } -} - template CUDF_KERNEL void replace_nulls(cudf::column_device_view input, cudf::column_device_view replacement, @@ -222,58 +165,24 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - rmm::device_scalar valid_counter(0, stream); - cudf::size_type* valid_count = valid_counter.data(); - - auto replace_first = replace_nulls_strings<0, false>; - auto replace_second = replace_nulls_strings<1, false>; - if (replacement.has_nulls()) { - replace_first = replace_nulls_strings<0, true>; - replace_second = replace_nulls_strings<1, true>; + auto d_input = cudf::column_device_view::create(input, stream); + auto d_replacement = cudf::column_device_view::create(replacement, stream); + + auto lhs_iter = + cudf::detail::make_optional_iterator(*d_input, cudf::nullate::YES{}); + auto rhs_iter = cudf::detail::make_optional_iterator( + *d_replacement, cudf::nullate::DYNAMIC{replacement.nullable()}); + + auto filter = cudf::detail::validity_accessor{*d_input}; + auto result = cudf::strings::detail::copy_if_else( + lhs_iter, lhs_iter + input.size(), rhs_iter, filter, stream, mr); + + // input is nullable so result should always be nullable here + if (!result->nullable()) { + result->set_null_mask( + cudf::detail::create_null_mask(input.size(), cudf::mask_state::ALL_VALID, stream, mr), 0); } - - // Create new offsets column to use in kernel - std::unique_ptr sizes = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream); - - auto sizes_view = sizes->mutable_view(); - auto device_in = cudf::column_device_view::create(input, stream); - auto device_replacement = cudf::column_device_view::create(replacement, stream); - - rmm::device_buffer valid_bits = - cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); - - // Call first pass kernel to get sizes in offsets - cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1}; - replace_first<<>>( - *device_in, - *device_replacement, - reinterpret_cast(valid_bits.data()), - sizes_view.begin(), - nullptr, - valid_count); - - auto [offsets, bytes] = cudf::detail::make_offsets_child_column( - sizes_view.begin(), sizes_view.end(), stream, mr); - - auto offsets_view = offsets->mutable_view(); - - // Allocate chars array and output null mask - rmm::device_uvector output_chars(bytes, stream, mr); - - replace_second<<>>( - *device_in, - *device_replacement, - reinterpret_cast(valid_bits.data()), - offsets_view.begin(), - output_chars.data(), - valid_count); - - return cudf::make_strings_column(input.size(), - std::move(offsets), - output_chars.release(), - input.size() - valid_counter.value(stream), - std::move(valid_bits)); + return result; } template <> From 2584fd9d1e1fffb2aefd0417ba0994d7a563e076 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 2 Apr 2024 16:39:46 -0700 Subject: [PATCH 15/69] Test static builds in CI and fix nanoarrow configure (#15437) Resolves #15275 Resolves #15434 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Jake Awe (https://github.com/AyodeAwe) - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15437 --- .github/workflows/pr.yaml | 11 ++ .github/workflows/test.yaml | 10 ++ ci/configure_cpp_static.sh | 23 +++ cpp/cmake/thirdparty/get_nanoarrow.cmake | 20 +++ .../thirdparty/patches/nanoarrow_cmake.diff | 161 ++++++++++++++++++ dependencies.yaml | 18 +- 6 files changed, 239 insertions(+), 4 deletions(-) create mode 100755 ci/configure_cpp_static.sh create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 303988212d3..2d7ebb62fa8 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,6 +20,7 @@ jobs: - conda-python-cudf-tests - conda-python-other-tests - conda-java-tests + - static-configure - conda-notebook-tests - docs-build - wheel-build-cudf @@ -88,6 +89,16 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" + static-configure: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + with: + build_type: pull-request + # Use the wheel container so we can skip conda solves and since our + # primary static consumers (Spark) are not in conda anyway. + container_image: "rapidsai/ci-wheel:latest" + run_script: "ci/configure_cpp_static.sh" conda-notebook-tests: needs: conda-python-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6f7aef79881..ea47b6ad466 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -43,6 +43,16 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" + static-configure: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + with: + build_type: pull-request + # Use the wheel container so we can skip conda solves and since our + # primary static consumers (Spark) are not in conda anyway. + container_image: "rapidsai/ci-wheel:latest" + run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh new file mode 100755 index 00000000000..675e0c3981f --- /dev/null +++ b/ci/configure_cpp_static.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-configure-conda-channels + +source rapids-date-string + +rapids-logger "Configure static cpp build" + +ENV_YAML_DIR="$(mktemp -d)" +REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt" + +rapids-dependency-file-generator \ + --output requirements \ + --file_key test_static_build \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}" + +python -m pip install -r "${REQUIREMENTS_FILE}" +pyenv rehash + +cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index be938a89ccd..4316db99a8d 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -17,6 +17,25 @@ function(find_and_configure_nanoarrow) set(oneValueArgs VERSION FORK PINNED_TAG) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + # Only run if PKG_VERSION is < 0.5.0 + if(PKG_VERSION VERSION_LESS 0.5.0) + set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff") + set(patch_issues_to_ref + "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]" + ) + set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake") + set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log") + string(TIMESTAMP current_year "%Y" UTC) + configure_file( + ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY + ) + else() + message( + FATAL_ERROR + "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf." + ) + endif() + rapids_cpm_find( nanoarrow ${PKG_VERSION} GLOBAL_TARGETS nanoarrow @@ -26,6 +45,7 @@ function(find_and_configure_nanoarrow) # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin # to an actual tag. GIT_SHALLOW FALSE + PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script} OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff new file mode 100644 index 00000000000..b53e134ed2c --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff @@ -0,0 +1,161 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 8714c70..1feec13 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -49,7 +49,6 @@ else() + endif() + + option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF) +-add_library(coverage_config INTERFACE) + + # Avoids a warning about timestamps on downloaded files (prefer new policy + # if available)) +@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE) + if(NANOARROW_BUILD_TESTS) + include_directories(${CMAKE_BINARY_DIR}/amalgamation) + add_library(nanoarrow ${NANOARROW_C_TEMP}) ++ add_library(nanoarrow::nanoarrow ALIAS nanoarrow) ++ + target_compile_definitions(nanoarrow PUBLIC "$<$:NANOARROW_DEBUG>") + endif() + +@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE) + else() + add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c + src/nanoarrow/array_stream.c src/nanoarrow/utils.c) ++ add_library(nanoarrow::nanoarrow ALIAS nanoarrow) + + target_include_directories(nanoarrow + PUBLIC $ +@@ -154,13 +156,50 @@ else() + endif() + endif() + +- install(TARGETS nanoarrow DESTINATION lib) ++ install(TARGETS nanoarrow ++ DESTINATION lib ++ EXPORT nanoarrow-exports) + install(DIRECTORY src/ + DESTINATION include + FILES_MATCHING +- PATTERN "*.h") ++ PATTERN "*.h*") + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h + DESTINATION include/nanoarrow) ++ ++ # Generate package files for the build and install trees. ++ include(CMakePackageConfigHelpers) ++ include(GNUInstallDirs) ++ ++ foreach(tree_type BUILD INSTALL) ++ if(tree_type STREQUAL "BUILD") ++ set(install_location ".") ++ else() ++ set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow") ++ endif() ++ ++ set(build_location "${PROJECT_BINARY_DIR}/${install_location}") ++ write_basic_package_version_file( ++ "${build_location}/nanoarrow-config-version.cmake" ++ VERSION ${nanoarrow_VERSION} ++ # After 1.0.0, we can use `SameMajorVersion` here. ++ COMPATIBILITY ExactVersion) ++ configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in" ++ "${build_location}/nanoarrow-config.cmake" ++ INSTALL_DESTINATION "${install_location}") ++ ++ if(tree_type STREQUAL "BUILD") ++ export(EXPORT nanoarrow-exports ++ FILE "${build_location}/nanoarrow-targets.cmake" ++ NAMESPACE nanoarrow::) ++ ++ else() ++ install(DIRECTORY "${build_location}/" DESTINATION "${install_location}") ++ install(EXPORT nanoarrow-exports ++ DESTINATION "${install_location}" ++ FILE "nanoarrow-targets.cmake" ++ NAMESPACE nanoarrow::) ++ endif() ++ endforeach() + endif() + + # Always build integration test if building tests +@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS) + src/nanoarrow/integration/c_data_integration_test.cc) + + if(NANOARROW_CODE_COVERAGE) +- target_compile_options(coverage_config INTERFACE -O0 -g --coverage) +- target_link_options(coverage_config INTERFACE --coverage) +- target_link_libraries(nanoarrow coverage_config) ++ target_compile_options(nanoarrow PUBLIC -O0 -g --coverage) ++ target_link_options(nanoarrow PUBLIC --coverage) + endif() + +- target_link_libraries(utils_test +- nanoarrow +- gtest_main +- ${NANOARROW_ARROW_TARGET} +- coverage_config) +- target_link_libraries(buffer_test nanoarrow gtest_main coverage_config) +- target_link_libraries(array_test +- nanoarrow +- gtest_main +- ${NANOARROW_ARROW_TARGET} +- coverage_config) +- target_link_libraries(schema_test +- nanoarrow +- gtest_main +- ${NANOARROW_ARROW_TARGET} +- coverage_config) +- target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config) +- target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config) +- target_link_libraries(nanoarrow_testing_test +- nanoarrow +- gtest_main +- nlohmann_json::nlohmann_json +- coverage_config) ++ target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) ++ target_link_libraries(buffer_test nanoarrow gtest_main) ++ target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) ++ target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) ++ target_link_libraries(array_stream_test nanoarrow gtest_main) ++ target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main) ++ target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main ++ nlohmann_json::nlohmann_json) + target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration + gtest_main) + +diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in +new file mode 100644 +index 0000000..021dc31 +--- /dev/null ++++ b/cmake/config.cmake.in +@@ -0,0 +1,28 @@ ++# Licensed to the Apache Software Foundation (ASF) under one ++# or more contributor license agreements. See the NOTICE file ++# distributed with this work for additional information ++# regarding copyright ownership. The ASF licenses this file ++# to you under the Apache License, Version 2.0 (the ++# "License"); you may not use this file except in compliance ++# with the License. You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, ++# software distributed under the License is distributed on an ++# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY ++# KIND, either express or implied. See the License for the ++# specific language governing permissions and limitations ++# under the License. ++ ++ ++@PACKAGE_INIT@ ++ ++cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@) ++ ++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED) ++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED) ++ ++set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}") ++include(FindPackageHandleStandardArgs) ++find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE) diff --git a/dependencies.yaml b/dependencies.yaml index 85f5a86d938..5bb555df818 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -6,6 +6,7 @@ files: cuda: ["11.8", "12.2"] arch: [x86_64] includes: + - build_base - build_all - build_cpp - build_wheels @@ -27,6 +28,10 @@ files: - test_python_cudf - test_python_dask_cudf - depends_on_cupy + test_static_build: + output: none + includes: + - build_base test_cpp: output: none includes: @@ -45,6 +50,7 @@ files: test_java: output: none includes: + - build_base - build_all - cuda - cuda_version @@ -75,6 +81,7 @@ files: extras: table: build-system includes: + - build_base - build_python_common - build_python_cudf py_run_cudf: @@ -144,6 +151,7 @@ files: extras: table: build-system includes: + - build_base - build_python_common py_run_cudf_kafka: output: pyproject @@ -191,12 +199,16 @@ channels: - conda-forge - nvidia dependencies: - build_all: + build_base: common: - - output_types: conda + - output_types: [conda, requirements, pyproject] packages: - &cmake_ver cmake>=3.26.4 - &ninja ninja + build_all: + common: + - output_types: conda + packages: - c-compiler - cxx-compiler - dlpack>=0.8,<1.0 @@ -254,9 +266,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - *cmake_ver - cython>=3.0.3 - - *ninja # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==14.0.2.* From 082f6c91eb3906dbdf785348160ad5631ec91458 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:27:47 -0400 Subject: [PATCH 16/69] Use offsetalator in cudf::strings::replace functions (#14824) Adds offsetalator in place of hardcoded offset size_type arrays to the strings replace functions. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14824 --- cpp/src/strings/replace/multi.cu | 236 +++---- cpp/src/strings/replace/replace.cu | 791 +++++++++-------------- cpp/src/strings/replace/replace_nulls.cu | 12 +- cpp/src/strings/replace/replace_slice.cu | 25 +- 4 files changed, 463 insertions(+), 601 deletions(-) diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 8b5a4317b50..c93add01f69 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "strings/split/split.cuh" + #include #include -#include #include #include +#include #include -#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -67,7 +69,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256; * @brief Type used for holding the target position (first) and the * target index (second). */ -using target_pair = thrust::pair; +using target_pair = thrust::tuple; /** * @brief Helper functions for performing character-parallel replace @@ -75,12 +77,6 @@ using target_pair = thrust::pair; struct replace_multi_parallel_fn { __device__ char const* get_base_ptr() const { return d_strings.head(); } - __device__ size_type const* get_offsets_ptr() const - { - return d_strings.child(strings_column_view::offsets_column_index).data() + - d_strings.offset(); - } - __device__ string_view const get_string(size_type idx) const { return d_strings.element(idx); @@ -100,11 +96,12 @@ struct replace_multi_parallel_fn { * @param idx Index of the byte position in the chars column * @param chars_bytes Number of bytes in the chars column */ - __device__ thrust::optional has_target(size_type idx, size_type chars_bytes) const + __device__ size_type target_index(int64_t idx, int64_t chars_bytes) const { - auto const d_offsets = get_offsets_ptr(); + auto const d_offsets = d_strings_offsets; auto const d_chars = get_base_ptr() + d_offsets[0] + idx; size_type str_idx = -1; + string_view d_str{}; for (std::size_t t = 0; t < d_targets.size(); ++t) { auto const d_tgt = d_targets[t]; if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) && @@ -113,12 +110,24 @@ struct replace_multi_parallel_fn { auto const idx_itr = thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx); str_idx = thrust::distance(d_offsets, idx_itr) - 1; + d_str = get_string(str_idx - d_offsets[0]); } - auto const d_str = get_string(str_idx - d_offsets[0]); if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; } } } - return thrust::nullopt; + return -1; + } + + __device__ bool has_target(int64_t idx, int64_t chars_bytes) const + { + auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx; + for (auto& d_tgt : d_targets) { + if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) && + (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) { + return true; + } + } + return false; } /** @@ -133,28 +142,32 @@ struct replace_multi_parallel_fn { * @return Number of substrings resulting from the replace operations on this row */ __device__ size_type count_strings(size_type idx, - target_pair const* d_positions, - size_type const* d_targets_offsets) const + int64_t const* d_positions, + size_type const* d_indices, + cudf::detail::input_offsetalator d_targets_offsets) const { if (!is_valid(idx)) { return 0; } - auto const d_str = get_string(idx); - auto const d_str_end = d_str.data() + d_str.size_bytes(); - auto const base_ptr = get_base_ptr(); - auto const targets_positions = cudf::device_span( - d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]); + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr(); + + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + auto const indices = d_indices + target_offset; size_type count = 1; // always at least one string auto str_ptr = d_str.data(); - for (auto d_pair : targets_positions) { - auto const d_pos = d_pair.first; - auto const d_tgt = d_targets[d_pair.second]; - auto const tgt_ptr = base_ptr + d_pos; + for (std::size_t i = 0; i < targets_size; ++i) { + auto const tgt_idx = indices[i]; + auto const d_tgt = d_targets[tgt_idx]; + auto const tgt_ptr = base_ptr + positions[i]; if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); if (keep_size > 0) { count++; } // don't bother counting empty strings - auto const d_repl = get_replacement_string(d_pair.second); + auto const d_repl = get_replacement_string(tgt_idx); if (!d_repl.empty()) { count++; } str_ptr += keep_size + d_tgt.size_bytes(); @@ -182,9 +195,10 @@ struct replace_multi_parallel_fn { * @return The size in bytes of the output string for this row */ __device__ size_type get_strings(size_type idx, - size_type const* d_offsets, - target_pair const* d_positions, - size_type const* d_targets_offsets, + cudf::detail::input_offsetalator const d_offsets, + int64_t const* d_positions, + size_type const* d_indices, + cudf::detail::input_offsetalator d_targets_offsets, string_index_pair* d_all_strings) const { if (!is_valid(idx)) { return 0; } @@ -194,22 +208,24 @@ struct replace_multi_parallel_fn { auto const d_str_end = d_str.data() + d_str.size_bytes(); auto const base_ptr = get_base_ptr(); - auto const targets_positions = cudf::device_span( - d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]); + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + auto const indices = d_indices + target_offset; size_type output_idx = 0; size_type output_size = 0; auto str_ptr = d_str.data(); - for (auto d_pair : targets_positions) { - auto const d_pos = d_pair.first; - auto const d_tgt = d_targets[d_pair.second]; - auto const tgt_ptr = base_ptr + d_pos; + for (std::size_t i = 0; i < targets_size; ++i) { + auto const tgt_idx = indices[i]; + auto const d_tgt = d_targets[tgt_idx]; + auto const tgt_ptr = base_ptr + positions[i]; if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; } output_size += keep_size; - auto const d_repl = get_replacement_string(d_pair.second); + auto const d_repl = get_replacement_string(tgt_idx); if (!d_repl.empty()) { d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()}; } @@ -228,14 +244,19 @@ struct replace_multi_parallel_fn { } replace_multi_parallel_fn(column_device_view const& d_strings, + cudf::detail::input_offsetalator d_strings_offsets, device_span d_targets, device_span d_replacements) - : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements} + : d_strings(d_strings), + d_strings_offsets(d_strings_offsets), + d_targets{d_targets}, + d_replacements{d_replacements} { } protected: column_device_view d_strings; + cudf::detail::input_offsetalator d_strings_offsets; device_span d_targets; device_span d_replacements; }; @@ -247,17 +268,16 @@ struct replace_multi_parallel_fn { * (this happens sometimes when passing device lambdas to thrust algorithms) */ struct pair_generator { - __device__ target_pair operator()(int idx) const + __device__ target_pair operator()(int64_t idx) const { - auto pos = fn.has_target(idx, chars_bytes); - return target_pair{idx, pos.value_or(-1)}; + return thrust::make_tuple(idx, fn.target_index(idx, chars_bytes)); } replace_multi_parallel_fn fn; - size_type chars_bytes; + int64_t chars_bytes; }; struct copy_if_fn { - __device__ bool operator()(target_pair pos) { return pos.second >= 0; } + __device__ bool operator()(target_pair pos) { return thrust::get<1>(pos) >= 0; } }; std::unique_ptr replace_character_parallel(strings_column_view const& input, @@ -270,105 +290,91 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in auto const strings_count = input.size(); auto const chars_bytes = - cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) - - cudf::detail::get_value(input.offsets(), input.offset(), stream); + get_offset_value(input.offsets(), input.offset() + strings_count, stream) - + get_offset_value(input.offsets(), input.offset(), stream); auto d_targets = create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource()); auto d_replacements = create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource()); - replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements}; + replace_multi_parallel_fn fn{ + *d_strings, + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()), + d_targets, + d_replacements, + }; + + // Count the number of targets in the entire column. + // Note this may over-count in the case where a target spans adjacent strings. + auto target_count = thrust::count_if( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); }); - // count the number of targets in the entire column - auto const target_count = thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [fn, chars_bytes] __device__(size_type idx) { - return fn.has_target(idx, chars_bytes).has_value(); - }); // Create a vector of every target position in the chars column. - // These may include overlapping targets which will be resolved later. - auto targets_positions = rmm::device_uvector(target_count, stream); + // These may also include overlapping targets which will be resolved later. + auto targets_positions = rmm::device_uvector(target_count, stream); + auto targets_indices = rmm::device_uvector(target_count, stream); + + // cudf::detail::make_counting_transform_iterator hardcodes size_type + auto const copy_itr = thrust::make_transform_iterator(thrust::counting_iterator(0), + pair_generator{fn, chars_bytes}); + auto const out_itr = thrust::make_zip_iterator( + thrust::make_tuple(targets_positions.begin(), targets_indices.begin())); + auto const copy_end = + cudf::detail::copy_if_safe(copy_itr, copy_itr + chars_bytes, out_itr, copy_if_fn{}, stream); + + // adjust target count since the copy-if may have eliminated some invalid targets + target_count = std::min(static_cast(std::distance(out_itr, copy_end)), target_count); + targets_positions.resize(target_count, stream); + targets_indices.resize(target_count, stream); auto d_positions = targets_positions.data(); - - auto const copy_itr = - cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes}); - auto const copy_end = thrust::copy_if( - rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{}); + auto d_targets_indices = targets_indices.data(); // create a vector of offsets to each string's set of target positions - auto const targets_offsets = [&] { - auto string_indices = rmm::device_uvector(target_count, stream); - - auto const pos_itr = cudf::detail::make_counting_transform_iterator( - 0, cuda::proclaim_return_type([d_positions] __device__(auto idx) -> int64_t { - return d_positions[idx].first; - })); - auto pos_count = std::distance(d_positions, copy_end); - - auto begin = - cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); - auto end = begin + input.offsets().size(); - thrust::upper_bound( - rmm::exec_policy(stream), begin, end, pos_itr, pos_itr + pos_count, string_indices.begin()); - - // compute offsets per string - auto targets_offsets = rmm::device_uvector(strings_count + 1, stream); - auto d_targets_offsets = targets_offsets.data(); - - // memset to zero-out the target counts for any null-entries or strings with no targets - thrust::uninitialized_fill( - rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0); - - // next, count the number of targets per string - auto d_string_indices = string_indices.data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - target_count, - [d_string_indices, d_targets_offsets] __device__(size_type idx) { - auto const str_idx = d_string_indices[idx] - 1; - atomicAdd(d_targets_offsets + str_idx, 1); - }); - // finally, convert the counts into offsets - thrust::exclusive_scan(rmm::exec_policy(stream), - targets_offsets.begin(), - targets_offsets.end(), - targets_offsets.begin()); - return targets_offsets; - }(); - auto const d_targets_offsets = targets_offsets.data(); + auto const targets_offsets = create_offsets_from_positions( + input, targets_positions, stream, rmm::mr::get_current_device_resource()); + auto const d_targets_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view()); // compute the number of string segments produced by replace in each string auto counts = rmm::device_uvector(strings_count, stream); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(strings_count), counts.begin(), cuda::proclaim_return_type( - [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type { - return fn.count_strings(idx, d_positions, d_targets_offsets); + [fn, d_positions, d_targets_indices, d_targets_offsets] __device__( + size_type idx) -> size_type { + return fn.count_strings( + idx, d_positions, d_targets_indices, d_targets_offsets); })); // create offsets from the counts - auto offsets = - std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); - auto const total_strings = - cudf::detail::get_value(offsets->view(), strings_count, stream); - auto const d_strings_offsets = offsets->view().data(); + auto [offsets, total_strings] = + cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); + auto const d_strings_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // build a vector of all the positions for all the strings auto indices = rmm::device_uvector(total_strings, stream); auto d_indices = indices.data(); auto d_sizes = counts.data(); // reusing this vector to hold output sizes now thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), strings_count, - [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__( - size_type idx) { - d_sizes[idx] = - fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices); + [fn, + d_strings_offsets, + d_positions, + d_targets_indices, + d_targets_offsets, + d_indices, + d_sizes] __device__(size_type idx) { + d_sizes[idx] = fn.get_strings( + idx, d_strings_offsets, d_positions, d_targets_indices, d_targets_offsets, d_indices); }); // use this utility to gather the string parts into a contiguous chars column @@ -376,8 +382,8 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in auto chars_data = chars->release().data; // create offsets from the sizes - offsets = - std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); + offsets = std::get<0>( + cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); // build the strings columns from the chars and offsets return make_strings_column(strings_count, diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 1f752f543d0..2c548f2f7cd 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -14,20 +14,21 @@ * limitations under the License. */ +#include "strings/split/split.cuh" + #include #include -#include #include #include -#include +#include #include #include +#include #include #include #include #include #include -#include #include #include @@ -39,11 +40,7 @@ #include #include #include -#include #include -#include -#include -#include #include namespace cudf { @@ -52,505 +49,375 @@ namespace detail { namespace { /** - * @brief Average string byte-length threshold for deciding character-level vs row-level parallel - * algorithm. + * @brief Threshold to decide on using string or character-parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * the character-parallel function is used. + * Otherwise, a regular string-parallel function is used. * - * This value was determined by running the replace string scalar benchmark against different - * power-of-2 string lengths and observing the point at which the performance only improved for - * all trials. + * This value was found using the replace-multi benchmark results using an + * RTX A6000. */ -constexpr size_type BYTES_PER_VALID_ROW_THRESHOLD = 64; +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256; /** - * @brief Function logic for the row-level parallelism replace API. - * - * This will perform a replace operation on each string. + * @brief Helper functions for performing character-parallel replace */ -struct replace_row_parallel_fn { - column_device_view const d_strings; - string_view const d_target; - string_view const d_repl; - int32_t const max_repl; - int32_t* d_offsets{}; - char* d_chars{}; +struct replace_parallel_chars_fn { + __device__ inline char const* get_base_ptr() const { return d_strings.head(); } - __device__ void operator()(size_type idx) + __device__ inline string_view const get_string(size_type idx) const { - if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const d_str = d_strings.element(idx); - char const* in_ptr = d_str.data(); - - char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; - auto max_n = (max_repl < 0) ? d_str.length() : max_repl; - auto bytes = d_str.size_bytes(); - auto position = d_str.find(d_target); - - size_type last_pos = 0; - while ((position != string_view::npos) && (max_n > 0)) { - if (out_ptr) { - auto const curr_pos = d_str.byte_offset(position); - out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos); // copy left - out_ptr = copy_string(out_ptr, d_repl); // copy repl - last_pos = curr_pos + d_target.size_bytes(); - } else { - bytes += d_repl.size_bytes() - d_target.size_bytes(); - } - position = d_str.find(d_target, position + d_target.length()); - --max_n; - } - if (out_ptr) // copy whats left (or right depending on your point of view) - memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos); - else - d_offsets[idx] = bytes; + return d_strings.element(idx); } -}; -/** - * @brief Functor for detecting falsely-overlapped target positions. - * - * This functor examines target positions that have been flagged as potentially overlapped by - * a previous target position and identifies the overlaps that are false. A false overlap can occur - * when a target position is overlapped by another target position that is itself overlapped. - * - * For example, a target string of "+++" and string to search of "++++++" will generate 4 potential - * target positions at char offsets 0 through 3. The targets at offsets 1, 2, and 3 will be flagged - * as potential overlaps since a prior target position is within range of the target string length. - * The targets at offset 1 and 2 are true overlaps, since the footprint of the valid target at - * offset 0 overlaps with them. The target at offset 3 is not truly overlapped because it is only - * overlapped by invalid targets, targets that were themselves overlapped by a valid target. - */ -struct target_false_overlap_filter_fn { - size_type const* const d_overlap_pos_indices{}; - size_type const* const d_target_positions{}; - size_type const target_size{}; + __device__ inline bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } - __device__ bool operator()(size_type overlap_idx) const + /** + * @brief Returns true if the target string is found at the given byte position + * in the input strings column and is legally within a string row + * + * @param idx Index of the byte position in the chars column + */ + __device__ bool is_target_within_row(int64_t idx) const { - if (overlap_idx == 0) { - // The first overlap has no prior overlap to chain, so it should be kept as an overlap. - return false; + auto const d_offsets = d_strings_offsets; + auto const d_chars = get_base_ptr() + idx; + auto const d_tgt = d_target; + auto const chars_end = chars_bytes + d_offsets[0]; + if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_end) && + (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) { + auto const idx_itr = + thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx); + auto str_idx = static_cast(thrust::distance(d_offsets, idx_itr) - 1); + auto d_str = get_string(str_idx); + if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return true; } } + return false; + } - size_type const this_pos_idx = d_overlap_pos_indices[overlap_idx]; - - // Searching backwards for the first target position index of an overlap that is not adjacent - // to its overlap predecessor. The result will be the first overlap in this chain of overlaps. - size_type first_overlap_idx = overlap_idx; - size_type first_pos_idx = this_pos_idx; - while (first_overlap_idx > 0) { - size_type prev_pos_idx = d_overlap_pos_indices[--first_overlap_idx]; - if (prev_pos_idx + 1 != first_pos_idx) { break; } - first_pos_idx = prev_pos_idx; - } + /** + * @brief Returns true if the target string found at the given byte position + * + * @param idx Index of the byte position in the chars column + */ + __device__ bool has_target(int64_t idx) const + { + auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx; + return (!d_target.empty() && (idx + d_target.size_bytes() <= chars_bytes) && + (d_target.compare(d_chars, d_target.size_bytes()) == 0)); + } - // The prior target position to the first overlapped position in the chain is a valid target. - size_type valid_pos_idx = first_pos_idx - 1; - size_type valid_pos = d_target_positions[valid_pos_idx]; - - // Walk forward from this valid target. Any targets within the range of this valid one are true - // overlaps. The first overlap beyond the range of this valid target is another valid target, - // as it was falsely overlapped by a target that was itself overlapped. Repeat until we get to - // the overlapped position being queried by this call. - while (valid_pos_idx < this_pos_idx) { - size_type next_pos_idx = valid_pos_idx + 1; - size_type next_pos = d_target_positions[next_pos_idx]; - // Every target position within the range of a valid target position is a true overlap. - while (next_pos < valid_pos + target_size) { - if (next_pos_idx == this_pos_idx) { return false; } - next_pos = d_target_positions[++next_pos_idx]; + /** + * @brief Count the number of strings that will be produced by the replace + * + * This includes segments of the string that are not replaced as well as those + * that are replaced. + * + * @param idx Index of the row in d_strings to be processed + * @param d_positions Positions of the targets found in the chars column + * @param d_targets_offsets Offsets identify which target positions go with the current string + * @return Number of substrings resulting from the replace operations on this row + */ + __device__ size_type count_strings(size_type idx, + int64_t const* d_positions, + cudf::detail::input_offsetalator d_targets_offsets) const + { + if (!is_valid(idx)) { return 0; } + + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr(); + auto max_n = (maxrepl < 0) ? d_str.length() : maxrepl; + + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + + size_type count = 1; // always at least one string + auto str_ptr = d_str.data(); + for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) { + auto const tgt_ptr = base_ptr + positions[i]; + if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { + auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); + if (keep_size > 0) { count++; } // don't bother counting empty strings + if (!d_replacement.empty()) { count++; } + str_ptr += keep_size + d_target.size_bytes(); + --max_n; } - valid_pos_idx = next_pos_idx; - valid_pos = next_pos; } - - // This was overlapped only by false overlaps and therefore is a valid target. - return true; + return count; } -}; -/** - * @brief Functor for replacing each target string with the replacement string. - * - * This will perform a replace operation at each target position. - */ -struct target_replacer_fn { - device_span const d_target_positions; - char const* const d_in_chars{}; - char* const d_out_chars{}; - size_type const target_size{}; - string_view const d_repl; - int32_t const in_char_offset = 0; - - __device__ void operator()(size_type input_idx) const + /** + * @brief Retrieve the strings for each row + * + * This will return string segments as string_index_pair objects for + * parts of the string that are not replaced interlaced with the + * appropriate replacement string where replacement targets are found. + * + * This function is called only once to produce both the string_index_pair objects + * and the output row size in bytes. + * + * @param idx Index of the row in d_strings + * @param d_offsets Offsets to identify where to store the results of the replace for this string + * @param d_positions The target positions found in the chars column + * @param d_targets_offsets The offsets to identify which target positions go with this string + * @param d_all_strings The output of all the produced string segments + * @return The size in bytes of the output string for this row + */ + __device__ size_type get_strings(size_type idx, + cudf::detail::input_offsetalator const d_offsets, + int64_t const* d_positions, + cudf::detail::input_offsetalator d_targets_offsets, + string_index_pair* d_all_strings) const { - // Calculate the adjustment from input index to output index for each prior target position. - auto const repl_size = d_repl.size_bytes(); - auto const idx_delta_per_pos = repl_size - target_size; - - // determine the number of target positions at or before this character position - size_type const* next_target_pos_ptr = thrust::upper_bound( - thrust::seq, d_target_positions.begin(), d_target_positions.end(), input_idx); - size_type const num_prev_targets = next_target_pos_ptr - d_target_positions.data(); - size_type output_idx = input_idx - in_char_offset + idx_delta_per_pos * num_prev_targets; - - if (num_prev_targets == 0) { - // not within a target string - d_out_chars[output_idx] = d_in_chars[input_idx]; - } else { - // check if this input position is within a target string - size_type const prev_target_pos = *(next_target_pos_ptr - 1); - size_type target_idx = input_idx - prev_target_pos; - if (target_idx < target_size) { - // within the target string, so the original calculation was off by one target string - output_idx -= idx_delta_per_pos; - - // Copy the corresponding byte from the replacement string. If the replacement string is - // larger than the target string then the thread reading the last target byte is - // responsible for copying the remainder of the replacement string. - if (target_idx < repl_size) { - d_out_chars[output_idx++] = d_repl.data()[target_idx++]; - if (target_idx == target_size) { - memcpy(d_out_chars + output_idx, d_repl.data() + target_idx, repl_size - target_idx); - } + if (!is_valid(idx)) { return 0; } + + auto const d_output = d_all_strings + d_offsets[idx]; + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr(); + auto max_n = (maxrepl < 0) ? d_str.length() : maxrepl; + + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + + size_type output_idx = 0; + size_type output_size = 0; + auto str_ptr = d_str.data(); + for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) { + auto const tgt_ptr = base_ptr + positions[i]; + if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { + auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); + if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; } + output_size += keep_size; + + if (!d_replacement.empty()) { + d_output[output_idx++] = + string_index_pair{d_replacement.data(), d_replacement.size_bytes()}; } - } else { - // not within a target string - d_out_chars[output_idx] = d_in_chars[input_idx]; + output_size += d_replacement.size_bytes(); + + str_ptr += keep_size + d_target.size_bytes(); + --max_n; } } + // include any leftover parts of the string + if (str_ptr <= d_str_end) { + auto const left_size = static_cast(thrust::distance(str_ptr, d_str_end)); + d_output[output_idx] = string_index_pair{str_ptr, left_size}; + output_size += left_size; + } + return output_size; } + + replace_parallel_chars_fn(column_device_view const& d_strings, + cudf::detail::input_offsetalator d_strings_offsets, + int64_t chars_bytes, + string_view d_target, + string_view d_replacement, + cudf::size_type maxrepl) + : d_strings(d_strings), + d_strings_offsets(d_strings_offsets), + chars_bytes(chars_bytes), + d_target{d_target}, + d_replacement{d_replacement}, + maxrepl(maxrepl) + { + } + + protected: + column_device_view d_strings; + cudf::detail::input_offsetalator d_strings_offsets; + int64_t chars_bytes; + string_view d_target; + string_view d_replacement; + cudf::size_type maxrepl; }; -/** - * @brief Filter target positions that are overlapped by other, valid target positions. - * - * This performs an in-place modification of the target positions to remove any target positions - * that are overlapped by other, valid target positions. For example, if the target string is "++" - * and the string to search is "+++" then there will be two potential targets at character offsets - * 0 and 1. The target at offset 0 is valid and overlaps the target at offset 1, invalidating the - * target at offset 1. - * - * @param[in,out] d_target_positions Potential target positions to filter in-place. - * @param[in] target_count Number of potential target positions. - * @param[in] target_size Size of the target string in bytes. - * @param[in] stream CUDA stream to use for device operations. - * @return Number of target positions after filtering. - */ -size_type filter_overlap_target_positions(size_type* d_target_positions, - size_type target_count, - size_type target_size, - rmm::cuda_stream_view stream) +std::unique_ptr replace_character_parallel(strings_column_view const& input, + string_view const& d_target, + string_view const& d_replacement, + cudf::size_type maxrepl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto overlap_detector = [d_target_positions, target_size] __device__(size_type pos_idx) -> bool { - return (pos_idx > 0) - ? d_target_positions[pos_idx] - d_target_positions[pos_idx - 1] < target_size - : false; - }; - - // count the potential number of overlapped target positions - size_type overlap_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(target_count), - overlap_detector); - if (overlap_count == 0) { return target_count; } - - // create a vector indexing the potential overlapped target positions - rmm::device_uvector potential_overlapped_pos_indices(overlap_count, stream); - auto d_potential_overlapped_pos_indices = potential_overlapped_pos_indices.data(); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(target_count), - d_potential_overlapped_pos_indices, - overlap_detector); - - // filter out the false overlaps that are actually valid - rmm::device_uvector overlapped_pos_indices(overlap_count, stream); - auto d_overlapped_pos_indices = overlapped_pos_indices.data(); - auto overlap_end = - thrust::remove_copy_if(rmm::exec_policy(stream), - d_potential_overlapped_pos_indices, - d_potential_overlapped_pos_indices + overlap_count, - thrust::make_counting_iterator(0), - d_overlapped_pos_indices, - target_false_overlap_filter_fn{ - d_potential_overlapped_pos_indices, d_target_positions, target_size}); - overlap_count = cudf::distance(d_overlapped_pos_indices, overlap_end); - - // In-place remove any target positions that are overlapped by valid target positions - auto target_pos_end = thrust::remove_if( - rmm::exec_policy(stream), - d_target_positions, - d_target_positions + target_count, + auto d_strings = column_device_view::create(input.parent(), stream); + + auto const strings_count = input.size(); + auto const chars_offset = get_offset_value(input.offsets(), input.offset(), stream); + auto const chars_bytes = + get_offset_value(input.offsets(), input.offset() + strings_count, stream) - chars_offset; + + auto const offsets_begin = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + + replace_parallel_chars_fn fn{ + *d_strings, offsets_begin, chars_bytes, d_target, d_replacement, maxrepl}; + + // Count the number of targets in the entire column. + // Note this may over-count in the case where a target spans adjacent strings. + auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [fn] __device__(int64_t idx) { return fn.has_target(idx); }); + + // Create a vector of every target position in the chars column. + // These may also include overlapping targets which will be resolved later. + auto targets_positions = rmm::device_uvector(target_count, stream); + auto const copy_itr = thrust::counting_iterator(chars_offset); + auto const copy_end = cudf::detail::copy_if_safe( + copy_itr, + copy_itr + chars_bytes + chars_offset, + targets_positions.begin(), + [fn] __device__(int64_t idx) { return fn.is_target_within_row(idx); }, + stream); + + // adjust target count since the copy-if may have eliminated some invalid targets + target_count = std::min(std::distance(targets_positions.begin(), copy_end), target_count); + targets_positions.resize(target_count, stream); + auto d_positions = targets_positions.data(); + + // create a vector of offsets to each string's set of target positions + auto const targets_offsets = create_offsets_from_positions( + input, targets_positions, stream, rmm::mr::get_current_device_resource()); + auto const d_targets_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view()); + + // compute the number of string segments produced by replace in each string + auto counts = rmm::device_uvector(strings_count, stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(strings_count), + counts.begin(), + cuda::proclaim_return_type( + [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type { + return fn.count_strings(idx, d_positions, d_targets_offsets); + })); + + // create offsets from the counts + auto [offsets, total_strings] = + cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); + auto const d_strings_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); + + // build a vector of all the positions for all the strings + auto indices = rmm::device_uvector(total_strings, stream); + auto d_indices = indices.data(); + auto d_sizes = counts.data(); // reusing this vector to hold output sizes now + thrust::for_each_n( + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), - [d_overlapped_pos_indices, overlap_count] __device__(size_type target_position_idx) -> bool { - return thrust::binary_search(thrust::seq, - d_overlapped_pos_indices, - d_overlapped_pos_indices + overlap_count, - target_position_idx); + strings_count, + [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__( + size_type idx) { + d_sizes[idx] = + fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices); }); - return cudf::distance(d_target_positions, target_pos_end); -} -/** - * @brief Filter target positions to remove any invalid target positions. - * - * This performs an in-place modification of the target positions to remove any target positions - * that are invalid, either by the target string overlapping a row boundary or being overlapped by - * another valid target string. - * - * @param[in,out] target_positions Potential target positions to filter in-place. - * @param[in] d_offsets_span Memory range encompassing the string column offsets. - * @param[in] target_size Size of the target string in bytes. - * @param[in] stream CUDA stream to use for device operations. - * @return Number of target positions after filtering. - */ -size_type filter_false_target_positions(rmm::device_uvector& target_positions, - device_span d_offsets_span, - size_type target_size, - rmm::cuda_stream_view stream) -{ - // In-place remove any positions for target strings that crossed string boundaries. - auto d_target_positions = target_positions.data(); - auto target_pos_end = - thrust::remove_if(rmm::exec_policy(stream), - d_target_positions, - d_target_positions + target_positions.size(), - [d_offsets_span, target_size] __device__(size_type target_pos) -> bool { - // find the end of the string containing the start of this target - size_type const* offset_ptr = thrust::upper_bound( - thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos); - return target_pos + target_size > *offset_ptr; - }); - auto const target_count = cudf::distance(d_target_positions, target_pos_end); - if (target_count == 0) { return 0; } - - // Filter out target positions that are the result of overlapping target matches. - return (target_count > 1) - ? filter_overlap_target_positions(d_target_positions, target_count, target_size, stream) - : target_count; -} + // use this utility to gather the string parts into a contiguous chars column + auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); + auto chars_data = chars->release().data; -/** - * @brief Filter target positions beyond the maximum target replacements per row limit. - * - * This performs an in-place modification of the target positions to remove any target positions - * corresponding to targets that should not be replaced due to the maximum target replacement per - * row limit. - * - * @param[in,out] target_positions Target positions to filter in-place. - * @param[in] target_count Number of target positions. - * @param[in] d_offsets_span Memory range encompassing the string column offsets. - * @param[in] max_repl_per_row Maximum target replacements per row limit. - * @param[in] stream CUDA stream to use for device operations. - * @return Number of target positions after filtering. - */ -size_type filter_maxrepl_target_positions(size_type* d_target_positions, - size_type target_count, - device_span d_offsets_span, - size_type max_repl_per_row, - rmm::cuda_stream_view stream) -{ - auto pos_to_row_fn = cuda::proclaim_return_type( - [d_offsets_span] __device__(size_type target_pos) -> size_type { - auto upper_bound = - thrust::upper_bound(thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos); - return thrust::distance(d_offsets_span.begin(), upper_bound); - }); + // create offsets from the sizes + offsets = std::get<0>( + cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); - // compute the match count per row for each target position - rmm::device_uvector match_counts(target_count, stream); - auto d_match_counts = match_counts.data(); - thrust::inclusive_scan_by_key( - rmm::exec_policy(stream), - thrust::make_transform_iterator(d_target_positions, pos_to_row_fn), - thrust::make_transform_iterator(d_target_positions + target_count, pos_to_row_fn), - thrust::make_constant_iterator(1), - d_match_counts); - - // In-place remove any positions that exceed the per-row match limit - auto target_pos_end = - thrust::remove_if(rmm::exec_policy(stream), - d_target_positions, - d_target_positions + target_count, - d_match_counts, - [max_repl_per_row] __device__(size_type match_count) -> bool { - return match_count > max_repl_per_row; - }); - - return cudf::distance(d_target_positions, target_pos_end); + // build the strings columns from the chars and offsets + return make_strings_column(strings_count, + std::move(offsets), + std::move(chars_data.release()[0]), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } /** - * @brief Scalar string replacement using a character-level parallel algorithm. - * - * Replaces occurrences of the target string with the replacement string using an algorithm with - * character-level parallelism. This algorithm will perform well when the strings in the string - * column are relatively long. - * @see BYTES_PER_VALID_ROW_THRESHOLD + * @brief Function logic for the replace_string_parallel * - * @param strings String column to search for target strings. - * @param chars_start Offset of the first character in the string column. - * @param chars_end Offset beyond the last character in the string column to search. - * @param d_target String to search for within the string column. - * @param d_repl Replacement string if target string is found. - * @param maxrepl Maximum times to replace if target appears multiple times in a string. - * @param stream CUDA stream to use for device operations - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings column. + * Performs the multi-replace operation with a thread per string. + * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD */ -std::unique_ptr replace_char_parallel(strings_column_view const& strings, - size_type chars_start, - size_type chars_end, - string_view const& d_target, - string_view const& d_repl, - int32_t maxrepl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const strings_count = strings.size(); - auto const offset_count = strings_count + 1; - auto const d_offsets = strings.offsets().begin() + strings.offset(); // TODO: PR 14824 - auto const d_in_chars = strings.chars_begin(stream); - auto const chars_bytes = chars_end - chars_start; - auto const target_size = d_target.size_bytes(); - - // detect a target match at the specified byte position - device_span const d_chars_span(d_in_chars, chars_end); - auto target_detector = [d_chars_span, d_target] __device__(size_type char_idx) { - auto target_size = d_target.size_bytes(); - auto target_ptr = d_chars_span.begin() + char_idx; - return target_ptr + target_size <= d_chars_span.end() && - d_target.compare(target_ptr, target_size) == 0; - }; - - // Count target string matches across all character positions, ignoring string boundaries and - // overlapping target strings. This may produce false-positives. - size_type target_count = thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - thrust::make_counting_iterator(chars_end), - target_detector); - if (target_count == 0) { - // nothing to replace, copy the input column - return std::make_unique(strings.parent(), stream, mr); - } +struct replace_fn { + column_device_view const d_strings; + string_view d_target; + string_view d_replacement; + cudf::size_type maxrepl; + cudf::size_type* d_offsets{}; + char* d_chars{}; - // create a vector of the potential target match positions - rmm::device_uvector target_positions(target_count, stream); - auto d_target_positions = target_positions.data(); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - thrust::make_counting_iterator(chars_end), - d_target_positions, - target_detector); - - device_span d_offsets_span(d_offsets, offset_count); - if (target_size > 1) { - target_count = - filter_false_target_positions(target_positions, d_offsets_span, target_size, stream); - if (target_count == 0) { - // nothing to replace, copy the input column - return std::make_unique(strings.parent(), stream, mr); + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { + if (!d_chars) { d_offsets[idx] = 0; } + return; } - } + auto const d_str = d_strings.element(idx); + char const* in_ptr = d_str.data(); - // filter out any target positions that exceed the per-row match limit - if (maxrepl > 0 && target_count > maxrepl) { - target_count = filter_maxrepl_target_positions( - d_target_positions, target_count, d_offsets_span, maxrepl, stream); + size_type bytes = d_str.size_bytes(); + size_type spos = 0; + size_type lpos = 0; + char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto max_n = (maxrepl < 0) ? d_str.length() : maxrepl; + + // check each character against each target + while (spos < d_str.size_bytes() && (max_n > 0)) { + auto const d_tgt = d_target; + if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + { + auto const d_repl = d_replacement; + bytes += d_repl.size_bytes() - d_tgt.size_bytes(); + if (out_ptr) { + out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); + out_ptr = copy_string(out_ptr, d_repl); + lpos = spos + d_tgt.size_bytes(); + } + spos += d_tgt.size_bytes() - 1; + --max_n; + } + ++spos; + } + if (out_ptr) { // copy remainder + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + } else { + d_offsets[idx] = bytes; + } } +}; - // build the offsets column - auto offsets_column = make_numeric_column( - data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - auto delta_per_target = d_repl.size_bytes() - target_size; - device_span d_target_positions_span(d_target_positions, target_count); - auto offsets_update_fn = cuda::proclaim_return_type( - [d_target_positions_span, delta_per_target, chars_start] __device__(int32_t offset) -> int32_t { - // determine the number of target positions occurring before this offset - size_type const* next_target_pos_ptr = thrust::lower_bound( - thrust::seq, d_target_positions_span.begin(), d_target_positions_span.end(), offset); - size_type num_prev_targets = - thrust::distance(d_target_positions_span.data(), next_target_pos_ptr); - return offset - chars_start + delta_per_target * num_prev_targets; - }); - thrust::transform(rmm::exec_policy(stream), - d_offsets_span.begin(), - d_offsets_span.end(), - offsets_view.begin(), - offsets_update_fn); - - // build the characters column - rmm::device_uvector chars(chars_bytes + (delta_per_target * target_count), stream, mr); - auto d_out_chars = chars.data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - chars_bytes, - target_replacer_fn{ - d_target_positions_span, d_in_chars, d_out_chars, target_size, d_repl, chars_start}); - - // free the target positions buffer as it is no longer needed - (void)target_positions.release(); - - return make_strings_column(strings_count, - std::move(offsets_column), - chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); -} - -/** - * @brief Scalar string replacement using a row-level parallel algorithm. - * - * Replaces occurrences of the target string with the replacement string using an algorithm with - * row-level parallelism. This algorithm will perform well when the strings in the string - * column are relatively short. - * @see BYTES_PER_VALID_ROW_THRESHOLD - * - * @param strings String column to search for target strings. - * @param d_target String to search for within the string column. - * @param d_repl Replacement string if target string is found. - * @param maxrepl Maximum times to replace if target appears multiple times in a string. - * @param stream CUDA stream to use for device operations - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings column. - */ -std::unique_ptr replace_row_parallel(strings_column_view const& strings, - string_view const& d_target, - string_view const& d_repl, - int32_t maxrepl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr replace_string_parallel(strings_column_view const& input, + string_view const& d_target, + string_view const& d_replacement, + cudf::size_type maxrepl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_strings = column_device_view::create(input.parent(), stream); - // this utility calls the given functor to build the offsets and chars columns auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr); + replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr); - return make_strings_column(strings.size(), + return make_strings_column(input.size(), std::move(offsets_column), chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } } // namespace -std::unique_ptr replace(strings_column_view const& strings, +std::unique_ptr replace(strings_column_view const& input, string_scalar const& target, string_scalar const& repl, - int32_t maxrepl, + cudf::size_type maxrepl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); - if (maxrepl == 0) return std::make_unique(strings.parent(), stream, mr); + if (input.is_empty()) { return make_empty_column(type_id::STRING); } + if (maxrepl == 0) { return std::make_unique(input.parent(), stream, mr); } CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid."); CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string."); @@ -558,25 +425,11 @@ std::unique_ptr replace(strings_column_view const& strings, string_view d_target(target.data(), target.size()); string_view d_repl(repl.data(), repl.size()); - // determine range of characters in the base column - auto const strings_count = strings.size(); - auto const offset_count = strings_count + 1; - auto const d_offsets = strings.offsets().data() + strings.offset(); - size_type const chars_start = - (strings.offset() == 0) - ? 0 - : cudf::detail::get_value(strings.offsets(), strings.offset(), stream); - size_type const chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size(stream) - : cudf::detail::get_value( - strings.offsets(), strings.offset() + strings_count, stream); - size_type const chars_bytes = chars_end - chars_start; - - auto const avg_bytes_per_row = chars_bytes / std::max(strings_count - strings.null_count(), 1); - return (avg_bytes_per_row < BYTES_PER_VALID_ROW_THRESHOLD) - ? replace_row_parallel(strings, d_target, d_repl, maxrepl, stream, mr) - : replace_char_parallel( - strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr); + return (input.size() == input.null_count() || + ((input.chars_size(stream) / (input.size() - input.null_count())) < + AVG_CHAR_BYTES_THRESHOLD)) + ? replace_string_parallel(input, d_target, d_repl, maxrepl, stream, mr) + : replace_character_parallel(input, d_target, d_repl, maxrepl, stream, mr); } } // namespace detail diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu index 26fb1c7819f..bbca4997f57 100644 --- a/cpp/src/strings/replace/replace_nulls.cu +++ b/cpp/src/strings/replace/replace_nulls.cu @@ -36,18 +36,18 @@ namespace cudf { namespace strings { namespace detail { -std::unique_ptr replace_nulls(strings_column_view const& strings, +std::unique_ptr replace_nulls(strings_column_view const& input, string_scalar const& repl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + size_type strings_count = input.size(); + if (strings_count == 0) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); string_view d_repl(repl.data(), repl.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; // build offsets column @@ -58,12 +58,12 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, })); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // build chars column rmm::device_uvector chars(bytes, stream, mr); auto d_chars = chars.data(); - thrust::for_each_n(rmm::exec_policy(stream), + thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), strings_count, [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) { diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu index 041801336e6..c11664c86d4 100644 --- a/cpp/src/strings/replace/replace_slice.cu +++ b/cpp/src/strings/replace/replace_slice.cu @@ -50,7 +50,7 @@ struct replace_slice_fn { __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_offsets[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -75,34 +75,37 @@ struct replace_slice_fn { } // namespace -std::unique_ptr replace_slice(strings_column_view const& strings, +std::unique_ptr replace_slice(strings_column_view const& input, string_scalar const& repl, size_type start, size_type stop, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); + if (input.is_empty()) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); - if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); + if (stop > 0) { + CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); + } string_view d_repl(repl.data(), repl.size()); - auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_strings = column_device_view::create(input.parent(), stream); // this utility calls the given functor to build the offsets and chars columns auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr); + replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr); - return make_strings_column(strings.size(), + return make_strings_column(input.size(), std::move(offsets_column), chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } + } // namespace detail -std::unique_ptr replace_slice(strings_column_view const& strings, +std::unique_ptr replace_slice(strings_column_view const& input, string_scalar const& repl, size_type start, size_type stop, @@ -110,7 +113,7 @@ std::unique_ptr replace_slice(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_slice(strings, repl, start, stop, stream, mr); + return detail::replace_slice(input, repl, start, stop, stream, mr); } } // namespace strings From 5192b608eeed4bda9317c657253c3a5630aa4c5d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Apr 2024 09:11:37 -1000 Subject: [PATCH 17/69] Align date_range defaults with pandas, support tz (#15139) Precursor to https://github.com/rapidsai/cudf/issues/15116 * Aligns `date_range` signature with pandas, _technically_ an API breakage with `closed` changing defaults even though it still isn't supported * Copies pandas behavior of allowing `date_range` with just two of `start/end/periods` * Supports `tz` arg now Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15139 --- python/cudf/cudf/core/tools/datetimes.py | 49 +++++++++++++----------- python/cudf/cudf/tests/test_datetime.py | 16 ++++++++ 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 65f97c99934..ed8fca88acd 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -799,9 +799,11 @@ def date_range( periods=None, freq=None, tz=None, - normalize=False, + normalize: bool = False, name=None, - closed=None, + closed: Literal["left", "right", "both", "neither"] = "both", + *, + unit: Optional[str] = None, ): """Return a fixed frequency DatetimeIndex. @@ -837,8 +839,13 @@ def date_range( name : str, default None Name of the resulting DatetimeIndex - closed : {None, 'left', 'right'}, optional - Not Supported + closed : {"left", "right", "both", "neither"}, default "both" + Whether to set each bound as closed or open. + Currently only "both" is supported + + unit : str, default None + Specify the desired resolution of the result. Currently + not supported. Returns ------- @@ -875,11 +882,15 @@ def date_range( '2026-04-23 08:00:00'], dtype='datetime64[ns]') """ - if tz is not None: - raise NotImplementedError("tz is currently unsupported.") + if closed != "both": + raise NotImplementedError(f"{closed=} is currently unsupported.") + if unit is not None: + raise NotImplementedError(f"{unit=} is currently unsupported.") + if normalize is not False: + raise NotImplementedError(f"{normalize=} is currently unsupported.") - if closed is not None: - raise NotImplementedError("closed is currently unsupported.") + if freq is None and any(arg is None for arg in (start, end, periods)): + freq = "D" if (start, end, periods, freq).count(None) > 1: raise ValueError( @@ -894,7 +905,7 @@ def date_range( FutureWarning, ) - dtype = np.dtype(" bool: @@ -1026,14 +1039,6 @@ def _has_non_fixed_frequency(freq: DateOffset) -> bool: return len(freq.kwds.keys() & non_fixed_frequencies) > 0 -def _has_mixed_freqeuency(freq: DateOffset) -> bool: - """Utility to determine if `freq` contains mixed fixed and non-fixed - frequency offset. e.g. {months=1, days=5} - """ - - return _has_fixed_frequency(freq) and _has_non_fixed_frequency(freq) - - def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int: """Given a DateOffset, which can consist of either fixed frequency or non-fixed frequency offset, convert to the smallest possible fixed diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 7c209078fd2..37ba7acf044 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2357,3 +2357,19 @@ def test_timezone_array_notimplemented(): def test_to_datetime_errors_ignore_deprecated(): with pytest.warns(FutureWarning): cudf.to_datetime("2001-01-01 00:04:45", errors="ignore") + + +def test_date_range_freq_default(): + result = pd.date_range("2020-01-01", periods=2, name="foo") + expected = cudf.date_range("2020-01-01", periods=2, name="foo") + assert_eq(result, expected) + + +def test_date_range_tz(): + result = pd.date_range("2020-01-01", periods=2, tz="UTC") + expected = cudf.date_range("2020-01-01", periods=2, tz="UTC") + assert_eq(result, expected) + + result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") + expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") + assert_eq(result, expected) From fbaad8a480d3b2755afe04431c5abe6c098224b4 Mon Sep 17 00:00:00 2001 From: Tanmay Gujar Date: Wed, 3 Apr 2024 18:10:19 -0400 Subject: [PATCH 18/69] [FEA] Performance improvement for mixed left semi/anti join (#15288) Current implementation of mixed semi/anti join probes the built hash table twice -- once to find the output table size and once to build the output. Since the upper bound on output table size is O(N) where N is the size of the left table, we can avoid probing twice and achieve a faster join implementation. This implementation reserves the required upper memory bound, builds the output, and then collects the relevant output rows. This probes the hash table only once. This PR also removes the size kernels for mixed semi join and output size parameters passed to the mixed semi join. Closes #15250 # Benchmark Results from cudf repository ## mixed_left_semi_join_32bit (New implementation) ### [0] NVIDIA TITAN V ``` | Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples | CPU Time | Noise | GPU Time | Noise | |----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------| | I32 | I32 | 0 | 100000 | 100000 | 1920x | 266.239 us | 3.43% | 261.324 us | 2.84% | | I32 | I32 | 0 | 100000 | 400000 | 1024x | 495.434 us | 1.18% | 490.544 us | 0.63% | | I32 | I32 | 0 | 10000000 | 10000000 | 24x | 20.919 ms | 0.04% | 20.914 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 40000000 | 11x | 54.697 ms | 0.03% | 54.692 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 100000000 | 11x | 122.171 ms | 0.03% | 122.166 ms | 0.03% | | I32 | I32 | 0 | 80000000 | 100000000 | 11x | 192.979 ms | 0.01% | 192.975 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 100000000 | 11x | 212.878 ms | 0.01% | 212.874 ms | 0.01% | | I32 | I32 | 0 | 10000000 | 240000000 | 11x | 279.794 ms | 0.01% | 279.790 ms | 0.01% | | I32 | I32 | 0 | 80000000 | 240000000 | 11x | 351.186 ms | 0.01% | 351.183 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 240000000 | 11x | 370.794 ms | 0.01% | 370.790 ms | 0.01% | ``` ## mixed_left_semi_join_32bit (Old implementation) ### [0] NVIDIA TITAN V ``` | Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples | CPU Time | Noise | GPU Time | Noise | |----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------| | I32 | I32 | 0 | 100000 | 100000 | 1392x | 368.030 us | 3.05% | 363.065 us | 2.70% | | I32 | I32 | 0 | 100000 | 400000 | 832x | 832.492 us | 0.84% | 827.586 us | 0.60% | | I32 | I32 | 0 | 10000000 | 10000000 | 16x | 32.310 ms | 0.03% | 32.305 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 40000000 | 11x | 100.222 ms | 0.03% | 100.218 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 100000000 | 11x | 235.874 ms | 0.01% | 235.870 ms | 0.01% | | I32 | I32 | 0 | 80000000 | 100000000 | 11x | 307.042 ms | 0.01% | 307.038 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 100000000 | 11x | 326.797 ms | 0.01% | 326.794 ms | 0.01% | | I32 | I32 | 0 | 10000000 | 240000000 | 11x | 552.730 ms | 0.01% | 552.728 ms | 0.01% | | I32 | I32 | 0 | 80000000 | 240000000 | 11x | 624.958 ms | 0.01% | 624.956 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 240000000 | 11x | 644.148 ms | 0.00% | 644.146 ms | 0.00% | ``` Authors: - Tanmay Gujar (https://github.com/tgujar) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Jason Lowe (https://github.com/jlowe) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15288 --- cpp/CMakeLists.txt | 1 - cpp/include/cudf/join.hpp | 90 +---- cpp/src/join/mixed_join_kernels_semi.cu | 31 +- cpp/src/join/mixed_join_kernels_semi.cuh | 64 +--- cpp/src/join/mixed_join_semi.cu | 360 ++---------------- cpp/src/join/mixed_join_size_kernels_semi.cu | 125 ------ cpp/tests/join/mixed_join_tests.cu | 41 -- java/src/main/java/ai/rapids/cudf/Table.java | 146 ------- java/src/main/native/src/TableJni.cpp | 60 --- .../test/java/ai/rapids/cudf/TableTest.java | 116 ------ 10 files changed, 42 insertions(+), 992 deletions(-) delete mode 100644 cpp/src/join/mixed_join_size_kernels_semi.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f1d43e3c35f..7c32474ea56 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -453,7 +453,6 @@ add_library( src/join/mixed_join_semi.cu src/join/mixed_join_size_kernel.cu src/join/mixed_join_size_kernel_nulls.cu - src/join/mixed_join_size_kernels_semi.cu src/join/semi_join.cu src/json/json_path.cu src/lists/contains.cu diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b7a3129cfec..e343ad9ee32 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -944,9 +944,6 @@ mixed_full_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be - * precomputed using the corresponding mixed_full_join_size API). * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -958,8 +955,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& left_conditional, table_view const& right_conditional, ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, + null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -996,9 +992,6 @@ std::unique_ptr> mixed_left_semi_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be - * precomputed using the corresponding mixed_full_join_size API). * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -1010,8 +1003,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& left_conditional, table_view const& right_conditional, ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, + null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -1094,84 +1086,6 @@ std::pair>> mixed_le null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Returns the exact number of matches (rows) when performing a mixed - * left semi join between the specified tables where the columns of the - * equality table are equal and the predicate evaluates to true on the - * conditional tables. - * - * If the provided predicate returns NULL for a pair of rows (left, right), - * that pair is not included in the output. It is the user's responsibility to - * choose a suitable compare_nulls value AND use appropriate null-safe - * operators in the expression. - * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. - * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not - * match. - * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not - * match. - * - * @param left_equality The left table used for the equality join - * @param right_equality The right table used for the equality join - * @param left_conditional The left table used for the conditional join - * @param right_conditional The right table used for the conditional join - * @param binary_predicate The condition on which to join - * @param compare_nulls Whether or not null values join to each other or not - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A pair containing the size that would result from performing the - * requested join and the number of matches for each row in one of the two - * tables. Which of the two tables is an implementation detail and should not - * be relied upon, simply passed to the corresponding `mixed_left_join` API as - * is. - */ -std::pair>> mixed_left_semi_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns the exact number of matches (rows) when performing a mixed - * left anti join between the specified tables. - * - * If the provided predicate returns NULL for a pair of rows (left, right), - * that pair is not included in the output. It is the user's responsibility to - * choose a suitable compare_nulls value AND use appropriate null-safe - * operators in the expression. - * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. - * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not - * match. - * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not - * match. - * - * @param left_equality The left table used for the equality join - * @param right_equality The right table used for the equality join - * @param left_conditional The left table used for the conditional join - * @param right_conditional The right table used for the conditional join - * @param binary_predicate The condition on which to join - * @param compare_nulls Whether or not null values join to each other or not - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A pair containing the size that would result from performing the - * requested join and the number of matches for each row in one of the two - * tables. Which of the two tables is an implementation detail and should not - * be relied upon, simply passed to the corresponding `mixed_left_join` API as - * is. - */ -std::pair>> mixed_left_anti_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns the exact number of matches (rows) when performing a * conditional inner join between the specified tables where the predicate diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 5a543997a50..01e3fe09b38 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -41,12 +41,9 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is @@ -60,7 +57,7 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ cudf::size_type const left_num_rows = left_table.num_rows(); cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); + auto const outer_num_rows = left_num_rows; cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; @@ -70,12 +67,10 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ if (outer_row_index < outer_num_rows) { // Figure out the number of elements for this key. auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; + evaluator, thread_intermediate_storage, false, equality_probe}; - if ((join_type == join_kind::LEFT_ANTI_JOIN) != - (hash_table_view.contains(outer_row_index, hash_probe, equality))) { - *(join_output_l + join_result_offsets[outer_row_index]) = outer_row_index; - } + left_table_keep_mask[outer_row_index] = + hash_table_view.contains(outer_row_index, hash_probe, equality); } } @@ -86,12 +81,9 @@ template __global__ void mixed_join_semi( table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data); template __global__ void mixed_join_semi( table_device_view left_table, @@ -100,12 +92,9 @@ template __global__ void mixed_join_semi( table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data); } // namespace detail diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index f411d36f0a8..4ea404d451c 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -27,53 +27,7 @@ namespace cudf { namespace detail { /** - * @brief Computes the output size of joining the left table to the right table for semi/anti joins. - * - * This method probes the hash table with each row in the probe table using a - * custom equality comparator that also checks that the conditional expression - * evaluates to true between the left/right tables when a match is found - * between probe and build rows. - * - * @tparam block_size The number of threads per block for this kernel - * @tparam has_nulls Whether or not the inputs may contain nulls. - * - * @param[in] left_table The left table - * @param[in] right_table The right table - * @param[in] probe The table with which to probe the hash table for matches. - * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. - * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed - * @param[in] hash_table_view The hash table built from `build`. - * @param[in] device_expression_data Container of device data required to evaluate the desired - * expression. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. - * @param[out] output_size The resulting output size - * @param[out] matches_per_row The number of matches in one pair of - * equality/conditional tables for each row in the other pair of tables. If - * swap_tables is true, matches_per_row corresponds to the right_table, - * otherwise it corresponds to the left_table. Note that corresponding swap of - * left/right tables to determine which is the build table and which is the - * probe table has already happened on the host. - */ -template -__global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -/** - * @brief Performs a semi/anti join using the combination of a hash lookup to + * @brief Performs a semi join using the combination of a hash lookup to * identify equal rows between one pair of tables and the evaluation of an * expression containing an arbitrary expression. * @@ -91,16 +45,11 @@ __global__ void compute_mixed_join_output_size_semi( * @param[in] build The table with which the hash table was built. * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed * @param[in] hash_table_view The hash table built from `build`. - * @param[out] join_output_l The left result of the join operation + * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating + * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired * expression. - * @param[in] join_result_offsets The starting indices in join_output[l|r] - * where the matches for each row begin. Equivalent to a prefix sum of - * matches_per_row. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. */ template __global__ void mixed_join_semi(table_device_view left_table, @@ -109,12 +58,9 @@ __global__ void mixed_join_semi(table_device_view left_table, table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data); } // namespace detail diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index edf6c32eadf..d654f580cad 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -92,7 +92,6 @@ std::unique_ptr> mixed_join_semi( ast::expression const& binary_predicate, null_equality compare_nulls, join_kind join_type, - std::optional>> output_size_data, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -107,12 +106,7 @@ std::unique_ptr> mixed_join_semi( auto const right_num_rows{right_conditional.num_rows()}; auto const left_num_rows{left_conditional.num_rows()}; - auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows); - - // The "outer" table is the larger of the two tables. The kernels are - // launched with one thread per row of the outer table, which also means that - // it is the probe table for the hash - auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows}; + auto const outer_num_rows{left_num_rows}; // We can immediately filter out cases where the right table is empty. In // some cases, we return all the rows of the left table with a corresponding @@ -155,8 +149,8 @@ std::unique_ptr> mixed_join_semi( // TODO: The non-conditional join impls start with a dictionary matching, // figure out what that is and what it's needed for (and if conditional joins // need to do the same). - auto& probe = swap_tables ? right_equality : left_equality; - auto& build = swap_tables ? left_equality : right_equality; + auto& probe = left_equality; + auto& build = right_equality; auto probe_view = table_device_view::create(probe, stream); auto build_view = table_device_view::create(build, stream); auto left_conditional_view = table_device_view::create(left_conditional, stream); @@ -197,8 +191,7 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create( - swap_tables ? left_conditional : right_conditional, stream); + experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; @@ -225,84 +218,14 @@ std::unique_ptr> mixed_join_semi( auto hash_table_view = hash_table.get_device_view(); - // For inner joins we support optimizing the join by launching one thread for - // whichever table is larger rather than always using the left table. detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; - join_kind const kernel_join_type = - join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type; - - // If the join size data was not provided as an input, compute it here. - std::size_t join_size; - // Using an optional because we only need to allocate a new vector if one was - // not passed as input, and rmm::device_uvector is not default constructible - std::optional> matches_per_row{}; - device_span matches_per_row_span{}; auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); - if (output_size_data.has_value()) { - join_size = output_size_data->first; - matches_per_row_span = output_size_data->second; - } else { - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - - matches_per_row = - rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; - // Note that the view goes out of scope after this else statement, but the - // data owned by matches_per_row stays alive so the data pointer is valid. - auto mutable_matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - if (has_nulls) { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - mutable_matches_per_row_span); - } else { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - mutable_matches_per_row_span); - } - join_size = size.value(stream); - } - - if (join_size == 0) { return std::make_unique>(0, stream, mr); } - - // Given the number of matches per row, we need to compute the offsets for insertion. - auto join_result_offsets = - rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; - thrust::exclusive_scan(rmm::exec_policy{stream}, - matches_per_row_span.begin(), - matches_per_row_span.end(), - join_result_offsets.begin()); - - auto left_indices = std::make_unique>(join_size, stream, mr); - auto const& join_output_l = left_indices->data(); + // Vector used to indicate indices from left/probe table which are present in output + auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); if (has_nulls) { mixed_join_semi @@ -313,12 +236,9 @@ std::unique_ptr> mixed_join_semi( *build_view, hash_probe, equality_probe, - kernel_join_type, hash_table_view, - join_output_l, - parser.device_expression_data, - join_result_offsets.data(), - swap_tables); + cudf::device_span(left_table_keep_mask), + parser.device_expression_data); } else { mixed_join_semi <<>>( @@ -328,235 +248,30 @@ std::unique_ptr> mixed_join_semi( *build_view, hash_probe, equality_probe, - kernel_join_type, hash_table_view, - join_output_l, - parser.device_expression_data, - join_result_offsets.data(), - swap_tables); + cudf::device_span(left_table_keep_mask), + parser.device_expression_data); } - return left_indices; -} - -std::pair>> -compute_mixed_join_output_size_semi(table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - join_kind join_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS( - (join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && - (join_type != join_kind::FULL_JOIN), - "Inner, left, and full join size estimation should use compute_mixed_join_output_size."); - - CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(), - "The left conditional and equality tables must have the same number of rows."); - CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(), - "The right conditional and equality tables must have the same number of rows."); - - auto const right_num_rows{right_conditional.num_rows()}; - auto const left_num_rows{left_conditional.num_rows()}; - auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows); - - // The "outer" table is the larger of the two tables. The kernels are - // launched with one thread per row of the outer table, which also means that - // it is the probe table for the hash - auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows}; - - auto matches_per_row = std::make_unique>( - static_cast(outer_num_rows), stream, mr); - auto matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - - // We can immediately filter out cases where one table is empty. In - // some cases, we return all the rows of the other table with a corresponding - // null index for the empty table; in others, we return an empty output. - if (right_num_rows == 0) { - switch (join_type) { - // Left, left anti, and full all return all the row indices from left - // with a corresponding NULL from the right. - case join_kind::LEFT_ANTI_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1); - return {left_num_rows, std::move(matches_per_row)}; - } - // Inner and left semi joins return empty output because no matches can exist. - case join_kind::LEFT_SEMI_JOIN: return {0, std::move(matches_per_row)}; - default: CUDF_FAIL("Invalid join kind."); break; - } - } else if (left_num_rows == 0) { - switch (join_type) { - // Left, left anti, left semi, and inner joins all return empty sets. - case join_kind::LEFT_ANTI_JOIN: - case join_kind::LEFT_SEMI_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0); - return {0, std::move(matches_per_row)}; - } - default: CUDF_FAIL("Invalid join kind."); break; - } - } - - // If evaluating the expression may produce null outputs we create a nullable - // output column and follow the null-supporting expression evaluation code - // path. - auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || - binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; - - auto const parser = ast::detail::expression_parser{ - binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr}; - CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); - - // TODO: The non-conditional join impls start with a dictionary matching, - // figure out what that is and what it's needed for (and if conditional joins - // need to do the same). - auto& probe = swap_tables ? right_equality : left_equality; - auto& build = swap_tables ? left_equality : right_equality; - auto probe_view = table_device_view::create(probe, stream); - auto build_view = table_device_view::create(build, stream); - auto left_conditional_view = table_device_view::create(left_conditional, stream); - auto right_conditional_view = table_device_view::create(right_conditional, stream); - - auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); - auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); - auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; - auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - - semi_map_type hash_table{compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{stream}, - stream.value()}; - - // Create hash table containing all keys found in right table - // TODO: To add support for nested columns we will need to flatten in many - // places. However, this probably isn't worth adding any time soon since we - // won't be able to support AST conditions for those types anyway. - auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; - auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); - // Since we may see multiple rows that are identical in the equality tables - // but differ in the conditional tables, the equality comparator used for - // insertion must account for both sets of tables. An alternative solution - // would be to use a multimap, but that solution would store duplicates where - // equality and conditional rows are equal, so this approach is preferable. - // One way to make this solution even more efficient would be to only include - // the columns of the conditional table that are used by the expression, but - // that requires additional plumbing through the AST machinery and is out of - // scope for now. - auto const row_comparator_build = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_build}; - auto const equality_build_equality = - row_comparator_build.equal_to(build_nulls, compare_nulls); - auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create( - swap_tables ? left_conditional : right_conditional, stream); - auto const row_comparator_conditional_build = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, - preprocessed_build_condtional}; - auto const equality_build_conditional = - row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); - - // skip rows that are null here. - if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); - } else { - thrust::counting_iterator stencil(0); - auto const [row_bitmask, _] = - cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()); - row_is_valid pred{static_cast(row_bitmask.data())}; - - // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); - } - - auto hash_table_view = hash_table.get_device_view(); - - // For inner joins we support optimizing the join by launching one thread for - // whichever table is larger rather than always using the left table. - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; - - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - - auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; - auto const hash_probe = row_hash.device_hasher(has_nulls); - - // Determine number of output rows without actually building the output to simply - // find what the size of the output will be. - if (has_nulls) { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - matches_per_row_span); - } else { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - matches_per_row_span); - } - - return {size.value(stream), std::move(matches_per_row)}; + auto gather_map = std::make_unique>(probe.num_rows(), stream, mr); + + // gather_map_end will be the end of valid data in gather_map + auto gather_map_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(probe.num_rows()), + left_table_keep_mask.begin(), + gather_map->begin(), + [join_type] __device__(bool keep_row) { + return keep_row == (join_type == detail::join_kind::LEFT_SEMI_JOIN); + }); + + gather_map->resize(thrust::distance(gather_map->begin(), gather_map_end), stream); + return gather_map; } } // namespace detail -std::pair>> mixed_left_semi_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::compute_mixed_join_output_size_semi(left_equality, - right_equality, - left_conditional, - right_conditional, - binary_predicate, - compare_nulls, - detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), - mr); -} - std::unique_ptr> mixed_left_semi_join( table_view const& left_equality, table_view const& right_equality, @@ -564,7 +279,6 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, - std::optional>> output_size_data, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -575,32 +289,10 @@ std::unique_ptr> mixed_left_semi_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - output_size_data, cudf::get_default_stream(), mr); } -std::pair>> mixed_left_anti_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::compute_mixed_join_output_size_semi(left_equality, - right_equality, - left_conditional, - right_conditional, - binary_predicate, - compare_nulls, - detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), - mr); -} - std::unique_ptr> mixed_left_anti_join( table_view const& left_equality, table_view const& right_equality, @@ -608,7 +300,6 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, - std::optional>> output_size_data, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -619,7 +310,6 @@ std::unique_ptr> mixed_left_anti_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - output_size_data, cudf::get_default_stream(), mr); } diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu deleted file mode 100644 index 7a22ac60710..00000000000 --- a/cpp/src/join/mixed_join_size_kernels_semi.cu +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "join/join_common_utils.cuh" -#include "join/join_common_utils.hpp" -#include "join/mixed_join_common_utils.cuh" - -#include -#include -#include -#include -#include - -#include - -namespace cudf { -namespace detail { - -namespace cg = cooperative_groups; - -#pragma GCC diagnostic ignored "-Wattributes" - -template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ - void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) -{ - // The (required) extern storage of the shared memory array leads to - // conflicting declarations between different templates. The easiest - // workaround is to declare an arbitrary (here char) array type then cast it - // after the fact to the appropriate type. - extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = - reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = - intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates); - - std::size_t thread_counter{0}; - cudf::size_type const start_idx = threadIdx.x + blockIdx.x * block_size; - cudf::size_type const stride = block_size * gridDim.x; - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); - - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); - - // TODO: Address asymmetry in operator. - auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - - for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows; - outer_row_index += stride) { - matches_per_row[outer_row_index] = - ((join_type == join_kind::LEFT_ANTI_JOIN) != - (hash_table_view.contains(outer_row_index, hash_probe, equality))); - thread_counter += matches_per_row[outer_row_index]; - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); - - // Add block counter to global counter - if (threadIdx.x == 0) { - cuda::atomic_ref ref{*output_size}; - ref.fetch_add(block_counter, cuda::std::memory_order_relaxed); - } -} - -template __global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -template __global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -} // namespace detail - -} // namespace cudf diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index cc37dadffd8..6c147c8a128 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -657,10 +657,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest { std::vector expected_outputs, cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) { - auto [result_size, actual_counts] = this->join_size( - left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); - EXPECT_TRUE(result_size == expected_outputs.size()); - auto result = this->join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); std::vector resulting_indices; @@ -751,19 +747,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest { cudf::table_view right_conditional, cudf::ast::operation predicate, cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0; - - /** - * This method must be implemented by subclasses for specific types of joins. - * It should be a simply forwarding of arguments to the appropriate cudf - * mixed join size computation API. - */ - virtual std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0; }; /** @@ -781,18 +764,6 @@ struct MixedLeftSemiJoinTest : public MixedJoinSingleReturnTest { return cudf::mixed_left_semi_join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override - { - return cudf::mixed_left_semi_join_size( - left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); - } }; TYPED_TEST_SUITE(MixedLeftSemiJoinTest, cudf::test::IntegralTypesNotBool); @@ -874,18 +845,6 @@ struct MixedLeftAntiJoinTest : public MixedJoinSingleReturnTest { return cudf::mixed_left_anti_join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override - { - return cudf::mixed_left_anti_join_size( - left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); - } }; TYPED_TEST_SUITE(MixedLeftAntiJoinTest, cudf::test::IntegralTypesNotBool); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 5ce2f9d2d6e..4038b3a40b8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -732,32 +732,14 @@ private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long ri long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftSemiJoinSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual); - private static native long[] mixedLeftSemiJoinGatherMap(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftSemiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual, - long outputRowCount, long matchesColumnView); - - private static native long[] mixedLeftAntiJoinSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual); - private static native long[] mixedLeftAntiJoinGatherMap(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftAntiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual, - long outputRowCount, long matchesColumnView); - private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException; private static native long[] concatenate(long[] cudfTablePointers) throws CudfException; @@ -3747,34 +3729,6 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable, return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes output size information for a left semi join between two tables using a mix of - * equality and inequality conditions. The entire join condition is assumed to be a logical AND - * of the equality condition and inequality condition. - * NOTE: It is the responsibility of the caller to close the resulting size information object - * or native resources can be leaked! - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @return size information for the join - */ - public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality) { - long[] mixedSizeInfo = mixedLeftSemiJoinSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); - assert mixedSizeInfo.length == 2; - long outputRowCount = mixedSizeInfo[0]; - long matchesColumnHandle = mixedSizeInfo[1]; - return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle)); - } - /** * Computes the gather map that can be used to manifest the result of a left semi join between * two tables using a mix of equality and inequality conditions. The entire join condition is @@ -3804,42 +3758,6 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes the gather map that can be used to manifest the result of a left semi join between - * two tables using a mix of equality and inequality conditions. The entire join condition is - * assumed to be a logical AND of the equality condition and inequality condition. - * A {@link GatherMap} instance will be returned that can be used to gather - * the left table to produce the result of the left semi join. - * - * It is the responsibility of the caller to close the resulting gather map instances. - * - * This interface allows passing the size result from - * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} - * when the output size was computed previously. - * - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @param joinSize mixed join size result - * @return left and right table gather maps - */ - public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality, - MixedJoinSize joinSize) { - long[] gatherMapData = mixedLeftSemiJoinGatherMapWithSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), - nullEquality == NullEquality.EQUAL, - joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView()); - return buildSingleJoinGatherMap(gatherMapData); - } - /** * Computes the gather map that can be used to manifest the result of a left anti-join between * two tables. It is assumed this table instance holds the key columns from the left table, and @@ -3919,34 +3837,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes output size information for a left anti join between two tables using a mix of - * equality and inequality conditions. The entire join condition is assumed to be a logical AND - * of the equality condition and inequality condition. - * NOTE: It is the responsibility of the caller to close the resulting size information object - * or native resources can be leaked! - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @return size information for the join - */ - public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality) { - long[] mixedSizeInfo = mixedLeftAntiJoinSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); - assert mixedSizeInfo.length == 2; - long outputRowCount = mixedSizeInfo[0]; - long matchesColumnHandle = mixedSizeInfo[1]; - return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle)); - } - /** * Computes the gather map that can be used to manifest the result of a left anti join between * two tables using a mix of equality and inequality conditions. The entire join condition is @@ -3976,42 +3866,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes the gather map that can be used to manifest the result of a left anti join between - * two tables using a mix of equality and inequality conditions. The entire join condition is - * assumed to be a logical AND of the equality condition and inequality condition. - * A {@link GatherMap} instance will be returned that can be used to gather - * the left table to produce the result of the left anti join. - * - * It is the responsibility of the caller to close the resulting gather map instances. - * - * This interface allows passing the size result from - * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} - * when the output size was computed previously. - * - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @param joinSize mixed join size result - * @return left and right table gather maps - */ - public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality, - MixedJoinSize joinSize) { - long[] gatherMapData = mixedLeftAntiJoinGatherMapWithSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), - nullEquality == NullEquality.EQUAL, - joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView()); - return buildSingleJoinGatherMap(gatherMapData); - } - /** * Construct a table from a packed representation. * @param metadata host-based metadata for the table diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 51b8eb853de..e8616710217 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2838,20 +2838,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { - return cudf::jni::mixed_join_size( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_semi_join_size(left_keys, right_keys, left_condition, - right_condition, condition, nulls_equal); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { @@ -2866,22 +2852,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMapWithSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count, - jlong j_matches_view) { - auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view); - return cudf::jni::mixed_join_gather_single_map( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition, - condition, nulls_equal, size_info); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { return cudf::jni::join_gather_single_map( @@ -2930,20 +2900,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { - return cudf::jni::mixed_join_size( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_anti_join_size(left_keys, right_keys, left_condition, - right_condition, condition, nulls_equal); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { @@ -2958,22 +2914,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMapWithSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count, - jlong j_matches_view) { - auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view); - return cudf::jni::mixed_join_gather_single_map( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition, - condition, nulls_equal, size_info); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass, jlong left_table, jlong right_table) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 30905783c7f..8560a9caad7 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -3058,64 +3058,6 @@ void testMixedLeftSemiJoinGatherMapNulls() { } } - @Test - void testMixedLeftSemiJoinGatherMapWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8) - .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(6, 5, 9, 8, 10, 32) - .column(0, 1, 2, 3, 4, 5) - .column(7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(2, 7, 8) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - - @Test - void testMixedLeftSemiJoinGatherMapNullsWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8) - .column( 1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(null, 5, null, 8, 10, 32) - .column( 0, 1, 2, 3, 4, 5) - .column( 7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(0, 7, 8) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - @Test void testMixedLeftAntiJoinGatherMap() { BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, @@ -3166,64 +3108,6 @@ void testMixedLeftAntiJoinGatherMapNulls() { } } - @Test - void testMixedLeftAntiJoinGatherMapWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8) - .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(6, 5, 9, 8, 10, 32) - .column(0, 1, 2, 3, 4, 5) - .column(7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(0, 1, 3, 4, 5, 6, 9) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - - @Test - void testMixedLeftAntiJoinGatherMapNullsWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8) - .column( 1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(null, 5, null, 8, 10, 32) - .column( 0, 1, 2, 3, 4, 5) - .column( 7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(1, 2, 3, 4, 5, 6, 9) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - @Test void testLeftSemiJoinGatherMap() { try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); From 61dbfe8dc7635264465ce46d7de9e87ca0353267 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 4 Apr 2024 15:22:48 -0400 Subject: [PATCH 19/69] Allow jit compilation when using a splayed CUDA toolkit (#15451) The `JitifyPreprocessKernels.cmake` module now handles when `CUDAToolkit_INCLUDE_DIRS` has multiple values correctly, allowing for compilation with splayed CUDA Toolkit installs. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15451 --- cpp/cmake/Modules/JitifyPreprocessKernels.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index 8c4e2b47fca..752c2028350 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -23,8 +23,9 @@ target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS}) function(jit_preprocess_files) cmake_parse_arguments(ARG "" "SOURCE_DIRECTORY" "FILES" ${ARGN}) - foreach(inc IN LISTS libcudacxx_raw_includes) - list(APPEND libcudacxx_includes "-I${inc}") + set(includes) + foreach(inc IN LISTS libcudacxx_raw_includes CUDAToolkit_INCLUDE_DIRS) + list(APPEND includes "-I${inc}") endforeach() foreach(ARG_FILE ${ARG_FILES}) set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp) @@ -44,8 +45,7 @@ function(jit_preprocess_files) $ ${ARG_FILE} -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17 -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include - -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS} - --no-preinclude-workarounds --no-replace-pragma-once + -I${CUDF_SOURCE_DIR}/src ${includes} --no-preinclude-workarounds --no-replace-pragma-once COMMENT "Custom command to JIT-compile files." ) endforeach() From c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 4 Apr 2024 15:24:04 -0400 Subject: [PATCH 20/69] Allow consumers of static builds to find nanoarrow (#15456) Allows consumers like spark-rapids to bring in libcudf static builds from the install and build trees. Authors: - Robert Maynard (https://github.com/robertmaynard) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15456 --- cpp/cmake/thirdparty/get_nanoarrow.cmake | 1 + .../thirdparty/patches/nanoarrow_cmake.diff | 39 +++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 4316db99a8d..884e5a2f368 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -49,6 +49,7 @@ function(find_and_configure_nanoarrow) OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) + rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports) endfunction() find_and_configure_nanoarrow( diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff index b53e134ed2c..1262a38c0a4 100644 --- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff +++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 8714c70..1feec13 100644 +index 8714c70..6a9e505 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,6 @@ else() @@ -10,7 +10,15 @@ index 8714c70..1feec13 100644 # Avoids a warning about timestamps on downloaded files (prefer new policy # if available)) -@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE) +@@ -59,6 +58,7 @@ endif() + + configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h) + ++include(GNUInstallDirs) + if(NANOARROW_BUNDLE) + # Combine all headers into amalgamation/nanoarrow.h in the build directory + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation) +@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE) if(NANOARROW_BUILD_TESTS) include_directories(${CMAKE_BINARY_DIR}/amalgamation) add_library(nanoarrow ${NANOARROW_C_TEMP}) @@ -19,7 +27,7 @@ index 8714c70..1feec13 100644 target_compile_definitions(nanoarrow PUBLIC "$<$:NANOARROW_DEBUG>") endif() -@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE) +@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE) else() add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c src/nanoarrow/array_stream.c src/nanoarrow/utils.c) @@ -27,25 +35,31 @@ index 8714c70..1feec13 100644 target_include_directories(nanoarrow PUBLIC $ -@@ -154,13 +156,50 @@ else() +- $) ++ $) + target_include_directories(nanoarrow + PUBLIC $ + ) +@@ -154,13 +157,49 @@ else() endif() endif() - install(TARGETS nanoarrow DESTINATION lib) + install(TARGETS nanoarrow -+ DESTINATION lib ++ DESTINATION "${CMAKE_INSTALL_LIBDIR}" + EXPORT nanoarrow-exports) install(DIRECTORY src/ - DESTINATION include +- DESTINATION include ++ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" FILES_MATCHING - PATTERN "*.h") + PATTERN "*.h*") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h - DESTINATION include/nanoarrow) +- DESTINATION include/nanoarrow) ++ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow") + + # Generate package files for the build and install trees. + include(CMakePackageConfigHelpers) -+ include(GNUInstallDirs) + + foreach(tree_type BUILD INSTALL) + if(tree_type STREQUAL "BUILD") @@ -80,6 +94,15 @@ index 8714c70..1feec13 100644 endif() # Always build integration test if building tests +@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS) + src/nanoarrow/integration/c_data_integration.cc) + target_include_directories(nanoarrow_c_data_integration + PUBLIC $ +- $) ++ $) + target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json) + endif() + @@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS) src/nanoarrow/integration/c_data_integration_test.cc) From 8509054861f57379524982cc70db294d85a0dc5c Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:09:45 -0400 Subject: [PATCH 21/69] Remove deprecated hash() and spark_murmurhash3_x86_32() (#15375) Remove deprecated libcudf hash functions. The `cudf::hash()` and `cudf::hashing::spark_murmurhash3_x86_32()` were deprecated in previous releases. The `cudf::hash_partition()` function still relies on the enum `hash_id` so it has been moved from `hashing.cpp` to `partitioning.hpp`. Calls to `cudf::hashing::spark_murmurhash3_x86_32()` were also removed from the JNI code. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - https://github.com/nvdbaranec - Jason Lowe (https://github.com/jlowe) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15375 --- cpp/CMakeLists.txt | 2 - cpp/include/cudf/hashing.hpp | 52 -- cpp/include/cudf/hashing/detail/hashing.hpp | 5 - cpp/include/cudf/partitioning.hpp | 10 +- cpp/src/hash/hashing.cu | 53 -- cpp/src/hash/spark_murmurhash3_x86_32.cu | 442 -------------- .../hashing/spark_murmurhash3_x86_32_test.cpp | 576 ------------------ .../partitioning/hash_partition_test.cpp | 15 - .../java/ai/rapids/cudf/ColumnVector.java | 44 +- .../main/java/ai/rapids/cudf/HashType.java | 6 +- java/src/main/native/src/ColumnVectorJni.cpp | 10 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 219 ------- 12 files changed, 18 insertions(+), 1416 deletions(-) delete mode 100644 cpp/src/hash/hashing.cu delete mode 100644 cpp/src/hash/spark_murmurhash3_x86_32.cu delete mode 100644 cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7c32474ea56..7d62e0acb10 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,7 +346,6 @@ add_library( src/groupby/sort/group_replace_nulls.cu src/groupby/sort/group_sum_scan.cu src/groupby/sort/sort_helper.cu - src/hash/hashing.cu src/hash/md5_hash.cu src/hash/murmurhash3_x86_32.cu src/hash/murmurhash3_x64_128.cu @@ -355,7 +354,6 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu - src/hash/spark_murmurhash3_x86_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/from_arrow.cu diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 64a78da1803..83962b50a10 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -34,42 +34,11 @@ namespace cudf { */ using hash_value_type = uint32_t; -/** - * @brief Identifies the hash function to be used - * - */ -enum class hash_id { - HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed - HASH_MURMUR3, ///< Murmur3 hash function - HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function - HASH_MD5 ///< MD5 hash function -}; - /** * @brief The default seed value for hash functions */ static constexpr uint32_t DEFAULT_HASH_SEED = 0; -/** - * @brief Computes the hash value of each row in the input set of columns. - * - * @deprecated Since 23.08 - * - * @param input The table of columns to hash - * @param hash_function The hash function enum to use - * @param seed Optional seed value to use for the hash function - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns A column where each row is the hash of a column from the input - */ -[[deprecated]] std::unique_ptr hash( - table_view const& input, - hash_id hash_function = hash_id::HASH_MURMUR3, - uint32_t seed = DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - //! Hash APIs namespace hashing { @@ -112,27 +81,6 @@ std::unique_ptr murmurhash3_x64_128( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table - * - * @deprecated Since 24.04 - * - * This function computes the hash similar to MurmurHash3_x86_32 with special processing - * to match Spark's implementation results. - * - * @param input The table of columns to hash - * @param seed Optional seed value to use for the hash function - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns A column where each row is the hash of a row from the input - */ -[[deprecated]] std::unique_ptr spark_murmurhash3_x86_32( - table_view const& input, - uint32_t seed = DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Computes the MD5 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index eaeb5d6b068..88a43a64638 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -37,11 +37,6 @@ std::unique_ptr
murmurhash3_x64_128(table_view const& input, rmm::cuda_stream_view, rmm::mr::device_memory_resource* mr); -std::unique_ptr spark_murmurhash3_x86_32(table_view const& input, - uint32_t seed, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource* mr); - std::unique_ptr md5(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 2c91bdf64f5..7033aa500a2 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,14 @@ namespace cudf { * @brief Column partitioning APIs */ +/** + * @brief Identifies the hash function to be used in hash partitioning + */ +enum class hash_id { + HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed + HASH_MURMUR3 ///< Murmur3 hash function +}; + /** * @brief Partitions rows of `t` according to the mapping specified by * `partition_map`. diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu deleted file mode 100644 index 68e02ef3cf4..00000000000 --- a/cpp/src/hash/hashing.cu +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include - -#include - -namespace cudf { -namespace hashing { -namespace detail { - -std::unique_ptr hash(table_view const& input, - hash_id hash_function, - uint32_t seed, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - switch (hash_function) { - case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr); - case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr); - case (hash_id::HASH_MD5): return md5(input, stream, mr); - default: CUDF_FAIL("Unsupported hash function."); - } -} - -} // namespace detail -} // namespace hashing - -std::unique_ptr hash(table_view const& input, - hash_id hash_function, - uint32_t seed, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return hashing::detail::hash(input, hash_function, seed, stream, mr); -} - -} // namespace cudf diff --git a/cpp/src/hash/spark_murmurhash3_x86_32.cu b/cpp/src/hash/spark_murmurhash3_x86_32.cu deleted file mode 100644 index c7992b4afa0..00000000000 --- a/cpp/src/hash/spark_murmurhash3_x86_32.cu +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace cudf { -namespace hashing { -namespace detail { - -namespace { - -using spark_hash_value_type = int32_t; - -template ())> -struct Spark_MurmurHash3_x86_32 { - using result_type = spark_hash_value_type; - - constexpr Spark_MurmurHash3_x86_32() = default; - constexpr Spark_MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {} - - [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const - { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - - [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, - cudf::size_type offset) const - { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); - } - - [[nodiscard]] result_type __device__ inline operator()(Key const& key) const - { - return compute(key); - } - - template - result_type __device__ inline compute(T const& key) const - { - return compute_bytes(reinterpret_cast(&key), sizeof(T)); - } - - result_type __device__ inline compute_remaining_bytes(std::byte const* data, - cudf::size_type len, - cudf::size_type tail_offset, - result_type h) const - { - // Process remaining bytes that do not fill a four-byte chunk using Spark's approach - // (does not conform to normal MurmurHash3). - for (auto i = tail_offset; i < len; i++) { - // We require a two-step cast to get the k1 value from the byte. First, - // we must cast to a signed int8_t. Then, the sign bit is preserved when - // casting to uint32_t under 2's complement. Java preserves the sign when - // casting byte-to-int, but C++ does not. - uint32_t k1 = static_cast(std::to_integer(data[i])); - k1 *= c1; - k1 = rotate_bits_left(k1, rot_c1); - k1 *= c2; - h ^= k1; - h = rotate_bits_left(static_cast(h), rot_c2); - h = h * 5 + c3; - } - return h; - } - - result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const - { - constexpr cudf::size_type BLOCK_SIZE = 4; - cudf::size_type const nblocks = len / BLOCK_SIZE; - cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; - result_type h = m_seed; - - // Process all four-byte chunks. - for (cudf::size_type i = 0; i < nblocks; i++) { - uint32_t k1 = getblock32(data, i * BLOCK_SIZE); - k1 *= c1; - k1 = rotate_bits_left(k1, rot_c1); - k1 *= c2; - h ^= k1; - h = rotate_bits_left(static_cast(h), rot_c2); - h = h * 5 + c3; - } - - h = compute_remaining_bytes(data, len, tail_offset, h); - - // Finalize hash. - h ^= len; - h = fmix32(h); - return h; - } - - private: - uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; - static constexpr uint32_t c1 = 0xcc9e2d51; - static constexpr uint32_t c2 = 0x1b873593; - static constexpr uint32_t c3 = 0xe6546b64; - static constexpr uint32_t rot_c1 = 15; - static constexpr uint32_t rot_c2 = 13; -}; - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - bool const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - int8_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - uint8_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - int16_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - uint16_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - float const& key) const -{ - return compute(normalize_nans(key)); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - double const& key) const -{ - return compute(normalize_nans(key)); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - cudf::string_view const& key) const -{ - auto const data = reinterpret_cast(key.data()); - auto const len = key.size_bytes(); - return compute_bytes(data, len); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - numeric::decimal32 const& key) const -{ - return compute(key.value()); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - numeric::decimal64 const& key) const -{ - return compute(key.value()); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - numeric::decimal128 const& key) const -{ - // Generates the Spark MurmurHash3 hash value, mimicking the conversion: - // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray() - // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381 - __int128_t const val = key.value(); - constexpr cudf::size_type key_size = sizeof(__int128_t); - std::byte const* data = reinterpret_cast(&val); - - // Small negative values start with 0xff..., small positive values start with 0x00... - bool const is_negative = val < 0; - std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00}; - - // If the value can be represented with a shorter than 16-byte integer, the - // leading bytes of the little-endian value are truncated and are not hashed. - auto const reverse_begin = thrust::reverse_iterator(data + key_size); - auto const reverse_end = thrust::reverse_iterator(data); - auto const first_nonzero_byte = - thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) { - return v == zero_value; - }).base(); - // Max handles special case of 0 and -1 which would shorten to 0 length otherwise - cudf::size_type length = - std::max(1, static_cast(thrust::distance(data, first_nonzero_byte))); - - // Preserve the 2's complement sign bit by adding a byte back on if necessary. - // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to - // preserve the sign bit, rather than leaving an "f" at the front which would - // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte - // is needed because the leftmost bit matches the sign bit. Similarly for - // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80. - if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; } - - // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed. - __int128_t big_endian_value = 0; - auto big_endian_data = reinterpret_cast(&big_endian_value); - thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data); - return compute_bytes(big_endian_data, length); -} - -/** - * @brief Computes the hash value of a row in the given table. - * - * This functor uses Spark conventions for Murmur hashing, which differs from - * the Murmur implementation used in the rest of libcudf. These differences - * include: - * - Serially using the output hash as an input seed for the next item - * - Ignorance of null values - * - * The serial use of hashes as seeds means that data of different nested types - * can exhibit hash collisions. For example, a row of an integer column - * containing a 1 will have the same hash as a lists column of integers - * containing a list of [1] and a struct column of a single integer column - * containing a struct of {1}. - * - * As a consequence of ignoring null values, inputs like [1], [1, null], and - * [null, 1] have the same hash (an expected hash collision). This kind of - * collision can also occur across a table of nullable columns and with nulls - * in structs ({1, null} and {null, 1} have the same hash). The seed value (the - * previous element's hash value) is returned as the hash if an element is - * null. - * - * For additional differences such as special tail processing and decimal type - * handling, refer to the Spark_MurmurHash3_x86_32 functor. - * - * @tparam hash_function Hash functor to use for hashing elements. Must be Spark_MurmurHash3_x86_32. - * @tparam Nullate A cudf::nullate type describing whether to check for nulls. - */ -template