From e3cbf62fcef479a051d116c451e69ddaa4568b57 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Mar 2024 15:45:09 -0400 Subject: [PATCH 1/4] Ignore DLManagedTensor in the docs build (#15392) Fixes a docs build error since `DLManagedTensor` cannot be resolved from the dlpack documentation. --- docs/cudf/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 3bba50b482c..7afc8fe19bf 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -388,6 +388,7 @@ def _generate_namespaces(namespaces): "thrust", "cuda", "arrow", + "DLManagedTensor", # Unknown types "int8_t", "int16_t", From a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 26 Mar 2024 17:24:41 -0700 Subject: [PATCH 2/4] Use logical types in Parquet reader (#15365) Closes #15224. Now use logical type exclusively in the reader rather than the deprecated converted type. Authors: - Ed Seidl (https://github.com/etseidl) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15365 --- cpp/src/io/parquet/decode_fixed.cu | 4 +- cpp/src/io/parquet/decode_preprocess.cu | 2 +- cpp/src/io/parquet/page_data.cu | 18 +- cpp/src/io/parquet/page_data.cuh | 3 +- cpp/src/io/parquet/page_decode.cuh | 58 ++--- cpp/src/io/parquet/page_hdr.cu | 4 +- cpp/src/io/parquet/page_string_decode.cu | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 41 ++-- cpp/src/io/parquet/reader_impl.cpp | 16 +- cpp/src/io/parquet/reader_impl_chunking.cu | 49 ++--- cpp/src/io/parquet/reader_impl_helpers.cpp | 210 ++++++++++--------- cpp/src/io/parquet/reader_impl_preprocess.cu | 4 +- 12 files changed, 220 insertions(+), 193 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 062363db503..945a7dcb4c6 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -165,7 +165,7 @@ __device__ inline void gpuDecodeValues( constexpr int max_batch_size = num_warps * cudf::detail::warp_size; PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.data_type & 7; + int const dtype = s->col.physical_type; // decode values int pos = start; @@ -187,7 +187,7 @@ __device__ inline void gpuDecodeValues( uint32_t dtype_len = s->dtype_len; void* dst = nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; - if (s->col.converted_type == DECIMAL) { + if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { case INT32: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; case INT64: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8f772636c7e..e49801e6172 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -389,7 +389,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) // we only need to preprocess hierarchies with repetition in them (ie, hierarchies // containing lists anywhere within). compute_string_sizes = - compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4); + compute_string_sizes && s->col.physical_type == BYTE_ARRAY && !s->col.is_strings_to_cat; // early out optimizations: diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 261e04e3f19..62ce5b9f9a5 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -77,7 +77,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) if (s->dict_base) { out_thread0 = (s->dict_bits > 0) ? 64 : 32; } else { - switch (s->col.data_type & 7) { + switch (s->col.physical_type) { case BOOLEAN: [[fallthrough]]; case BYTE_ARRAY: [[fallthrough]]; case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break; @@ -123,16 +123,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) // be needed in the other DecodeXXX kernels. if (s->dict_base) { src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, t & 0x1f).first; - } else if ((s->col.data_type & 7) == BOOLEAN) { + } else if (s->col.physical_type == BOOLEAN) { src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f); - } else if ((s->col.data_type & 7) == BYTE_ARRAY or - (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + } else if (s->col.physical_type == BYTE_ARRAY or + s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); } if (t == 32) { s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values - int const dtype = s->col.data_type & 7; + int const dtype = s->col.physical_type; src_pos += t - out_thread0; // the position in the output column/buffer @@ -166,10 +166,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) uint32_t dtype_len = s->dtype_len; void* dst = nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + auto const is_decimal = + s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; if (dtype == BYTE_ARRAY) { - if (s->col.converted_type == DECIMAL) { + if (is_decimal) { auto const [ptr, len] = gpuGetStringData(s, sb, val_src_pos); - auto const decimal_precision = s->col.decimal_precision; + auto const decimal_precision = s->col.logical_type->precision(); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { gpuOutputByteArrayAsInt(ptr, len, static_cast(dst)); } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { @@ -182,7 +184,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) } } else if (dtype == BOOLEAN) { gpuOutputBoolean(sb, val_src_pos, static_cast(dst)); - } else if (s->col.converted_type == DECIMAL) { + } else if (is_decimal) { switch (dtype) { case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); break; case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); break; diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh index f0fa7d814cf..df8d801d66c 100644 --- a/cpp/src/io/parquet/page_data.cuh +++ b/cpp/src/io/parquet/page_data.cuh @@ -34,8 +34,7 @@ template inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv) { auto [ptr, len] = gpuGetStringData(s, sb, src_pos); - // make sure to only hash `BYTE_ARRAY` when specified with the output type size - if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) { + if (s->col.is_strings_to_cat and s->col.physical_type == BYTE_ARRAY) { // Output hash. This hash value is used if the option to convert strings to // categoricals is enabled. The seed value is chosen arbitrarily. uint32_t constexpr hash_seed = 33; diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index a081ee4e03f..fa1de5f301d 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -441,7 +441,7 @@ gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int ta while (pos < target_pos) { int len = 0; - if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { if (k < dict_size) { len = s->dtype_len_in; } } else { if (k + 4 <= dict_size) { @@ -1144,11 +1144,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, if (s->page.num_input_values > 0) { uint8_t* cur = s->page.page_data; uint8_t* end = cur + s->page.uncompressed_page_size; - - uint32_t dtype_len_out = s->col.data_type >> 3; - s->ts_scale = 0; + s->ts_scale = 0; // Validate data type - auto const data_type = s->col.data_type & 7; + auto const data_type = s->col.physical_type; + auto const is_decimal = + s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; switch (data_type) { case BOOLEAN: s->dtype_len = 1; // Boolean are stored as 1 byte on the output @@ -1159,13 +1159,15 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, if (s->col.ts_clock_rate) { int32_t units = 0; // Duration types are not included because no scaling is done when reading - if (s->col.converted_type == TIMESTAMP_MILLIS) { - units = cudf::timestamp_ms::period::den; - } else if (s->col.converted_type == TIMESTAMP_MICROS) { - units = cudf::timestamp_us::period::den; - } else if (s->col.logical_type.has_value() and - s->col.logical_type->is_timestamp_nanos()) { - units = cudf::timestamp_ns::period::den; + if (s->col.logical_type.has_value()) { + auto const& lt = s->col.logical_type.value(); + if (lt.is_timestamp_millis()) { + units = cudf::timestamp_ms::period::den; + } else if (lt.is_timestamp_micros()) { + units = cudf::timestamp_us::period::den; + } else if (lt.is_timestamp_nanos()) { + units = cudf::timestamp_ns::period::den; + } } if (units and units != s->col.ts_clock_rate) { s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate) @@ -1176,8 +1178,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, case DOUBLE: s->dtype_len = 8; break; case INT96: s->dtype_len = 12; break; case BYTE_ARRAY: - if (s->col.converted_type == DECIMAL) { - auto const decimal_precision = s->col.decimal_precision; + if (is_decimal) { + auto const decimal_precision = s->col.logical_type->precision(); s->dtype_len = [decimal_precision]() { if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return sizeof(int32_t); @@ -1192,14 +1194,14 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } break; default: // FIXED_LEN_BYTE_ARRAY: - s->dtype_len = dtype_len_out; + s->dtype_len = s->col.type_length; if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); } break; } // Special check for downconversions s->dtype_len_in = s->dtype_len; if (data_type == FIXED_LEN_BYTE_ARRAY) { - if (s->col.converted_type == DECIMAL) { + if (is_decimal) { s->dtype_len = [dtype_len = s->dtype_len]() { if (dtype_len <= sizeof(int32_t)) { return sizeof(int32_t); @@ -1213,17 +1215,17 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dtype_len = sizeof(string_index_pair); } } else if (data_type == INT32) { - if (dtype_len_out == 1) { - // INT8 output - s->dtype_len = 1; - } else if (dtype_len_out == 2) { - // INT16 output - s->dtype_len = 2; - } else if (s->col.converted_type == TIME_MILLIS) { - // INT64 output - s->dtype_len = 8; + // check for smaller bitwidths + if (s->col.logical_type.has_value()) { + auto const& lt = s->col.logical_type.value(); + if (lt.type == LogicalType::INTEGER) { + s->dtype_len = lt.bit_width() / 8; + } else if (lt.is_time_millis()) { + // cudf outputs as INT64 + s->dtype_len = 8; + } } - } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) { + } else if (data_type == BYTE_ARRAY && s->col.is_strings_to_cat) { s->dtype_len = 4; // HASH32 output } else if (data_type == INT96) { s->dtype_len = 8; // Convert to 64-bit timestamp @@ -1298,7 +1300,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, case Encoding::PLAIN_DICTIONARY: case Encoding::RLE_DICTIONARY: // RLE-packed dictionary indices, first byte indicates index length in bits - if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) { + if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) { // String dictionary: use index s->dict_base = reinterpret_cast(s->col.str_dict_index); s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair); @@ -1316,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, case Encoding::PLAIN: s->dict_size = static_cast(end - cur); s->dict_val = 0; - if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } + if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } break; case Encoding::RLE: { // first 4 bytes are length of RLE data diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 4a50c7445b3..07e03460ecb 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -147,12 +147,12 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk) __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk) { - return (chunk.data_type & 7) == BYTE_ARRAY; + return chunk.physical_type == BYTE_ARRAY; } __device__ inline bool is_boolean(ColumnChunkDesc const& chunk) { - return (chunk.data_type & 7) == BOOLEAN; + return chunk.physical_type == BOOLEAN; } /** diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d8b1c1cc046..6f96d4dd1cf 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -689,7 +689,7 @@ CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPage auto const start_value = pp->start_val; // if data size is known, can short circuit here - if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (chunks[pp->chunk_idx].physical_type == FIXED_LEN_BYTE_ARRAY) { if (t == 0) { pp->str_bytes = pp->num_valids * s->dtype_len_in; @@ -881,7 +881,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSi auto const& col = s->col; size_t str_bytes = 0; // short circuit for FIXED_LEN_BYTE_ARRAY - if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (col.physical_type == FIXED_LEN_BYTE_ARRAY) { str_bytes = pp->num_valids * s->dtype_len_in; } else { // now process string info in the range [start_value, end_value) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 82ccb2b314a..200a8ec9ddb 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -370,8 +370,8 @@ struct ColumnChunkDesc { explicit ColumnChunkDesc(size_t compressed_size_, uint8_t* compressed_data_, size_t num_values_, - uint16_t datatype_, - uint16_t datatype_length_, + Type datatype_, + int32_t datatype_length_, size_t start_row_, uint32_t num_rows_, int16_t max_definition_level_, @@ -379,15 +379,14 @@ struct ColumnChunkDesc { int16_t max_nesting_depth_, uint8_t def_level_bits_, uint8_t rep_level_bits_, - int8_t codec_, - int8_t converted_type_, + Compression codec_, thrust::optional logical_type_, - int8_t decimal_precision_, int32_t ts_clock_rate_, int32_t src_col_index_, int32_t src_col_schema_, column_chunk_info const* chunk_info_, - float list_bytes_per_row_est_) + float list_bytes_per_row_est_, + bool strings_to_categorical_) : compressed_data(compressed_data_), compressed_size(compressed_size_), num_values(num_values_), @@ -395,7 +394,8 @@ struct ColumnChunkDesc { num_rows(num_rows_), max_level{max_definition_level_, max_repetition_level_}, max_nesting_depth{max_nesting_depth_}, - data_type(datatype_ | (datatype_length_ << 3)), + type_length(datatype_length_), + physical_type(datatype_), level_bits{def_level_bits_, rep_level_bits_}, num_data_pages(0), num_dict_pages(0), @@ -405,14 +405,13 @@ struct ColumnChunkDesc { column_data_base{nullptr}, column_string_base{nullptr}, codec(codec_), - converted_type(converted_type_), logical_type(logical_type_), - decimal_precision(decimal_precision_), ts_clock_rate(ts_clock_rate_), src_col_index(src_col_index_), src_col_schema(src_col_schema_), h_chunk_info(chunk_info_), - list_bytes_per_row_est(list_bytes_per_row_est_) + list_bytes_per_row_est(list_bytes_per_row_est_), + is_strings_to_cat(strings_to_categorical_) { } @@ -423,7 +422,8 @@ struct ColumnChunkDesc { uint32_t num_rows{}; // number of rows in this chunk int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level int16_t max_nesting_depth{}; // max nesting depth of the output - uint16_t data_type{}; // basic column data type, ((type_length << 3) | // parquet::Type) + int32_t type_length{}; // type length from schema (for FLBA only) + Type physical_type{}; // parquet physical data type uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels int32_t num_data_pages{}; // number of data pages @@ -433,10 +433,8 @@ struct ColumnChunkDesc { bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column void** column_data_base{}; // base pointers of column data void** column_string_base{}; // base pointers of column string data - int8_t codec{}; // compressed codec enum - int8_t converted_type{}; // converted type enum + Compression codec{}; // compressed codec enum thrust::optional logical_type{}; // logical type - int8_t decimal_precision{}; // Decimal precision int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index @@ -446,6 +444,8 @@ struct ColumnChunkDesc { column_chunk_info const* h_chunk_info{}; float list_bytes_per_row_est{}; // for LIST columns, an estimate on number of bytes per row + + bool is_strings_to_cat{}; // convert strings to hashes }; /** @@ -615,11 +615,16 @@ struct EncPage { */ constexpr bool is_string_col(ColumnChunkDesc const& chunk) { - auto const not_converted_to_decimal = chunk.converted_type != DECIMAL; + // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing + // a decimal. + if (chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL) { + return false; + } + auto const non_hashed_byte_array = - (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4; - auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY; - return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array); + chunk.physical_type == BYTE_ARRAY and not chunk.is_strings_to_cat; + auto const fixed_len_byte_array = chunk.physical_type == FIXED_LEN_BYTE_ARRAY; + return non_hashed_byte_array or fixed_len_byte_array; } /** diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 8112328d962..2356878f6ba 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -28,6 +28,19 @@ namespace cudf::io::parquet::detail { +namespace { +// Tests the passed in logical type for a FIXED_LENGTH_BYTE_ARRAY column to see if it should +// be treated as a string. Currently the only logical type that has special handling is DECIMAL. +// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which +// for now would also be treated as a string). +inline bool is_treat_fixed_length_as_string(thrust::optional const& logical_type) +{ + if (!logical_type.has_value()) { return true; } + return logical_type->type != LogicalType::DECIMAL; +} + +} // namespace + void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows) { auto& pass = *_pass_itm_data; @@ -66,7 +79,8 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row // TODO: we could probably dummy up size stats for FLBA data since we know the width auto const has_flba = std::any_of(pass.chunks.begin(), pass.chunks.end(), [](auto const& chunk) { - return (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY && chunk.converted_type != DECIMAL; + return chunk.physical_type == FIXED_LEN_BYTE_ARRAY and + is_treat_fixed_length_as_string(chunk.logical_type); }); if (!_has_page_index || uses_custom_row_bounds || has_flba) { diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 5c387147e4b..912f53a8277 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -364,33 +364,28 @@ int64_t find_next_split(int64_t cur_pos, /** * @brief Converts cuDF units to Parquet units. * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + * @return A tuple of Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple conversion_info( +[[nodiscard]] std::tuple> conversion_info( type_id column_type_id, type_id timestamp_type_id, Type physical, - thrust::optional converted, - int32_t length) + thrust::optional logical_type) { - int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); + int32_t const clock_rate = + is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0; + + // TODO(ets): this is leftover from the original code, but will we ever output decimal as + // anything but fixed point? + if (logical_type.has_value() and logical_type->type == LogicalType::DECIMAL) { + // if decimal but not outputting as float or decimal, then convert to no logical type + if (column_type_id != type_id::FLOAT64 and + not cudf::is_fixed_point(data_type{column_type_id})) { + return std::make_tuple(clock_rate, thrust::nullopt); + } } - int8_t converted_type = converted.value_or(UNKNOWN); - if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); + return std::make_tuple(clock_rate, std::move(logical_type)); } /** @@ -1515,12 +1510,11 @@ void reader::impl::create_global_chunk_info() auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); auto& schema = _metadata->get_schema(col.schema_idx); - auto [type_width, clock_rate, converted_type] = + auto [clock_rate, logical_type] = conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), _timestamp_type.id(), schema.type, - schema.converted_type, - schema.type_length); + schema.logical_type); // for lists, estimate the number of bytes per row. this is used by the subpass reader to // determine where to split the decompression boundaries @@ -1538,7 +1532,7 @@ void reader::impl::create_global_chunk_info() nullptr, col_meta.num_values, schema.type, - type_width, + schema.type_length, row_group_start, row_group_rows, schema.max_definition_level, @@ -1547,14 +1541,13 @@ void reader::impl::create_global_chunk_info() required_bits(schema.max_definition_level), required_bits(schema.max_repetition_level), col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, + logical_type, clock_rate, i, col.schema_idx, chunk_info, - list_bytes_per_row_est)); + list_bytes_per_row_est, + schema.type == BYTE_ARRAY and _strings_to_categorical)); } remaining_rows -= row_group_rows; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 776caa99ac9..bfc69264ab2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -16,6 +16,7 @@ #include "reader_impl_helpers.hpp" +#include "io/parquet/parquet.hpp" #include "io/utilities/row_selection.hpp" #include @@ -25,44 +26,35 @@ namespace cudf::io::parquet::detail { namespace { -ConvertedType logical_type_to_converted_type(thrust::optional const& logical) +thrust::optional converted_to_logical_type(SchemaElement const& schema) { - if (not logical.has_value()) { return UNKNOWN; } - switch (logical->type) { - case LogicalType::STRING: return UTF8; - case LogicalType::MAP: return MAP; - case LogicalType::LIST: return LIST; - case LogicalType::ENUM: return ENUM; - case LogicalType::DECIMAL: return DECIMAL; // TODO use decimal scale/precision - case LogicalType::DATE: return DATE; - case LogicalType::TIME: - if (logical->is_time_millis()) { - return TIME_MILLIS; - } else if (logical->is_time_micros()) { - return TIME_MICROS; - } - break; - case LogicalType::TIMESTAMP: - if (logical->is_timestamp_millis()) { - return TIMESTAMP_MILLIS; - } else if (logical->is_timestamp_micros()) { - return TIMESTAMP_MICROS; - } - break; - case LogicalType::INTEGER: - switch (logical->bit_width()) { - case 8: return logical->is_signed() ? INT_8 : UINT_8; - case 16: return logical->is_signed() ? INT_16 : UINT_16; - case 32: return logical->is_signed() ? INT_32 : UINT_32; - case 64: return logical->is_signed() ? INT_64 : UINT_64; - default: break; - } - case LogicalType::UNKNOWN: return NA; - case LogicalType::JSON: return JSON; - case LogicalType::BSON: return BSON; - default: break; + if (schema.converted_type.has_value()) { + switch (schema.converted_type.value()) { + case ENUM: // treat ENUM as UTF8 string + case UTF8: return LogicalType{LogicalType::STRING}; + case MAP: return LogicalType{LogicalType::MAP}; + case LIST: return LogicalType{LogicalType::LIST}; + case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}}; + case DATE: return LogicalType{LogicalType::DATE}; + case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}}; + case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}}; + case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}}; + case TIMESTAMP_MICROS: return LogicalType{TimestampType{true, TimeUnit::MICROS}}; + case UINT_8: return LogicalType{IntType{8, false}}; + case UINT_16: return LogicalType{IntType{16, false}}; + case UINT_32: return LogicalType{IntType{32, false}}; + case UINT_64: return LogicalType{IntType{64, false}}; + case INT_8: return LogicalType{IntType{8, true}}; + case INT_16: return LogicalType{IntType{16, true}}; + case INT_32: return LogicalType{IntType{32, true}}; + case INT_64: return LogicalType{IntType{64, true}}; + case JSON: return LogicalType{LogicalType::JSON}; + case BSON: return LogicalType{LogicalType::BSON}; + case INTERVAL: // there is no logical type for INTERVAL yet + default: return LogicalType{LogicalType::UNDEFINED}; + } } - return UNKNOWN; + return thrust::nullopt; } } // namespace @@ -74,76 +66,90 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - auto const physical = schema.type; - auto const logical_type = schema.logical_type; - auto converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; - - // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to - // converted_type if logical_type isn't set - // Logical type used for actual data interpretation; the legacy converted type - // is superseded by 'logical' type whenever available. - auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); } - - switch (converted_type.value_or(UNKNOWN)) { - case UINT_8: return type_id::UINT8; - case INT_8: return type_id::INT8; - case UINT_16: return type_id::UINT16; - case INT_16: return type_id::INT16; - case UINT_32: return type_id::UINT32; - case UINT_64: return type_id::UINT64; - case DATE: return type_id::TIMESTAMP_DAYS; - case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case TIMESTAMP_MILLIS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_MILLISECONDS; - case TIMESTAMP_MICROS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_MICROSECONDS; - case DECIMAL: - if (physical == INT32) { return type_id::DECIMAL32; } - if (physical == INT64) { return type_id::DECIMAL64; } - if (physical == FIXED_LEN_BYTE_ARRAY) { - if (schema.type_length <= static_cast(sizeof(int32_t))) { - return type_id::DECIMAL32; + auto const physical = schema.type; + auto logical_type = schema.logical_type; + + // sanity check, but not worth failing over + if (schema.converted_type.has_value() and not logical_type.has_value()) { + CUDF_LOG_WARN("ConvertedType is specified but not LogicalType"); + logical_type = converted_to_logical_type(schema); + } + + if (logical_type.has_value()) { + switch (logical_type->type) { + case LogicalType::INTEGER: { + auto const is_signed = logical_type->is_signed(); + switch (logical_type->bit_width()) { + case 8: return is_signed ? type_id::INT8 : type_id::UINT8; + case 16: return is_signed ? type_id::INT16 : type_id::UINT16; + case 32: return is_signed ? type_id::INT32 : type_id::UINT32; + case 64: return is_signed ? type_id::INT64 : type_id::UINT64; + default: CUDF_FAIL("Invalid integer bitwidth"); } - if (schema.type_length <= static_cast(sizeof(int64_t))) { - return type_id::DECIMAL64; + } break; + + case LogicalType::DATE: return type_id::TIMESTAMP_DAYS; + + case LogicalType::TIME: + if (logical_type->is_time_millis()) { + return type_id::DURATION_MILLISECONDS; + } else if (logical_type->is_time_micros()) { + return type_id::DURATION_MICROSECONDS; + } else if (logical_type->is_time_nanos()) { + return type_id::DURATION_NANOSECONDS; } - if (schema.type_length <= static_cast(sizeof(__int128_t))) { - return type_id::DECIMAL128; + break; + + case LogicalType::TIMESTAMP: + if (timestamp_type_id != type_id::EMPTY) { + return timestamp_type_id; + } else if (logical_type->is_timestamp_millis()) { + return type_id::TIMESTAMP_MILLISECONDS; + } else if (logical_type->is_timestamp_micros()) { + return type_id::TIMESTAMP_MICROSECONDS; + } else if (logical_type->is_timestamp_nanos()) { + return type_id::TIMESTAMP_NANOSECONDS; } - } - if (physical == BYTE_ARRAY) { - CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); - if (decimal_precision <= MAX_DECIMAL32_PRECISION) { + + case LogicalType::DECIMAL: { + int32_t const decimal_precision = logical_type->precision(); + if (physical == INT32) { return type_id::DECIMAL32; - } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { + } else if (physical == INT64) { return type_id::DECIMAL64; + } else if (physical == FIXED_LEN_BYTE_ARRAY) { + if (schema.type_length <= static_cast(sizeof(int32_t))) { + return type_id::DECIMAL32; + } else if (schema.type_length <= static_cast(sizeof(int64_t))) { + return type_id::DECIMAL64; + } else if (schema.type_length <= static_cast(sizeof(__int128_t))) { + return type_id::DECIMAL128; + } + } else if (physical == BYTE_ARRAY) { + CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); + if (decimal_precision <= MAX_DECIMAL32_PRECISION) { + return type_id::DECIMAL32; + } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { + return type_id::DECIMAL64; + } else { + return type_id::DECIMAL128; + } } else { - return type_id::DECIMAL128; + CUDF_FAIL("Invalid representation of decimal type"); } - } - CUDF_FAIL("Invalid representation of decimal type"); - break; - - // maps are just List>. - case MAP: - case LIST: return type_id::LIST; - case NA: return type_id::STRING; - // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support - default: break; - } + } break; - if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) { - if (logical_type->is_timestamp_nanos()) { - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_NANOSECONDS; - } else if (logical_type->is_time_nanos()) { - return type_id::DURATION_NANOSECONDS; + // maps are just List>. + case LogicalType::MAP: + case LogicalType::LIST: return type_id::LIST; + + // All null column that can't have its type deduced. + // Note: originally LogicalType::UNKNOWN was converted to ConvertedType::NA, and + // NA then became type_id::STRING, but with the following TODO: + // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support + case LogicalType::UNKNOWN: return type_id::STRING; + + default: break; } } @@ -208,6 +214,7 @@ void metadata::sanitize_schema() // This is a list of structs, so we need to mark this as a list, but also // add a struct child and move this element's children to the struct schema_elem.converted_type = LIST; + schema_elem.logical_type = LogicalType::LIST; schema_elem.repetition_type = OPTIONAL; auto const struct_node_idx = static_cast(schema.size()); @@ -216,7 +223,7 @@ void metadata::sanitize_schema() struct_elem.repetition_type = REQUIRED; struct_elem.num_children = schema_elem.num_children; struct_elem.type = UNDEFINED_TYPE; - struct_elem.converted_type = UNKNOWN; + struct_elem.converted_type = thrust::nullopt; // swap children struct_elem.children_idx = std::move(schema_elem.children_idx); @@ -238,6 +245,11 @@ void metadata::sanitize_schema() } } + // convert ConvertedType to LogicalType for older files + if (schema_elem.converted_type.has_value() and not schema_elem.logical_type.has_value()) { + schema_elem.logical_type = converted_to_logical_type(schema_elem); + } + for (auto& child_idx : schema_elem.children_idx) { process(child_idx); } diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index e39445108a6..4b7a64ac6ab 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -643,7 +643,7 @@ struct set_str_dict_index_count { __device__ void operator()(PageInfo const& page) { auto const& chunk = chunks[page.chunk_idx]; - if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && (chunk.data_type & 0x7) == BYTE_ARRAY && + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) { // there is only ever one dictionary page per chunk, so this is safe to do in parallel. str_dict_index_count[page.chunk_idx] = page.num_input_values; @@ -659,7 +659,7 @@ struct set_str_dict_index_ptr { __device__ void operator()(size_t i) { auto& chunk = chunks[i]; - if ((chunk.data_type & 0x7) == BYTE_ARRAY && (chunk.num_dict_pages > 0)) { + if (chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) { chunk.str_dict_index = base + str_dict_index_offsets[i]; } } From 35f818b3e4bef8e331f083dadc9a4c45e2987a78 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 27 Mar 2024 13:39:41 -0500 Subject: [PATCH 3/4] Use `conda env create --yes` instead of `--force` (#15403) conda dropped support for the `--force` flag to `conda env create`. This changes that flag name to `--yes`. See https://github.com/conda/conda/blob/main/CHANGELOG.md#2430-2024-03-12 and https://github.com/rapidsai/miniforge-cuda/pull/63 for more info. --- ci/build_docs.sh | 2 +- ci/check_style.sh | 2 +- ci/test_cpp_common.sh | 4 ++-- ci/test_java.sh | 4 ++-- ci/test_notebooks.sh | 4 ++-- ci/test_python_common.sh | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 8e22f02b484..668d52e530b 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -17,7 +17,7 @@ rapids-dependency-file-generator \ --file_key docs \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n docs +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs conda activate docs rapids-print-env diff --git a/ci/check_style.sh b/ci/check_style.sh index b3890607f64..029cd305f1d 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -13,7 +13,7 @@ rapids-dependency-file-generator \ --file_key checks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n checks +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks conda activate checks RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh index 163d381c1d4..e1b2a367187 100644 --- a/ci/test_cpp_common.sh +++ b/ci/test_cpp_common.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ --file_key test_cpp \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test # Temporarily allow unbound variables for conda activation. set +u diff --git a/ci/test_java.sh b/ci/test_java.sh index 0863795162d..c93079742f0 100755 --- a/ci/test_java.sh +++ b/ci/test_java.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ --file_key test_java \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test export CMAKE_GENERATOR=Ninja diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index b746a18aed1..8be2d374bed 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. set -euo pipefail @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ --file_key test_notebooks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test # Temporarily allow unbound variables for conda activation. set +u diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index 1c330d47ac6..7559d970f6d 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Common setup steps shared by Python test jobs @@ -16,7 +16,7 @@ rapids-dependency-file-generator \ --file_key test_python \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" -rapids-mamba-retry env create --force -f "${ENV_YAML_DIR}/env.yaml" -n test +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test # Temporarily allow unbound variables for conda activation. set +u From aab6137c80c50eccc5007120f7140cfe6646b5e0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 1 Apr 2024 04:01:36 -0700 Subject: [PATCH 4/4] First pass at adding testing for pylibcudf (#15300) This PR adds tests of the `pylibcudf.copying` module along with establishing the infrastructure and best practices for writing pylibcudf tests going forward (and adding associated documentation). Resolves #15133 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Ashwin Srinath (https://github.com/shwina) - Jake Awe (https://github.com/AyodeAwe) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/15300 --- ci/test_python_cudf.sh | 8 + ci/test_wheel_cudf.sh | 8 + cpp/include/cudf/copying.hpp | 3 + cpp/src/copying/copy.cpp | 5 +- cpp/src/copying/copy_range.cu | 2 +- cpp/src/copying/scatter.cu | 11 +- docs/cudf/source/developer_guide/pylibcudf.md | 66 ++ docs/cudf/source/developer_guide/testing.md | 6 + python/cudf/cudf/_lib/cpp/copying.pxd | 42 +- python/cudf/cudf/_lib/pylibcudf/column.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/column.pyx | 9 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 6 +- python/cudf/cudf/_lib/pylibcudf/copying.pyx | 126 ++- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 1 + python/cudf/cudf/_lib/pylibcudf/table.pxd | 3 + python/cudf/cudf/_lib/pylibcudf/table.pyx | 8 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 5 + .../cudf/cudf/pylibcudf_tests/common/utils.py | 111 +++ python/cudf/cudf/pylibcudf_tests/conftest.py | 31 + python/cudf/cudf/pylibcudf_tests/pytest.ini | 8 + .../cudf/cudf/pylibcudf_tests/test_copying.py | 848 ++++++++++++++++++ 21 files changed, 1254 insertions(+), 54 deletions(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/common/utils.py create mode 100644 python/cudf/cudf/pylibcudf_tests/conftest.py create mode 100644 python/cudf/cudf/pylibcudf_tests/pytest.ini create mode 100644 python/cudf/cudf/pylibcudf_tests/test_copying.py diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index bacb54b3896..217dd2fd9a8 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -14,6 +14,14 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +rapids-logger "pytest pylibcudf" +pushd python/cudf/cudf/pylibcudf_tests +python -m pytest \ + --cache-clear \ + --dist=worksteal \ + . +popd + rapids-logger "pytest cudf" ./ci/run_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 83f0b976128..a6f122491b0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -18,6 +18,14 @@ if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then rapids-logger "Run smoke tests for cudf" python ./ci/wheel_smoke_test_cudf.py else + rapids-logger "pytest pylibcudf" + pushd python/cudf/cudf/pylibcudf_tests + python -m pytest \ + --cache-clear \ + --dist=worksteal \ + . + popd + rapids-logger "pytest cudf" pushd python/cudf/cudf/tests python -m pytest \ diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index b2cde82fada..df96efdaffc 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -253,6 +253,8 @@ std::unique_ptr empty_like(scalar const& input); * If the `mask_alloc` allocates a validity mask that mask is also uninitialized * and the validity bits and the null count should be set by the caller. * + * @throws cudf::data_type_error if input type is not of fixed width. + * * @param input Immutable view of input column to emulate * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN * @param mr Device memory resource used to allocate the returned column's device memory @@ -360,6 +362,7 @@ void copy_range_in_place(column_view const& source, * * @throws std::out_of_range for any invalid range. * @throws cudf::data_type_error if @p target and @p source have different types. + * @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary * * @param source The column to copy from inside the range * @param target The column to copy from outside the range diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index 490a1ccb254..cb7d507de81 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -122,7 +122,8 @@ std::unique_ptr allocate_like(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column"); + CUDF_EXPECTS( + is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error); mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable()); return std::make_unique(input.type(), diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 038646d8cf4..e10d7081a55 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -119,7 +119,7 @@ struct out_of_place_copy_range_dispatch { std::enable_if_t(), std::unique_ptr> operator()(Args...) { - CUDF_FAIL("Unsupported type for out of place copy."); + CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error); } }; diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 7931df4c9f0..3bc3979ec1b 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -144,7 +144,9 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match"); + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); auto const scalar_impl = static_cast(&source.get()); auto const source_view = string_view(scalar_impl->data(), scalar_impl->size()); @@ -166,6 +168,9 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); auto result = lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr); @@ -249,6 +254,10 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + CUDF_EXPECTS(source.get().type() == target.type(), + "scalar and column types must match", + cudf::data_type_error); + // For each field of `source`, copy construct a scalar from the field // and dispatch to the corresponding scalar scatterer diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 0120cbb286e..0b881b2b057 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -96,6 +96,72 @@ There are a couple of notable points from the snippet above: - The object returned from libcudf is immediately converted to a pylibcudf type. - `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case). +## Testing + +When writing pylibcudf tests, it is important to remember that all the APIs should be tested in the C++ layer in libcudf already. +The primary purpose of pylibcudf tests is to ensure the correctness of the _bindings_; the correctness of the underlying implementation should generally be validated in libcudf. +If pylibcudf tests uncover a libcudf bug, a suitable libcudf test should be added to cover this case rather than relying solely on pylibcudf testing. + +pylibcudf's ``conftest.py`` contains some standard parametrized dtype fixture lists that may in turn be used to parametrize other fixtures. +Fixtures allocating data should leverage these dtype lists wherever possible to simplify testing across the matrix of important types. +Where appropriate, new fixture lists may be added. + +To run tests as efficiently as possible, the test suite should make generous use of fixtures. +The simplest general structure to follow is for pyarrow array/table/scalar fixtures to be parametrized by one of the dtype list. +Then, a corresponding pylibcudf fixture may be created using a simple `from_arrow` call. +This approach ensures consistent global coverage across types for various tests. + +In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data. +This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above. +Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module. + +Here is an example demonstrating the above points: + +```python +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from cudf._lib import pylibcudf as plc +from utils import assert_column_eq + +# The pa_dtype fixture is defined in conftest.py. +@pytest.fixture(scope="module") +def pa_column(pa_dtype): + pa.array([1, 2, 3]) + + +@pytest.fixture(scope="module") +def column(pa_column): + return plc.interop.from_arrow(pa_column) + + +def test_foo(pa_column, column): + index = 1 + result = plc.foo(column) + expected = pa.foo(pa_column) + + assert_column_eq(result, expected) +``` + +Some guidelines on what should be tested: +- Tests SHOULD comprehensively cover the API, including all possible combinations of arguments required to ensure good test coverage. +- pylibcudf SHOULD NOT attempt to stress test large data sizes, and SHOULD instead defer to libcudf tests. + - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead. +- Nullable data should always be tested. +- Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated. + - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd). + +Some guidelines on how best to use pytests. +- By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance. +- Where necessary, mutable fixtures should be named as such (e.g. `mutable_col`) and be of function scope. If possible, they can be implemented as simply making a copy of a corresponding module-scope immutable fixture to avoid duplicating the generation logic. + +Tests should be organized corresponding to pylibcudf modules, i.e. one test module for each pylibcudf module. + +The following sections of the cuDF Python testing guide also generally apply to pylibcudf unless superseded by any statements above: +- [](#test_parametrization) +- [](#xfailing_tests) +- [](#testing_warnings) + ## Miscellaneous Notes ### Cython Scoped Enums diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index a28a6b9192d..f12f809d5db 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -55,6 +55,8 @@ Typically, exception cases require specific assertions or other special logic, s The main exception to this rule is tests based on comparison to pandas. Such tests may test exceptional cases alongside more typical cases since the logic is generally identical. +(test_parametrization)= + ### Parametrization: custom fixtures and `pytest.mark.parametrize` When it comes to parametrizing tests written with `pytest`, @@ -140,6 +142,8 @@ def test_odds(): Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review. +(xfailing_tests)= + ### Tests with expected failures (`xfail`s) In some circumstances it makes sense to mark a test as _expected_ to @@ -218,6 +222,8 @@ This way, when the bug is fixed, the test suite will fail at this point (and we will remember to update the test). +(testing_warnings)= + ### Testing code that throws warnings Some code may be expected to throw warnings. diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index f3e5c0aec72..053e2299f22 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool @@ -33,19 +33,19 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: const column_view& input, size_type offset, const scalar& fill_values - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] scatter ( const table_view& source_table, const column_view& scatter_map, const table_view& target_table, - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] scatter ( const vector[reference_wrapper[constscalar]]& source_scalars, const column_view& indices, const table_view& target, - ) except + + ) except +cudf_exception_handler cpdef enum class mask_allocation_policy(int32_t): NEVER @@ -54,22 +54,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: cdef unique_ptr[column] empty_like ( const column_view& input_column - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] allocate_like ( const column_view& input_column, mask_allocation_policy policy - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] allocate_like ( const column_view& input_column, size_type size, mask_allocation_policy policy - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] empty_like ( const table_view& input_table - ) except + + ) except +cudf_exception_handler cdef void copy_range_in_place ( const column_view& input_column, @@ -77,7 +77,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: size_type input_begin, size_type input_end, size_type target_begin - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_range ( const column_view& input_column, @@ -85,68 +85,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: size_type input_begin, size_type input_end, size_type target_begin - ) except + + ) except +cudf_exception_handler cdef vector[column_view] slice ( const column_view& input_column, vector[size_type] indices - ) except + + ) except +cudf_exception_handler cdef vector[table_view] slice ( const table_view& input_table, vector[size_type] indices - ) except + + ) except +cudf_exception_handler cdef vector[column_view] split ( const column_view& input_column, vector[size_type] splits - ) except + + ) except +cudf_exception_handler cdef vector[table_view] split ( const table_view& input_table, vector[size_type] splits - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const column_view& lhs, const column_view& rhs, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const scalar& lhs, const column_view& rhs, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const column_view& lhs, const scalar& rhs, const column_view boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] copy_if_else ( const scalar& lhs, const scalar& rhs, const column_view boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] boolean_mask_scatter ( const table_view& input, const table_view& target, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[table] boolean_mask_scatter ( const vector[reference_wrapper[constscalar]]& input, const table_view& target, const column_view& boolean_mask - ) except + + ) except +cudf_exception_handler cdef unique_ptr[scalar] get_element ( const column_view& input, size_type index - ) except + + ) except +cudf_exception_handler cpdef enum class sample_with_replacement(bool): FALSE diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index fc5cc77c9e7..66ccdb53d1a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -43,6 +43,7 @@ cdef class Column: cpdef gpumemoryview data(self) cpdef gpumemoryview null_mask(self) cpdef list children(self) + cpdef Column copy(self) cpdef ListColumnView list_view(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 3c5c53f99cf..2565e92d5c9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -1,7 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer @@ -274,6 +274,13 @@ cdef class Column: """The children of the column.""" return self._children + cpdef Column copy(self): + """Create a copy of the column.""" + cdef unique_ptr[column] c_result + with nogil: + c_result = move(make_unique[column](self.view())) + return Column.from_libcudf(move(c_result)) + cdef class ListColumnView: """Accessor for methods of a Column that are specific to lists.""" diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index 7b5f1e70ea3..0211d122c8e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -58,12 +58,12 @@ cpdef Column copy_range( size_type target_begin, ) -cpdef Column shift(Column input, size_type offset, Scalar fill_values) - -cpdef list split(ColumnOrTable input, list splits) +cpdef Column shift(Column input, size_type offset, Scalar fill_value) cpdef list slice(ColumnOrTable input, list indices) +cpdef list split(ColumnOrTable input, list splits) + cpdef Column copy_if_else( LeftCopyIfElseOperand lhs, RightCopyIfElseOperand rhs, diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index d78955dc325..125a4ffe65f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -54,6 +54,11 @@ cpdef Table gather( ------- pylibcudf.Table The result of the gather + + Raises + ------ + ValueError + If the gather_map contains nulls. """ cdef unique_ptr[table] c_result with nogil: @@ -92,6 +97,20 @@ cpdef Table scatter( ------- Table The result of the scatter + + Raises + ------ + ValueError + If any of the following occur: + - scatter_map contains null values. + - source is a Table and the number of columns in source does not match the + number of columns in target. + - source is a Table and the number of rows in source does not match the + number of elements in scatter_map. + - source is a List[Scalar] and the number of scalars does not match the + number of columns in target. + TypeError + If data types of the source and target columns do not match. """ cdef unique_ptr[table] c_result cdef vector[reference_wrapper[const scalar]] source_scalars @@ -207,6 +226,17 @@ cpdef Column copy_range_in_place( The index of the last element in input_column to copy. target_begin : int The index of the first element in target_column to overwrite. + + Raises + ------ + TypeError + If the operation is attempted on non-fixed width types since those would require + memory reallocations, or if the input and target columns have different types. + IndexError + If the indices accessed by the ranges implied by input_begin, input_end, and + target_begin are out of bounds. + ValueError + If source has null values and target is not nullable. """ # Need to initialize this outside the function call so that Cython doesn't @@ -251,6 +281,14 @@ cpdef Column copy_range( ------- pylibcudf.Column A copy of target_column with the specified range overwritten. + + Raises + ------ + IndexError + If the indices accessed by the ranges implied by input_begin, input_end, and + target_begin are out of bounds. + TypeError + If target and source have different types. """ cdef unique_ptr[column] c_result @@ -266,7 +304,7 @@ cpdef Column copy_range( return Column.from_libcudf(move(c_result)) -cpdef Column shift(Column input, size_type offset, Scalar fill_values): +cpdef Column shift(Column input, size_type offset, Scalar fill_value): """Shift the elements of input by offset. For details on the implementation, see :cpp:func:`shift`. @@ -285,6 +323,12 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values): ------- pylibcudf.Column A copy of input shifted by offset. + + Raises + ------ + TypeError + If the fill_value is not of the same type as input, or if the input type is not + of fixed width or string type. """ cdef unique_ptr[column] c_result with nogil: @@ -292,37 +336,44 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values): cpp_copying.shift( input.view(), offset, - dereference(fill_values.c_obj) + dereference(fill_value.c_obj) ) ) return Column.from_libcudf(move(c_result)) -cpdef list split(ColumnOrTable input, list splits): - """Split input into multiple. +cpdef list slice(ColumnOrTable input, list indices): + """Slice input according to indices. - For details on the implementation, see :cpp:func:`split`. + For details on the implementation, see :cpp:func:`slice`. Parameters ---------- - input : Union[Column, Table] - The column to split. - splits : List[int] - The indices at which to split the column. + input_column : Union[Column, Table] + The column or table to slice. + indices : List[int] + The indices to select from input. Returns ------- List[Union[Column, Table]] - The result of splitting input. + The result of slicing ``input``. + + Raises + ------ + ValueError + If indices size is not even or the values in any pair of lower/upper bounds are + strictly decreasing. + IndexError + When any of the indices don't belong to the range ``[0, input_column.size())``. """ - cdef vector[size_type] c_splits = splits + cdef vector[size_type] c_indices = indices cdef vector[column_view] c_col_result cdef vector[table_view] c_tbl_result cdef int i - if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.split(input.view(), c_splits)) + c_col_result = move(cpp_copying.slice(input.view(), c_indices)) return [ Column.from_column_view(c_col_result[i], input) @@ -330,7 +381,7 @@ cpdef list split(ColumnOrTable input, list splits): ] else: with nogil: - c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) + c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) return [ Table.from_table_view(c_tbl_result[i], input) @@ -338,30 +389,31 @@ cpdef list split(ColumnOrTable input, list splits): ] -cpdef list slice(ColumnOrTable input, list indices): - """Slice input according to indices. +cpdef list split(ColumnOrTable input, list splits): + """Split input into multiple. - For details on the implementation, see :cpp:func:`slice`. + For details on the implementation, see :cpp:func:`split`. Parameters ---------- - input_column : Union[Column, Table] - The column or table to slice. - indices : List[int] - The indices to select from input. + input : Union[Column, Table] + The column to split. + splits : List[int] + The indices at which to split the column. Returns ------- List[Union[Column, Table]] - The result of slicing ``input``. + The result of splitting input. """ - cdef vector[size_type] c_indices = indices + cdef vector[size_type] c_splits = splits cdef vector[column_view] c_col_result cdef vector[table_view] c_tbl_result cdef int i + if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.slice(input.view(), c_indices)) + c_col_result = move(cpp_copying.split(input.view(), c_splits)) return [ Column.from_column_view(c_col_result[i], input) @@ -369,7 +421,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] else: with nogil: - c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) + c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) return [ Table.from_table_view(c_tbl_result[i], input) @@ -401,6 +453,15 @@ cpdef Column copy_if_else( ------- pylibcudf.Column The result of copying elements from lhs and rhs according to boolean_mask. + + Raises + ------ + TypeError + If lhs and rhs are not of the same type or if the boolean mask is not of type + bool. + ValueError + If boolean mask is not of the same length as lhs and rhs (whichever are + columns), or if lhs and rhs are not of the same length (if both are columns). """ cdef unique_ptr[column] result @@ -459,6 +520,16 @@ cpdef Table boolean_mask_scatter( ------- Table The result of the scatter + + Raises + ------ + ValueError + If input.num_columns() != target.num_columns(), boolean_mask.size() != + target.num_rows(), or if input is a Table and the number of `true` in + `boolean_mask` > input.num_rows(). + TypeError + If any input type does not match the corresponding target column's type, or + if boolean_mask.type() is not bool. """ cdef unique_ptr[table] result cdef vector[reference_wrapper[const scalar]] source_scalars @@ -502,6 +573,11 @@ cpdef Scalar get_element(Column input_column, size_type index): ------- pylibcudf.Scalar The element at index from input_column. + + Raises + ------ + IndexError + If index is out of bounds. """ cdef unique_ptr[scalar] c_output with nogil: diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index e7471033fc8..8dc41fccc0c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -140,6 +140,7 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None): @from_arrow.register(pa.Array) +@from_arrow.register(pa.ChunkedArray) def _from_arrow_column(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for arrays") diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 327f3911489..7467bfccaa8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -12,6 +12,9 @@ cdef class Table: cdef table_view view(self) nogil + cpdef int num_columns(self) + cpdef int num_rows(self) + @staticmethod cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 793e6330244..1fa60ec2b6c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -77,6 +77,14 @@ cdef class Table: for i in range(tv.num_columns()) ]) + cpdef int num_columns(self): + """The number of columns in this table.""" + return len(self._columns) + + cpdef int num_rows(self): + """The number of rows in this table.""" + return self._columns[0].size() + cpdef list columns(self): """The columns in this table.""" return self._columns diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index f6ff6e5a2fc..d8b92283412 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -39,6 +39,11 @@ cdef class DataType: """Get the scale associated with this data type.""" return self.c_obj.scale() + def __eq__(self, other): + if not isinstance(other, DataType): + return False + return self.id() == other.id() and self.scale() == other.scale() + @staticmethod cdef DataType from_libcudf(data_type dt): """Create a DataType from a libcudf data_type. diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py new file mode 100644 index 00000000000..6636ab9e5f8 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -0,0 +1,111 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Optional + +import pyarrow as pa +import pytest + +from cudf._lib import pylibcudf as plc + + +def metadata_from_arrow_array( + pa_array: pa.Array, +) -> Optional[plc.interop.ColumnMetadata]: + metadata = None + if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): + metadata = plc.interop.ColumnMetadata( + "", + # libcudf does not store field names, so just match pyarrow's. + [ + plc.interop.ColumnMetadata(pa_array.type.field(i).name) + for i in range(pa_array.type.num_fields) + ], + ) + return metadata + + +def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: + """Verify that the pylibcudf array and PyArrow array are equal.""" + # Nested types require children metadata to be passed to the conversion function. + plc_pa = plc.interop.to_arrow( + plc_column, metadata=metadata_from_arrow_array(pa_array) + ) + + if isinstance(plc_pa, pa.ChunkedArray): + plc_pa = plc_pa.combine_chunks() + if isinstance(pa_array, pa.ChunkedArray): + pa_array = pa_array.combine_chunks() + + assert plc_pa.equals(pa_array) + + +def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: + """Verify that the pylibcudf array and PyArrow array are equal.""" + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + +def cudf_raises(expected_exception: BaseException, *args, **kwargs): + # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions + match = kwargs.get("match", None) + if match is None: + kwargs["match"] = "CUDF failure at" + return pytest.raises(expected_exception, *args, **kwargs) + + +# TODO: Consider moving these type utilities into pylibcudf.types itself. +def is_signed_integer(plc_dtype: plc.DataType): + return ( + plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value + ) + + +def is_unsigned_integer(plc_dtype: plc.DataType): + return plc_dtype.id() in ( + plc.TypeId.UINT8, + plc.TypeId.UINT16, + plc.TypeId.UINT32, + plc.TypeId.UINT64, + ) + + +def is_integer(plc_dtype: plc.DataType): + return plc_dtype.id() in ( + plc.TypeId.INT8, + plc.TypeId.INT16, + plc.TypeId.INT32, + plc.TypeId.INT64, + ) + + +def is_floating(plc_dtype: plc.DataType): + return plc_dtype.id() in ( + plc.TypeId.FLOAT32, + plc.TypeId.FLOAT64, + ) + + +def is_boolean(plc_dtype: plc.DataType): + return plc_dtype.id() == plc.TypeId.BOOL8 + + +def is_string(plc_dtype: plc.DataType): + return plc_dtype.id() == plc.TypeId.STRING + + +def is_fixed_width(plc_dtype: plc.DataType): + return ( + is_integer(plc_dtype) + or is_floating(plc_dtype) + or is_boolean(plc_dtype) + ) + + +# We must explicitly specify this type via a field to ensure we don't include +# nullability accidentally. +DEFAULT_STRUCT_TESTING_TYPE = pa.struct( + [pa.field("v", pa.int64(), nullable=False)] +) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py new file mode 100644 index 00000000000..6d8284fb3db --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# Tell ruff it's OK that some imports occur after the sys.path.insert +# ruff: noqa: E402 +import os +import sys + +import pyarrow as pa +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) + +from utils import DEFAULT_STRUCT_TESTING_TYPE + + +# This fixture defines the standard set of types that all tests should default to +# running on. If there is a need for some tests to run on a different set of types, that +# type list fixture should also be defined below here if it is likely to be reused +# across modules. Otherwise it may be defined on a per-module basis. +@pytest.fixture( + scope="session", + params=[ + pa.int64(), + pa.float64(), + pa.string(), + pa.bool_(), + pa.list_(pa.int64()), + DEFAULT_STRUCT_TESTING_TYPE, + ], +) +def pa_type(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/pytest.ini b/python/cudf/cudf/pylibcudf_tests/pytest.ini new file mode 100644 index 00000000000..1761c0f011c --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/pytest.ini @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +xfail_strict = true +filterwarnings = + error + ignore:::.*xdist.* + ignore:::.*pytest.* diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py new file mode 100644 index 00000000000..0bf30f98636 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -0,0 +1,848 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import ( + DEFAULT_STRUCT_TESTING_TYPE, + assert_column_eq, + assert_table_eq, + cudf_raises, + is_fixed_width, + is_floating, + is_integer, + is_string, + metadata_from_arrow_array, +) + +from cudf._lib import pylibcudf as plc + + +# TODO: Test nullable data +@pytest.fixture(scope="module") +def pa_input_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([1, 2, 3], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["a", "b", "c"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([True, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[1], [2], [3]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="module") +def input_column(pa_input_column): + return plc.interop.from_arrow(pa_input_column) + + +@pytest.fixture(scope="module") +def pa_index_column(): + # Index column for testing gather/scatter, always integral. + return pa.array([1, 2, 3]) + + +@pytest.fixture(scope="module") +def index_column(pa_index_column): + return plc.interop.from_arrow(pa_index_column) + + +@pytest.fixture(scope="module") +def pa_target_column(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.array([4, 5, 6, 7, 8, 9], type=pa_type) + elif pa.types.is_string(pa_type): + return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.array([False, True, True, False, True, False], type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Add heterogenous sizes + return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.array( + [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], + type=pa_type, + ) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="module") +def target_column(pa_target_column): + return plc.interop.from_arrow(pa_target_column) + + +@pytest.fixture +def mutable_target_column(target_column): + return target_column.copy() + + +@pytest.fixture(scope="module") +def pa_source_table(pa_input_column): + return pa.table([pa_input_column] * 3, [""] * 3) + + +@pytest.fixture(scope="module") +def source_table(pa_source_table): + return plc.interop.from_arrow(pa_source_table) + + +@pytest.fixture(scope="module") +def pa_target_table(pa_target_column): + return pa.table([pa_target_column] * 3, [""] * 3) + + +@pytest.fixture(scope="module") +def target_table(pa_target_table): + return plc.interop.from_arrow(pa_target_table) + + +@pytest.fixture(scope="module") +def pa_source_scalar(pa_type): + if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): + return pa.scalar(1, type=pa_type) + elif pa.types.is_string(pa_type): + return pa.scalar("a", type=pa_type) + elif pa.types.is_boolean(pa_type): + return pa.scalar(False, type=pa_type) + elif pa.types.is_list(pa_type): + # TODO: Longer list? + return pa.scalar([1], type=pa_type) + elif pa.types.is_struct(pa_type): + return pa.scalar({"v": 1}, type=pa_type) + raise ValueError("Unsupported type") + + +@pytest.fixture(scope="module") +def source_scalar(pa_source_scalar): + return plc.interop.from_arrow(pa_source_scalar) + + +@pytest.fixture(scope="module") +def pa_mask(pa_target_column): + return pa.array([True, False] * (len(pa_target_column) // 2)) + + +@pytest.fixture(scope="module") +def mask(pa_mask): + return plc.interop.from_arrow(pa_mask) + + +def test_gather(target_table, pa_target_table, index_column, pa_index_column): + result = plc.copying.gather( + target_table, + index_column, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + expected = pa_target_table.take(pa_index_column) + assert_table_eq(result, expected) + + +def test_gather_map_has_nulls(target_table): + gather_map = plc.interop.from_arrow(pa.array([0, 1, None])) + with cudf_raises(ValueError): + plc.copying.gather( + target_table, + gather_map, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + + +def _pyarrow_index_to_mask(indices, mask_size): + # Convert a list of indices to a boolean mask. + return pc.is_in(pa.array(range(mask_size)), pa.array(indices)) + + +def _pyarrow_boolean_mask_scatter_column(source, mask, target): + if isinstance(source, pa.Scalar): + # if_else requires array lengths to match exactly or the replacement must be a + # scalar, so we use this in the scalar case. + return pc.if_else(mask, target, source) + + if isinstance(source, pa.ChunkedArray): + source = source.combine_chunks() + if isinstance(target, pa.ChunkedArray): + target = target.combine_chunks() + + # replace_with_mask accepts a column whose size is the number of true values in + # the mask, so we can use it for columnar scatters. + return pc.replace_with_mask(target, mask, source) + + +def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): + # pyarrow equivalent of cudf's boolean_mask_scatter. + return pa.table( + [ + _pyarrow_boolean_mask_scatter_column(r, mask, v) + for v, r in zip(target_table, source) + ], + [""] * target_table.num_columns, + ) + + +def test_scatter_table( + source_table, + pa_source_table, + index_column, + pa_index_column, + target_table, + pa_target_table, +): + result = plc.copying.scatter( + source_table, + index_column, + target_table, + ) + + if pa.types.is_list( + dtype := pa_target_table[0].type + ) or pa.types.is_struct(dtype): + # pyarrow does not support scattering with list data. If and when they do, + # replace this hardcoding with their implementation. + with pytest.raises(pa.ArrowNotImplementedError): + _pyarrow_boolean_mask_scatter_table( + pa_source_table, + _pyarrow_index_to_mask( + pa_index_column, pa_target_table.num_rows + ), + pa_target_table, + ) + + if pa.types.is_list(dtype := pa_target_table[0].type): + expected = pa.table( + [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3 + ) + elif pa.types.is_struct(dtype): + expected = pa.table( + [ + pa.array( + [ + {"v": 4}, + {"v": 1}, + {"v": 2}, + {"v": 3}, + {"v": 8}, + {"v": 9}, + ], + type=DEFAULT_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) + else: + expected = _pyarrow_boolean_mask_scatter_table( + pa_source_table, + _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows), + pa_target_table, + ) + + assert_table_eq(result, expected) + + +def test_scatter_table_num_col_mismatch( + source_table, index_column, target_table +): + # Number of columns in source and target must match. + with cudf_raises(ValueError): + plc.copying.scatter( + plc.Table(source_table.columns()[:2]), + index_column, + target_table, + ) + + +def test_scatter_table_num_row_mismatch(source_table, target_table): + # Number of rows in source and scatter map must match. + with cudf_raises(ValueError): + plc.copying.scatter( + source_table, + plc.interop.from_arrow( + pa.array(range(source_table.num_rows() * 2)) + ), + target_table, + ) + + +def test_scatter_table_map_has_nulls(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.scatter( + source_table, + plc.interop.from_arrow(pa.array([None] * source_table.num_rows())), + target_table, + ) + + +def test_scatter_table_type_mismatch(source_table, index_column, target_table): + with cudf_raises(TypeError): + if is_integer( + dtype := target_table.columns()[0].type() + ) or is_floating(dtype): + pa_array = pa.array([True] * source_table.num_rows()) + else: + pa_array = pa.array([1] * source_table.num_rows()) + ncol = source_table.num_columns() + pa_table = pa.table([pa_array] * ncol, [""] * ncol) + plc.copying.scatter( + plc.interop.from_arrow(pa_table), + index_column, + target_table, + ) + + +def test_scatter_scalars( + source_scalar, + pa_source_scalar, + index_column, + pa_index_column, + target_table, + pa_target_table, +): + result = plc.copying.scatter( + [source_scalar] * target_table.num_columns(), + index_column, + target_table, + ) + + expected = _pyarrow_boolean_mask_scatter_table( + [pa_source_scalar] * target_table.num_columns(), + pc.invert( + _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows) + ), + pa_target_table, + ) + + assert_table_eq(result, expected) + + +def test_scatter_scalars_num_scalars_mismatch( + source_scalar, index_column, target_table +): + with cudf_raises(ValueError): + plc.copying.scatter( + [source_scalar] * (target_table.num_columns() - 1), + index_column, + target_table, + ) + + +def test_scatter_scalars_map_has_nulls(source_scalar, target_table): + with cudf_raises(ValueError): + plc.copying.scatter( + [source_scalar] * target_table.num_columns(), + plc.interop.from_arrow(pa.array([None, None])), + target_table, + ) + + +def test_scatter_scalars_type_mismatch(index_column, target_table): + with cudf_raises(TypeError): + if is_integer( + dtype := target_table.columns()[0].type() + ) or is_floating(dtype): + source_scalar = [plc.interop.from_arrow(pa.scalar(True))] + else: + source_scalar = [plc.interop.from_arrow(pa.scalar(1))] + plc.copying.scatter( + source_scalar * target_table.num_columns(), + index_column, + target_table, + ) + + +def test_empty_like_column(input_column): + result = plc.copying.empty_like(input_column) + assert result.type() == input_column.type() + + +def test_empty_like_table(source_table): + result = plc.copying.empty_like(source_table) + assert result.num_columns() == source_table.num_columns() + for icol, rcol in zip(source_table.columns(), result.columns()): + assert rcol.type() == icol.type() + + +@pytest.mark.parametrize("size", [None, 10]) +def test_allocate_like(input_column, size): + if is_fixed_width(input_column.type()): + result = plc.copying.allocate_like( + input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size + ) + assert result.type() == input_column.type() + assert result.size() == (input_column.size() if size is None else size) + else: + with pytest.raises(TypeError): + plc.copying.allocate_like( + input_column, + plc.copying.MaskAllocationPolicy.RETAIN, + size=size, + ) + + +def test_copy_range_in_place( + input_column, pa_input_column, mutable_target_column, pa_target_column +): + if not is_fixed_width(mutable_target_column.type()): + with pytest.raises(TypeError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + else: + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + expected = _pyarrow_boolean_mask_scatter_column( + pa_input_column, + _pyarrow_index_to_mask( + range(len(pa_input_column)), len(pa_target_column) + ), + pa_target_column, + ) + assert_column_eq(mutable_target_column, expected) + + +def test_copy_range_in_place_out_of_bounds( + input_column, mutable_target_column +): + if is_fixed_width(mutable_target_column.type()): + with cudf_raises(IndexError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 5, + 5 + input_column.size(), + 0, + ) + + +def test_copy_range_in_place_different_types(mutable_target_column): + if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): + input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + else: + input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + + with cudf_raises(TypeError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + + +def test_copy_range_in_place_null_mismatch( + pa_input_column, mutable_target_column +): + if is_fixed_width(mutable_target_column.type()): + pa_input_column = pc.if_else( + _pyarrow_index_to_mask([0], len(pa_input_column)), + pa_input_column, + pa.scalar(None, type=pa_input_column.type), + ) + input_column = plc.interop.from_arrow(pa_input_column) + with cudf_raises(ValueError): + plc.copying.copy_range_in_place( + input_column, + mutable_target_column, + 0, + input_column.size(), + 0, + ) + + +def test_copy_range( + input_column, pa_input_column, target_column, pa_target_column +): + if is_fixed_width(dtype := target_column.type()) or is_string(dtype): + result = plc.copying.copy_range( + input_column, + target_column, + 0, + input_column.size(), + 0, + ) + expected = _pyarrow_boolean_mask_scatter_column( + pa_input_column, + _pyarrow_index_to_mask( + range(len(pa_input_column)), len(pa_target_column) + ), + pa_target_column, + ) + assert_column_eq(result, expected) + else: + with pytest.raises(TypeError): + plc.copying.copy_range( + input_column, + target_column, + 0, + input_column.size(), + 0, + ) + + +def test_copy_range_out_of_bounds(input_column, target_column): + with cudf_raises(IndexError): + plc.copying.copy_range( + input_column, + target_column, + 5, + 5 + input_column.size(), + 0, + ) + + +def test_copy_range_different_types(target_column): + if is_integer(dtype := target_column.type()) or is_floating(dtype): + input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + else: + input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + + with cudf_raises(TypeError): + plc.copying.copy_range( + input_column, + target_column, + 0, + input_column.size(), + 0, + ) + + +def test_shift( + target_column, pa_target_column, source_scalar, pa_source_scalar +): + shift = 2 + if is_fixed_width(dtype := target_column.type()) or is_string(dtype): + result = plc.copying.shift(target_column, shift, source_scalar) + expected = pa.concat_arrays( + [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] + ) + assert_column_eq(result, expected) + else: + with pytest.raises(TypeError): + plc.copying.shift(target_column, shift, source_scalar) + + +def test_shift_type_mismatch(target_column): + if is_integer(dtype := target_column.type()) or is_floating(dtype): + fill_value = plc.interop.from_arrow(pa.scalar("a")) + else: + fill_value = plc.interop.from_arrow(pa.scalar(1)) + + with cudf_raises(TypeError): + plc.copying.shift(target_column, 2, fill_value) + + +def test_slice_column(target_column, pa_target_column): + bounds = list(range(6)) + upper_bounds = bounds[1::2] + lower_bounds = bounds[::2] + result = plc.copying.slice(target_column, bounds) + for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): + assert_column_eq(slice_, pa_target_column[lb:ub]) + + +def test_slice_column_wrong_length(target_column): + with cudf_raises(ValueError): + plc.copying.slice(target_column, list(range(5))) + + +def test_slice_column_decreasing(target_column): + with cudf_raises(ValueError): + plc.copying.slice(target_column, list(range(5, -1, -1))) + + +def test_slice_column_out_of_bounds(target_column): + with cudf_raises(IndexError): + plc.copying.slice(target_column, list(range(2, 8))) + + +def test_slice_table(target_table, pa_target_table): + bounds = list(range(6)) + upper_bounds = bounds[1::2] + lower_bounds = bounds[::2] + result = plc.copying.slice(target_table, bounds) + for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): + assert_table_eq(slice_, pa_target_table[lb:ub]) + + +def test_split_column(target_column, pa_target_column): + upper_bounds = [1, 3, 5] + lower_bounds = [0] + upper_bounds[:-1] + result = plc.copying.split(target_column, upper_bounds) + for lb, ub, split in zip(lower_bounds, upper_bounds, result): + assert_column_eq(split, pa_target_column[lb:ub]) + + +def test_split_column_decreasing(target_column): + with cudf_raises(ValueError): + plc.copying.split(target_column, list(range(5, -1, -1))) + + +def test_split_column_out_of_bounds(target_column): + with cudf_raises(IndexError): + plc.copying.split(target_column, list(range(5, 8))) + + +def test_split_table(target_table, pa_target_table): + upper_bounds = [1, 3, 5] + lower_bounds = [0] + upper_bounds[:-1] + result = plc.copying.split(target_table, upper_bounds) + for lb, ub, split in zip(lower_bounds, upper_bounds, result): + assert_table_eq(split, pa_target_table[lb:ub]) + + +def test_copy_if_else_column_column( + target_column, pa_target_column, pa_source_scalar, mask, pa_mask +): + pa_other_column = pa.concat_arrays( + [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]] + ) + other_column = plc.interop.from_arrow(pa_other_column) + + result = plc.copying.copy_if_else( + target_column, + other_column, + mask, + ) + + expected = pc.if_else( + pa_mask, + pa_target_column, + pa_other_column, + ) + assert_column_eq(result, expected) + + +def test_copy_if_else_wrong_type(target_column, mask): + if is_integer(dtype := target_column.type()) or is_floating(dtype): + input_column = plc.interop.from_arrow( + pa.array(["a"] * target_column.size()) + ) + else: + input_column = plc.interop.from_arrow( + pa.array([1] * target_column.size()) + ) + + with cudf_raises(TypeError): + plc.copying.copy_if_else(input_column, target_column, mask) + + +def test_copy_if_else_wrong_type_mask(target_column): + with cudf_raises(TypeError): + plc.copying.copy_if_else( + target_column, + target_column, + plc.interop.from_arrow( + pa.array([1.0, 2.0] * (target_column.size() // 2)) + ), + ) + + +def test_copy_if_else_wrong_size(target_column): + with cudf_raises(ValueError): + plc.copying.copy_if_else( + plc.interop.from_arrow(pa.array([1])), + target_column, + plc.interop.from_arrow( + pa.array([True, False] * (target_column.size() // 2)) + ), + ) + + +def test_copy_if_else_wrong_size_mask(target_column): + with cudf_raises(ValueError): + plc.copying.copy_if_else( + target_column, + target_column, + plc.interop.from_arrow(pa.array([True])), + ) + + +@pytest.mark.parametrize("array_left", [True, False]) +def test_copy_if_else_column_scalar( + target_column, + pa_target_column, + source_scalar, + pa_source_scalar, + array_left, + mask, + pa_mask, +): + args = ( + (target_column, source_scalar) + if array_left + else (source_scalar, target_column) + ) + result = plc.copying.copy_if_else( + *args, + mask, + ) + + pa_args = ( + (pa_target_column, pa_source_scalar) + if array_left + else (pa_source_scalar, pa_target_column) + ) + expected = pc.if_else( + pa_mask, + *pa_args, + ) + assert_column_eq(result, expected) + + +def test_boolean_mask_scatter_from_table( + source_table, + pa_source_table, + target_table, + pa_target_table, + mask, + pa_mask, +): + result = plc.copying.boolean_mask_scatter( + source_table, + target_table, + mask, + ) + + if pa.types.is_list( + dtype := pa_target_table[0].type + ) or pa.types.is_struct(dtype): + # pyarrow does not support scattering with list data. If and when they do, + # replace this hardcoding with their implementation. + with pytest.raises(pa.ArrowNotImplementedError): + _pyarrow_boolean_mask_scatter_table( + pa_source_table, pa_mask, pa_target_table + ) + + if pa.types.is_list(dtype := pa_target_table[0].type): + expected = pa.table( + [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3 + ) + elif pa.types.is_struct(dtype): + expected = pa.table( + [ + pa.array( + [ + {"v": 1}, + {"v": 5}, + {"v": 2}, + {"v": 7}, + {"v": 3}, + {"v": 9}, + ], + type=DEFAULT_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) + else: + expected = _pyarrow_boolean_mask_scatter_table( + pa_source_table, pa_mask, pa_target_table + ) + + assert_table_eq(result, expected) + + +def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.boolean_mask_scatter( + plc.Table(source_table.columns()[:2]), + target_table, + plc.interop.from_arrow(pa.array([True, False] * 3)), + ) + + +def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.boolean_mask_scatter( + source_table, + target_table, + plc.interop.from_arrow(pa.array([True, False] * 2)), + ) + + +def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): + with cudf_raises(ValueError): + plc.copying.boolean_mask_scatter( + plc.Table(source_table.columns()[:2]), + target_table, + plc.interop.from_arrow( + pa.array([True, False] * 2 + [False, False]) + ), + ) + + +def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): + if is_integer(dtype := target_table.columns()[0].type()) or is_floating( + dtype + ): + input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + else: + input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + + with cudf_raises(TypeError): + plc.copying.boolean_mask_scatter( + plc.Table([input_column] * 3), target_table, mask + ) + + +def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): + with cudf_raises(TypeError): + plc.copying.boolean_mask_scatter( + source_table, + target_table, + plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)), + ) + + +def test_boolean_mask_scatter_from_scalars( + source_scalar, + pa_source_scalar, + target_table, + pa_target_table, + mask, + pa_mask, +): + result = plc.copying.boolean_mask_scatter( + [source_scalar] * 3, + target_table, + mask, + ) + + expected = _pyarrow_boolean_mask_scatter_table( + [pa_source_scalar] * target_table.num_columns(), + pc.invert(pa_mask), + pa_target_table, + ) + + assert_table_eq(result, expected) + + +def test_get_element(input_column, pa_input_column): + index = 1 + result = plc.copying.get_element(input_column, index) + + assert ( + plc.interop.to_arrow( + result, metadata_from_arrow_array(pa_input_column) + ).as_py() + == pa_input_column[index].as_py() + ) + + +def test_get_element_out_of_bounds(input_column): + with cudf_raises(IndexError): + plc.copying.get_element(input_column, 100)