diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9ac373db309..ccda2596031 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -91,6 +91,12 @@ repos: entry: '(category=|\s)DeprecationWarning[,)]' language: pygrep types_or: [python, cython] + # We need to exclude just the following file because few APIs still need + # DeprecationWarning: https://github.com/pandas-dev/pandas/issues/54970 + exclude: | + (?x)^( + ^python/cudf/cudf/core/dtypes.py + ) - id: no-programmatic-xfail name: no-programmatic-xfail description: 'Enforce that pytest.xfail is not introduced (see dev docs for details)' diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 30744d99b54..fa4ef8ddf68 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -59,13 +59,13 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21,<1.25 +- numpy>=1.21 - numpydoc - nvcc_linux-64=11.8 - nvcomp==3.0.5 - nvtx>=0.2.1 - packaging -- pandas>=1.3,<1.6.0dev0 +- pandas>=2.0,<2.1.5dev0 - pandoc - pip - pre-commit diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index ddd66611fde..a8be9d65c43 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -58,12 +58,12 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21,<1.25 +- numpy>=1.21 - numpydoc - nvcomp==3.0.5 - nvtx>=0.2.1 - packaging -- pandas>=1.3,<1.6.0dev0 +- pandas>=2.0,<2.1.5dev0 - pandoc - pip - pre-commit diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 4f39a9fe452..0dffdc10421 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -76,12 +76,11 @@ requirements: - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }} - python - typing_extensions >=4.0.0 - - pandas >=1.3,<1.6.0dev0 + - pandas >=2.0,<2.1.5dev0 - cupy >=12.0.0 # TODO: Pin to numba<0.58 until #14160 is resolved - numba >=0.57,<0.58 - # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - - numpy >=1.21,<1.25 + - numpy >=1.21 - {{ pin_compatible('pyarrow', max_pin='x') }} - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3cc4fda695f..2f351edd2b9 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -388,6 +388,27 @@ __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, return packer.flush(); } +/** + * @brief Sets `s->cur` to point to the start of encoded page data. + * + * For V1 headers, this will be immediately after the repetition and definition level data. For V2, + * it will be at the next properly aligned location after the level data. The padding in V2 is + * needed for compressors that require aligned input. + */ +template +inline void __device__ set_page_data_start(state_type* s) +{ + s->cur = s->page.page_data + s->page.max_hdr_size; + switch (s->page.page_type) { + case PageType::DATA_PAGE: + s->cur += s->page.level_bytes(); + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + break; + case PageType::DATA_PAGE_V2: s->cur += s->page.max_lvl_size; break; + } +} + } // anonymous namespace // blockDim {512,1,1} @@ -594,8 +615,13 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.hdr_size = 0; + page_g.def_lvl_bytes = 0; + page_g.rep_lvl_bytes = 0; + page_g.max_lvl_size = 0; + page_g.comp_data_size = 0; page_g.max_hdr_size = MAX_V1_HDR_SIZE; page_g.max_data_size = ck_g.uniq_data_size; + page_g.data_size = ck_g.uniq_data_size; page_g.start_row = cur_row; page_g.num_rows = ck_g.num_dict_entries; page_g.num_leaf_values = ck_g.num_dict_entries; @@ -689,12 +715,17 @@ CUDF_KERNEL void __launch_bounds__(128) page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page); } if (!t) { - page_g.num_fragments = fragments_in_chunk - page_start; - page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; - page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; - page_g.page_type = data_page_type; - page_g.hdr_size = 0; - page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics + page_g.num_fragments = fragments_in_chunk - page_start; + page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; + page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; + page_g.page_type = data_page_type; + page_g.hdr_size = 0; + page_g.def_lvl_bytes = 0; + page_g.rep_lvl_bytes = 0; + page_g.max_lvl_size = 0; + page_g.data_size = 0; + page_g.comp_data_size = 0; + page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics if (ck_g.stats) { uint32_t stats_hdr_len = 16; if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) { @@ -716,13 +747,19 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.num_valid = num_valid; auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page); auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page); + if (write_v2_headers) { + page_g.max_lvl_size = + util::round_up_unsafe(def_level_size + rep_level_size, page_align); + } // get a different bound if using delta encoding if (is_use_delta) { auto const delta_len = delta_data_len(physical_type, type_id, page_g.num_leaf_values, page_size); page_size = max(page_size, delta_len); } - auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; + auto const max_data_size = + page_size + rle_pad + + (write_v2_headers ? page_g.max_lvl_size : def_level_size + rep_level_size); // page size must fit in 32-bit signed integer if (max_data_size > std::numeric_limits::max()) { CUDF_UNREACHABLE("page size exceeds maximum for i32"); @@ -739,7 +776,9 @@ CUDF_KERNEL void __launch_bounds__(128) page_offset += util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align); if (not comp_page_sizes.empty()) { - comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page + num_pages]; + // V2 does not include level data in compressed size estimate + comp_page_offset += page_g.max_hdr_size + page_g.max_lvl_size + + comp_page_sizes[ck_g.first_page + num_pages]; } page_headers_size += page_g.max_hdr_size; max_page_data_size = max(max_page_data_size, page_g.max_data_size); @@ -774,8 +813,10 @@ CUDF_KERNEL void __launch_bounds__(128) } pages[ck_g.first_page + num_pages] = page_g; } + // page_sizes should be the number of bytes to be compressed, so don't include level + // data for V2. if (not page_sizes.empty()) { - page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size; + page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size - page_g.max_lvl_size; } if (page_grstats) { page_grstats[ck_g.first_page + num_pages] = pagestats_g; } } @@ -1429,10 +1470,6 @@ __device__ void finish_page_encode(state_buf* s, return thrust::reduce(thrust::seq, hist_start, hist_end, 0U); }; - // V2 does not compress rep and def level data - size_t const skip_comp_size = - write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0; - // this will be true if max_rep > 0 (i.e. there are lists) if (s->page.rep_histogram != nullptr) { // for repetition we get hist[0] from num_rows, and can derive hist[max_rep_level] @@ -1489,10 +1526,17 @@ __device__ void finish_page_encode(state_buf* s, // FIXME(ets): this needs to do error propagation back to the host CUDF_UNREACHABLE("detected possible page data corruption"); } - s->page.max_data_size = actual_data_size; + if (s->page.is_v2()) { + auto const d_base = base + s->page.max_lvl_size; + s->page.data_size = static_cast(end_ptr - d_base) + s->page.level_bytes(); + } else { + s->page.data_size = actual_data_size; + } if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, + auto const c_base = base + s->page.max_lvl_size; + auto const bytes_to_compress = static_cast(end_ptr - c_base); + comp_in[blockIdx.x] = {c_base, bytes_to_compress}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + s->page.max_lvl_size, 0}; // size is unused } pages[blockIdx.x] = s->page; @@ -1503,10 +1547,10 @@ __device__ void finish_page_encode(state_buf* s, } // copy uncompressed bytes over - if (skip_comp_size != 0 && not comp_in.empty()) { + if (s->page.is_v2() and not comp_in.empty()) { uint8_t* const src = s->page.page_data + s->page.max_hdr_size; uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; - for (int i = t; i < skip_comp_size; i += block_size) { + for (int i = t; i < s->page.level_bytes(); i += block_size) { dst[i] = src[i]; } } @@ -1536,13 +1580,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; - // if V1 data page, need space for the RLE length fields - if (s->page.page_type == PageType::DATA_PAGE) { - if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - } + set_page_data_start(s); } __syncthreads(); @@ -1771,13 +1809,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; - // if V1 data page, need space for the RLE length fields - if (s->page.page_type == PageType::DATA_PAGE) { - if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - } + set_page_data_start(s); } __syncthreads(); @@ -1908,8 +1940,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + set_page_data_start(s); } __syncthreads(); @@ -2017,8 +2048,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + set_page_data_start(s); } __syncthreads(); @@ -2142,11 +2172,10 @@ CUDF_KERNEL void __launch_bounds__(decide_compression_block_size) auto const num_pages = ck_g[warp_id].num_pages; for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) { auto const& curr_page = ck_g[warp_id].pages[page_id]; - auto const page_data_size = curr_page.max_data_size; - auto const is_v2 = curr_page.page_type == PageType::DATA_PAGE_V2; - auto const lvl_bytes = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0; + auto const page_data_size = curr_page.data_size; uncompressed_data_size += page_data_size; if (auto comp_res = curr_page.comp_res; comp_res != nullptr) { + auto const lvl_bytes = curr_page.is_v2() ? curr_page.level_bytes() : 0; compressed_data_size += comp_res->bytes_written + lvl_bytes; if (comp_res->status != compression_status::SUCCESS) { atomicOr(&compression_error[warp_id], 1); @@ -2614,14 +2643,13 @@ CUDF_KERNEL void __launch_bounds__(128) EncodeStatistics(hdr_start, &chunk_stats[page_g.chunk_id], col_g.stats_dtype, scratch); page_g.chunk->ck_stat_size = static_cast(hdr_end - hdr_start); } - uncompressed_page_size = page_g.max_data_size; + uncompressed_page_size = page_g.data_size; if (ck_g.is_compressed) { - auto const is_v2 = page_g.page_type == PageType::DATA_PAGE_V2; - auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0; + auto const lvl_bytes = page_g.is_v2() ? page_g.level_bytes() : 0; hdr_start = page_g.compressed_data; compressed_page_size = static_cast(comp_results[blockIdx.x].bytes_written) + lvl_bytes; - page_g.max_data_size = compressed_page_size; + page_g.comp_data_size = compressed_page_size; } else { hdr_start = page_g.page_data; compressed_page_size = uncompressed_page_size; @@ -2708,19 +2736,26 @@ CUDF_KERNEL void __launch_bounds__(1024) if (t == 0) { page_g = first_page[page]; } __syncthreads(); - src = (ck_g.is_compressed) ? page_g.compressed_data : page_g.page_data; + src = ck_g.is_compressed ? page_g.compressed_data : page_g.page_data; // Copy page header hdr_len = page_g.hdr_size; memcpy_block<1024, true>(dst, src, hdr_len, t); src += page_g.max_hdr_size; dst += hdr_len; - // Copy page data uncompressed_size += hdr_len; - data_len = page_g.max_data_size; + data_len = ck_g.is_compressed ? page_g.comp_data_size : page_g.data_size; + // Copy page data. For V2, the level data and page data are disjoint. + if (page_g.is_v2()) { + auto const lvl_len = page_g.level_bytes(); + memcpy_block<1024, true>(dst, src, lvl_len, t); + src += page_g.max_lvl_size; + dst += lvl_len; + data_len -= lvl_len; + } memcpy_block<1024, true>(dst, src, data_len, t); dst += data_len; __syncthreads(); - if (!t && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; } + if (t == 0 && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; } } if (t == 0) { chunks[blockIdx.x].bfr_size = uncompressed_size; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index d58c7f95389..b215cd7a20b 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -560,30 +560,41 @@ struct EncColumnChunk { * @brief Struct describing an encoder data page */ struct EncPage { - uint8_t* page_data; //!< Ptr to uncompressed page - uint8_t* compressed_data; //!< Ptr to compressed page - uint16_t num_fragments; //!< Number of fragments in page - PageType page_type; //!< Page type - Encoding encoding; //!< Encoding used for page data - EncColumnChunk* chunk; //!< Chunk that this page belongs to + // all pointers at the top to keep things properly aligned + uint8_t* page_data; //!< Ptr to uncompressed page + uint8_t* compressed_data; //!< Ptr to compressed page + EncColumnChunk* chunk; //!< Chunk that this page belongs to + compression_result* comp_res; //!< Ptr to compression result + uint32_t* def_histogram; //!< Histogram of counts for each definition level + uint32_t* rep_histogram; //!< Histogram of counts for each repetition level + // put this here in case it's ever made 64-bit + encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run + // the rest can be 4 byte aligned uint32_t chunk_id; //!< Index in chunk array - uint32_t hdr_size; //!< Size of page header + uint32_t hdr_size; //!< Actual size of encoded page header uint32_t max_hdr_size; //!< Maximum size of page header - uint32_t max_data_size; //!< Maximum size of coded page data (excluding header) + uint32_t max_data_size; //!< Maximum size of encoded page data (excluding header) + uint32_t data_size; //!< Actual size of encoded page data (includes level data) + uint32_t comp_data_size; //!< Actual size of compressed page data uint32_t start_row; //!< First row of page uint32_t num_rows; //!< Rows in page uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) - uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) - compression_result* comp_res; //!< Ptr to compression result - uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) - encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run - uint32_t* def_histogram; //!< Histogram of counts for each definition level - uint32_t* rep_histogram; //!< Histogram of counts for each repetition level - uint32_t var_bytes_size; //!< Number of variable length bytes in the page (byte arrays only) + uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data + uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data + uint32_t max_lvl_size; //!< Maximum size of level data (V2 only, 0 for V1) + uint32_t num_nulls; //!< Number of null values uint32_t num_valid; //!< Number of valid leaf values + uint32_t var_bytes_size; //!< Number of variable length bytes in the page (byte arrays only) + // enums and smaller stuff down here + PageType page_type; //!< Page type + Encoding encoding; //!< Encoding used for page data + uint16_t num_fragments; //!< Number of fragments in page + + constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; } + + constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; } }; /** diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 93b225dca1b..0303439fb27 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2220,10 +2220,6 @@ writer::impl::~impl() { close(); } void writer::impl::init_state() { - // See issue #14781. Can remove this check once that is fixed. - CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD), - "V2 page headers cannot be used with ZSTD compression"); - _current_chunk_offset.resize(_out_sink.size()); // Write file header file_header_s fhdr; @@ -2405,7 +2401,8 @@ void writer::impl::write_parquet_data_to_sink( // skip dict pages if (enc_page.page_type == PageType::DICTIONARY_PAGE) { continue; } - int32_t this_page_size = enc_page.hdr_size + enc_page.max_data_size; + int32_t const this_page_size = + enc_page.hdr_size + (ck.is_compressed ? enc_page.comp_data_size : enc_page.data_size); // first_row_idx is relative to start of row group PageLocation loc{curr_pg_offset, this_page_size, enc_page.start_row - ck.start_row}; if (is_byte_arr) { var_bytes.push_back(enc_page.var_bytes_size); } diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 1a373ed92ae..25d58a96512 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -35,7 +35,7 @@ INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest, TEST_P(ParquetV2Test, MultiColumn) { - constexpr auto num_rows = 50000; + constexpr auto num_rows = 50'000; auto const is_v2 = GetParam(); // auto col0_data = random_values(num_rows); @@ -84,6 +84,7 @@ TEST_P(ParquetV2Test, MultiColumn) cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::ZSTD) .metadata(expected_metadata); cudf::io::write_parquet(out_opts); @@ -156,6 +157,7 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls) cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::ZSTD) .metadata(expected_metadata); cudf::io::write_parquet(out_opts); @@ -197,6 +199,7 @@ TEST_P(ParquetV2Test, Strings) cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::ZSTD) .metadata(expected_metadata); cudf::io::write_parquet(out_opts); diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 2df34c7928b..34061cb7bf8 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1064,7 +1064,6 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) auto const expected = table_view{{col0, col1}}; auto const filepath = temp_env->get_temp_filepath("DictionaryAdaptiveTest.parquet"); - // no compression so we can easily read page data cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .compression(cudf::io::compression_type::ZSTD) @@ -1116,7 +1115,6 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) auto const expected = table_view{{col0, col1}}; auto const filepath = temp_env->get_temp_filepath("DictionaryAlwaysTest.parquet"); - // no compression so we can easily read page data cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .compression(cudf::io::compression_type::ZSTD) @@ -1428,21 +1426,6 @@ TEST_F(ParquetWriterTest, RowGroupMetadata) static_cast(num_rows * sizeof(column_type))); } -// See #14772. -// zStandard compression cannot currently be used with V2 page headers due to buffer -// alignment issues. -// TODO: Remove this test when #14781 is closed. -TEST_F(ParquetWriterTest, ZstdWithV2Header) -{ - auto const expected = table_view{}; - - cudf::io::parquet_writer_options const out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected) - .compression(cudf::io::compression_type::ZSTD) - .write_v2_headers(true); - EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error); -} - ///////////////////////////////////////////////////////////// // custom mem mapped data sink that supports device writes template diff --git a/dependencies.yaml b/dependencies.yaml index 5bceaa74af1..90b0527479a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -266,8 +266,7 @@ dependencies: - *cmake_ver - cython>=3.0.3 - *ninja - # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - - &numpy numpy>=1.21,<1.25 + - &numpy numpy>=1.21 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==14.0.1.* @@ -502,7 +501,7 @@ dependencies: packages: - fsspec>=0.6.0 - *numpy - - pandas>=1.3,<1.6.0dev0 + - pandas>=2.0,<2.1.5dev0 run_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 5b04335f475..035ee586822 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -454,12 +454,6 @@ def on_missing_reference(app, env, node, contnode): _prefixed_domain_objects[f"{prefix}{name}"] = name reftarget = node.get("reftarget") - if reftarget == "cudf.core.index.GenericIndex": - # We don't exposed docs for `cudf.core.index.GenericIndex` - # hence we would want the docstring & mypy references to - # use `cudf.Index` - node["reftarget"] = "cudf.Index" - return contnode if "namespacecudf" in reftarget: node["reftarget"] = "cudf" return contnode diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 0b37de00f6b..b2c9ddf9fe4 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -22,7 +22,7 @@ Finally we tie these pieces together to provide a more holistic view of the proj % class IndexedFrame % class SingleColumnFrame % class BaseIndex -% class GenericIndex +% class Index % class MultiIndex % class RangeIndex % class DataFrame @@ -42,8 +42,8 @@ Finally we tie these pieces together to provide a more holistic view of the proj % BaseIndex <|-- MultiIndex % Frame <|-- MultiIndex % -% BaseIndex <|-- GenericIndex -% SingleColumnFrame <|-- GenericIndex +% BaseIndex <|-- Index +% SingleColumnFrame <|-- Index % % @enduml @@ -89,31 +89,26 @@ While we've highlighted some exceptional cases of Indexes before, let's start wi In practice, `BaseIndex` does have concrete implementations of a small set of methods. However, currently many of these implementations are not applicable to all subclasses and will be eventually be removed. -Almost all indexes are subclasses of `GenericIndex`, a single-columned index with the class hierarchy: +Almost all indexes are subclasses of `Index`, a single-columned index with the class hierarchy: ```python -class GenericIndex(SingleColumnFrame, BaseIndex) +class Index(SingleColumnFrame, BaseIndex) ``` Integer, float, or string indexes are all composed of a single column of data. -Most `GenericIndex` methods are inherited from `Frame`, saving us the trouble of rewriting them. +Most `Index` methods are inherited from `Frame`, saving us the trouble of rewriting them. We now consider the three main exceptions to this model: - A `RangeIndex` is not backed by a column of data, so it inherits directly from `BaseIndex` alone. Wherever possible, its methods have special implementations designed to avoid materializing columns. - Where such an implementation is infeasible, we fall back to converting it to an `Int64Index` first instead. + Where such an implementation is infeasible, we fall back to converting it to an `Index` of `int64` + dtype first instead. - A `MultiIndex` is backed by _multiple_ columns of data. Therefore, its inheritance hierarchy looks like `class MultiIndex(Frame, BaseIndex)`. Some of its more `Frame`-like methods may be inherited, but many others must be reimplemented since in many cases a `MultiIndex` is not expected to behave like a `Frame`. -- Just like in pandas, `Index` itself can never be instantiated. - `pandas.Index` is the parent class for indexes, - but its constructor returns an appropriate subclass depending on the input data type and shape. - Unfortunately, mimicking this behavior requires overriding `__new__`, - which in turn makes shared initialization across inheritance trees much more cumbersome to manage. - To enable sharing constructor logic across different index classes, - we instead define `BaseIndex` as the parent class of all indexes. +- To enable sharing constructor logic across different index classes, + we define `BaseIndex` as the parent class of all indexes. `Index` inherits from `BaseIndex`, but it masquerades as a `BaseIndex` to match pandas. - This class should contain no implementations since it is simply a factory for other indexes. ## The Column layer diff --git a/docs/cudf/source/user_guide/api_docs/dataframe.rst b/docs/cudf/source/user_guide/api_docs/dataframe.rst index a8f3edf5a04..90227541e4a 100644 --- a/docs/cudf/source/user_guide/api_docs/dataframe.rst +++ b/docs/cudf/source/user_guide/api_docs/dataframe.rst @@ -105,13 +105,14 @@ Function application, GroupBy & window .. autosummary:: :toctree: api/ + DataFrame.agg DataFrame.apply DataFrame.applymap DataFrame.apply_chunks DataFrame.apply_rows - DataFrame.pipe - DataFrame.agg DataFrame.groupby + DataFrame.map + DataFrame.pipe DataFrame.rolling .. _api.dataframe.stats: @@ -232,7 +233,6 @@ Combining / comparing / joining / merging .. autosummary:: :toctree: api/ - DataFrame.append DataFrame.assign DataFrame.join DataFrame.merge diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst index 550a0ef1c89..80811efa33f 100644 --- a/docs/cudf/source/user_guide/api_docs/groupby.rst +++ b/docs/cudf/source/user_guide/api_docs/groupby.rst @@ -42,7 +42,6 @@ Computations / descriptive stats :toctree: api/ GroupBy.bfill - GroupBy.backfill GroupBy.count GroupBy.cumcount GroupBy.cummax @@ -63,7 +62,6 @@ Computations / descriptive stats GroupBy.ngroup GroupBy.nth GroupBy.nunique - GroupBy.pad GroupBy.prod GroupBy.shift GroupBy.size @@ -82,7 +80,6 @@ application to columns of a specific data type. .. autosummary:: :toctree: api/ - DataFrameGroupBy.backfill DataFrameGroupBy.bfill DataFrameGroupBy.count DataFrameGroupBy.cumcount @@ -96,7 +93,6 @@ application to columns of a specific data type. DataFrameGroupBy.idxmax DataFrameGroupBy.idxmin DataFrameGroupBy.nunique - DataFrameGroupBy.pad DataFrameGroupBy.quantile DataFrameGroupBy.shift DataFrameGroupBy.size diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst index b6da9af9b3e..9c84f206010 100644 --- a/docs/cudf/source/user_guide/api_docs/index_objects.rst +++ b/docs/cudf/source/user_guide/api_docs/index_objects.rst @@ -25,7 +25,6 @@ Properties Index.empty Index.has_duplicates Index.hasnans - Index.is_monotonic Index.is_monotonic_increasing Index.is_monotonic_decreasing Index.is_unique @@ -42,6 +41,7 @@ Modifying and computations .. autosummary:: :toctree: api/ + Index.all Index.any Index.copy Index.drop_duplicates @@ -61,6 +61,7 @@ Modifying and computations Index.where Index.take Index.unique + Index.nunique Compatibility with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -77,7 +78,9 @@ Missing values Index.fillna Index.dropna Index.isna + Index.isnull Index.notna + Index.notnull Memory usage ~~~~~~~~~~~~ @@ -143,6 +146,7 @@ Selecting .. autosummary:: :toctree: api/ + Index.get_indexer Index.get_level_values Index.get_loc Index.get_slice_bound @@ -168,9 +172,6 @@ Numeric Index RangeIndex.step RangeIndex.to_numpy RangeIndex.to_arrow - Int64Index - UInt64Index - Float64Index .. _api.categoricalindex: @@ -212,6 +213,7 @@ IntervalIndex components IntervalIndex.from_breaks IntervalIndex.values + IntervalIndex.get_indexer IntervalIndex.get_loc .. _api.multiindex: @@ -258,6 +260,7 @@ MultiIndex selecting .. autosummary:: :toctree: api/ + MultiIndex.get_indexer MultiIndex.get_loc MultiIndex.get_level_values diff --git a/docs/cudf/source/user_guide/api_docs/series.rst b/docs/cudf/source/user_guide/api_docs/series.rst index ebfc1e3f5d1..28931d567b4 100644 --- a/docs/cudf/source/user_guide/api_docs/series.rst +++ b/docs/cudf/source/user_guide/api_docs/series.rst @@ -158,7 +158,6 @@ Computations / descriptive stats Series.unique Series.nunique Series.is_unique - Series.is_monotonic Series.is_monotonic_increasing Series.is_monotonic_decreasing Series.value_counts @@ -226,7 +225,6 @@ Combining / comparing / joining / merging .. autosummary:: :toctree: api/ - Series.append Series.update Time Series-related diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md index 22218d4d57a..03ce58ea9e3 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.md +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -158,6 +158,27 @@ module, which allow you to compare values up to a desired precision. Unlike Pandas, cuDF does not support duplicate column names. It is best to use unique strings for column names. +## Writing a DataFrame to Parquet with non-string column names + +When there is a DataFrame with non-string column names, pandas casts each +column name to `str` before writing to a Parquet file. `cudf` raises an +error by default if this is attempted. However, to achieve similar behavior +as pandas you can enable the `mode.pandas_compatible` option, which will +enable `cudf` to cast the column names to `str` just like pandas. + +```python +>>> import cudf +>>> df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) +>>> df.to_parquet("df.parquet") + +Traceback (most recent call last): +ValueError: Writing a Parquet file requires string column names +>>> cudf.set_option("mode.pandas_compatible", True) +>>> df.to_parquet("df.parquet") + +UserWarning: The DataFrame has column names of non-string type. They will be converted to strings on write. +``` + ## No true `"object"` data type In Pandas and NumPy, the `"object"` data type is used for diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index a70d2329625..7b2b71cf216 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -40,8 +40,8 @@ In addition to the above fixtures, we also provide the following more specialized fixtures: - rangeindex: Since RangeIndex always holds int64 data we cannot conflate - it with index_dtype_int64 (a true Int64Index), and it cannot hold nulls. - As a result, it is provided as a separate fixture. + it with index_dtype_int64 (a true Index with int64 dtype), and it + cannot hold nulls. As a result, it is provided as a separate fixture. """ import os diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 02274a5fdd1..e14815a1b0d 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # _setup_numba _must be called before numba.cuda is imported, because # it sets the numba config variable responsible for enabling @@ -41,22 +41,10 @@ BaseIndex, CategoricalIndex, DatetimeIndex, - Float32Index, - Float64Index, - GenericIndex, Index, - Int8Index, - Int16Index, - Int32Index, - Int64Index, IntervalIndex, RangeIndex, - StringIndex, TimedeltaIndex, - UInt8Index, - UInt16Index, - UInt32Index, - UInt64Index, interval_range, ) from cudf.core.missing import NA, NaT @@ -109,15 +97,8 @@ "DatetimeIndex", "Decimal32Dtype", "Decimal64Dtype", - "Float32Index", - "Float64Index", - "GenericIndex", "Grouper", "Index", - "Int16Index", - "Int32Index", - "Int64Index", - "Int8Index", "IntervalDtype", "IntervalIndex", "ListDtype", @@ -127,13 +108,8 @@ "RangeIndex", "Scalar", "Series", - "StringIndex", "StructDtype", "TimedeltaIndex", - "UInt16Index", - "UInt32Index", - "UInt64Index", - "UInt8Index", "api", "concat", "crosstab", diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 8ab7048cff0..5b49143fd5a 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import logging import random @@ -99,7 +99,7 @@ def set_rand_params(self, params): if dtype_val is not None: dtype_val = { col_name: "category" - if cudf.utils.dtypes.is_categorical_dtype(dtype) + if cudf.utils.dtypes._is_categorical_dtype(dtype) else pandas_dtypes_to_np_dtypes[dtype] for col_name, dtype in dtype_val.items() } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index 29e0aeb7050..bffd508b2ef 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import logging import random @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val): if dtype_val is not None and isinstance(dtype_val, abc.Mapping): processed_dtypes = {} for col_name, dtype in dtype_val.items(): - if cudf.utils.dtypes.is_categorical_dtype(dtype): + if cudf.utils.dtypes._is_categorical_dtype(dtype): processed_dtypes[col_name] = "category" else: processed_dtypes[col_name] = str( diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 359c9f8725b..0f0bc3ce81a 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.map cimport map @@ -434,7 +434,7 @@ def read_csv( if dtype is not None: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - if cudf.api.types.is_categorical_dtype(v): + if cudf.api.types._is_categorical_dtype(v): df._data[str(k)] = df._data[str(k)].astype(v) elif ( cudf.api.types.is_scalar(dtype) or @@ -442,11 +442,11 @@ def read_csv( np.dtype, pd.api.extensions.ExtensionDtype, type )) ): - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): df = df.astype(dtype) elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): - if cudf.api.types.is_categorical_dtype(col_dtype): + if cudf.api.types._is_categorical_dtype(col_dtype): col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) @@ -554,7 +554,7 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): if isinstance(dtype, str): dtype = "str" else: diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 2cbdf76030b..af2759e16f9 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -115,6 +115,7 @@ cpdef read_orc(object filepaths_or_buffers, ) cdef table_with_metadata c_result + cdef size_type nrows with nogil: c_result = move(libcudf_read_orc(c_orc_reader_options)) @@ -126,6 +127,12 @@ cpdef read_orc(object filepaths_or_buffers, skip_rows, num_rows) + if columns is not None and (isinstance(columns, list) and len(columns) == 0): + # When `columns=[]`, index needs to be + # established, but not the columns. + nrows = c_result.tbl.get()[0].view().num_rows() + return {}, cudf.RangeIndex(nrows) + data, index = data_from_unique_ptr( move(c_result.tbl), col_names if columns is None else names, @@ -172,7 +179,6 @@ cdef tuple _get_index_from_metadata( range_idx = None if json_str != "": meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: index_col = meta['index_columns'] if isinstance(index_col[0], dict) and \ @@ -352,7 +358,8 @@ cdef orc_reader_options make_orc_reader_options( c_column_names.reserve(len(column_names)) for col in column_names: c_column_names.push_back(str(col).encode()) - opts.set_columns(c_column_names) + if len(column_names) > 0: + opts.set_columns(c_column_names) return opts diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 27efc5e1ecd..fab7d76c3c2 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -105,6 +105,7 @@ cdef class BufferArrayFromVector: def _parse_metadata(meta): file_is_range_index = False file_index_cols = None + file_column_dtype = None if 'index_columns' in meta and len(meta['index_columns']) > 0: file_index_cols = meta['index_columns'] @@ -112,7 +113,9 @@ def _parse_metadata(meta): if isinstance(file_index_cols[0], dict) and \ file_index_cols[0]['kind'] == 'range': file_is_range_index = True - return file_is_range_index, file_index_cols + if 'column_indexes' in meta and len(meta['column_indexes']) == 1: + file_column_dtype = meta['column_indexes'][0]["numpy_type"] + return file_is_range_index, file_index_cols, file_column_dtype cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, @@ -180,6 +183,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, cdef vector[unordered_map[string, string]] per_file_user_data = \ c_result.metadata.per_file_user_data + column_index_type = None index_col_names = None is_range_index = True for single_file in per_file_user_data: @@ -187,7 +191,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, meta = None if json_str != "": meta = json.loads(json_str) - file_is_range_index, index_col = _parse_metadata(meta) + file_is_range_index, index_col, column_index_type = _parse_metadata(meta) is_range_index &= file_is_range_index if not file_is_range_index and index_col is not None \ @@ -297,6 +301,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if use_pandas_metadata: df.index.names = index_col + # Set column dtype for empty types. + if len(df._data.names) == 0 and column_index_type is not None: + df._data.label_dtype = cudf.dtype(column_index_type) return df @@ -355,9 +362,15 @@ def write_parquet( for i, name in enumerate(table._column_names, num_index_cols_meta): if not isinstance(name, str): - raise ValueError("parquet must have string column names") + if cudf.get_option("mode.pandas_compatible"): + tbl_meta.column_metadata[i].set_name(str(name).encode()) + else: + raise ValueError( + "Writing a Parquet file requires string column names" + ) + else: + tbl_meta.column_metadata[i].set_name(name.encode()) - tbl_meta.column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.column_metadata[i], diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 50a47b4f507..7ba717a0003 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -85,7 +85,12 @@ cpdef generate_pandas_metadata(table, index): # Columns for name, col in table._data.items(): - col_names.append(name) + if cudf.get_option("mode.pandas_compatible"): + # in pandas-compat mode, non-string column names are stringified. + col_names.append(str(name)) + else: + col_names.append(name) + if isinstance(col.dtype, cudf.CategoricalDtype): raise ValueError( "'category' column dtypes are currently not " diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index e2ea12a0e4d..206173919e1 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import sys from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union @@ -37,9 +37,7 @@ DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] -SeriesOrSingleColumnIndex = Union[ - "cudf.Series", "cudf.core.index.GenericIndex" -] +SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation AggType = Union[str, Callable] diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 72fc17f0286..a422eb82231 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,9 +1,10 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Define common type operations.""" from __future__ import annotations +import warnings from collections import abc from functools import wraps from inspect import isclass @@ -15,9 +16,11 @@ from pandas.api import types as pd_types import cudf -from cudf.core._compat import PANDAS_GE_150 +from cudf.core._compat import PANDAS_LT_300 from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, + _is_categorical_dtype, + _is_interval_dtype, dtype, is_categorical_dtype, is_decimal32_dtype, @@ -105,13 +108,20 @@ def is_string_dtype(obj): Whether or not the array or dtype is of the string dtype. """ return ( - pd.api.types.is_string_dtype(obj) - # Reject all cudf extension types. - and not is_categorical_dtype(obj) - and not is_decimal_dtype(obj) - and not is_list_dtype(obj) - and not is_struct_dtype(obj) - and not is_interval_dtype(obj) + ( + isinstance(obj, (cudf.Index, cudf.Series)) + and obj.dtype == cudf.dtype("O") + ) + or (isinstance(obj, cudf.core.column.StringColumn)) + or ( + pd.api.types.is_string_dtype(obj) + # Reject all cudf extension types. + and not _is_categorical_dtype(obj) + and not is_decimal_dtype(obj) + and not is_list_dtype(obj) + and not is_struct_dtype(obj) + and not _is_interval_dtype(obj) + ) ) @@ -455,6 +465,24 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: ) +def _is_datetime64tz_dtype(obj): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + return _wrap_pandas_is_dtype_api(pd_types.is_datetime64tz_dtype)(obj) + + +def is_datetime64tz_dtype(obj): + # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + warnings.warn( + "is_datetime64tz_dtype is deprecated and will be removed in a future " + "version.", + FutureWarning, + ) + return _is_datetime64tz_dtype(obj) + + def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: if isinstance( dtype_to_check, @@ -471,8 +499,9 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: pd.Float64Dtype, pd.BooleanDtype, pd.StringDtype, + pd.ArrowDtype, ), - ) or (PANDAS_GE_150 and isinstance(dtype_to_check, pd.ArrowDtype)): + ): return True elif isinstance(dtype_to_check, pd.CategoricalDtype): return _is_pandas_nullable_extension_dtype( @@ -497,10 +526,6 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api( pd_types.is_datetime64_ns_dtype ) -is_datetime64tz_dtype = _wrap_pandas_is_dtype_api( - pd_types.is_datetime64tz_dtype -) -is_extension_type = pd_types.is_extension_type is_extension_array_dtype = pd_types.is_extension_array_dtype is_int64_dtype = pd_types.is_int64_dtype is_period_dtype = pd_types.is_period_dtype @@ -518,7 +543,6 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: is_named_tuple = pd_types.is_named_tuple is_iterator = pd_types.is_iterator is_bool = pd_types.is_bool -is_categorical = pd_types.is_categorical is_complex = pd_types.is_complex is_float = pd_types.is_float is_hashable = pd_types.is_hashable diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index d7d8e26db1b..babead9ca97 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -88,9 +88,9 @@ def astype(self, dtype, copy: bool = True): >>> import cudf >>> index = cudf.Index([1, 2, 3]) >>> index - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') >>> index.astype('float64') - Float64Index([1.0, 2.0, 3.0], dtype='float64') + Index([1.0, 2.0, 3.0], dtype='float64') """ raise NotImplementedError @@ -178,6 +178,91 @@ def str(self): def values(self): raise NotImplementedError + def get_indexer(self, target, method=None, limit=None, tolerance=None): + """ + Compute indexer and mask for new index given the current index. + + The indexer should be then used as an input to ndarray.take to align + the current data to the new index. + + Parameters + ---------- + target : Index + method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional + - default: exact matches only. + - pad / ffill: find the PREVIOUS index value if no exact match. + - backfill / bfill: use NEXT index value if no exact match. + - nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index + value. + tolerance : int or float, optional + Maximum distance from index value for inexact matches. The value + of the index at the matching location must satisfy the equation + ``abs(index[loc] - target) <= tolerance``. + + Returns + ------- + cupy.ndarray + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. + Missing values in the target are marked by -1. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index(['c', 'a', 'b']) + >>> index + Index(['c', 'a', 'b'], dtype='object') + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1], dtype=int32) + """ + raise NotImplementedError + + def get_loc(self, key): + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + + Returns + ------- + int or slice or boolean mask + - If result is unique, return integer index + - If index is monotonic, loc is returned as a slice object + - Otherwise, a boolean mask is returned + + Examples + -------- + >>> import cudf + >>> unique_index = cudf.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + >>> monotonic_index = cudf.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + >>> non_monotonic_index = cudf.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True]) + >>> numeric_unique_index = cudf.Index([1, 2, 3]) + >>> numeric_unique_index.get_loc(3) + 2 + + **MultiIndex** + + >>> multi_index = cudf.MultiIndex.from_tuples([('a', 'd'), ('b', 'e'), ('b', 'f')]) + >>> multi_index + MultiIndex([('a', 'd'), + ('b', 'e'), + ('b', 'f')], + ) + >>> multi_index.get_loc('b') + slice(1, 3, None) + >>> multi_index.get_loc(('b', 'e')) + 1 + """ # noqa: E501 + def max(self): """The maximum value of the index.""" raise NotImplementedError @@ -186,9 +271,6 @@ def min(self): """The minimum value of the index.""" raise NotImplementedError - def get_loc(self, key, method=None, tolerance=None): - raise NotImplementedError - def __getitem__(self, key): raise NotImplementedError() @@ -231,7 +313,7 @@ def get_level_values(self, level): >>> import cudf >>> idx = cudf.Index(["a", "b", "c"]) >>> idx.get_level_values(0) - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') """ if level == self.name: @@ -278,31 +360,12 @@ def _clean_nulls_from_index(self): to `` as a preprocessing step to `__repr__` methods. This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` + to string dtype but it is the responsibility of the `__repr__` methods using this method to replace or handle representation of the actual types correctly. """ raise NotImplementedError - @property - def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. - - Returns - ------- - bool - """ - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "is_monotonic is deprecated and will be removed in a future " - "version. Use is_monotonic_increasing instead.", - FutureWarning, - ) - - return self.is_monotonic_increasing - @property def is_monotonic_increasing(self): """Return boolean if values in the object are monotonically increasing. @@ -340,7 +403,7 @@ def hasnans(self): >>> import numpy as np >>> index = cudf.Index([1, 2, np.nan, 3, 4], nan_as_null=False) >>> index - Float64Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') + Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') >>> index.hasnans True @@ -348,7 +411,7 @@ def hasnans(self): >>> index = cudf.Index([1, 2, None, 3, 4]) >>> index - Int64Index([1, 2, , 3, 4], dtype='int64') + Index([1, 2, , 3, 4], dtype='int64') >>> index.hasnans True """ @@ -401,9 +464,9 @@ def set_names(self, names, level=None, inplace=False): >>> import cudf >>> idx = cudf.Index([1, 2, 3, 4]) >>> idx - Int64Index([1, 2, 3, 4], dtype='int64') + Index([1, 2, 3, 4], dtype='int64') >>> idx.set_names('quarter') - Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + Index([1, 2, 3, 4], dtype='int64', name='quarter') >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]]) >>> idx @@ -473,6 +536,7 @@ def union(self, other, sort=None): 2. `self` or `other` has length 0. * False : do not sort the result. + * True : Sort the result (which may raise TypeError). Returns ------- @@ -486,7 +550,7 @@ def union(self, other, sort=None): >>> idx1 = cudf.Index([1, 2, 3, 4]) >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + Index([1, 2, 3, 4, 5, 6], dtype='int64') MultiIndex case @@ -534,10 +598,10 @@ def union(self, other, sort=None): if not isinstance(other, BaseIndex): other = cudf.Index(other, name=self.name) - if sort not in {None, False}: + if sort not in {None, False, True}: raise ValueError( f"The 'sort' keyword only takes the values of " - f"None or False; {sort} was passed." + f"[None, False, True]; {sort} was passed." ) if cudf.get_option("mode.pandas_compatible"): @@ -564,12 +628,18 @@ def union(self, other, sort=None): common_dtype = cudf.utils.dtypes.find_common_type( [self.dtype, other.dtype] ) - return self._get_reconciled_name_object(other).astype(common_dtype) + res = self._get_reconciled_name_object(other).astype(common_dtype) + if sort: + return res.sort_values() + return res elif not len(self): common_dtype = cudf.utils.dtypes.find_common_type( [self.dtype, other.dtype] ) - return other._get_reconciled_name_object(self).astype(common_dtype) + res = other._get_reconciled_name_object(self).astype(common_dtype) + if sort: + return res.sort_values() + return res result = self._union(other, sort=sort) result.name = _get_result_name(self.name, other.name) @@ -590,6 +660,7 @@ def intersection(self, other, sort=False): * False : do not sort the result. * None : sort the result, except when `self` and `other` are equal or when the values cannot be compared. + * True : Sort the result (which may raise TypeError). Returns ------- @@ -602,7 +673,7 @@ def intersection(self, other, sort=False): >>> idx1 = cudf.Index([1, 2, 3, 4]) >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') + Index([3, 4], dtype='int64') MultiIndex case @@ -646,10 +717,10 @@ def intersection(self, other, sort=False): name=getattr(other, "name", self.name), ) - if sort not in {None, False}: + if sort not in {None, False, True}: raise ValueError( f"The 'sort' keyword only takes the values of " - f"None or False; {sort} was passed." + f"[None, False, True]; {sort} was passed." ) if not len(self) or not len(other) or self.equals(other): @@ -719,9 +790,9 @@ def fillna(self, value, downcast=None): >>> import cudf >>> index = cudf.Index([1, 2, None, 4]) >>> index - Int64Index([1, 2, , 4], dtype='int64') + Index([1, 2, , 4], dtype='int64') >>> index.fillna(3) - Int64Index([1, 2, 3, 4], dtype='int64') + Index([1, 2, 3, 4], dtype='int64') """ if downcast is not None: raise NotImplementedError( @@ -778,22 +849,12 @@ def to_frame(self, index=True, name=no_default): 1 Bear 2 Cow """ - if name is None: - warnings.warn( - "Explicitly passing `name=None` currently preserves " - "the Index's name or uses a default name of 0. This " - "behaviour is deprecated, and in the future `None` " - "will be used as the name of the " - "resulting DataFrame column.", - FutureWarning, - ) - name = no_default - if name is not no_default: - col_name = name - elif self.name is None: - col_name = 0 + + if name is no_default: + col_name = 0 if self.name is None else self.name else: - col_name = self.name + col_name = name + return cudf.DataFrame( {col_name: self._values}, index=self if index else None ) @@ -869,13 +930,13 @@ def to_pandas(self, *, nullable: bool = False): >>> import cudf >>> idx = cudf.Index([-3, 10, 15, 20]) >>> idx - Int64Index([-3, 10, 15, 20], dtype='int64') + Index([-3, 10, 15, 20], dtype='int64') >>> idx.to_pandas() - Int64Index([-3, 10, 15, 20], dtype='int64') + Index([-3, 10, 15, 20], dtype='int64') >>> type(idx.to_pandas()) - + >>> type(idx) - + """ raise NotImplementedError @@ -900,7 +961,7 @@ def isin(self, values): -------- >>> idx = cudf.Index([1,2,3]) >>> idx - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') Check whether each index value in a list of values. @@ -970,17 +1031,17 @@ def append(self, other): >>> import cudf >>> idx = cudf.Index([1, 2, 10, 100]) >>> idx - Int64Index([1, 2, 10, 100], dtype='int64') + Index([1, 2, 10, 100], dtype='int64') >>> other = cudf.Index([200, 400, 50]) >>> other - Int64Index([200, 400, 50], dtype='int64') + Index([200, 400, 50], dtype='int64') >>> idx.append(other) - Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') + Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') append accepts list of Index objects >>> idx.append([other, other]) - Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') + Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') """ raise NotImplementedError @@ -1002,6 +1063,7 @@ def difference(self, other, sort=None): * None : Attempt to sort the result, but catch any TypeErrors from comparing incomparable elements. * False : Do not sort the result. + * True : Sort the result (which may raise TypeError). Returns ------- @@ -1012,35 +1074,45 @@ def difference(self, other, sort=None): >>> import cudf >>> idx1 = cudf.Index([2, 1, 3, 4]) >>> idx1 - Int64Index([2, 1, 3, 4], dtype='int64') + Index([2, 1, 3, 4], dtype='int64') >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx2 - Int64Index([3, 4, 5, 6], dtype='int64') + Index([3, 4, 5, 6], dtype='int64') >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') + Index([1, 2], dtype='int64') >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') + Index([2, 1], dtype='int64') """ + if not can_convert_to_column(other): raise TypeError("Input must be Index or array-like") - if sort not in {None, False}: + if sort not in {None, False, True}: raise ValueError( f"The 'sort' keyword only takes the values " - f"of None or False; {sort} was passed." + f"of [None, False, True]; {sort} was passed." ) other = cudf.Index(other, name=getattr(other, "name", self.name)) if not len(other): - return self._get_reconciled_name_object(other) + res = self._get_reconciled_name_object(other) + if sort: + return res.sort_values() + return res elif self.equals(other): - return self[:0]._get_reconciled_name_object(other) + res = self[:0]._get_reconciled_name_object(other) + if sort: + return res.sort_values() + return res res_name = _get_result_name(self.name, other.name) - if is_mixed_with_object_dtype(self, other): + if is_mixed_with_object_dtype(self, other) or len(other) == 0: difference = self.copy().unique() + difference.name = res_name + if sort is True: + return difference.sort_values() else: other = other.copy(deep=False) difference = cudf.core.index._index_from_data( @@ -1052,13 +1124,12 @@ def difference(self, other, sort=None): ) ._data ) + difference.name = res_name if self.dtype != other.dtype: difference = difference.astype(self.dtype) - difference.name = res_name - - if sort is None and len(other): + if sort in {None, True} and len(other): return difference.sort_values() return difference @@ -1415,7 +1486,7 @@ def _union(self, other, sort=None): ) union_result = cudf.core.index._index_from_data({0: res._data[0]}) - if sort is None and len(other): + if sort in {None, True} and len(other): return union_result.sort_values() return union_result @@ -1430,7 +1501,7 @@ def _intersection(self, other, sort=None): ._data ) - if sort is None and len(other): + if sort is {None, True} and len(other): return intersection_result.sort_values() return intersection_result @@ -1474,18 +1545,18 @@ def sort_values( >>> import cudf >>> idx = cudf.Index([10, 100, 1, 1000]) >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + Index([10, 100, 1, 1000], dtype='int64') Sort values in ascending order (default behavior). >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + Index([1, 10, 100, 1000], dtype='int64') Sort values in descending order, and also get the indices `idx` was sorted by. >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], + (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], dtype=int32)) Sorting values in a MultiIndex: @@ -1562,7 +1633,7 @@ def join( names=['a', 'b']) >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index >>> rhs - Int64Index([1, 4, 3], dtype='int64', name='a') + Index([1, 4, 3], dtype='int64', name='a') >>> lhs.join(rhs, how='inner') MultiIndex([(3, 4), (1, 2)], @@ -1649,12 +1720,12 @@ def rename(self, name, inplace=False): >>> import cudf >>> index = cudf.Index([1, 2, 3], name='one') >>> index - Int64Index([1, 2, 3], dtype='int64', name='one') + Index([1, 2, 3], dtype='int64', name='one') >>> index.name 'one' >>> renamed_index = index.rename('two') >>> renamed_index - Int64Index([1, 2, 3], dtype='int64', name='two') + Index([1, 2, 3], dtype='int64', name='two') >>> renamed_index.name 'two' """ @@ -1759,7 +1830,6 @@ def get_slice_bound( self, label, side: Literal["left", "right"], - kind: Literal["ix", "loc", "getitem", None] = None, ) -> int: """ Calculate slice bound that corresponds to given label. @@ -1770,20 +1840,12 @@ def get_slice_bound( ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} Returns ------- int Index of label. """ - if kind is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "'kind' argument in get_slice_bound is deprecated and will be " - "removed in a future version.", - FutureWarning, - ) if side not in {"left", "right"}: raise ValueError(f"Invalid side argument {side}") if self.is_monotonic_increasing or self.is_monotonic_decreasing: @@ -1868,9 +1930,9 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default): >>> data = [10, 20, 30, np.nan] >>> pdi = pd.Index(data) >>> cudf.Index.from_pandas(pdi) - Float64Index([10.0, 20.0, 30.0, ], dtype='float64') + Index([10.0, 20.0, 30.0, ], dtype='float64') >>> cudf.Index.from_pandas(pdi, nan_as_null=False) - Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') + Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if nan_as_null is no_default: nan_as_null = ( @@ -2054,7 +2116,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): -------- >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) >>> idx.take([2, 0, 4, 3]) - StringIndex(['c' 'a' 'e' 'd'], dtype='object') + Index(['c', 'a', 'e', 'd'], dtype='object') """ if axis not in {0, "index"}: @@ -2105,9 +2167,9 @@ def repeat(self, repeats, axis=None): -------- >>> index = cudf.Index([10, 22, 33, 55]) >>> index - Int64Index([10, 22, 33, 55], dtype='int64') + Index([10, 22, 33, 55], dtype='int64') >>> index.repeat(5) - Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, + Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 55, 55, 55, 55, 55], dtype='int64') """ diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e257b7a1fa1..5aa685560c8 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -1,14 +1,14 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import pandas as pd from packaging import version PANDAS_VERSION = version.parse(pd.__version__) -PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3") -PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4") -PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") -PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0") -PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3") +PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0") +PANDAS_GE_201 = PANDAS_VERSION >= version.parse("2.0.1") PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") +PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4") PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") +PANDAS_LT_203 = PANDAS_VERSION < version.parse("2.0.3") +PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0") diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 25d58029d6b..33cec21caa5 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import warnings import cupy as cp @@ -13,9 +13,7 @@ from cudf.utils.dtypes import can_convert_to_column -def factorize( - values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None -): +def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): """Encode the input values as integer labels Parameters @@ -24,14 +22,6 @@ def factorize( The data to be factorized. sort : bool, default True Sort uniques and shuffle codes to maintain the relationship. - na_sentinel : number, default -1 - Value to indicate missing category. - - .. deprecated:: 23.04 - - The na_sentinel argument is deprecated and will be removed in - a future version of cudf. Specify use_na_sentinel as - either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NA values. If False, NA values will be encoded as non-negative @@ -58,7 +48,7 @@ def factorize( >>> codes array([0, 1, 1], dtype=int8) >>> uniques - StringIndex(['a' 'c'], dtype='object') + Index(['a' 'c'], dtype='object') When ``use_na_sentinel=True`` (the default), missing values are indicated in the `codes` with the sentinel value ``-1`` and missing values are not @@ -68,7 +58,7 @@ def factorize( >>> codes array([ 1, -1, 0, 2, 1], dtype=int8) >>> uniques - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') If NA is in the values, and we want to include NA in the uniques of the values, it can be achieved by setting ``use_na_sentinel=False``. @@ -78,21 +68,13 @@ def factorize( >>> codes array([ 0, 1, 0, -1], dtype=int8) >>> uniques - Float64Index([1.0, 2.0], dtype='float64') + Index([1.0, 2.0], dtype='float64') >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False) >>> codes array([1, 2, 1, 0], dtype=int8) >>> uniques - Float64Index([, 1.0, 2.0], dtype='float64') + Index([, 1.0, 2.0], dtype='float64') """ - # TODO: Drop `na_sentinel` in the next release immediately after - # pandas 2.0 upgrade. - if na_sentinel is not None and use_na_sentinel is not None: - raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentile`; " - f"got `na_sentinel={na_sentinel}` and " - f"`use_na_sentinel={use_na_sentinel}`" - ) return_cupy_array = isinstance(values, cp.ndarray) @@ -104,39 +86,14 @@ def factorize( values = as_column(values) - if na_sentinel is None: - na_sentinel = ( - -1 - if use_na_sentinel is None or use_na_sentinel - else Scalar(None, dtype=values.dtype) - ) - else: - if na_sentinel is None: - msg = ( - "Specifying `na_sentinel=None` is deprecated, specify " - "`use_na_sentinel=False` instead." - ) - elif na_sentinel == -1: - msg = ( - "Specifying `na_sentinel=-1` is deprecated, specify " - "`use_na_sentinel=True` instead." - ) - else: - msg = ( - "Specifying the specific value to use for `na_sentinel` is " - "deprecated and will be removed in a future version of cudf. " - "Specify `use_na_sentinel=True` to use the sentinel value -1, " - "and `use_na_sentinel=False` to encode NA values.", - ) - # Do not remove until pandas 2.0 support is added. - warnings.warn(msg, FutureWarning) - if size_hint: warnings.warn("size_hint is not applicable for cudf.factorize") - if use_na_sentinel is None or use_na_sentinel: + if use_na_sentinel: + na_sentinel = Scalar(-1) cats = values.dropna() else: + na_sentinel = Scalar(None, dtype=values.dtype) cats = values cats = cats.unique().astype(values.dtype) @@ -146,7 +103,7 @@ def factorize( labels = values._label_encoding( cats=cats, - na_sentinel=Scalar(na_sentinel), + na_sentinel=na_sentinel, dtype="int64" if get_option("mode.pandas_compatible") else None, ).values diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0ddb31efbfe..bbff72722ab 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -2,7 +2,6 @@ from __future__ import annotations -import warnings from collections import abc from functools import cached_property from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast @@ -22,6 +21,7 @@ from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils.dtypes import ( + find_common_type, is_mixed_with_object_dtype, min_signed_type, min_unsigned_type, @@ -62,7 +62,7 @@ class CategoricalAccessor(ColumnMethods): dtype: category Categories (3, int64): [1, 2, 3] >>> s.cat.categories - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') >>> s.cat.reorder_categories([3,2,1]) 0 1 1 2 @@ -105,7 +105,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex): super().__init__(parent=parent) @property - def categories(self) -> "cudf.core.index.GenericIndex": + def categories(self) -> "cudf.core.index.Index": """ The categories of this categorical. """ @@ -130,28 +130,14 @@ def ordered(self) -> bool: """ return self._column.ordered - def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: + def as_ordered(self) -> Optional[SeriesOrIndex]: """ Set the Categorical to be ordered. - Parameters - ---------- - inplace : bool, default False - Whether or not to add the categories inplace - or return a copy of this categorical with - added categories. - - .. deprecated:: 23.02 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Setting categories as ordered will always - return a new Categorical object. - Returns ------- Categorical - Ordered Categorical or None if inplace. + Ordered Categorical. Examples -------- @@ -177,48 +163,13 @@ def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: 6 10 dtype: category Categories (3, int64): [1 < 2 < 10] - >>> s.cat.as_ordered(inplace=True) - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1 < 2 < 10] """ - if inplace: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "The inplace parameter is deprecated and will be removed in a " - "future release. set_ordered will always return a new Series " - "in the future.", - FutureWarning, - ) - return self._return_or_inplace( - self._column.as_ordered(), inplace=inplace - ) + return self._return_or_inplace(self._column.as_ordered()) - def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: + def as_unordered(self) -> Optional[SeriesOrIndex]: """ Set the Categorical to be unordered. - Parameters - ---------- - inplace : bool, default False - Whether or not to set the ordered attribute - in-place or return a copy of this - categorical with ordered set to False. - - .. deprecated:: 23.02 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Setting categories as unordered will always - return a new Categorical object. - Returns ------- Categorical @@ -259,33 +210,11 @@ def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: 6 10 dtype: category Categories (3, int64): [1, 2, 10] - >>> s.cat.as_unordered(inplace=True) - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] """ - if inplace: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "The inplace parameter is deprecated and will be removed in a " - "future release. set_ordered will always return a new Series " - "in the future.", - FutureWarning, - ) - return self._return_or_inplace( - self._column.as_unordered(), inplace=inplace - ) - def add_categories( - self, new_categories: Any, inplace: bool = False - ) -> Optional[SeriesOrIndex]: + return self._return_or_inplace(self._column.as_unordered()) + + def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: """ Add new categories. @@ -297,23 +226,11 @@ def add_categories( ---------- new_categories : category or list-like of category The new categories to be included. - inplace : bool, default False - Whether or not to add the categories inplace - or return a copy of this categorical with - added categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Adding categories will always return a - new Categorical object. Returns ------- cat - Categorical with new categories added or - None if inplace. + Categorical with new categories added. Examples -------- @@ -334,22 +251,7 @@ def add_categories( 1 2 dtype: category Categories (2, int64): [1, 2] - >>> s.cat.add_categories([0, 3, 4], inplace=True) - >>> s - 0 1 - 1 2 - dtype: category - Categories (5, int64): [1, 2, 0, 3, 4] """ - if inplace: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "The `inplace` parameter in cudf.Series.cat.add_categories " - "is deprecated and will be removed in a future version of " - "cudf. Adding categories will always return a new " - "Categorical object.", - FutureWarning, - ) old_categories = self._column.categories new_categories = column.as_column( new_categories, @@ -364,8 +266,8 @@ def add_categories( f"type-cast new_categories to the same type as " f"existing categories." ) - common_dtype = np.find_common_type( - [old_categories.dtype, new_categories.dtype], [] + common_dtype = find_common_type( + [old_categories.dtype, new_categories.dtype] ) new_categories = new_categories.astype(common_dtype) @@ -379,12 +281,11 @@ def add_categories( if not out_col._categories_equal(new_categories): out_col = out_col._set_categories(new_categories) - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace(out_col) def remove_categories( self, removals: Any, - inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Remove the specified categories. @@ -397,23 +298,11 @@ def remove_categories( ---------- removals : category or list-like of category The categories which should be removed. - inplace : bool, default False - Whether or not to remove the categories - inplace or return a copy of this categorical - with removed categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Removing categories will always return a - new Categorical object. Returns ------- cat - Categorical with removed categories or None - if inplace. + Categorical with removed categories Examples -------- @@ -449,28 +338,7 @@ def remove_categories( 6 10 dtype: category Categories (3, int64): [1, 2, 10] - >>> s.cat.remove_categories([10], inplace=True) - >>> s - 0 - 1 1 - 2 1 - 3 2 - 4 - 5 2 - 6 - dtype: category - Categories (2, int64): [1, 2] """ - if inplace: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "The `inplace` parameter in " - "cudf.Series.cat.remove_categories is deprecated and " - "will be removed in a future version of cudf. " - "Removing categories will always return a new " - "Categorical object.", - FutureWarning, - ) cats = self.categories.to_series() removals = cudf.Series(removals, dtype=cats.dtype) @@ -487,14 +355,13 @@ def remove_categories( if not out_col._categories_equal(new_categories): out_col = out_col._set_categories(new_categories) - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace(out_col) def set_categories( self, new_categories: Any, ordered: bool = False, rename: bool = False, - inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Set the categories to the specified new_categories. @@ -529,23 +396,11 @@ def set_categories( Whether or not the `new_categories` should be considered as a rename of the old categories or as reordered categories. - inplace : bool, default False - Whether or not to reorder the categories in-place - or return a copy of this categorical with - reordered categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Setting categories will always return a - new Categorical object. Returns ------- cat Categorical with reordered categories - or None if inplace. Examples -------- @@ -569,38 +424,17 @@ def set_categories( 5 10 dtype: category Categories (2, int64): [1, 10] - >>> s.cat.set_categories([1, 10], inplace=True) - >>> s - 0 1 - 1 1 - 2 - 3 10 - 4 - 5 10 - dtype: category - Categories (2, int64): [1, 10] """ - if inplace: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "The `inplace` parameter in cudf.Series.cat.set_categories is " - "deprecated and will be removed in a future version of cudf. " - "Setting categories will always return a new Categorical " - "object.", - FutureWarning, - ) return self._return_or_inplace( self._column.set_categories( new_categories=new_categories, ordered=ordered, rename=rename - ), - inplace=inplace, + ) ) def reorder_categories( self, new_categories: Any, ordered: bool = False, - inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Reorder categories as specified in new_categories. @@ -616,23 +450,11 @@ def reorder_categories( Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. - inplace : bool, default False - Whether or not to reorder the categories - inplace or return a copy of this categorical - with reordered categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Reordering categories will always return a - new Categorical object. Returns ------- cat - Categorical with reordered categories or - None if inplace. + Categorical with reordered categories Raises ------ @@ -669,19 +491,8 @@ def reorder_categories( ValueError: items in new_categories are not the same as in old categories """ - if inplace: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "The `inplace` parameter in " - "cudf.Series.cat.reorder_categories is deprecated " - "and will be removed in a future version of cudf. " - "Reordering categories will always return a new " - "Categorical object.", - FutureWarning, - ) return self._return_or_inplace( self._column.reorder_categories(new_categories, ordered=ordered), - inplace=inplace, ) @@ -981,6 +792,7 @@ def to_pandas( .fillna(_DEFAULT_CATEGORICAL_VALUE) .values_host ) + cats = col.categories if cats.dtype.kind in "biuf": cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 77302531206..9143c7f5e9e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,6 +4,7 @@ import builtins import pickle +import warnings from collections import abc from functools import cached_property from itertools import chain @@ -27,6 +28,7 @@ import pyarrow as pa import pyarrow.compute as pc from numba import cuda +from pandas.core.arrays.arrow.extension_types import ArrowIntervalType from typing_extensions import Self import rmm @@ -50,22 +52,22 @@ from cudf._lib.types import size_type_dtype from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( + _is_categorical_dtype, + _is_datetime64tz_dtype, + _is_interval_dtype, _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_decimal_dtype, is_dtype_equal, is_integer_dtype, - is_interval_dtype, is_list_dtype, is_scalar, is_string_dtype, ) -from cudf.core._compat import PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_210 from cudf.core.abc import Serializable from cudf.core.buffer import ( Buffer, @@ -85,6 +87,7 @@ from cudf.utils.dtypes import ( _maybe_convert_to_default_type, cudf_dtype_from_pa_type, + find_common_type, get_time_unit, is_mixed_with_object_dtype, min_scalar_type, @@ -95,10 +98,10 @@ ) from cudf.utils.utils import _array_ufunc, mask_dtype -if PANDAS_GE_150: - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +if PANDAS_GE_210: + NumpyExtensionArray = pd.arrays.NumpyExtensionArray else: - from pandas.core.arrays._arrow_utils import ArrowIntervalType + NumpyExtensionArray = pd.arrays.PandasArray class ColumnBase(Column, Serializable, BinaryOperand, Reducible): @@ -965,7 +968,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: col = self if self.dtype == dtype: return col - if is_categorical_dtype(dtype): + if _is_categorical_dtype(dtype): return col.as_categorical_column(dtype) if ( @@ -984,7 +987,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: dtype = pandas_dtypes_to_np_dtypes.get(dtype, dtype) if _is_non_decimal_numeric_dtype(dtype): return col.as_numerical_column(dtype) - elif is_categorical_dtype(dtype): + elif _is_categorical_dtype(dtype): return col.as_categorical_column(dtype) elif cudf.dtype(dtype).type in { np.str_, @@ -1393,7 +1396,7 @@ def column_empty_like( if ( hasattr(column, "dtype") - and is_categorical_dtype(column.dtype) + and _is_categorical_dtype(column.dtype) and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) @@ -1941,8 +1944,8 @@ def as_column( new_dtype = dtype elif len(arbitrary) == 0: # If the column is empty, it has to be - # a `float64` dtype. - new_dtype = cudf.dtype("float64") + # a `str` dtype. + new_dtype = cudf.dtype("str") else: # If the null column is not empty, it has to # be of `object` dtype. @@ -2015,13 +2018,13 @@ def as_column( ) elif isinstance( arbitrary.dtype, pd.api.extensions.ExtensionDtype - ) and not isinstance(arbitrary, pd.arrays.PandasArray): + ) and not isinstance(arbitrary, NumpyExtensionArray): raise NotImplementedError( "Custom pandas ExtensionDtypes are not supported" ) elif arbitrary.dtype.kind in "fiubmM": # numpy dtype like - if isinstance(arbitrary, pd.arrays.PandasArray): + if isinstance(arbitrary, NumpyExtensionArray): arbitrary = np.array(arbitrary) arb_dtype = np.dtype(arbitrary.dtype) if arb_dtype.kind == "f" and arb_dtype.itemsize == 2: @@ -2035,17 +2038,8 @@ def as_column( arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length ) elif arbitrary.dtype.kind == "O": - if len(arbitrary) == 0: - # TODO: Can remove once empty constructor default becomes - # object instead of float. - return as_column( - pa.array([], type=pa.string()), - nan_as_null=nan_as_null, - dtype=dtype, - length=length, - ) - if isinstance(arbitrary, pd.arrays.PandasArray): - # infer_dtype does not handle PandasArray + if isinstance(arbitrary, NumpyExtensionArray): + # infer_dtype does not handle NumpyExtensionArray arbitrary = np.array(arbitrary, dtype=object) inferred_dtype = infer_dtype(arbitrary) if inferred_dtype in ("mixed-integer", "mixed-integer-float"): @@ -2269,9 +2263,9 @@ def as_column( np_type = None try: if dtype is not None: - if is_categorical_dtype(dtype) or is_interval_dtype(dtype): + if _is_categorical_dtype(dtype) or _is_interval_dtype(dtype): raise TypeError - if is_datetime64tz_dtype(dtype): + if _is_datetime64tz_dtype(dtype): raise NotImplementedError( "Use `tz_localize()` to construct " "timezone aware data." @@ -2413,13 +2407,13 @@ def as_column( except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e: if isinstance(e, MixedTypeError): raise TypeError(str(e)) - if is_categorical_dtype(dtype): + if _is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) - elif is_interval_dtype(dtype): + elif _is_interval_dtype(dtype): sr = pd.Series(arbitrary, dtype="interval") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif ( @@ -2473,7 +2467,11 @@ def _construct_array( ): # We may have date-like strings with timezones try: - pd_arbitrary = pd.to_datetime(arbitrary) + with warnings.catch_warnings(): + # Need to ignore userwarnings when + # datetime format cannot be inferred. + warnings.simplefilter("ignore", UserWarning) + pd_arbitrary = pd.to_datetime(arbitrary) if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype): raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" @@ -2611,8 +2609,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: and np.issubdtype(dtyp, np.datetime64) for dtyp in not_null_col_dtypes ): - # Use NumPy to find a common dtype - common_dtype = np.find_common_type(not_null_col_dtypes, []) + common_dtype = find_common_type(not_null_col_dtypes) # Cast all columns to the common dtype objs = [obj.astype(common_dtype) for obj in objs] diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2ab2dd46c53..6682bbb333b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -23,12 +23,12 @@ ScalarLike, ) from cudf.api.types import ( + _is_datetime64tz_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_scalar, is_timedelta64_dtype, ) -from cudf.core._compat import PANDAS_GE_220 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_220 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion @@ -113,7 +113,13 @@ def infer_format(element: str, **kwargs) -> str: raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" ) - return fmt + if ".%f" not in fmt: + # For context read: + # https://github.com/pandas-dev/pandas/issues/52418 + # We cannot rely on format containing only %f + # c++/libcudf expects .%3f, .%6f, .%9f + # Logic below handles those cases well. + return fmt element_parts = element.split(".") if len(element_parts) != 2: @@ -323,11 +329,17 @@ def to_pandas( # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 + if PANDAS_GE_200: + host_values = self.to_arrow() + else: + # Pandas<2.0 supports only `datetime64[ns]`, hence the cast. + host_values = self.astype("datetime64[ns]").to_arrow() + # Pandas only supports `datetime64[ns]` dtype # and conversion to this type is necessary to make # arrow to pandas conversion happen for large values. return pd.Series( - self.astype("datetime64[ns]").to_arrow(), + host_values, copy=True, dtype=self.dtype, index=index, @@ -377,19 +389,30 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, np.datetime64): if np.isnat(other): - return cudf.Scalar(None, dtype=self.dtype) + other_time_unit = cudf.utils.dtypes.get_time_unit(other) + if other_time_unit not in {"s", "ms", "ns", "us"}: + other_time_unit = "ns" + + return cudf.Scalar( + None, dtype=f"datetime64[{other_time_unit}]" + ) other = other.astype(self.dtype) return cudf.Scalar(other) elif isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) + if np.isnat(other): + return cudf.Scalar( + None, + dtype="timedelta64[ns]" + if other_time_unit not in {"s", "ms", "ns", "us"} + else other.dtype, + ) + if other_time_unit not in {"s", "ms", "ns", "us"}: other = other.astype("timedelta64[s]") - if np.isnat(other): - return cudf.Scalar(None, dtype=other.dtype) - return cudf.Scalar(other) elif isinstance(other, str): try: @@ -484,7 +507,7 @@ def mean( skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def std( self, @@ -498,12 +521,30 @@ def std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], - ) + ).as_unit(self.time_unit) def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, + ).as_unit(self.time_unit) + + def cov(self, other: DatetimeColumn) -> float: + if not isinstance(other, DatetimeColumn): + raise TypeError( + f"cannot perform cov with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").cov( + other.as_numerical_column("int64") + ) + + def corr(self, other: DatetimeColumn) -> float: + if not isinstance(other, DatetimeColumn): + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").corr( + other.as_numerical_column("int64") ) def quantile( @@ -520,7 +561,9 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timestamp(result, unit=self.time_unit) + return pd.Timestamp(result, unit=self.time_unit).as_unit( + self.time_unit + ) return result.astype(self.dtype) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: @@ -529,7 +572,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if other is NotImplemented: return NotImplemented if isinstance(other, cudf.DateOffset): - return other._datetime_binop(self, op, reflect=reflect) + return other._datetime_binop(self, op, reflect=reflect).astype( + self.dtype + ) # We check this on `other` before reflection since we already know the # dtype of `self`. @@ -585,12 +630,15 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if out_dtype is None: return NotImplemented - result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if cudf.get_option( + result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) + if out_dtype != cudf.dtype(np.bool_) and op == "__add__": + return result_col + elif cudf.get_option( "mode.pandas_compatible" ) and out_dtype == cudf.dtype(np.bool_): - result = result.fillna(op == "__ne__") - return result + return result_col.fillna(op == "__ne__") + else: + return result_col def fillna( self, @@ -601,7 +649,6 @@ def fillna( if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) if is_scalar(fill_value): - # TODO: Add cast checking like TimedeltaColumn.fillna if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=self.dtype) else: @@ -655,7 +702,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: return False def _with_type_metadata(self, dtype): - if is_datetime64tz_dtype(dtype): + if _is_datetime64tz_dtype(dtype): return DatetimeTZColumn( data=self.base_data, dtype=dtype, diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 7227ef8ba3a..f5d527ad201 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -5,6 +5,7 @@ import pyarrow as pa import cudf +from cudf.api.types import _is_interval_dtype from cudf.core.column import StructColumn from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -95,7 +96,7 @@ def as_interval_column(self, dtype): else: # a user can directly input the string `interval` as the dtype # when creating an interval series or interval dataframe - if dtype == "interval": + if _is_interval_dtype(dtype): dtype = IntervalDtype( self.dtype.subtype, self.dtype.closed ) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 741d18c10db..0f5a0eb086b 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -9,7 +9,7 @@ import cudf from cudf.utils.utils import NotIterable -ParentType = Union["cudf.Series", "cudf.core.index.GenericIndex"] +ParentType = Union["cudf.Series", "cudf.core.index.Index"] class ColumnMethods(NotIterable): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0577e0f37ed..ae4ad9c5136 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -173,7 +173,12 @@ def __setitem__(self, key: Any, value: Any): if isinstance(key, slice): out = self._scatter_by_slice(key, device_value) else: - key = as_column(key) + key = as_column( + key, + dtype="float64" + if isinstance(key, list) and len(key) == 0 + else None, + ) if not isinstance(key, cudf.core.column.NumericalColumn): raise ValueError(f"Invalid scatter map type {key.dtype}.") out = self._scatter_by_column(key, device_value) @@ -697,7 +702,7 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: col_dtype if col_dtype.kind == "f" else np.dtype("int64") ) elif reduction_op == "sum_of_squares": - col_dtype = np.find_common_type([col_dtype], [np.dtype("uint64")]) + col_dtype = np.result_dtype(col_dtype, np.dtype("uint64")) return col_dtype diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fcb993e1a78..2373f94ee97 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -697,9 +697,9 @@ def contains( >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN] >>> idx = cudf.Index(data) >>> idx - StringIndex(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object') + Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object') >>> idx.str.contains('23', regex=False) - GenericIndex([False, False, False, True, ], dtype='bool') + Index([False, False, False, True, ], dtype='bool') Returning 'house' or 'dog' when either expression occurs in a string. @@ -2805,7 +2805,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - StringIndex(['X 123' 'Y 999'], dtype='object') + Index(['X 123', 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -2878,7 +2878,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - StringIndex(['X 123' 'Y 999'], dtype='object') + Index(['X 123', 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -3536,7 +3536,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') + Index([0, 0, 2, 1], dtype='int64') .. pandas-compat:: **StringMethods.count** @@ -5593,6 +5593,21 @@ def data(self): ] return self._data + def all(self, skipna: bool = True) -> bool: + if skipna and self.null_count == self.size: + return True + elif not skipna and self.has_nulls(): + raise TypeError("boolean value of NA is ambiguous") + raise NotImplementedError("`all` not implemented for `StringColumn`") + + def any(self, skipna: bool = True) -> bool: + if not skipna and self.has_nulls(): + raise TypeError("boolean value of NA is ambiguous") + elif skipna and self.null_count == self.size: + return False + + raise NotImplementedError("`any` not implemented for `StringColumn`") + def data_array_view( self, *, mode="write" ) -> cuda.devicearray.DeviceNDArray: @@ -5733,6 +5748,10 @@ def as_datetime_column( self.apply_boolean_mask(self.notnull()).element_indexing(0) ) + if format.endswith("%z"): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) return self._as_datetime_or_timedelta_column(out_dtype, format) def as_timedelta_column( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 6038a1a1e97..edf05fbb264 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -14,6 +14,7 @@ from cudf import _lib as libcudf from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype from cudf.api.types import is_scalar, is_timedelta64_dtype +from cudf.core._compat import PANDAS_GE_200 from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype @@ -152,6 +153,12 @@ def to_pandas( # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 + if PANDAS_GE_200: + host_values = self.to_arrow() + else: + # Pandas<2.0 supports only `timedelta64[ns]`, hence the cast. + host_values = self.astype("timedelta64[ns]").to_arrow() + # Pandas only supports `timedelta64[ns]` dtype # and conversion to this type is necessary to make # arrow to pandas conversion happen for large values. @@ -159,7 +166,7 @@ def to_pandas( raise NotImplementedError(f"{nullable=} is not implemented.") return pd.Series( - self.astype("timedelta64[ns]").to_arrow(), + host_values, copy=True, dtype=self.dtype, index=index, @@ -252,7 +259,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: if isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) if np.isnat(other): - return cudf.Scalar(None, dtype=self.dtype) + return cudf.Scalar( + None, + dtype="timedelta64[ns]" + if other_time_unit not in {"s", "ms", "ns", "us"} + else self.dtype, + ) if other_time_unit not in {"s", "ms", "ns", "us"}: common_dtype = "timedelta64[s]" @@ -277,12 +289,8 @@ def fillna( return self.copy(deep=True) if is_scalar(fill_value): fill_value = cudf.Scalar(fill_value) - dtype = determine_out_dtype(self.dtype, fill_value.dtype) + dtype = self.dtype fill_value = fill_value.astype(dtype) - if self.dtype != dtype: - return cast( - Self, self.astype(dtype).fillna(fill_value, method) - ) else: fill_value = column.as_column(fill_value, nan_as_null=False) return super().fillna(fill_value, method) @@ -335,13 +343,13 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) @@ -360,7 +368,9 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timedelta(result, unit=self.time_unit) + return pd.Timedelta(result, unit=self.time_unit).as_unit( + self.time_unit + ) return result.astype(self.dtype) def sum( @@ -377,7 +387,7 @@ def sum( skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def std( self, @@ -391,6 +401,24 @@ def std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, + ).as_unit(self.time_unit) + + def cov(self, other: TimeDeltaColumn) -> float: + if not isinstance(other, TimeDeltaColumn): + raise TypeError( + f"cannot perform cov with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").cov( + other.as_numerical_column("int64") + ) + + def corr(self, other: TimeDeltaColumn) -> float: + if not isinstance(other, TimeDeltaColumn): + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").corr( + other.as_numerical_column("int64") ) def components(self, index=None) -> "cudf.DataFrame": @@ -422,79 +450,72 @@ def components(self, index=None) -> "cudf.DataFrame": 4 37 13 12 14 234 0 0 """ # noqa: E501 - return cudf.DataFrame( - data={ - "days": self - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns") - ), - "hours": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ) - ) - ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["h"], "ns") - ), - "minutes": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ) - ) - ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["m"], "ns") - ), - "seconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["m"], "ns" - ) - ) - ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") - ), - "milliseconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["s"], "ns" - ) - ) + date_meta = { + "seconds": ["m", "s"], + "milliseconds": ["s", "ms"], + "microseconds": ["ms", "us"], + "nanoseconds": ["us", "ns"], + } + data = { + "days": self + // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) + ), + "hours": ( + self + % cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ms"], "ns") - ), - "microseconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["ms"], "ns" - ) - ) + ) + // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["h"], "ns" + ).astype(self.dtype) + ), + "minutes": ( + self + % cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["h"], "ns" + ).astype(self.dtype) ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ), - "nanoseconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["us"], "ns" - ) - ) + ) + // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["m"], "ns" + ).astype(self.dtype) + ), + } + keys_list = iter(date_meta.keys()) + for name in keys_list: + value = date_meta[name] + data[name] = ( + self + % cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion[value[0]], "ns" + ).astype(self.dtype) ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns") - ), - }, + ) // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion[value[1]], "ns" + ).astype(self.dtype) + ) + if self._time_unit == value[1]: + break + + for name in keys_list: + res_col = cudf.core.column.full(len(self), 0, dtype="int64") + if self.nullable: + res_col = res_col.set_mask(self.mask) + data[name] = res_col + + return cudf.DataFrame( + data=data, index=index, ) @@ -508,7 +529,9 @@ def days(self) -> "cudf.core.column.NumericalColumn": NumericalColumn """ return self // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns") + np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( + self.dtype + ) ) @property @@ -528,7 +551,9 @@ def seconds(self) -> "cudf.core.column.NumericalColumn": return ( self % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns") + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ) ) // cudf.Scalar( np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") @@ -549,7 +574,10 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn": # division operation to extract the number of microseconds. return ( - self % np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") + self + % np.timedelta64( + _unit_to_nanoseconds_conversion["s"], "ns" + ).astype(self.dtype) ) // cudf.Scalar( np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") ) @@ -569,6 +597,11 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": # performing division operation to extract the number # of nanoseconds. + if self._time_unit != "ns": + res_col = cudf.core.column.full(len(self), 0, dtype="int64") + if self.nullable: + res_col = res_col.set_mask(self.mask) + return cast("cudf.core.column.NumericalColumn", res_col) return ( self % cudf.Scalar( diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index d87580fcfac..33085bede78 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,7 +3,6 @@ from __future__ import annotations import itertools -import warnings from collections import abc from functools import cached_property, reduce from typing import ( @@ -19,7 +18,6 @@ import numpy as np import pandas as pd -from packaging.version import Version from pandas.api.types import is_bool import cudf @@ -237,24 +235,10 @@ def _clear_cache(self): def to_pandas_index(self) -> pd.Index: """Convert the keys of the ColumnAccessor to a Pandas Index object.""" if self.multiindex and len(self.level_names) > 0: - # Using `from_frame()` instead of `from_tuples` - # prevents coercion of values to a different type - # (e.g., ''->NaT) - with warnings.catch_warnings(): - # Specifying `dtype="object"` here and passing that to - # `from_frame` is deprecated in pandas, but we cannot remove - # that without also losing compatibility with other current - # pandas behaviors like the NaT inference above. For now we - # must catch the warnings internally, but we will need to - # remove this when we implement compatibility with pandas 2.0, - # which will remove these compatibility layers. - assert Version(pd.__version__) < Version("2.0.0") - warnings.simplefilter("ignore") - result = pd.MultiIndex.from_frame( - pd.DataFrame( - self.names, columns=self.level_names, dtype="object" - ), - ) + result = pd.MultiIndex.from_tuples( + self.names, + names=self.level_names, + ) else: # Determine if we can return a RangeIndex if self.rangeindex: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index aea8d11945a..727d5135297 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -34,7 +34,6 @@ import pandas as pd import pyarrow as pa from nvtx import annotate -from packaging.version import Version from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing from typing_extensions import Self, assert_never @@ -57,6 +56,7 @@ is_string_dtype, ) from cudf.core import column, df_protocol, indexing_utils, reshape +from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, @@ -84,6 +84,7 @@ from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.core.udf.row_function import _get_row_kernel +from cudf.errors import MixedTypeError from cudf.utils import applyutils, docutils, ioutils, queryutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -803,6 +804,7 @@ def __init__( if is_list_like(data): if len(data) > 0 and is_scalar(data[0]): if columns is not None: + label_dtype = getattr(columns, "dtype", None) data = dict(zip(columns, [data])) rangeindex = isinstance( columns, (range, pd.RangeIndex, cudf.RangeIndex) @@ -810,6 +812,7 @@ def __init__( else: data = dict(enumerate([data])) rangeindex = True + label_dtype = None new_df = DataFrame(data=data, index=index) self._data = new_df._data @@ -820,6 +823,11 @@ def __init__( else self._data._level_names ) self._data.rangeindex = rangeindex + self._data.label_dtype = ( + cudf.dtype(label_dtype) + if label_dtype is not None + else None + ) elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index @@ -923,7 +931,9 @@ def _init_from_series_list(self, data, columns, index): transpose = self.T else: - concat_df = cudf.concat(data, axis=1) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + concat_df = cudf.concat(data, axis=1) cols = concat_df._data.to_pandas_index() if cols.dtype == "object": @@ -1037,7 +1047,6 @@ def _init_from_dict_like( empty_column = functools.partial( cudf.core.column.column_empty, row_count=(0 if index is None else len(index)), - dtype=None, masked=index is not None, ) @@ -1328,13 +1337,13 @@ def __getitem__(self, arg): elif can_convert_to_column(arg): mask = arg if is_list_like(mask): - # An explicit dtype is needed to avoid pandas warnings from - # empty sets of columns. This shouldn't be needed in pandas - # 2.0, we don't need to specify a dtype when we know we're not - # trying to match any columns so the default is fine. dtype = None - if len(mask) == 0: - assert Version(pd.__version__) < Version("2.0.0") + if len(mask) == 0 and not PANDAS_GE_200: + # An explicit dtype is needed to avoid pandas + # warnings from empty sets of columns. This + # shouldn't be needed in pandas 2.0, we don't + # need to specify a dtype when we know we're not + # trying to match any columns so the default is fine. dtype = "float64" mask = pd.Series(mask, dtype=dtype) if mask.dtype == "bool": @@ -1745,7 +1754,7 @@ def _concat( if empty_has_index and num_empty_input_frames == len(objs): out._index = cudf.RangeIndex(result_index_length) elif are_all_range_index and not ignore_index: - out._index = cudf.core.index.GenericIndex._concat( + out._index = cudf.core.index.Index._concat( [o._index for o in objs] ) @@ -1940,9 +1949,11 @@ def _get_renderable_dataframe(self): lower_left = self.tail(lower_rows).iloc[:, :left_cols] lower_right = self.tail(lower_rows).iloc[:, right_cols:] - upper = cudf.concat([upper_left, upper_right], axis=1) - lower = cudf.concat([lower_left, lower_right], axis=1) - output = cudf.concat([upper, lower]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + upper = cudf.concat([upper_left, upper_right], axis=1) + lower = cudf.concat([lower_left, lower_right], axis=1) + output = cudf.concat([upper, lower]) output = self._clean_nulls_from_dataframe(output) output._index = output._index._clean_nulls_from_index() @@ -2026,6 +2037,20 @@ def _make_operands_and_index_for_binop( rhs = {name: other for name in self._data} equal_columns = True elif isinstance(other, Series): + if ( + not can_reindex + and fn in cudf.utils.utils._EQUALITY_OPS + and ( + not self._data.to_pandas_index().equals( + other.index.to_pandas() + ) + ) + ): + raise ValueError( + "Can only compare DataFrame & Series objects " + "whose columns & index are same respectively, " + "please reindex." + ) rhs = dict(zip(other.index.to_pandas(), other.values_host)) # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). @@ -3516,7 +3541,7 @@ def rename( if index: if ( any(isinstance(item, str) for item in index.values()) - and type(self.index) != cudf.StringIndex + and type(self.index._values) != cudf.core.column.StringColumn ): raise NotImplementedError( "Implicit conversion of index to " @@ -3604,11 +3629,12 @@ def agg(self, aggs, axis=None): * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ - # TODO: Remove the typecasting below once issue #6846 is fixed - # link dtypes = [self[col].dtype for col in self._column_names] common_dtype = find_common_type(dtypes) - df_normalized = self.astype(common_dtype) + if not is_bool_dtype(common_dtype) and any( + is_bool_dtype(dtype) for dtype in dtypes + ): + raise MixedTypeError("Cannot create a column with mixed types") if any(is_string_dtype(dt) for dt in dtypes): raise NotImplementedError( @@ -3626,17 +3652,17 @@ def agg(self, aggs, axis=None): # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization for agg in aggs: - result[agg] = getattr(df_normalized, agg)() + result[agg] = getattr(self, agg)() return result.T.sort_index(axis=1, ascending=True) elif isinstance(aggs, str): - if not hasattr(df_normalized, aggs): + if not hasattr(self, aggs): raise AttributeError( f"{aggs} is not a valid function for " f"'DataFrame' object" ) result = DataFrame() - result[aggs] = getattr(df_normalized, aggs)() + result[aggs] = getattr(self, aggs)() result = result.iloc[:, 0] result.name = None return result @@ -3648,15 +3674,16 @@ def agg(self, aggs, axis=None): "callable parameter is not implemented yet" ) elif all(isinstance(val, str) for val in aggs.values()): - result = cudf.Series(index=cols) + res = {} for key, value in aggs.items(): - col = df_normalized[key] + col = self[key] if not hasattr(col, value): raise AttributeError( f"{value} is not a valid function for " f"'Series' object" ) - result[key] = getattr(col, value)() + res[key] = getattr(col, value)() + result = cudf.Series(list(res.values()), index=res.keys()) elif all(isinstance(val, abc.Iterable) for val in aggs.values()): idxs = set() for val in aggs.values(): @@ -3672,7 +3699,7 @@ def agg(self, aggs, axis=None): ) result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): - col = df_normalized[key] + col = self[key] col_empty = column_empty( len(idxs), dtype=col.dtype, masked=True ) @@ -4547,6 +4574,39 @@ def applymap( This method applies a function that accepts and returns a scalar to every element of a DataFrame. + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to func. + + Returns + ------- + DataFrame + Transformed DataFrame. + """ + # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + warnings.warn( + "DataFrame.applymap has been deprecated. Use DataFrame.map " + "instead.", + FutureWarning, + ) + return self.map(func=func, na_action=na_action, **kwargs) + + def map( + self, + func: Callable[[Any], Any], + na_action: Union[str, None] = None, + **kwargs, + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + Parameters ---------- func : callable @@ -5094,22 +5154,11 @@ def describe( percentiles=None, include=None, exclude=None, - datetime_is_numeric=False, ): """{docstring}""" if not include and not exclude: - default_include = [np.number] - if datetime_is_numeric: - default_include.append("datetime") - else: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "`datetime_is_numeric` is deprecated. Specify " - "`datetime_is_numeric=True` to silence this " - "warning and adopt the future behavior now.", - FutureWarning, - ) + default_include = [np.number, "datetime"] data_to_describe = self.select_dtypes(include=default_include) if data_to_describe._num_columns == 0: data_to_describe = self @@ -5130,7 +5179,6 @@ def describe( describe_series_list = [ data_to_describe[col].describe( percentiles=percentiles, - datetime_is_numeric=datetime_is_numeric, ) for col in data_to_describe._column_names ] @@ -5149,14 +5197,17 @@ def describe( None, ) - return cudf.concat( - [ - series.reindex(names, copy=False) - for series in describe_series_list - ], - axis=1, - sort=False, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + res = cudf.concat( + [ + series.reindex(names, copy=False) + for series in describe_series_list + ], + axis=1, + sort=False, + ) + return res @_cudf_nvtx_annotate def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: @@ -5936,7 +5987,7 @@ def make_false_column_like_self(): # Stats # @_cudf_nvtx_annotate - def _prepare_for_rowwise_op(self, method, skipna): + def _prepare_for_rowwise_op(self, method, skipna, numeric_only): """Prepare a DataFrame for CuPy-based row-wise operations.""" if method not in _cupy_nan_methods_map and any( @@ -5950,26 +6001,23 @@ def _prepare_for_rowwise_op(self, method, skipna): ) raise ValueError(msg) - is_pure_dt = all(is_datetime_dtype(dt) for dt in self.dtypes) - - if not is_pure_dt: + if numeric_only: filtered = self.select_dtypes(include=[np.number, np.bool_]) else: filtered = self.copy(deep=False) - common_dtype = find_common_type(filtered.dtypes) + is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes) - if filtered._num_columns < self._num_columns: - # When we update our pandas compatibility target to 2.0, pandas - # will stop supporting numeric_only=None and users will have to - # specify True/False. At that time we should also top our implicit - # removal of non-numeric columns here. - assert Version(pd.__version__) < Version("2.0.0") - msg = ( - "Row-wise operations currently only support int, float " - "and bool dtypes. Non numeric columns are ignored." + common_dtype = find_common_type(filtered.dtypes) + if ( + not numeric_only + and is_string_dtype(common_dtype) + and any(not is_string_dtype(dt) for dt in filtered.dtypes) + ): + raise TypeError( + f"Cannot perform row-wise {method} across mixed-dtype columns," + " try type-casting all the columns to same dtype." ) - warnings.warn(msg) if not skipna and any(col.nullable for col in filtered._columns): mask = DataFrame( @@ -5991,7 +6039,7 @@ def _prepare_for_rowwise_op(self, method, skipna): return coerced, mask, common_dtype @_cudf_nvtx_annotate - def count(self, axis=0, level=None, numeric_only=False, **kwargs): + def count(self, axis=0, numeric_only=False): """ Count ``non-NA`` cells for each column or row. @@ -6019,8 +6067,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): .. pandas-compat:: **DataFrame.count** - Parameters currently not supported are `axis`, `level`, - `numeric_only`. + Parameters currently not supported are `axis` and `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) if axis != 0: @@ -6048,28 +6095,13 @@ def _reduce( self, op, axis=None, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - source = self - if numeric_only: - numeric_cols = ( - name - for name in self._data.names - if is_numeric_dtype(self._data[name].dtype) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - return Series(index=cudf.Index([], dtype="str")) if axis is None: - if op in {"any", "all"}: - axis = 2 - else: + if op in {"sum", "product", "std", "var"}: # Do not remove until pandas 2.0 support is added. warnings.warn( f"In a future version, {type(self).__name__}" @@ -6080,58 +6112,91 @@ def _reduce( FutureWarning, ) axis = 0 + else: + axis = 2 elif axis is no_default: axis = 0 else: axis = source._get_axis_from_axis_arg(axis) + if numeric_only: + numeric_cols = ( + name + for name in self._data.names + if is_numeric_dtype(self._data[name].dtype) + ) + source = self._get_columns_by_label(numeric_cols) + if source.empty: + return Series( + index=self._data.to_pandas_index()[:0] + if axis == 0 + else source.index, + dtype="float64", + ) if axis in {0, 2}: + if axis == 2 and op in ("kurtosis", "kurt", "skew"): + # TODO: concat + op can probably be done in the general case + # for axis == 2. + # https://github.com/rapidsai/cudf/issues/14930 + return getattr(concat_columns(source._data.columns), op)( + **kwargs + ) try: result = [ getattr(source._data[col], op)(**kwargs) for col in source._data.names ] except AttributeError: - if numeric_only is None and op in _numeric_reduction_ops: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - f"The default value of numeric_only in DataFrame.{op} " - "is deprecated. In a future version, it will default " - "to False. In addition, specifying " - "'numeric_only=None' is deprecated. Select only valid " - "columns or specify the value of numeric_only to " - "silence this warning.", - FutureWarning, - ) - numeric_cols = ( - name + numeric_ops = ( + "mean", + "min", + "max", + "sum", + "product", + "prod", + "std", + "var", + "kurtosis", + "kurt", + "skew", + ) + + if op in numeric_ops: + if numeric_only: + try: + result = [ + getattr(source._data[col], op)(**kwargs) + for col in source._data.names + ] + except AttributeError: + raise NotImplementedError( + f"Not all column dtypes support op {op}" + ) + elif any( + not is_numeric_dtype(self._data[name].dtype) for name in self._data.names - if is_numeric_dtype(self._data[name].dtype) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - if axis == 2: - return getattr(column_empty(0), op)(**kwargs) - else: - return Series(index=cudf.Index([], dtype="str")) - try: - result = [ - getattr(source._data[col], op)(**kwargs) - for col in source._data.names - ] - except AttributeError: + ): raise TypeError( - f"Not all column dtypes support op {op}" + "Non numeric columns passed with " + "`numeric_only=False`, pass `numeric_only=True` " + f"to perform DataFrame.{op}" ) else: raise if axis == 2: - return getattr(as_column(result), op)(**kwargs) + return getattr(as_column(result, nan_as_null=False), op)( + **kwargs + ) else: source_dtypes = [c.dtype for c in source._data.columns] common_dtype = find_common_type(source_dtypes) - if is_object_dtype(common_dtype) and any( - not is_object_dtype(dtype) for dtype in source_dtypes + if ( + is_object_dtype(common_dtype) + and any( + not is_object_dtype(dtype) for dtype in source_dtypes + ) + or not is_bool_dtype(common_dtype) + and any(is_bool_dtype(dtype) for dtype in source_dtypes) ): raise TypeError( "Columns must all have the same dtype to " @@ -6255,7 +6320,13 @@ def mode(self, axis=0, numeric_only=False, dropna=True): if len(mode_results) == 0: return DataFrame() - df = cudf.concat(mode_results, axis=1) + with warnings.catch_warnings(): + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.simplefilter("ignore", FutureWarning) + df = cudf.concat(mode_results, axis=1) + if isinstance(df, Series): df = df.to_frame() @@ -6264,14 +6335,14 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df @_cudf_nvtx_annotate - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def all(self, axis=0, bool_only=None, skipna=True, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self - return super(DataFrame, obj).all(axis, skipna, level, **kwargs) + return super(DataFrame, obj).all(axis, skipna, **kwargs) @_cudf_nvtx_annotate - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def any(self, axis=0, bool_only=None, skipna=True, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self - return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + return super(DataFrame, obj).any(axis, skipna, **kwargs) @_cudf_nvtx_annotate def _apply_cupy_method_axis_1(self, method, *args, **kwargs): @@ -6298,12 +6369,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): "Row-wise operations currently do not support `level`." ) - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `numeric_only=False`." - ) + numeric_only = kwargs.pop("numeric_only", False) min_count = kwargs.pop("min_count", None) if min_count not in (None, 0): @@ -6323,7 +6389,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): kwargs.pop("cast_to_int", None) prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna + method, skipna, numeric_only ) for col in prepared._data.names: if prepared._data[col].nullable: @@ -6753,11 +6819,11 @@ def stack(self, level=-1, dropna=True): cat 1.0 2.0 dog 3.0 4.0 >>> df_multi_level_cols2.stack() - height weight - cat kg 1.0 - m 2.0 - dog kg 3.0 - m 4.0 + weight height + cat kg 1.0 + m 2.0 + dog kg 3.0 + m 4.0 **Prescribing the level(s) to be stacked** @@ -6929,10 +6995,18 @@ def unnamed_group_generator(): else: if unnamed_level_values.nlevels == 1: unnamed_level_values = unnamed_level_values.get_level_values(0) - unnamed_level_values = unnamed_level_values.unique().sort_values() + unnamed_level_values = unnamed_level_values.unique() data = ColumnAccessor( - dict(zip(unnamed_level_values, stacked)), + dict( + zip( + unnamed_level_values, + [ + stacked[i] + for i in unnamed_level_values.argsort().argsort() + ], + ) + ), isinstance(unnamed_level_values, pd.MultiIndex), unnamed_level_values.names, ) @@ -7060,7 +7134,7 @@ def keys(self): Columns: [0, 1, 2, 3] Index: [] >>> df.keys() - Int64Index([0, 1, 2, 3], dtype='int64') + Index([0, 1, 2, 3], dtype='int64') """ return self._data.to_pandas_index() @@ -7092,153 +7166,9 @@ def iterrows(self): "if you wish to iterate over each row." ) - @_cudf_nvtx_annotate - def append( - self, other, ignore_index=False, verify_integrity=False, sort=False - ): - """ - Append rows of `other` to the end of caller, returning a new object. - Columns in `other` that are not in the caller are added as new columns. - - Parameters - ---------- - other : DataFrame or Series/dict-like object, or list of these - The data to append. - ignore_index : bool, default False - If True, do not use the index labels. - sort : bool, default False - Sort columns ordering if the columns of - `self` and `other` are not aligned. - verify_integrity : bool, default False - This Parameter is currently not supported. - - Returns - ------- - DataFrame - - Notes - ----- - Iteratively appending rows to a cudf DataFrame can be more - computationally intensive than a single concatenate. A better solution - is to append those rows to a list and then concatenate the list with - the original DataFrame all at once. - - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - objects. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - >>> df - A B - 0 1 2 - 1 3 4 - >>> df2 = cudf.DataFrame([[5, 6], [7, 8]], columns=list('AB')) - >>> df2 - A B - 0 5 6 - 1 7 8 - >>> df.append(df2) - A B - 0 1 2 - 1 3 4 - 0 5 6 - 1 7 8 - - With `ignore_index` set to True: - - >>> df.append(df2, ignore_index=True) - A B - 0 1 2 - 1 3 4 - 2 5 6 - 3 7 8 - - The following, while not recommended methods for generating DataFrames, - show two ways to generate a DataFrame from multiple data sources. - Less efficient: - - >>> df = cudf.DataFrame(columns=['A']) - >>> for i in range(5): - ... df = df.append({'A': i}, ignore_index=True) - >>> df - A - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - - More efficient than above: - - >>> cudf.concat([cudf.DataFrame([i], columns=['A']) for i in range(5)], - ... ignore_index=True) - A - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - - .. pandas-compat:: - **DataFrame.append** - - * If a list of dict/series is passed and the keys are all contained - in the DataFrame's index, the order of the columns in the - resulting DataFrame will be unchanged. - * The `verify_integrity` parameter is not supported yet. - """ - if isinstance(other, dict): - if not ignore_index: - raise TypeError("Can only append a dict if ignore_index=True") - other = DataFrame(other) - elif isinstance(other, Series): - if other.name is None and not ignore_index: - raise TypeError( - "Can only append a Series if ignore_index=True " - "or if the Series has a name" - ) - - current_cols = self._data.to_pandas_index() - combined_columns = other.index.to_pandas() - if len(current_cols): - if cudf.utils.dtypes.is_mixed_with_object_dtype( - current_cols, combined_columns - ): - raise TypeError( - "cudf does not support mixed types, please type-cast " - "the column index of dataframe and index of series " - "to same dtypes." - ) - - combined_columns = current_cols.union( - combined_columns, sort=False - ) - - if sort: - combined_columns = combined_columns.sort_values() - - other = other.reindex(combined_columns, copy=False).to_frame().T - if not current_cols.equals(combined_columns): - self = self.reindex(columns=combined_columns) - elif ( - isinstance(other, list) - and other - and not isinstance(other[0], DataFrame) - ): - other = DataFrame(other) - cols = self._data.to_pandas_index() - if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all(): - other = other.reindex(columns=cols) - - return super()._append(other, ignore_index, verify_integrity, sort) - @_cudf_nvtx_annotate @copy_docstring(reshape.pivot) - def pivot(self, index, columns, values=None): + def pivot(self, *, columns, index=no_default, values=no_default): return cudf.core.reshape.pivot( self, index=index, columns=columns, values=values ) @@ -7325,7 +7255,7 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) def pct_change( - self, periods=1, fill_method="ffill", limit=None, freq=None + self, periods=1, fill_method=no_default, limit=no_default, freq=None ): """ Calculates the percent change between sequential elements @@ -7337,9 +7267,16 @@ def pct_change( Periods to shift for forming percent change. fill_method : str, default 'ffill' How to handle NAs before computing percent changes. + + .. deprecated:: 24.04 + All options of `fill_method` are deprecated + except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. + + .. deprecated:: 24.04 + `limit` is deprecated. freq : str, optional Increment to use from time series API. Not yet implemented. @@ -7348,17 +7285,44 @@ def pct_change( ------- DataFrame """ - if limit is not None: + if limit is not no_default: raise NotImplementedError("limit parameter not supported yet.") if freq is not None: raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + elif fill_method not in { + no_default, + None, + "ffill", + "pad", + "bfill", + "backfill", + }: raise ValueError( - "fill_method must be one of 'ffill', 'pad', " + "fill_method must be one of None, 'ffill', 'pad', " "'bfill', or 'backfill'." ) - data = self.fillna(method=fill_method, limit=limit) + if fill_method not in (no_default, None) or limit is not no_default: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + "The 'fill_method' and 'limit' keywords in " + f"{type(self).__name__}.pct_change are deprecated and will be " + "removed in a future version. Either fill in any non-leading " + "NA values prior to calling pct_change or specify " + "'fill_method=None' to not fill NA values.", + FutureWarning, + ) + if fill_method is no_default: + fill_method = "ffill" + if limit is no_default: + limit = None + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + data = self.fillna(method=fill_method, limit=limit) return data.diff(periods=periods) / data.shift( periods=periods, freq=freq @@ -7689,12 +7653,18 @@ def value_counts( >>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6], ... 'num_wings': [2, 0, 0, 0]}, ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 >>> df.value_counts() num_legs num_wings 4 0 2 2 2 1 6 0 1 - dtype: int64 + Name: count, dtype: int64 """ if subset: diff = set(subset) - set(self._data) @@ -7716,6 +7686,7 @@ def value_counts( # Pandas always returns MultiIndex even if only one column. if not isinstance(result.index, MultiIndex): result.index = MultiIndex._from_data(result._index._data) + result.name = "proportion" if normalize else "count" return result @@ -7899,14 +7870,14 @@ def from_pandas(obj, nan_as_null=no_default): >>> pidx = pd.Index([1, 2, 10, 20]) >>> pidx - Int64Index([1, 2, 10, 20], dtype='int64') + Index([1, 2, 10, 20], dtype='int64') >>> gidx = cudf.from_pandas(pidx) >>> gidx - Int64Index([1, 2, 10, 20], dtype='int64') + Index([1, 2, 10, 20], dtype='int64') >>> type(gidx) - + >>> type(pidx) - + Converting a Pandas MultiIndex to cuDF MultiIndex: @@ -8091,7 +8062,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.core.index.GenericIndex._concat(indexes) + merged_index = cudf.core.index.Index._concat(indexes) return merged_index.drop_duplicates() diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index e451b32df28..7892f8065d0 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -4,6 +4,7 @@ import operator import pickle import textwrap +import warnings from functools import cached_property from typing import Any, Callable, Dict, List, Tuple, Type, Union @@ -12,19 +13,15 @@ import pyarrow as pa from pandas.api import types as pd_types from pandas.api.extensions import ExtensionDtype +from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf from cudf._typing import Dtype -from cudf.core._compat import PANDAS_GE_150 +from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply -if PANDAS_GE_150: - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -else: - from pandas.core.arrays._arrow_utils import ArrowIntervalType - def dtype(arbitrary): """ @@ -172,7 +169,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None: self._ordered = ordered @property - def categories(self) -> "cudf.core.index.GenericIndex": + def categories(self) -> "cudf.core.index.Index": """ An ``Index`` containing the unique categories allowed. @@ -181,7 +178,7 @@ def categories(self) -> "cudf.core.index.GenericIndex": >>> import cudf >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> dtype.categories - StringIndex(['b' 'a'], dtype='object') + Index(['b', 'a'], dtype='object') """ if self._categories is None: return cudf.core.index.as_index( @@ -223,11 +220,11 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": >>> import pandas as pd >>> pd_dtype = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> pd_dtype - CategoricalDtype(categories=['b', 'a'], ordered=True) + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) >>> cudf_dtype - CategoricalDtype(categories=['b', 'a'], ordered=True) - """ + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) + """ # noqa: E501 return CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) @@ -241,16 +238,17 @@ def to_pandas(self) -> pd.CategoricalDtype: >>> import cudf >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> dtype - CategoricalDtype(categories=['b', 'a'], ordered=True) + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) >>> dtype.to_pandas() - CategoricalDtype(categories=['b', 'a'], ordered=True) - """ + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) + """ # noqa: E501 if self._categories is None: categories = None else: - if isinstance( - self._categories, (cudf.Float32Index, cudf.Float64Index) - ): + if self._categories.dtype in { + cudf.dtype("float32"), + cudf.dtype("float64"), + }: categories = self._categories.dropna().to_pandas() else: categories = self._categories.to_pandas() @@ -259,7 +257,7 @@ def to_pandas(self) -> pd.CategoricalDtype: def _init_categories(self, categories: Any): if categories is None: return categories - if len(categories) == 0 and not is_interval_dtype(categories): + if len(categories) == 0 and not _is_interval_dtype(categories): dtype = "object" # type: Any else: dtype = None @@ -962,19 +960,7 @@ def deserialize(cls, header: dict, frames: list): return klass(subtype, closed=closed) -def is_categorical_dtype(obj): - """Check whether an array-like or dtype is of the Categorical dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of a categorical dtype. - """ +def _is_categorical_dtype(obj): if obj is None: return False @@ -1018,13 +1004,41 @@ def is_categorical_dtype(obj): pd.Series, ), ): - return is_categorical_dtype(obj.dtype) + return _is_categorical_dtype(obj.dtype) if hasattr(obj, "type"): if obj.type is pd.CategoricalDtype.type: return True # TODO: A lot of the above checks are probably redundant and should be # farmed out to this function here instead. - return pd_types.is_categorical_dtype(obj) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return pd_types.is_categorical_dtype(obj) + + +def is_categorical_dtype(obj): + """Check whether an array-like or dtype is of the Categorical dtype. + + .. deprecated:: 24.04 + Use isinstance(dtype, cudf.CategoricalDtype) instead + + Parameters + ---------- + obj : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + bool + Whether or not the array-like or dtype is of a categorical dtype. + """ + # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + warnings.warn( + "is_categorical_dtype is deprecated and will be removed in a future " + "version. Use isinstance(dtype, cudf.CategoricalDtype) instead", + DeprecationWarning, + ) + return _is_categorical_dtype(obj) def is_list_dtype(obj): @@ -1102,21 +1116,7 @@ def is_decimal_dtype(obj): ) -def is_interval_dtype(obj): - """Check whether an array-like or dtype is of the interval dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of the interval dtype. - """ - # TODO: Should there be any branch in this function that calls - # pd.api.types.is_interval_dtype? +def _is_interval_dtype(obj): return ( isinstance( obj, @@ -1139,6 +1139,27 @@ def is_interval_dtype(obj): ) +def is_interval_dtype(obj): + """Check whether an array-like or dtype is of the interval dtype. + + Parameters + ---------- + obj : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + bool + Whether or not the array-like or dtype is of the interval dtype. + """ + warnings.warn( + "is_interval_dtype is deprecated and will be removed in a " + "future version. Use `isinstance(dtype, cudf.IntervalDtype)` instead", + DeprecationWarning, + ) + return _is_interval_dtype(obj) + + def is_decimal32_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal32Dtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1e6ff118626..79005193b4e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -31,7 +31,6 @@ import cudf from cudf import _lib as libcudf from cudf._typing import Dtype -from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -43,10 +42,8 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import BinaryOperand, Scannable -from cudf.core.window import Rolling from cudf.utils import ioutils -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column, find_common_type +from cudf.utils.dtypes import find_common_type from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf @@ -223,12 +220,12 @@ def size(self) -> int: >>> index = cudf.Index([]) >>> index - Float64Index([], dtype='float64') + Index([], dtype='float64') >>> index.size 0 >>> index = cudf.Index([1, 2, 3, 10]) >>> index - Int64Index([1, 2, 3, 10], dtype='int64') + Index([1, 2, 3, 10], dtype='int64') >>> index.size 4 @@ -615,115 +612,6 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ raise NotImplementedError - @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: - """ - Replace values where the condition is True. - - Parameters - ---------- - cond : bool Series/DataFrame, array-like - Where cond is False, keep the original value. - Where True, replace with corresponding value from other. - Callables are not supported. - other: scalar, list of scalars, Series/DataFrame - Entries where cond is True are replaced with - corresponding value from other. Callables are not - supported. Default is None. - - DataFrame expects only Scalar or array like with scalars or - dataframe with same dimension as self. - - Series expects only scalar or series like with same length - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - Same type as caller - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) - >>> df.mask(df % 2 == 0, [-1, -1]) - A B - 0 1 3 - 1 -1 5 - 2 5 -1 - - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.mask(ser > 2, 10) - 0 10 - 1 10 - 2 2 - 3 1 - 4 0 - dtype: int64 - >>> ser.mask(ser > 2) - 0 - 1 - 2 2 - 3 1 - 4 0 - dtype: int64 - """ - - if not hasattr(cond, "__invert__"): - # We Invert `cond` below and call `where`, so - # making sure the object supports - # `~`(inversion) operator or `__invert__` method - cond = cupy.asarray(cond) - - return self.where(cond=~cond, other=other, inplace=inplace) - - @_cudf_nvtx_annotate - def pipe(self, func, *args, **kwargs): - """ - Apply ``func(self, *args, **kwargs)``. - - Parameters - ---------- - func : function - Function to apply to the Series/DataFrame/Index. - ``args``, and ``kwargs`` are passed into ``func``. - Alternatively a ``(callable, data_keyword)`` tuple where - ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the Series/DataFrame/Index. - args : iterable, optional - Positional arguments passed into ``func``. - kwargs : mapping, optional - A dictionary of keyword arguments passed into ``func``. - - Returns - ------- - object : the return type of ``func``. - - Examples - -------- - Use ``.pipe`` when chaining together functions that expect - Series, DataFrames or GroupBy objects. Instead of writing - - >>> func(g(h(df), arg1=a), arg2=b, arg3=c) - - You can write - - >>> (df.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe(func, arg2=b, arg3=c) - ... ) - - If you have a function that takes the data as (say) the second - argument, pass a tuple indicating which keyword expects the - data. For example, suppose ``f`` takes its data as ``arg2``: - - >>> (df.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe((func, 'arg2'), arg1=a, arg3=c) - ... ) - """ - return cudf.core.common.pipe(self, func, *args, **kwargs) - @_cudf_nvtx_annotate def fillna( self, @@ -742,13 +630,15 @@ def fillna( are filled with values in corresponding indices. A dict can be used to provide different values to fill nulls in different columns. Cannot be used with ``method``. - method : {'ffill', 'bfill'}, default None Method to use for filling null values in the dataframe or series. `ffill` propagates the last non-null values forward to the next non-null value. `bfill` propagates backward with the next non-null value. Cannot be used with ``value``. + .. deprecated:: 24.04 + `method` is deprecated. + Returns ------- result : DataFrame, Series, or Index @@ -1225,7 +1115,7 @@ def isna(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx - Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') + Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.isna() array([False, False, True, True, False, False]) """ @@ -1304,7 +1194,7 @@ def notna(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx - Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') + Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.notna() array([ True, True, False, False, True, True]) """ @@ -1547,32 +1437,6 @@ def _get_sorted_inds( stable=True, ) - @_cudf_nvtx_annotate - def abs(self): - """ - Return a Series/DataFrame with absolute numeric value of each element. - - This function only applies to elements that are all numeric. - - Returns - ------- - DataFrame/Series - Absolute value of each element. - - Examples - -------- - Absolute numeric values in a Series - - >>> s = cudf.Series([-1.10, 2, -3.33, 4]) - >>> s.abs() - 0 1.10 - 1 2.00 - 2 3.33 - 3 4.00 - dtype: float64 - """ - return self._unaryop("abs") - @_cudf_nvtx_annotate def _is_sorted(self, ascending=None, null_position=None): """ @@ -1769,121 +1633,6 @@ def _apply_cupy_ufunc_to_operands( data[i][name] = as_column(out).set_mask(mask) return data - @_cudf_nvtx_annotate - def dot(self, other, reflect=False): - """ - Get dot product of frame and other, (binary operator `dot`). - - Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`, - `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`, - `@`. - - Parameters - ---------- - other : Sequence, Series, or DataFrame - Any multiple element data structure, or list-like object. - reflect : bool, default False - If ``True``, swap the order of the operands. See - https://docs.python.org/3/reference/datamodel.html#object.__ror__ - for more information on when this is necessary. - - Returns - ------- - scalar, Series, or DataFrame - The result of the operation. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame([[1, 2, 3, 4], - ... [5, 6, 7, 8]]) - >>> df @ df.T - 0 1 - 0 30 70 - 1 70 174 - >>> s = cudf.Series([1, 1, 1, 1]) - >>> df @ s - 0 10 - 1 26 - dtype: int64 - >>> [1, 2, 3, 4] @ s - 10 - """ - # TODO: This function does not currently support nulls. - lhs = self.values - result_index = None - result_cols = None - if isinstance(self, cudf.Series) and isinstance( - other, (cudf.Series, cudf.DataFrame) - ): - common = self.index.union(other.index) - if len(common) > len(self.index) or len(common) > len(other.index): - raise ValueError("matrices are not aligned") - - lhs = self.reindex(index=common, copy=False).values - rhs = other.reindex(index=common, copy=False).values - if isinstance(other, cudf.DataFrame): - result_index = other._data.to_pandas_index() - elif isinstance(self, cudf.DataFrame) and isinstance( - other, (cudf.Series, cudf.DataFrame) - ): - common = self._data.to_pandas_index().union( - other.index.to_pandas() - ) - if len(common) > len(self._data.names) or len(common) > len( - other.index - ): - raise ValueError("matrices are not aligned") - - lhs = self.reindex(columns=common, copy=False) - result_index = lhs.index - - rhs = other.reindex(index=common, copy=False).values - lhs = lhs.values - if isinstance(other, cudf.DataFrame): - result_cols = other._data.to_pandas_index() - - elif isinstance( - other, (cupy.ndarray, np.ndarray) - ) or can_convert_to_column(other): - rhs = cupy.asarray(other) - else: - # TODO: This should raise an exception, not return NotImplemented, - # but __matmul__ relies on the current behavior. We should either - # move this implementation to __matmul__ and call it from here - # (checking for NotImplemented and raising NotImplementedError if - # that's what's returned), or __matmul__ should catch a - # NotImplementedError from here and return NotImplemented. The - # latter feels cleaner (putting the implementation in this method - # rather than in the operator) but will be slower in the (highly - # unlikely) case that we're multiplying a cudf object with another - # type of object that somehow supports this behavior. - return NotImplemented - if reflect: - lhs, rhs = rhs, lhs - - result = lhs.dot(rhs) - if len(result.shape) == 1: - return cudf.Series( - result, - index=self.index if result_index is None else result_index, - ) - if len(result.shape) == 2: - return cudf.DataFrame( - result, - index=self.index if result_index is None else result_index, - columns=result_cols, - ) - return result.item() - - @_cudf_nvtx_annotate - def __matmul__(self, other): - return self.dot(other) - - @_cudf_nvtx_annotate - def __rmatmul__(self, other): - return self.dot(other, reflect=True) - # Unary logical operators @_cudf_nvtx_annotate def __neg__(self): @@ -1923,10 +1672,9 @@ def _reduce(self, *args, **kwargs): @_cudf_nvtx_annotate def min( self, - axis=no_default, + axis=0, skipna=True, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): """ @@ -1938,12 +1686,10 @@ def min( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. + numeric_only: bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- @@ -1953,10 +1699,13 @@ def min( -------- >>> import cudf >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() + >>> min_series = df.min() + >>> min_series a 1 b 7 dtype: int64 + >>> min_series.min() + 1 .. pandas-compat:: **DataFrame.min, Series.min** @@ -1967,7 +1716,6 @@ def min( "min", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @@ -1975,10 +1723,9 @@ def min( @_cudf_nvtx_annotate def max( self, - axis=no_default, + axis=0, skipna=True, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): """ @@ -1990,12 +1737,10 @@ def max( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. + numeric_only: bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- @@ -2019,428 +1764,12 @@ def max( "max", axis=axis, skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - def sum( - self, - axis=no_default, - skipna=True, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - - .. pandas-compat:: - **DataFrame.sum, Series.sum** - - Parameters currently not supported are `level`, `numeric_only`. - """ - return self._reduce( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - @_cudf_nvtx_annotate - def product( - self, - axis=no_default, - skipna=True, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - - .. pandas-compat:: - **DataFrame.product, Series.product** - - Parameters currently not supported are level`, `numeric_only`. - """ - - return self._reduce( - # cuDF columns use "product" as the op name, but cupy uses "prod" - # and we need cupy if axis == 1. - "prod" if axis in {1, "columns"} else "product", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - # Alias for pandas compatibility. - prod = product - - @_cudf_nvtx_annotate - def mean( - self, - axis=no_default, - skipna=True, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._reduce( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - def std( - self, - axis=no_default, - skipna=True, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - - .. pandas-compat:: - **DataFrame.std, Series.std** - - Parameters currently not supported are `level` and - `numeric_only` - """ - - return self._reduce( - "std", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - def var( - self, - axis=no_default, - skipna=True, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - - .. pandas-compat:: - **DataFrame.var, Series.var** - - Parameters currently not supported are `level` and - `numeric_only` - """ - return self._reduce( - "var", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, numeric_only=numeric_only, **kwargs, ) @_cudf_nvtx_annotate - def kurtosis( - self, - axis=no_default, - skipna=True, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher's definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - Series or scalar - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series.kurtosis() - -1.1999999999999904 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.kurt() - a -1.2 - b -1.2 - dtype: float64 - - .. pandas-compat:: - **DataFrame.kurtosis** - - Parameters currently not supported are `level` and `numeric_only` - """ - if axis not in (0, "index", None, no_default): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._reduce( - "kurtosis", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - # Alias for kurtosis. - kurt = kurtosis - - @_cudf_nvtx_annotate - def skew( - self, - axis=no_default, - skipna=True, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna: bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - Series - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 6 - dtype: int64 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) - >>> df.skew() - a 0.00000 - b -0.37037 - dtype: float64 - - .. pandas-compat:: - **DataFrame.skew, Series.skew, Frame.skew** - - Parameters currently not supported are `axis`, `level` and - `numeric_only` - """ - if axis not in (0, "index", None, no_default): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._reduce( - "skew", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - def all(self, axis=0, skipna=True, level=None, **kwargs): + def all(self, axis=0, skipna=True, **kwargs): """ Return whether all elements are True in DataFrame. @@ -2468,7 +1797,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): Notes ----- - Parameters currently not supported are `bool_only`, `level`. + Parameters currently not supported are `bool_only`. Examples -------- @@ -2495,12 +1824,11 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, - level=level, **kwargs, ) @_cudf_nvtx_annotate - def any(self, axis=0, skipna=True, level=None, **kwargs): + def any(self, axis=0, skipna=True, **kwargs): """ Return whether any elements is True in DataFrame. @@ -2528,7 +1856,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): Notes ----- - Parameters currently not supported are `bool_only`, `level`. + Parameters currently not supported are `bool_only`. Examples -------- @@ -2555,76 +1883,9 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, - level=level, **kwargs, ) - @_cudf_nvtx_annotate - def median( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - - .. pandas-compat:: - **DataFrame.median, Series.median** - - Parameters currently not supported are `level` and `numeric_only`. - - .. pandas-compat:: - **DataFrame.median, Series.median** - - Parameters currently not supported are `level` and `numeric_only`. - """ - return self._reduce( - "median", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - - return cudf.io.json.to_json( - self, path_or_buf=path_or_buf, *args, **kwargs - ) - - @_cudf_nvtx_annotate - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - - cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - @_cudf_nvtx_annotate @ioutils.doc_to_dlpack() def to_dlpack(self): @@ -2632,32 +1893,9 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) - @_cudf_nvtx_annotate - def to_string(self): - r""" - Convert to string - - cuDF uses Pandas internals for efficient string formatting. - Set formatting options using pandas string formatting options and - cuDF objects will print identically to Pandas objects. - - cuDF supports `null/None` as a value in any column type, which - is transparently supported during this output process. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2] - >>> df['val'] = [float(i + 10) for i in range(3)] - >>> df.to_string() - ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' - """ - return repr(self) - @_cudf_nvtx_annotate def __str__(self): - return self.to_string() + return repr(self) @_cudf_nvtx_annotate def __deepcopy__(self, memo): @@ -2667,184 +1905,6 @@ def __deepcopy__(self, memo): def __copy__(self): return self.copy(deep=False) - @_cudf_nvtx_annotate - def head(self, n=5): - """ - Return the first `n` rows. - This function returns the first `n` rows for the object based - on position. It is useful for quickly testing if your object - has the right type of data in it. - For negative values of `n`, this function returns all rows except - the last `n` rows, equivalent to ``df[:-n]``. - - Parameters - ---------- - n : int, default 5 - Number of rows to select. - - Returns - ------- - DataFrame or Series - The first `n` rows of the caller object. - - Examples - -------- - **Series** - - >>> ser = cudf.Series(['alligator', 'bee', 'falcon', - ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) - >>> ser - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - 6 shark - 7 whale - 8 zebra - dtype: object - - Viewing the first 5 lines - - >>> ser.head() - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - dtype: object - - Viewing the first `n` lines (three in this case) - - >>> ser.head(3) - 0 alligator - 1 bee - 2 falcon - dtype: object - - For negative values of `n` - - >>> ser.head(-3) - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - dtype: object - - **DataFrame** - - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.head(2) - key val - 0 0 10.0 - 1 1 11.0 - """ - return self.iloc[:n] - - @_cudf_nvtx_annotate - def tail(self, n=5): - """ - Returns the last n rows as a new DataFrame or Series - - Examples - -------- - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.tail(2) - key val - 3 3 13.0 - 4 4 14.0 - - **Series** - - >>> import cudf - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.tail(2) - 3 1 - 4 0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - - @_cudf_nvtx_annotate - @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - win_type=win_type, - ) - - @_cudf_nvtx_annotate - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls - - Returns - ------- - DataFrame or Series - - Examples - -------- - **Series** - - >>> import cudf, numpy as np - >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) - >>> series - 0 1.0 - 1 2.0 - 2 NaN - 3 - 4 10.0 - dtype: float64 - >>> series.nans_to_nulls() - 0 1.0 - 1 2.0 - 2 - 3 - 4 10.0 - dtype: float64 - - **DataFrame** - - >>> df = cudf.DataFrame() - >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) - >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) - >>> df - a b - 0 1.0 - 1 3.14 - 2 NaN NaN - >>> df.nans_to_nulls() - a b - 0 1.0 - 1 3.14 - 2 - """ - result_data = {} - for name, col in self._data.items(): - try: - result_data[name] = col.nans_to_nulls() - except AttributeError: - result_data[name] = col.copy() - return self._from_data_like_self(result_data) - @_cudf_nvtx_annotate def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" @@ -2894,10 +1954,12 @@ def _repeat( @_cudf_nvtx_annotate @_warn_no_dask_cudf def __dask_tokenize__(self): + from dask.base import normalize_token + return [ type(self), - self._dtypes, - self.to_pandas(), + normalize_token(self._dtypes), + normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index c4d92b84c99..1f08abdc7fc 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -21,7 +21,9 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType +from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like +from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor @@ -276,26 +278,22 @@ def __init__( self.grouping = _Grouping(obj, self._by, level) def __iter__(self): - if isinstance(self._by, list) and len(self._by) == 1: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "In a future version of cudf, a length 1 tuple will be " - "returned when iterating over a groupby with a grouper equal " - "to a list of length 1. To avoid this warning, do not supply " - "a list with a single grouper.", - FutureWarning, - ) group_names, offsets, _, grouped_values = self._grouped() if isinstance(group_names, cudf.BaseIndex): group_names = group_names.to_pandas() for i, name in enumerate(group_names): - yield name, grouped_values[offsets[i] : offsets[i + 1]] + yield (name,) if isinstance(self._by, list) and len( + self._by + ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]] @property def dtypes(self): """ Return the dtypes in this group. + .. deprecated:: 24.04 + Use `.dtypes` on base object instead. + Returns ------- pandas.DataFrame @@ -307,18 +305,24 @@ def dtypes(self): >>> df = cudf.DataFrame({'a': [1, 2, 3, 3], 'b': ['x', 'y', 'z', 'a'], ... 'c':[10, 11, 12, 12]}) >>> df.groupby("a").dtypes - b c + a b c a - 1 object int64 - 2 object int64 - 3 object int64 + 1 int64 object int64 + 2 int64 object int64 + 3 int64 object int64 """ + warnings.warn( + f"{type(self).__name__}.dtypes is deprecated and will be " + "removed in a future version. Check the dtypes on the " + "base object instead", + FutureWarning, + ) index = self.grouping.keys.unique().sort_values().to_pandas() obj_dtypes = self.obj._dtypes return pd.DataFrame( { name: [obj_dtypes[name]] * len(index) - for name in self.grouping.values._column_names + for name in self.obj._data.names }, index=index, ) @@ -341,6 +345,33 @@ def groups(self): zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) ) + @cached_property + def indices(self): + """ + Dict {group name -> group indices}. + + Examples + -------- + >>> import cudf + >>> data = [[10, 20, 30], [10, 30, 40], [40, 50, 30]] + >>> df = cudf.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 10 20 30 + 1 10 30 40 + 2 40 50 30 + >>> df.groupby(by=["a"]).indices + {10: array([0, 1]), 40: array([2])} + """ + group_names, offsets, _, grouped_values = self._grouped() + + return dict( + zip( + group_names.to_pandas(), + np.split(grouped_values.index.values, offsets[1:-1]), + ) + ) + @_cudf_nvtx_annotate def get_group(self, name, obj=None): """ @@ -376,6 +407,13 @@ def get_group(self, name, obj=None): """ if obj is None: obj = self.obj + else: + warnings.warn( + "obj is deprecated and will be removed in a future version. " + "Use ``df.iloc[gb.indices.get(name)]`` " + "instead of ``gb.get_group(name, obj=df)``.", + FutureWarning, + ) return obj.loc[self.groups[name].drop_duplicates()] @@ -558,7 +596,8 @@ def agg(self, func): orig_dtypes = tuple(c.dtype for c in columns) # Note: When there are no key columns, the below produces - # a Float64Index, while Pandas returns an Int64Index + # an Index with float64 dtype, while Pandas returns + # an Index with int64 dtype. # (GH: 6945) ( result_columns, @@ -592,7 +631,7 @@ def agg(self, func): # Structs lose their labels which we reconstruct here col = col._with_type_metadata(cudf.ListDtype(orig_dtype)) - if agg_kind in {"COUNT", "SIZE"}: + if agg_kind in {"COUNT", "SIZE", "ARGMIN", "ARGMAX"}: data[key] = col.astype("int64") elif ( self.obj.empty @@ -642,15 +681,6 @@ def agg(self, func): how="left", ) result = result.take(indices) - if isinstance(result._index, cudf.CategoricalIndex): - # Needs re-ordering the categories in the order - # they are after grouping. - result._index = cudf.Index( - result._index._column.reorder_categories( - result._index._column._get_decategorized_column() - ), - name=result._index.name, - ) if not self._as_index: result = result.reset_index() @@ -882,10 +912,21 @@ def nth(self, n): """ Return the nth row from each group. """ - result = self.agg(lambda x: x.nth(n)).sort_index() - sizes = self.size().sort_index() - return result[sizes > n] + self.obj["__groupbynth_order__"] = range(0, len(self.obj)) + # We perform another groupby here to have the grouping columns + # be a part of dataframe columns. + result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n)) + sizes = self.size().reindex(result.index) + + result = result[sizes > n] + + result._index = self.obj.index.take( + result._data["__groupbynth_order__"] + ) + del result._data["__groupbynth_order__"] + del self.obj._data["__groupbynth_order__"] + return result @_cudf_nvtx_annotate def ngroup(self, ascending=True): @@ -1300,13 +1341,17 @@ def _post_process_chunk_results( # group is a row-like "Series" where the index labels # are the same as the original calling DataFrame if _is_row_of(chunk_results[0], self.obj): - result = cudf.concat(chunk_results, axis=1).T + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + result = cudf.concat(chunk_results, axis=1).T result.index = group_names result.index.names = self.grouping.names # When the UDF is like df.x + df.y, the result for each # group is the same length as the original group elif len(self.obj) == sum(len(chk) for chk in chunk_results): - result = cudf.concat(chunk_results) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + result = cudf.concat(chunk_results) index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column result.index = cudf.MultiIndex._from_data(index_data) @@ -1317,7 +1362,9 @@ def _post_process_chunk_results( f"type {type(chunk_results[0])}" ) else: - result = cudf.concat(chunk_results) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + result = cudf.concat(chunk_results) if self._group_keys: index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column @@ -1424,9 +1471,11 @@ def mult(df): dtype: int64 """ - if self.obj.empty: - res = self.obj.copy(deep=True) + if function in {"count", "size", "idxmin", "idxmax"}: + res = cudf.Series([], dtype="int64") + else: + res = self.obj.copy(deep=True) res.index = self.grouping.keys if function in {"sum", "product"}: # For `sum` & `product`, boolean types @@ -2116,30 +2165,6 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: result = self._mimic_pandas_order(result) return result._copy_type_metadata(values) - @_cudf_nvtx_annotate - def pad(self, limit=None): - """Forward fill NA values. - - .. deprecated:: 23.06 - `pad` is deprecated, use `ffill` instead. - - Parameters - ---------- - limit : int, default None - Unsupported - """ - - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "pad is deprecated and will be removed in a future version. " - "Use ffill instead.", - FutureWarning, - ) - return self._scan_fill("ffill", limit) - def ffill(self, limit=None): """Forward fill NA values. @@ -2154,29 +2179,6 @@ def ffill(self, limit=None): return self._scan_fill("ffill", limit) - @_cudf_nvtx_annotate - def backfill(self, limit=None): - """Backward fill NA values. - - .. deprecated:: 23.06 - `backfill` is deprecated, use `bfill` instead. - - Parameters - ---------- - limit : int, default None - Unsupported - """ - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "backfill is deprecated and will be removed in a future version. " - "Use bfill instead.", - FutureWarning, - ) - return self._scan_fill("bfill", limit) - def bfill(self, limit=None): """Backward fill NA values. @@ -2206,11 +2208,11 @@ def fillna( ---------- value : scalar, dict Value to use to fill the holes. Cannot be specified with method. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + method : { 'bfill', 'ffill', None}, default None Method to use for filling holes in reindexed Series - - pad/ffill: propagate last valid observation forward to next valid - - backfill/bfill: use next valid observation to fill gap + - ffill: propagate last valid observation forward to next valid + - bfill: use next valid observation to fill gap axis : {0 or 'index', 1 or 'columns'} Unsupported inplace : bool, default False @@ -2240,11 +2242,19 @@ def fillna( raise ValueError("Cannot specify both 'value' and 'method'.") if method is not None: - if method not in {"pad", "ffill", "backfill", "bfill"}: - raise ValueError( - "Method can only be of 'pad', 'ffill'," - "'backfill', 'bfill'." - ) + if method not in {"ffill", "bfill"}: + raise ValueError("Method can only be of 'ffill', 'bfill'.") + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) + return getattr(self, method, limit)() values = self.obj.__class__._from_data( @@ -2319,7 +2329,12 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): @_cudf_nvtx_annotate def pct_change( - self, periods=1, fill_method="ffill", axis=0, limit=None, freq=None + self, + periods=1, + fill_method=no_default, + axis=0, + limit=no_default, + freq=None, ): """ Calculates the percent change between sequential elements @@ -2331,9 +2346,16 @@ def pct_change( Periods to shift for forming percent change. fill_method : str, default 'ffill' How to handle NAs before computing percent changes. + + .. deprecated:: 24.04 + All options of `fill_method` are deprecated + except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. + + .. deprecated:: 24.04 + `limit` is deprecated. freq : str, optional Increment to use from time series API. Not yet implemented. @@ -2345,26 +2367,39 @@ def pct_change( """ if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - if limit is not None: + if limit is not no_default: raise NotImplementedError("limit parameter not supported yet.") if freq is not None: raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + elif fill_method not in {no_default, None, "ffill", "bfill"}: raise ValueError( - "fill_method must be one of 'ffill', 'pad', " - "'bfill', or 'backfill'." + "fill_method must be one of 'ffill', or" "'bfill'." ) - if fill_method in ("pad", "backfill"): - alternative = "ffill" if fill_method == "pad" else "bfill" - # Do not remove until pandas 2.0 support is added. + if fill_method not in (no_default, None) or limit is not no_default: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( - f"{fill_method} is deprecated and will be removed in a future " - f"version. Use f{alternative} instead.", + "The 'fill_method' keyword being not None and the 'limit' " + f"keywords in {type(self).__name__}.pct_change are " + "deprecated and will be removed in a future version. " + "Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' " + "to not fill NA values.", FutureWarning, ) - filled = self.fillna(method=fill_method, limit=limit) + if fill_method in (no_default, None): + fill_method = "ffill" + if limit is no_default: + limit = None + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + filled = self.fillna(method=fill_method, limit=limit) + fill_grp = filled.groupby(self.grouping) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d2e22b320f9..c8eedae200b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2,21 +2,18 @@ from __future__ import annotations -import math +import operator import pickle import warnings from functools import cache, cached_property from numbers import Number from typing import ( Any, - Dict, List, Literal, MutableMapping, Optional, - Sequence, Tuple, - Type, Union, cast, ) @@ -24,8 +21,10 @@ import cupy import numpy as np import pandas as pd +from typing_extensions import Self import cudf +from cudf import _lib as libcudf from cudf._lib.datetime import extract_quarter, is_leap_year from cudf._lib.filling import sequence from cudf._lib.search import search_sorted @@ -38,9 +37,9 @@ is_list_like, is_scalar, is_signed_integer_dtype, - is_string_dtype, ) from cudf.core._base_index import BaseIndex +from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_300 from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -56,6 +55,7 @@ from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype from cudf.core.frame import Frame +from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import BinaryOperand from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring @@ -69,8 +69,33 @@ from cudf.utils.utils import _warn_no_dask_cudf, search_range +class IndexMeta(type): + """Custom metaclass for Index that overrides instance/subclass tests.""" + + def __call__(cls, data, *args, **kwargs): + if cls is Index: + return as_index( + arbitrary=data, + *args, + **kwargs, + ) + return super().__call__(data, *args, **kwargs) + + def __instancecheck__(self, instance): + if self is cudf.Index: + return isinstance(instance, BaseIndex) + else: + return type.__instancecheck__(self, instance) + + def __subclasscheck__(self, subclass): + if self is cudf.Index: + return issubclass(subclass, BaseIndex) + else: + return type.__subclasscheck__(self, subclass) + + def _lexsorted_equal_range( - idx: Union[GenericIndex, cudf.MultiIndex], + idx: Union[Index, cudf.MultiIndex], key_as_table: Frame, is_sorted: bool, ) -> Tuple[int, int, Optional[ColumnBase]]: @@ -103,18 +128,13 @@ def _index_from_data(data: MutableMapping, name: Any = no_default): values = next(iter(data.values())) if isinstance(values, NumericalColumn): - try: - index_class_type: Type[ - Union[GenericIndex, cudf.MultiIndex] - ] = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex + index_class_type = Index elif isinstance(values, DatetimeColumn): index_class_type = DatetimeIndex elif isinstance(values, TimeDeltaColumn): index_class_type = TimedeltaIndex elif isinstance(values, StringColumn): - index_class_type = StringIndex + index_class_type = Index elif isinstance(values, CategoricalColumn): index_class_type = CategoricalIndex elif isinstance(values, (IntervalColumn, StructColumn)): @@ -224,8 +244,8 @@ def __init__( self._end = self._start + self._step * (len(self._range) - 1) def _copy_type_metadata( - self: RangeIndex, other: RangeIndex, *, override_dtypes=None - ) -> RangeIndex: + self, other: RangeIndex, *, override_dtypes=None + ) -> Self: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -338,7 +358,7 @@ def __contains__(self, item): return False @_cudf_nvtx_annotate - def copy(self, name=None, deep=False, dtype=None, names=None): + def copy(self, name=None, deep=False): """ Make a copy of this object. @@ -347,46 +367,11 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name : object optional (default: None), name of index deep : Bool (default: False) Ignored for RangeIndex - dtype : numpy dtype optional (default: None) - Target dtype for underlying range data - - .. deprecated:: 23.02 - - The `dtype` parameter is deprecated and will be removed in - a future version of cudf. Use the `astype` method instead. - - names : list-like optional (default: False) - Kept compatibility with MultiIndex. Should not be used. - - .. deprecated:: 23.04 - - The parameter `names` is deprecated and will be removed in - a future version of cudf. Use the `name` parameter instead. Returns ------- - New RangeIndex instance with same range, casted to new dtype + New RangeIndex instance with same range """ - if dtype is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter dtype is deprecated and will be removed in a " - "future version. Use the astype method instead.", - FutureWarning, - ) - - if names is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter names is deprecated and will be removed in a " - "future version. Use the name parameter instead.", - FutureWarning, - ) - - dtype = self.dtype if dtype is None else dtype - - if not np.issubdtype(dtype, np.signedinteger): - raise ValueError(f"Expected Signed Integer Type, Got {dtype}") name = self.name if name is None else name @@ -566,7 +551,7 @@ def __rmul__(self, other): def _as_int_index(self): # Convert self to an integer index. This method is used to perform ops # that are not defined directly on RangeIndex. - return _dtype_to_index[self.dtype.type]._from_data(self._data) + return cudf.Index._from_data(self._data) @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -575,49 +560,48 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ) @_cudf_nvtx_annotate - def get_loc(self, key, method=None, tolerance=None): - # We should not actually remove this code until we have implemented the - # get_indexers method as an alternative, see - # https://github.com/rapidsai/cudf/issues/12312 - if method is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - f"Passing method to {self.__class__.__name__}.get_loc is " - "deprecated and will raise in a future version.", - FutureWarning, + def get_indexer(self, target, limit=None, method=None, tolerance=None): + target_col = cudf.core.column.as_column(target) + if method is not None or not isinstance( + target_col, cudf.core.column.NumericalColumn + ): + # TODO: See if we can implement this without converting to + # Integer index. + return self._as_int_index().get_indexer( + target=target, limit=limit, method=method, tolerance=tolerance ) - # Given an actual integer, - idx = (key - self._start) / self._step - idx_int_upper_bound = (self._stop - self._start) // self._step - if method is None: - if tolerance is not None: - raise ValueError( - "tolerance argument only valid if using pad, " - "backfill or nearest lookups" - ) + if self.step > 0: + start, stop, step = self.start, self.stop, self.step + else: + # Reversed + reverse = self._range[::-1] + start, stop, step = reverse.start, reverse.stop, reverse.step - if idx > idx_int_upper_bound or idx < 0: - raise KeyError(key) + target_array = target_col.values + locs = target_array - start + valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) + locs[~valid] = -1 + locs[valid] = locs[valid] / step - idx_int = (key - self._start) // self._step - if idx_int != idx: - raise KeyError(key) - return idx_int + if step != self.step: + # Reversed + locs[valid] = len(self) - 1 - locs[valid] + return locs - if (method == "ffill" and idx < 0) or ( - method == "bfill" and idx > idx_int_upper_bound - ): + @_cudf_nvtx_annotate + def get_loc(self, key): + if not is_scalar(key): + raise TypeError("Should be a scalar-like") + idx = (key - self._start) / self._step + idx_int_upper_bound = (self._stop - self._start) // self._step + if idx > idx_int_upper_bound or idx < 0: raise KeyError(key) - round_method = { - "ffill": math.floor, - "bfill": math.ceil, - "nearest": round, - }[method] - if tolerance is not None and (abs(idx) * self._step > tolerance): + idx_int = (key - self._start) // self._step + if idx_int != idx: raise KeyError(key) - return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) + return idx_int @_cudf_nvtx_annotate def _union(self, other, sort=None): @@ -684,7 +668,7 @@ def _union(self, other, sort=None): ): result = type(self)(start_r, end_r + step_s / 2, step_s / 2) if result is not None: - if sort is None and not result.is_monotonic_increasing: + if sort in {None, True} and not result.is_monotonic_increasing: return result.sort_values() else: return result @@ -697,7 +681,7 @@ def _union(self, other, sort=None): ) @_cudf_nvtx_annotate - def _intersection(self, other, sort=False): + def _intersection(self, other, sort=None): if not isinstance(other, RangeIndex): return self._try_reconstruct_range_index( super()._intersection(other, sort=sort) @@ -739,7 +723,7 @@ def _intersection(self, other, sort=False): if (self.step < 0 and other.step < 0) is not (new_index.step < 0): new_index = new_index[::-1] - if sort is None: + if sort in {None, True}: new_index = new_index.sort_values() return self._try_reconstruct_range_index(new_index) @@ -799,13 +783,13 @@ def sort_values( @_cudf_nvtx_annotate def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return _dtype_to_index[self.dtype.type]._from_data( + return cudf.Index._from_data( {self.name: self._values.take(gather_map, nullify, check_bounds)} ) @_cudf_nvtx_annotate def _apply_boolean_mask(self, boolean_mask): - return _dtype_to_index[self.dtype.type]._from_data( + return cudf.Index._from_data( {self.name: self._values.apply_boolean_mask(boolean_mask)} ) @@ -813,7 +797,7 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return _dtype_to_index[self.dtype.type]._from_data( + return cudf.Index._from_data( {self.name: self._as_int_index()._split(splits)} ) @@ -979,7 +963,7 @@ def __dask_tokenize__(self): return (type(self), self.start, self.stop, self.step) -class GenericIndex(SingleColumnFrame, BaseIndex): +class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta): """ An array of orderable values that represent the indices of another Column @@ -1001,21 +985,6 @@ class GenericIndex(SingleColumnFrame, BaseIndex): @_cudf_nvtx_annotate def __init__(self, data, **kwargs): kwargs = _setdefault_name(data, **kwargs) - - # normalize the input - if isinstance(data, cudf.Series): - data = data._column - elif isinstance(data, column.ColumnBase): - data = data - else: - if isinstance(data, (list, tuple)): - if len(data) == 0: - data = np.asarray([], dtype="int64") - else: - data = np.asarray(data) - data = column.as_column(data) - assert isinstance(data, (NumericalColumn, StringColumn)) - name = kwargs.get("name") super().__init__({name: data}) @@ -1047,8 +1016,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # pandas returns numpy arrays when the outputs are boolean. for i, o in enumerate(out): # We explicitly _do not_ use isinstance here: we want only - # boolean GenericIndexes, not dtype-specific subclasses. - if type(o) is GenericIndex and o.dtype.kind == "b": + # boolean Indexes, not dtype-specific subclasses. + if type(o) is Index and o.dtype.kind == "b": out[i] = o.values return out[0] if ufunc.nout == 1 else tuple(out) @@ -1057,14 +1026,29 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @classmethod @_cudf_nvtx_annotate - def _from_data( - cls, data: MutableMapping, name: Any = no_default - ) -> GenericIndex: + def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out = super()._from_data(data=data) if name is not no_default: out.name = name return out + @classmethod + @_cudf_nvtx_annotate + def from_arrow(cls, obj): + try: + return cls(ColumnBase.from_arrow(obj)) + except TypeError: + # Try interpreting object as a MultiIndex before failing. + return cudf.MultiIndex.from_arrow(obj) + + @cached_property + def is_monotonic_increasing(self): + return super().is_monotonic_increasing + + @cached_property + def is_monotonic_decreasing(self): + return super().is_monotonic_decreasing + def _binaryop( self, other: Frame, @@ -1094,21 +1078,19 @@ def _binaryop( # pandas returns numpy arrays when the outputs are boolean. We # explicitly _do not_ use isinstance here: we want only boolean - # GenericIndexes, not dtype-specific subclasses. - if ( - isinstance(ret, (GenericIndex, cudf.Series)) - and ret.dtype.kind == "b" - ): + # Indexes, not dtype-specific subclasses. + if isinstance(ret, (Index, cudf.Series)) and ret.dtype.kind == "b": if ret._column.has_nulls(): ret = ret.fillna(op == "__ne__") + return ret.values return ret # Override just to make mypy happy. @_cudf_nvtx_annotate def _copy_type_metadata( - self: GenericIndex, other: GenericIndex, *, override_dtypes=None - ) -> GenericIndex: + self, other: Self, *, override_dtypes=None + ) -> Self: return super()._copy_type_metadata( other, override_dtypes=override_dtypes ) @@ -1122,6 +1104,19 @@ def _values(self): @_cudf_nvtx_annotate def _concat(cls, objs): non_empties = [index for index in objs if len(index)] + if len(objs) != len(non_empties): + # Do not remove until pandas-3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + ) if all(isinstance(obj, RangeIndex) for obj in non_empties): result = _concat_range_index(non_empties) else: @@ -1174,7 +1169,7 @@ def equals(self, other): return False @_cudf_nvtx_annotate - def copy(self, name=None, deep=False, dtype=None, names=None): + def copy(self, name=None, deep=False): """ Make a copy of this object. @@ -1185,107 +1180,26 @@ def copy(self, name=None, deep=False, dtype=None, names=None): deep : bool, default True Make a deep copy of the data. With ``deep=False`` the original data is used - dtype : numpy dtype, default None - Target datatype to cast into, use original dtype when None - - .. deprecated:: 23.02 - - The `dtype` parameter is deprecated and will be removed in - a future version of cudf. Use the `astype` method instead. - - names : list-like, default False - Kept compatibility with MultiIndex. Should not be used. - - .. deprecated:: 23.04 - - The parameter `names` is deprecated and will be removed in - a future version of cudf. Use the `name` parameter instead. Returns ------- - New index instance, casted to new dtype + New index instance. """ - if dtype is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter dtype is deprecated and will be removed in a " - "future version. Use the astype method instead.", - FutureWarning, - ) - - if names is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter names is deprecated and will be removed in a " - "future version. Use the name parameter instead.", - FutureWarning, - ) - - dtype = self.dtype if dtype is None else dtype name = self.name if name is None else name - col = self._values.astype(dtype) - return _index_from_data({name: col.copy(True) if deep else col}) + return _index_from_data( + {name: self._values.copy(True) if deep else self._values} + ) @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): return _index_from_data(super().astype({self.name: dtype}, copy)) @_cudf_nvtx_annotate - def get_loc(self, key, method=None, tolerance=None): - """Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - key : label - method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional - - default: exact matches only. - - pad / ffill: find the PREVIOUS index value if no exact match. - - backfill / bfill: use NEXT index value if no exact match. - - nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index - value. - tolerance : int or float, optional - Maximum distance from index value for inexact matches. The value - of the index at the matching location must satisfy the equation - ``abs(index[loc] - key) <= tolerance``. + def get_indexer(self, target, method=None, limit=None, tolerance=None): + if is_scalar(target): + raise TypeError("Should be a sequence") - Returns - ------- - int or slice or boolean mask - - If result is unique, return integer index - - If index is monotonic, loc is returned as a slice object - - Otherwise, a boolean mask is returned - - Examples - -------- - >>> unique_index = cudf.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 - >>> monotonic_index = cudf.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - >>> non_monotonic_index = cudf.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True]) - >>> numeric_unique_index = cudf.Index([1, 2, 3]) - >>> numeric_unique_index.get_loc(3) - 2 - """ - # We should not actually remove this code until we have implemented the - # get_indexers method as an alternative, see - # https://github.com/rapidsai/cudf/issues/12312 - if method is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - f"Passing method to {self.__class__.__name__}.get_loc is " - "deprecated and will raise in a future version.", - FutureWarning, - ) - if tolerance is not None: - raise NotImplementedError( - "Parameter tolerance is not supported yet." - ) if method not in { None, "ffill", @@ -1299,6 +1213,9 @@ def get_loc(self, key, method=None, tolerance=None): f" or nearest. Got {method}" ) + if not self.is_unique: + raise ValueError("Cannot get index for a non-unique Index.") + is_sorted = ( self.is_monotonic_increasing or self.is_monotonic_decreasing ) @@ -1309,37 +1226,63 @@ def get_loc(self, key, method=None, tolerance=None): "is specified." ) - key_as_table = cudf.core.frame.Frame( - {"None": as_column(key, length=1)} + needle = as_column(target) + result = cudf.core.column.full( + len(needle), + fill_value=-1, + dtype=libcudf.types.size_type_dtype, + ) + + if not len(self): + return result.values + try: + lcol, rcol = _match_join_keys(needle, self._column, "inner") + except ValueError: + return result.values + + scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") + (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) + result_series = cudf.Series(result) + + if method in {"ffill", "bfill", "pad", "backfill"}: + result_series = _get_indexer_basic( + index=self, + positions=result_series, + method=method, + target_col=cudf.Series(needle), + tolerance=tolerance, + ) + elif method == "nearest": + result_series = _get_nearest_indexer( + index=self, + positions=result_series, + target_col=cudf.Series(needle), + tolerance=tolerance, + ) + elif method is not None: + raise ValueError( + f"{method=} is unsupported, only supported values are: " + "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}" + ) + + return result_series.to_cupy() + + @_cudf_nvtx_annotate + def get_loc(self, key): + if not is_scalar(key): + raise TypeError("Should be a scalar-like") + + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing ) + + target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, key_as_table, is_sorted + self, target_as_table, is_sorted ) if lower_bound == upper_bound: - # Key not found, apply method - if method in ("pad", "ffill"): - if lower_bound == 0: - raise KeyError(key) - return lower_bound - 1 - elif method in ("backfill", "bfill"): - if lower_bound == self._data.nrows: - raise KeyError(key) - return lower_bound - elif method == "nearest": - if lower_bound == self._data.nrows: - return lower_bound - 1 - elif lower_bound == 0: - return 0 - lower_val = self._column.element_indexing(lower_bound - 1) - upper_val = self._column.element_indexing(lower_bound) - return ( - lower_bound - 1 - if abs(lower_val - key) < abs(upper_val - key) - else lower_bound - ) - else: - raise KeyError(key) + raise KeyError(key) if lower_bound + 1 == upper_bound: # Search result is unique, return int. @@ -1371,7 +1314,9 @@ def __repr__(self): top = self[0:mr] bottom = self[-1 * mr :] - preprocess = cudf.concat([top, bottom]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + preprocess = cudf.concat([top, bottom]) else: preprocess = self @@ -1401,16 +1346,17 @@ def __repr__(self): output = output.replace("nan", str(cudf.NA)) elif preprocess._values.nullable: - output = repr(self._clean_nulls_from_index().to_pandas()) - - if not isinstance(self, StringIndex): + if isinstance(self._values, StringColumn): + output = repr(self.to_pandas(nullable=True)) + else: + output = repr(self._clean_nulls_from_index().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. # Note : The replacing of single quotes has - # to happen only in case of non-StringIndex types, + # to happen only in case of non-Index[string] types, # as we want to preserve single quotes in case - # of StringIndex and it is valid to have them. + # of Index[string] and it is valid to have them. output = output.replace("'", "") else: output = repr(preprocess.to_pandas()) @@ -1455,7 +1401,7 @@ def __getitem__(self, index): @_cudf_nvtx_annotate def dtype(self): """ - `dtype` of the underlying values in GenericIndex. + `dtype` of the underlying values in Index. """ return self._values.dtype @@ -1472,19 +1418,21 @@ def notna(self): notnull = notna def _is_numeric(self): - return False + return isinstance( + self._values, cudf.core.column.NumericalColumn + ) and self.dtype != cudf.dtype("bool") def _is_boolean(self): - return True + return self.dtype == cudf.dtype("bool") def _is_integer(self): - return False + return cudf.api.types.is_integer_dtype(self.dtype) def _is_floating(self): - return False + return cudf.api.types.is_float_dtype(self.dtype) def _is_object(self): - return False + return isinstance(self._values, cudf.core.column.StringColumn) def _is_categorical(self): return False @@ -1553,498 +1501,103 @@ def where(self, cond, other=None, inplace=False): def values(self): return self._column.values - def __contains__(self, item): - return item in self._values - - def _clean_nulls_from_index(self): - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return cudf.Index( - self._values.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - - def any(self): - return self._values.any() - - def to_pandas(self, *, nullable: bool = False) -> pd.Index: - return pd.Index( - self._values.to_pandas(nullable=nullable), name=self.name - ) - - def append(self, other): - if is_list_like(other): - to_concat = [self] - for obj in other: - if not isinstance(obj, BaseIndex): - raise TypeError("all inputs must be Index") - to_concat.append(obj) - else: - this = self - other = cudf.Index(other) - - if len(this) == 0 or len(other) == 0: - # we'll filter out empties later in ._concat - to_concat = [this, other] - else: - if is_mixed_with_object_dtype(this, other): - got_dtype = ( - other.dtype - if this.dtype == cudf.dtype("object") - else this.dtype - ) - raise TypeError( - f"cudf does not support appending an Index of " - f"dtype `{cudf.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ) - - if isinstance(self._values, cudf.core.column.NumericalColumn): - if self.dtype != other.dtype: - this, other = numeric_normalize_types(self, other) - to_concat = [this, other] - - return self._concat(to_concat) - - def unique(self): - return cudf.core.index._index_from_data( - {self.name: self._values.unique()}, name=self.name - ) - - def isin(self, values): - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a {type(values).__name__}" - ) - - return self._values.isin(values).values - - def _indices_of(self, value): - """Return indices of value in index""" - return self._column.indices_of(value) - - @cache - @_warn_no_dask_cudf - def __dask_tokenize__(self): - # We can use caching, because an index is immutable - return super().__dask_tokenize__() - - -class NumericIndex(GenericIndex): - """Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Index - """ - - # Subclasses must define the dtype they are associated with. - _dtype: Union[None, Type[np.number]] = None - - @_cudf_nvtx_annotate - def __init__(self, data=None, dtype=None, copy=False, name=None): - # Do not remove until pandas 2.0 support is added. - warnings.warn( - f"cudf.{self.__class__.__name__} is deprecated and will be " - "removed from cudf in a future version. Use cudf.Index with the " - "appropriate dtype instead.", - FutureWarning, - ) - - dtype = type(self)._dtype - if copy: - data = column.as_column(data, dtype=dtype).copy() - - kwargs = _setdefault_name(data, name=name) - - data = column.as_column(data, dtype=dtype) - - super().__init__(data, **kwargs) - - def _is_numeric(self): - return True - - def _is_boolean(self): - return False - - def _is_integer(self): - return True - - def _is_floating(self): - return False - - def _is_object(self): - return False - - def _is_categorical(self): - return False - - def _is_interval(self): - return False - - -class Int8Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int8Index is a special case of Index with purely - integer(``int8``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - Int8Index - """ - - _dtype = np.int8 - - -class Int16Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int16Index is a special case of Index with purely - integer(``int16``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - Int16Index - """ - - _dtype = np.int16 - - -class Int32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int32Index is a special case of Index with purely - integer(``int32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - Int32Index - """ - - _dtype = np.int32 - - -class Int64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int64Index is a special case of Index with purely - integer(``int64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - Int64Index - """ - - _dtype = np.int64 - - -class UInt8Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt8Index is a special case of Index with purely - integer(``uint64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - UInt8Index - """ - - _dtype = np.uint8 - - -class UInt16Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt16Index is a special case of Index with purely - integer(``uint16``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - UInt16Index - """ - - _dtype = np.uint16 - - -class UInt32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt32Index is a special case of Index with purely - integer(``uint32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - UInt32Index - """ - - _dtype = np.uint32 - - -class UInt64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt64Index is a special case of Index with purely - integer(``uint64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - UInt64Index - """ - - _dtype = np.uint64 - - -class Float32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Float32Index is a special case of Index with purely - float(``float32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Attributes - ---------- - None + def __contains__(self, item): + return item in self._values - Methods - ------- - None + def _clean_nulls_from_index(self): + if self._values.has_nulls(): + fill_value = ( + str(cudf.NaT) + if isinstance(self, (DatetimeIndex, TimedeltaIndex)) + else str(cudf.NA) + ) + return cudf.Index( + self._values.astype("str").fillna(fill_value), + name=self.name, + ) - Returns - ------- - Float32Index - """ + return self - _dtype = np.float32 + def any(self): + return self._values.any() - def _is_integer(self): - return False + def to_pandas(self, *, nullable: bool = False) -> pd.Index: + return pd.Index( + self._values.to_pandas(nullable=nullable), name=self.name + ) - def _is_floating(self): - return True + def append(self, other): + if is_list_like(other): + to_concat = [self] + for obj in other: + if not isinstance(obj, BaseIndex): + raise TypeError("all inputs must be Index") + to_concat.append(obj) + else: + this = self + other = cudf.Index(other) + if len(this) == 0 or len(other) == 0: + # we'll filter out empties later in ._concat + to_concat = [this, other] + else: + if is_mixed_with_object_dtype(this, other): + got_dtype = ( + other.dtype + if this.dtype == cudf.dtype("object") + else this.dtype + ) + raise TypeError( + f"cudf does not support appending an Index of " + f"dtype `{cudf.dtype('object')}` with an Index " + f"of dtype `{got_dtype}`, please type-cast " + f"either one of them to same dtypes." + ) -class Float64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Float64Index is a special case of Index with purely - float(``float64``) labels. + if isinstance(self._values, cudf.core.column.NumericalColumn): + if self.dtype != other.dtype: + this, other = numeric_normalize_types(self, other) + to_concat = [this, other] - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. + return self._concat(to_concat) - Attributes - ---------- - None + def unique(self): + return cudf.core.index._index_from_data( + {self.name: self._values.unique()}, name=self.name + ) - Methods - ------- - None + def isin(self, values): + if is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a {type(values).__name__}" + ) - Returns - ------- - Float64Index - """ + return self._values.isin(values).values - _dtype = np.float64 + def _indices_of(self, value): + """Return indices of value in index""" + return self._column.indices_of(value) - def _is_integer(self): - return False + @copy_docstring(StringMethods) # type: ignore + @property + @_cudf_nvtx_annotate + def str(self): + if isinstance(self._values, cudf.core.column.StringColumn): + return StringMethods(parent=self) + else: + raise AttributeError( + "Can only use .str accessor with string values!" + ) - def _is_floating(self): - return True + @cache + @_warn_no_dask_cudf + def __dask_tokenize__(self): + # We can use caching, because an index is immutable + return super().__dask_tokenize__() -class DatetimeIndex(GenericIndex): +class DatetimeIndex(Index): """ Immutable , ordered and sliceable sequence of datetime64 data, represented internally as int64. @@ -2172,7 +1725,7 @@ def __init__( @_cudf_nvtx_annotate def _copy_type_metadata( self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None - ) -> GenericIndex: + ) -> Index: super()._copy_type_metadata(other, override_dtypes=override_dtypes) self._freq = _validate_freq(other._freq) return self @@ -2194,8 +1747,8 @@ def __getitem__(self, index): return value @_cudf_nvtx_annotate - def copy(self, name=None, deep=False, dtype=None, names=None): - idx_copy = super().copy(name=name, deep=deep, dtype=dtype, names=names) + def copy(self, name=None, deep=False): + idx_copy = super().copy(name=name, deep=deep) return idx_copy._copy_type_metadata(self) def searchsorted( @@ -2225,7 +1778,7 @@ def year(self): >>> datetime_index DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]') >>> datetime_index.year - Int16Index([2000, 2001, 2002], dtype='int16') + Index([2000, 2001, 2002], dtype='int16') """ # noqa: E501 return self._get_dt_field("year") @@ -2244,7 +1797,7 @@ def month(self): >>> datetime_index DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]') >>> datetime_index.month - Int16Index([1, 2, 3], dtype='int16') + Index([1, 2, 3], dtype='int16') """ # noqa: E501 return self._get_dt_field("month") @@ -2263,7 +1816,7 @@ def day(self): >>> datetime_index DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]') >>> datetime_index.day - Int16Index([1, 2, 3], dtype='int16') + Index([1, 2, 3], dtype='int16') """ # noqa: E501 return self._get_dt_field("day") @@ -2284,7 +1837,7 @@ def hour(self): '2000-01-01 02:00:00'], dtype='datetime64[ns]') >>> datetime_index.hour - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("hour") @@ -2305,7 +1858,7 @@ def minute(self): '2000-01-01 00:02:00'], dtype='datetime64[ns]') >>> datetime_index.minute - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("minute") @@ -2326,7 +1879,7 @@ def second(self): '2000-01-01 00:00:02'], dtype='datetime64[ns]') >>> datetime_index.second - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("second") @@ -2347,7 +1900,7 @@ def microsecond(self): '2000-01-01 00:00:00.000002'], dtype='datetime64[ns]') >>> datetime_index.microsecond - Int32Index([0, 1, 2], dtype='int32') + Index([0, 1, 2], dtype='int32') """ # noqa: E501 return as_index( ( @@ -2379,7 +1932,7 @@ def nanosecond(self): '2000-01-01 00:00:00.000000002'], dtype='datetime64[ns]') >>> datetime_index.nanosecond - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("nanosecond") @@ -2401,7 +1954,7 @@ def weekday(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.weekday - Int16Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ return self._get_dt_field("weekday") @@ -2423,7 +1976,7 @@ def dayofweek(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.dayofweek - Int16Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ return self._get_dt_field("weekday") @@ -2446,7 +1999,7 @@ def dayofyear(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.dayofyear - Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') + Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ return self._get_dt_field("day_of_year") @@ -2469,7 +2022,7 @@ def day_of_year(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.day_of_year - Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') + Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ return self._get_dt_field("day_of_year") @@ -2504,7 +2057,7 @@ def quarter(self): Returns ------- - Int8Index + Index Integer indicating which quarter the date belongs to. Examples @@ -2513,7 +2066,7 @@ def quarter(self): >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", ... "1999-12-31 18:40:00"]) >>> gIndex.quarter - Int8Index([2, 4], dtype='int8') + Index([2, 4], dtype='int8') """ res = extract_quarter(self._values) return Index(res, dtype="int8") @@ -2544,13 +2097,17 @@ def isocalendar(self): def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - # TODO: no need to convert to nanos with Pandas 2.x - if isinstance(self.dtype, pd.DatetimeTZDtype): - nanos = self._values.astype( - pd.DatetimeTZDtype("ns", self.dtype.tz) - ) + + if PANDAS_GE_200: + nanos = self._values else: - nanos = self._values.astype("datetime64[ns]") + # no need to convert to nanos with Pandas 2.x + if isinstance(self.dtype, pd.DatetimeTZDtype): + nanos = self._values.astype( + pd.DatetimeTZDtype("ns", self.dtype.tz) + ) + else: + nanos = self._values.astype("datetime64[ns]") freq = ( self._freq._maybe_as_fast_pandas_offset() @@ -2563,7 +2120,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: def _get_dt_field(self, field): out_column = self._values.get_dt_field(field) # column.column_empty_like always returns a Column object - # but we need a NumericalColumn for GenericIndex.. + # but we need a NumericalColumn for Index.. # how should this be handled? out_column = column.build_column( data=out_column.base_data, @@ -2775,7 +2332,7 @@ def tz_convert(self, tz): return DatetimeIndex._from_data({self.name: result_col}) -class TimedeltaIndex(GenericIndex): +class TimedeltaIndex(Index): """ Immutable, ordered and sliceable sequence of timedelta64 data, represented internally as int64. @@ -2887,7 +2444,10 @@ def days(self): """ Number of days for each element. """ - return as_index(arbitrary=self._values.days, name=self.name) + # Need to specifically return `int64` to avoid overflow. + return as_index( + arbitrary=self._values.days, name=self.name, dtype="int64" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2895,7 +2455,9 @@ def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index(arbitrary=self._values.seconds, name=self.name) + return as_index( + arbitrary=self._values.seconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2903,7 +2465,9 @@ def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return as_index(arbitrary=self._values.microseconds, name=self.name) + return as_index( + arbitrary=self._values.microseconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2912,7 +2476,9 @@ def nanoseconds(self): Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return as_index(arbitrary=self._values.nanoseconds, name=self.name) + return as_index( + arbitrary=self._values.nanoseconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2938,7 +2504,7 @@ def _is_boolean(self): return False -class CategoricalIndex(GenericIndex): +class CategoricalIndex(Index): """ A categorical of orderable values that represent the indices of another Column @@ -3042,7 +2608,6 @@ def __init__( data = data.as_ordered() elif ordered is False and data.ordered is True: data = data.as_unordered() - super().__init__(data, **kwargs) @property # type: ignore @@ -3181,7 +2746,7 @@ def interval_range( return IntervalIndex(interval_col, closed=closed) -class IntervalIndex(GenericIndex): +class IntervalIndex(Index): """ Immutable index of intervals that are closed on the same side. @@ -3350,81 +2915,6 @@ def _clean_nulls_from_index(self): return self -class StringIndex(GenericIndex): - """String defined indices into another Column - - .. deprecated:: 23.06 - `StringIndex` is deprecated, use `Index` instead. - - Attributes - ---------- - _values: A StringColumn object or NDArray of strings - name: A string - """ - - @_cudf_nvtx_annotate - def __init__(self, values, copy=False, **kwargs): - # Do not remove until pandas 2.0 support is added. - warnings.warn( - f"cudf.{self.__class__.__name__} is deprecated and will be " - "removed from cudf in a future version. Use cudf.Index with the " - "appropriate dtype instead.", - FutureWarning, - ) - kwargs = _setdefault_name(values, **kwargs) - if isinstance(values, StringColumn): - values = values.copy(deep=copy) - elif isinstance(values, StringIndex): - values = values._values.copy(deep=copy) - else: - values = column.as_column(values, dtype="str") - if not is_string_dtype(values.dtype): - raise ValueError( - "Couldn't create StringIndex from passed in object" - ) - - super().__init__(values, **kwargs) - - @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.Index: - return pd.Index( - self.to_numpy(na_value=None), - name=self.name, - dtype=pd.StringDtype() if nullable else "object", - ) - - @_cudf_nvtx_annotate - def __repr__(self): - return ( - f"{self.__class__.__name__}({self._values.values_host}," - f" dtype='object'" - + ( - f", name={pd.io.formats.printing.default_pprint(self.name)}" - if self.name is not None - else "" - ) - + ")" - ) - - @copy_docstring(StringMethods) # type: ignore - @property - @_cudf_nvtx_annotate - def str(self): - return StringMethods(parent=self) - - def _clean_nulls_from_index(self): - if self._values.has_nulls(): - return self.fillna(str(cudf.NA)) - else: - return self - - def _is_boolean(self): - return False - - def _is_object(self): - return True - - @_cudf_nvtx_annotate def as_index( arbitrary, nan_as_null=None, copy=False, name=no_default, dtype=None @@ -3455,7 +2945,7 @@ def as_index( result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - - GenericIndex for all other inputs. + - Index for all other inputs. Notes ----- @@ -3469,6 +2959,13 @@ def as_index( * numpy array * pyarrow array * pandas.Categorical + + Returns + ------- + result : subclass of Index + - CategoricalIndex for Categorical input. + - DatetimeIndex for Datetime input. + - Index for all other inputs. """ if name is no_default: @@ -3523,131 +3020,12 @@ def as_index( return idx -_dtype_to_index: Dict[Any, Type[NumericIndex]] = { - np.int8: Int8Index, - np.int16: Int16Index, - np.int32: Int32Index, - np.int64: Int64Index, - np.uint8: UInt8Index, - np.uint16: UInt16Index, - np.uint32: UInt32Index, - np.uint64: UInt64Index, - np.float32: Float32Index, - np.float64: Float64Index, -} - - def _setdefault_name(values, **kwargs): if kwargs.get("name") is None: kwargs["name"] = getattr(values, "name", None) return kwargs -class IndexMeta(type): - """Custom metaclass for Index that overrides instance/subclass tests.""" - - def __instancecheck__(self, instance): - return isinstance(instance, BaseIndex) - - def __subclasscheck__(self, subclass): - return issubclass(subclass, BaseIndex) - - -class Index(BaseIndex, metaclass=IndexMeta): - """The basic object storing row labels for all cuDF objects. - - Parameters - ---------- - data : array-like (1-dimensional)/ DataFrame - If it is a DataFrame, it will return a MultiIndex - dtype : NumPy dtype (default: object) - If dtype is None, we find the dtype that best fits the data. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - tupleize_cols : bool (default: True) - When True, attempt to create a MultiIndex if possible. - tupleize_cols == False is not yet supported. - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Returns - ------- - Index - cudf Index - - Warnings - -------- - This class should not be subclassed. It is designed as a factory for - different subclasses of `BaseIndex` depending on the provided input. - If you absolutely must, and if you're intimately familiar with the - internals of cuDF, subclass `BaseIndex` instead. - - Examples - -------- - >>> import cudf - >>> cudf.Index([1, 2, 3], dtype="uint64", name="a") - UInt64Index([1, 2, 3], dtype='uint64', name='a') - """ - - @_cudf_nvtx_annotate - def __new__( - cls, - data=None, - dtype=None, - copy=False, - name=no_default, - tupleize_cols=True, - nan_as_null=True, - **kwargs, - ): - assert ( - cls is Index - ), "Index cannot be subclassed, extend BaseIndex instead." - if tupleize_cols is not True: - raise NotImplementedError( - "tupleize_cols != True is not yet supported" - ) - - res = as_index( - data, - copy=copy, - dtype=dtype, - name=name, - nan_as_null=nan_as_null, - **kwargs, - ) - if ( - isinstance(data, Sequence) - and not isinstance(data, range) - and len(data) == 0 - and dtype is None - and getattr(data, "dtype", None) is None - ): - return res.astype("str") - return res - - @classmethod - @_cudf_nvtx_annotate - def from_arrow(cls, obj): - try: - return cls(ColumnBase.from_arrow(obj)) - except TypeError: - # Try interpreting object as a MultiIndex before failing. - return cudf.MultiIndex.from_arrow(obj) - - @cached_property - def is_monotonic_increasing(self): - return super().is_monotonic_increasing - - @cached_property - def is_monotonic_decreasing(self): - return super().is_monotonic_decreasing - - @_cudf_nvtx_annotate def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: """ @@ -3708,6 +3086,74 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: return old_r, old_s, old_t +def _get_indexer_basic(index, positions, method, target_col, tolerance): + # `positions` will be modified in-place, so it is the + # responsibility of the caller to decide whether or not + # to make a copy of it before passing it to this method. + nonexact = positions == -1 + positions[nonexact] = index.searchsorted( + target_col[nonexact], + side="left" if method in {"pad", "ffill"} else "right", + ) + if method in {"pad", "ffill"}: + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + positions[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + positions[positions == len(index)] = np.int32(-1) + + if tolerance is not None: + distance = abs(index[positions] - target_col) + return positions.where(distance <= tolerance, -1) + return positions + + +def _get_nearest_indexer( + index: Index, + positions: cudf.Series, + target_col: cudf.core.column.ColumnBase, + tolerance: Union[int, float], +): + """ + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other. + """ + left_indexer = _get_indexer_basic( + index=index, + positions=positions.copy(deep=True), + method="pad", + target_col=target_col, + tolerance=tolerance, + ) + right_indexer = _get_indexer_basic( + index=index, + positions=positions.copy(deep=True), + method="backfill", + target_col=target_col, + tolerance=tolerance, + ) + + left_distances = abs(index[left_indexer] - target_col) + right_distances = abs(index[right_indexer] - target_col) + + op = operator.lt if index.is_monotonic_increasing else operator.le + indexer = left_indexer.where( + op(left_distances, right_distances) | (right_indexer == -1), + right_indexer, + ) + + if tolerance is not None: + distance = abs(index[indexer] - target_col) + return indexer.where(distance <= tolerance, -1) + return indexer + + def _validate_freq(freq: Any) -> cudf.DateOffset: if isinstance(freq, str): return cudf.DateOffset._from_freqstr(freq) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6c0aba34970..0a0cefde9cd 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -48,6 +48,7 @@ is_scalar, ) from cudf.core._base_index import BaseIndex +from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column, full from cudf.core.column_accessor import ColumnAccessor @@ -65,8 +66,10 @@ _post_process_output_col, _return_arr_from_dtype, ) -from cudf.utils import docutils +from cudf.core.window import Rolling +from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig +from cudf.utils.docutils import copy_docstring from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _warn_no_dask_cudf @@ -199,9 +202,18 @@ def _get_label_range_or_mask(index, start, stop, step): if start is not None and stop is not None: if start > stop: return slice(0, 0, None) - # TODO: Once Index binary ops are updated to support logical_and, - # can use that instead of using cupy. - boolean_mask = cp.logical_and((index >= start), (index <= stop)) + if (start in index) and (stop in index): + # when we have a non-monotonic datetime index, return + # values in the slice defined by index_of(start) and + # index_of(end) + start_loc = index.get_loc(start.to_datetime64()) + stop_loc = index.get_loc(stop.to_datetime64()) + 1 + return slice(start_loc, stop_loc) + else: + raise KeyError( + "Value based partial slicing on non-monotonic DatetimeIndexes " + "with non-existing keys is not allowed.", + ) elif start is not None: boolean_mask = index >= start else: @@ -495,6 +507,45 @@ def empty(self): """ return self.size == 0 + @_cudf_nvtx_annotate + @ioutils.doc_to_json() + def to_json(self, path_or_buf=None, *args, **kwargs): + """{docstring}""" + + return cudf.io.json.to_json( + self, path_or_buf=path_or_buf, *args, **kwargs + ) + + @_cudf_nvtx_annotate + @ioutils.doc_to_hdf() + def to_hdf(self, path_or_buf, key, *args, **kwargs): + """{docstring}""" + + cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + + @_cudf_nvtx_annotate + def to_string(self): + r""" + Convert to string + + cuDF uses Pandas internals for efficient string formatting. + Set formatting options using pandas string formatting options and + cuDF objects will print identically to Pandas objects. + + cuDF supports `null/None` as a value in any column type, which + is transparently supported during this output process. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2] + >>> df['val'] = [float(i + 10) for i in range(3)] + >>> df.to_string() + ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' + """ + return str(self) + def copy(self, deep: bool = True) -> Self: """Make a copy of this object's indices and data. @@ -589,11 +640,11 @@ def index(self, value): def replace( self, to_replace=None, - value=None, + value=no_default, inplace=False, limit=None, regex=False, - method=None, + method=no_default, ): """Replace values given in ``to_replace`` with ``value``. @@ -794,12 +845,30 @@ def replace( if regex: raise NotImplementedError("regex parameter is not implemented yet") - if method not in ("pad", None): - raise NotImplementedError( - "method parameter is not implemented yet" + if method is not no_default: + warnings.warn( + "The 'method' keyword in " + f"{type(self).__name__}.replace is deprecated and " + "will be removed in a future version.", + FutureWarning, ) + elif method not in {"pad", None, no_default}: + raise NotImplementedError("method parameter is not implemented") - if not (to_replace is None and value is None): + if ( + value is no_default + and method is no_default + and not is_dict_like(to_replace) + and regex is False + ): + warnings.warn( + f"{type(self).__name__}.replace without 'value' and with " + "non-dict-like 'to_replace' is deprecated " + "and will raise in a future version. " + "Explicitly specify the new values instead.", + FutureWarning, + ) + if not (to_replace is None and value is no_default): copy_data = {} ( all_na_per_column, @@ -959,6 +1028,892 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): output._copy_type_metadata(self, include_index=False) return self._mimic_inplace(output, inplace=inplace) + @_cudf_nvtx_annotate + def abs(self): + """ + Return a Series/DataFrame with absolute numeric value of each element. + + This function only applies to elements that are all numeric. + + Returns + ------- + DataFrame/Series + Absolute value of each element. + + Examples + -------- + Absolute numeric values in a Series + + >>> s = cudf.Series([-1.10, 2, -3.33, 4]) + >>> s.abs() + 0 1.10 + 1 2.00 + 2 3.33 + 3 4.00 + dtype: float64 + """ + return self._unaryop("abs") + + @_cudf_nvtx_annotate + def dot(self, other, reflect=False): + """ + Get dot product of frame and other, (binary operator `dot`). + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`, + `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`, + `@`. + + Parameters + ---------- + other : Sequence, Series, or DataFrame + Any multiple element data structure, or list-like object. + reflect : bool, default False + If ``True``, swap the order of the operands. See + https://docs.python.org/3/reference/datamodel.html#object.__ror__ + for more information on when this is necessary. + + Returns + ------- + scalar, Series, or DataFrame + The result of the operation. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame([[1, 2, 3, 4], + ... [5, 6, 7, 8]]) + >>> df @ df.T + 0 1 + 0 30 70 + 1 70 174 + >>> s = cudf.Series([1, 1, 1, 1]) + >>> df @ s + 0 10 + 1 26 + dtype: int64 + >>> [1, 2, 3, 4] @ s + 10 + """ + # TODO: This function does not currently support nulls. + lhs = self.values + result_index = None + result_cols = None + if isinstance(self, cudf.Series) and isinstance( + other, (cudf.Series, cudf.DataFrame) + ): + common = self.index.union(other.index) + if len(common) > len(self.index) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + lhs = self.reindex(index=common, copy=False).values + rhs = other.reindex(index=common, copy=False).values + if isinstance(other, cudf.DataFrame): + result_index = other._data.to_pandas_index() + elif isinstance(self, cudf.DataFrame) and isinstance( + other, (cudf.Series, cudf.DataFrame) + ): + common = self._data.to_pandas_index().union( + other.index.to_pandas() + ) + if len(common) > len(self._data.names) or len(common) > len( + other.index + ): + raise ValueError("matrices are not aligned") + + lhs = self.reindex(columns=common, copy=False) + result_index = lhs.index + + rhs = other.reindex(index=common, copy=False).values + lhs = lhs.values + if isinstance(other, cudf.DataFrame): + result_cols = other._data.to_pandas_index() + + elif isinstance( + other, (cp.ndarray, np.ndarray) + ) or cudf.utils.dtypes.can_convert_to_column(other): + rhs = cp.asarray(other) + else: + # TODO: This should raise an exception, not return NotImplemented, + # but __matmul__ relies on the current behavior. We should either + # move this implementation to __matmul__ and call it from here + # (checking for NotImplemented and raising NotImplementedError if + # that's what's returned), or __matmul__ should catch a + # NotImplementedError from here and return NotImplemented. The + # latter feels cleaner (putting the implementation in this method + # rather than in the operator) but will be slower in the (highly + # unlikely) case that we're multiplying a cudf object with another + # type of object that somehow supports this behavior. + return NotImplemented + if reflect: + lhs, rhs = rhs, lhs + + result = lhs.dot(rhs) + if len(result.shape) == 1: + return cudf.Series( + result, + index=self.index if result_index is None else result_index, + ) + if len(result.shape) == 2: + return cudf.DataFrame( + result, + index=self.index if result_index is None else result_index, + columns=result_cols, + ) + return result.item() + + @_cudf_nvtx_annotate + def __matmul__(self, other): + return self.dot(other) + + @_cudf_nvtx_annotate + def __rmatmul__(self, other): + return self.dot(other, reflect=True) + + @_cudf_nvtx_annotate + def head(self, n=5): + """ + Return the first `n` rows. + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + DataFrame or Series + The first `n` rows of the caller object. + + Examples + -------- + **Series** + + >>> ser = cudf.Series(['alligator', 'bee', 'falcon', + ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) + >>> ser + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + dtype: object + + Viewing the first 5 lines + + >>> ser.head() + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + dtype: object + + Viewing the first `n` lines (three in this case) + + >>> ser.head(3) + 0 alligator + 1 bee + 2 falcon + dtype: object + + For negative values of `n` + + >>> ser.head(-3) + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + dtype: object + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.head(2) + key val + 0 0 10.0 + 1 1 11.0 + """ + return self.iloc[:n] + + @_cudf_nvtx_annotate + def tail(self, n=5): + """ + Returns the last n rows as a new DataFrame or Series + + Examples + -------- + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.tail(2) + key val + 3 3 13.0 + 4 4 14.0 + + **Series** + + >>> import cudf + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.tail(2) + 3 1 + 4 0 + """ + if n == 0: + return self.iloc[0:0] + + return self.iloc[-n:] + + @_cudf_nvtx_annotate + def pipe(self, func, *args, **kwargs): + """ + Apply ``func(self, *args, **kwargs)``. + + Parameters + ---------- + func : function + Function to apply to the Series/DataFrame. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects the Series/DataFrame. + args : iterable, optional + Positional arguments passed into ``func``. + kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + + Examples + -------- + Use ``.pipe`` when chaining together functions that expect + Series, DataFrames or GroupBy objects. Instead of writing + + >>> func(g(h(df), arg1=a), arg2=b, arg3=c) + + You can write + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe(func, arg2=b, arg3=c) + ... ) + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``f`` takes its data as ``arg2``: + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe((func, 'arg2'), arg1=a, arg3=c) + ... ) + """ + return cudf.core.common.pipe(self, func, *args, **kwargs) + + @_cudf_nvtx_annotate + def sum( + self, + axis=no_default, + skipna=True, + dtype=None, + numeric_only=False, + min_count=0, + **kwargs, + ): + """ + Return sum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + + .. pandas-compat:: + **DataFrame.sum, Series.sum** + + Parameters currently not supported are `level`, `numeric_only`. + """ + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + @_cudf_nvtx_annotate + def product( + self, + axis=no_default, + skipna=True, + dtype=None, + numeric_only=False, + min_count=0, + **kwargs, + ): + """ + Return product of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.product() + a 24 + b 5040 + dtype: int64 + + .. pandas-compat:: + **DataFrame.product, Series.product** + + Parameters currently not supported are level`, `numeric_only`. + """ + + return self._reduce( + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "prod" if axis in {1, "columns"} else "product", + axis=axis, + skipna=skipna, + dtype=dtype, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + # Alias for pandas compatibility. + prod = product + + @_cudf_nvtx_annotate + def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): + """ + Return the mean of the values for the requested axis. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + mean : Series or DataFrame (if level specified) + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.mean() + a 2.5 + b 8.5 + dtype: float64 + """ + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + def median( + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + ): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. For Series this + parameter is unused and defaults to 0. + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + scalar + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + + .. pandas-compat:: + **DataFrame.median, Series.median** + + Parameters currently not supported are `level` and `numeric_only`. + + .. pandas-compat:: + **DataFrame.median, Series.median** + + Parameters currently not supported are `level` and `numeric_only`. + """ + return self._reduce( + "median", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + @_cudf_nvtx_annotate + def std( + self, + axis=no_default, + skipna=True, + ddof=1, + numeric_only=False, + **kwargs, + ): + """ + Return sample standard deviation of the DataFrame. + + Normalized by N-1 by default. This can be changed using + the `ddof` argument + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is N - ddof, where N represents the number of elements. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.std() + a 1.290994 + b 1.290994 + dtype: float64 + + .. pandas-compat:: + **DataFrame.std, Series.std** + + Parameters currently not supported are `level` and + `numeric_only` + """ + + return self._reduce( + "std", + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + @_cudf_nvtx_annotate + def var( + self, + axis=no_default, + skipna=True, + ddof=1, + numeric_only=False, + **kwargs, + ): + """ + Return unbiased variance of the DataFrame. + + Normalized by N-1 by default. This can be changed using the + ddof argument. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is + N - ddof, where N represents the number of elements. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + scalar + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.var() + a 1.666667 + b 1.666667 + dtype: float64 + + .. pandas-compat:: + **DataFrame.var, Series.var** + + Parameters currently not supported are `level` and + `numeric_only` + """ + return self._reduce( + "var", + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + @_cudf_nvtx_annotate + def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): + """ + Return Fisher's unbiased kurtosis of a sample. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + Series or scalar + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series.kurtosis() + -1.1999999999999904 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.kurt() + a -1.2 + b -1.2 + dtype: float64 + + .. pandas-compat:: + **DataFrame.kurtosis** + + Parameters currently not supported are `level` and `numeric_only` + """ + if axis not in (0, "index", None, no_default): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "kurtosis", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + # Alias for kurtosis. + kurt = kurtosis + + @_cudf_nvtx_annotate + def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): + """ + Return unbiased Fisher-Pearson skew of a sample. + + Parameters + ---------- + skipna: bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + Series + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 6 + dtype: int64 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) + >>> df.skew() + a 0.00000 + b -0.37037 + dtype: float64 + + .. pandas-compat:: + **DataFrame.skew, Series.skew, Frame.skew** + + The `axis` parameter is not currently supported. + """ + if axis not in (0, "index", None, no_default): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "skew", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + @_cudf_nvtx_annotate + def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + """ + Replace values where the condition is True. + + Parameters + ---------- + cond : bool Series/DataFrame, array-like + Where cond is False, keep the original value. + Where True, replace with corresponding value from other. + Callables are not supported. + other: scalar, list of scalars, Series/DataFrame + Entries where cond is True are replaced with + corresponding value from other. Callables are not + supported. Default is None. + + DataFrame expects only Scalar or array like with scalars or + dataframe with same dimension as self. + + Series expects only scalar or series like with same length + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + Same type as caller + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df.mask(df % 2 == 0, [-1, -1]) + A B + 0 1 3 + 1 -1 5 + 2 5 -1 + + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.mask(ser > 2, 10) + 0 10 + 1 10 + 2 2 + 3 1 + 4 0 + dtype: int64 + >>> ser.mask(ser > 2) + 0 + 1 + 2 2 + 3 1 + 4 0 + dtype: int64 + """ + + if not hasattr(cond, "__invert__"): + # We Invert `cond` below and call `where`, so + # making sure the object supports + # `~`(inversion) operator or `__invert__` method + cond = cp.asarray(cond) + + return self.where(cond=~cond, other=other, inplace=inplace) + + @_cudf_nvtx_annotate + @copy_docstring(Rolling) + def rolling( + self, window, min_periods=None, center=False, axis=0, win_type=None + ): + return Rolling( + self, + window, + min_periods=min_periods, + center=center, + axis=axis, + win_type=win_type, + ) + + @_cudf_nvtx_annotate + def nans_to_nulls(self): + """ + Convert nans (if any) to nulls + + Returns + ------- + DataFrame or Series + + Examples + -------- + **Series** + + >>> import cudf, numpy as np + >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) + >>> series + 0 1.0 + 1 2.0 + 2 NaN + 3 + 4 10.0 + dtype: float64 + >>> series.nans_to_nulls() + 0 1.0 + 1 2.0 + 2 + 3 + 4 10.0 + dtype: float64 + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) + >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) + >>> df + a b + 0 1.0 + 1 3.14 + 2 NaN NaN + >>> df.nans_to_nulls() + a b + 0 1.0 + 1 3.14 + 2 + """ + result_data = {} + for name, col in self._data.items(): + try: + result_data[name] = col.nans_to_nulls() + except AttributeError: + result_data[name] = col.copy() + return self._from_data_like_self(result_data) + def _copy_type_metadata( self, other: Self, @@ -989,7 +1944,7 @@ def _copy_type_metadata( self._index, cudf.core.index.CategoricalIndex ): self._index = cudf.Index( - cast(cudf.core.index.NumericIndex, self._index)._column, + cast("cudf.Index", self._index)._column, name=self._index.name, ) elif isinstance(other._index, cudf.MultiIndex) and not isinstance( @@ -1046,6 +2001,14 @@ def interpolate( f"`limit_direction` must be 'backward' for method `{method}`" ) + if method.lower() in {"ffill", "bfill", "pad", "backfill"}: + warnings.warn( + f"{type(self).__name__}.interpolate with method={method} is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) + data = self if not isinstance(data._index, cudf.RangeIndex): @@ -1061,6 +2024,12 @@ def interpolate( interpolator = cudf.core.algorithms.get_column_interpolator(method) columns = {} for colname, col in data._data.items(): + if isinstance(col, cudf.core.column.StringColumn): + warnings.warn( + f"{type(self).__name__}.interpolate with object dtype is " + "deprecated and will raise in a future version.", + FutureWarning, + ) if col.nullable: col = col.astype("float64").fillna(np.nan) @@ -1590,8 +2559,6 @@ def sort_index( idx = self.index if isinstance(idx, MultiIndex): if level is not None: - # Pandas doesn't handle na_position in case of MultiIndex. - na_position = "first" if ascending is True else "last" if not is_list_like(level): level = [level] by = list(map(idx._get_level_label, level)) @@ -2119,6 +3086,17 @@ def _split(self, splits, keep_index=True): def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): # noqa: D102 + if method is not None: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) old_index = self._index ret = super().fillna(value, method, axis, inplace, limit) if inplace: @@ -2136,13 +3114,15 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None): ------- Object with missing values filled or None if ``inplace=True``. """ - return self.fillna( - method="bfill", - value=value, - axis=axis, - inplace=inplace, - limit=limit, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + return self.fillna( + method="bfill", + value=value, + axis=axis, + inplace=inplace, + limit=limit, + ) @_cudf_nvtx_annotate def backfill(self, value=None, axis=None, inplace=None, limit=None): @@ -2173,13 +3153,15 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None): ------- Object with missing values filled or None if ``inplace=True``. """ - return self.fillna( - method="ffill", - value=value, - axis=axis, - inplace=inplace, - limit=limit, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + return self.fillna( + method="ffill", + value=value, + axis=axis, + inplace=inplace, + limit=limit, + ) @_cudf_nvtx_annotate def pad(self, value=None, axis=None, inplace=None, limit=None): @@ -3337,6 +4319,13 @@ def first(self, offset): 2018-04-09 1 2018-04-11 2 """ + # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + warnings.warn( + "first is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", + FutureWarning, + ) return self._first_or_last( offset, idx=0, @@ -3383,6 +4372,13 @@ def last(self, offset): 2018-04-13 3 2018-04-15 4 """ + # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + warnings.warn( + "last is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", + FutureWarning, + ) return self._first_or_last( offset, idx=-1, @@ -3646,14 +4642,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): fname = ufunc.__name__ if ret is not None: - # pandas bitwise operations return bools if indexes are misaligned. - if "bitwise" in fname: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - if isinstance(other, self.__class__) and not self.index.equals( - other.index - ): - ret = ret.astype(bool) return ret # Attempt to dispatch all other functions to cupy. @@ -3754,28 +4742,6 @@ def repeat(self, repeats, axis=None): self._index_names, ) - def _append( - self, other, ignore_index=False, verify_integrity=False, sort=None - ): - # Note: Do not remove this function until pandas does. This warning is - # to clean up cudf but to match a deprecation in pandas - warnings.warn( - "The append method is deprecated and will be removed in a future " - "version. Use cudf.concat instead.", - FutureWarning, - ) - if verify_integrity not in (None, False): - raise NotImplementedError( - "verify_integrity parameter is not supported yet." - ) - - if is_list_like(other): - to_concat = [self, *other] - else: - to_concat = [self, other] - - return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) - def astype( self, dtype, @@ -5094,7 +6060,7 @@ def rank( self, axis=0, method="average", - numeric_only=None, + numeric_only=False, na_option="keep", ascending=True, pct=False, @@ -5117,7 +6083,7 @@ def rank( * max: highest rank in the group * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, optional + numeric_only : bool, default False For DataFrame objects, rank only numeric columns if set to True. na_option : {'keep', 'top', 'bottom'}, default 'keep' How to rank NaN values: @@ -5152,6 +6118,13 @@ def rank( source = self if numeric_only: + if isinstance( + source, cudf.Series + ) and not _is_non_decimal_numeric_dtype(self.dtype): + raise TypeError( + "Series.rank does not allow numeric_only=True with " + "non-numeric dtype." + ) numeric_cols = ( name for name in self._data.names @@ -5203,11 +6176,13 @@ def convert_dtypes( @_warn_no_dask_cudf def __dask_tokenize__(self): + from dask.base import normalize_token + return [ type(self), - self._dtypes, - self.index, - self.hash_values().values_host, + normalize_token(self._dtypes), + normalize_token(self.index), + normalize_token(self.hash_values().values_host), ] @@ -5298,7 +6273,7 @@ def _get_replacement_values_for_columns( "value argument must be scalar, list-like or Series" ) elif _is_series(to_replace): - if value is None: + if value is None or value is no_default: to_replace_columns = { col: as_column(to_replace.index) for col in columns_dtype_map } @@ -5329,7 +6304,7 @@ def _get_replacement_values_for_columns( "value" ) elif is_dict_like(to_replace): - if value is None: + if value is None or value is no_default: to_replace_columns = { col: list(to_replace.keys()) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 1071261044f..6a619945e75 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -1,7 +1,8 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from __future__ import annotations +import warnings from collections import abc from typing import TYPE_CHECKING, Any, Tuple, cast @@ -97,7 +98,7 @@ def _match_join_keys( common_type = ( max(ltype, rtype) if ltype.kind == rtype.kind - else np.find_common_type([], (ltype, rtype)) + else np.result_type(ltype, rtype) ) elif ( np.issubdtype(ltype, np.datetime64) @@ -170,9 +171,11 @@ def _match_categorical_dtypes_both( return lcol, rcol.astype(ltype) else: # merge categories - merged_categories = cudf.concat( - [ltype.categories, rtype.categories] - ).unique() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + merged_categories = cudf.concat( + [ltype.categories, rtype.categories] + ).unique() common_type = cudf.CategoricalDtype( categories=merged_categories, ordered=False ) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 86f0c8465ba..1ef2915bc59 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,7 +2,6 @@ from __future__ import annotations import itertools -import warnings from typing import Any, ClassVar, List, Optional import cudf @@ -536,12 +535,10 @@ def _validate_merge_params( # modified in the size 0 case. and max(lhs._data.nlevels, 1) != max(rhs._data.nlevels, 1) ): - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "merging between different levels is deprecated and will be " - f"removed in a future version. ({lhs._data.nlevels} levels on " - f"the left, {rhs._data.nlevels} on the right)", - FutureWarning, + raise ValueError( + "Not allowed to merge between different levels. " + f"({lhs._data.nlevels} levels on " + f"the left, {rhs._data.nlevels} on the right)" ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6ac3797ecf4..a3f7be7b266 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -23,9 +23,14 @@ from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column -from cudf.core._compat import PANDAS_GE_150 from cudf.core.frame import Frame -from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index +from cudf.core.index import ( + BaseIndex, + _get_indexer_basic, + _lexsorted_equal_range, + as_index, +) +from cudf.core.join._join_helpers import _match_join_keys from cudf.utils.dtypes import is_column_like from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -355,9 +360,6 @@ def name(self, value): def copy( self, names=None, - dtype=None, - levels=None, - codes=None, deep=False, name=None, ): @@ -371,36 +373,12 @@ def copy( ---------- names : sequence of objects, optional (default None) Names for each of the index levels. - dtype : object, optional (default None) - MultiIndex dtype, only supports None or object type - - .. deprecated:: 23.02 - - The `dtype` parameter is deprecated and will be removed in - a future version of cudf. Use the `astype` method instead. - - levels : sequence of arrays, optional (default None) - The unique labels for each level. Original values used if None. - - .. deprecated:: 23.02 - - The `levels` parameter is deprecated and will be removed in - a future version of cudf. - - codes : sequence of arrays, optional (default None) - Integers for each level designating which label at each location. - Original values used if None. - - .. deprecated:: 23.02 - - The `codes` parameter is deprecated and will be removed in - a future version of cudf. - deep : Bool (default False) If True, `._data`, `._levels`, `._codes` will be copied. Ignored if `levels` or `codes` are specified. name : object, optional (default None) - To keep consistent with `Index.copy`, should not be used. + Kept for compatibility with 1-dimensional Index. Should not + be used. Returns ------- @@ -414,8 +392,6 @@ def copy( ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], ... names=['Date', 'Symbol']) >>> idx2 = idx1.copy( - ... levels=[['day1', 'day2'], ['com1', 'com2']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], ... names=['col1', 'col2']) >>> df.index = idx1 @@ -429,58 +405,14 @@ def copy( >>> df.index = idx2 >>> df - Close - col1 col2 - day1 com1 3400.00 - com2 226.58 - day2 com1 3401.80 - com2 228.91 - + Close + col1 col2 + 2020-08-27 AMZN 3400.00 + MSFT 226.58 + 2020-08-28 AMZN 3401.80 + MSFT 228.91 """ - # TODO: Update message when set_levels is implemented. - # https://github.com/rapidsai/cudf/issues/12307 - if levels is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter levels is deprecated and will be removed in a " - "future version.", - FutureWarning, - ) - - # TODO: Update message when set_codes is implemented. - # https://github.com/rapidsai/cudf/issues/12308 - if codes is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter codes is deprecated and will be removed in a " - "future version.", - FutureWarning, - ) - - if dtype is not None: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "parameter dtype is deprecated and will be removed in a " - "future version. Use the astype method instead.", - FutureWarning, - ) - - dtype = object if dtype is None else dtype - if not pd.api.types.is_object_dtype(dtype): - raise TypeError("Dtype for MultiIndex only supports object type.") - - # ._data needs to be rebuilt - if levels is not None or codes is not None: - if self._levels is None or self._codes is None: - self._compute_levels_and_codes() - levels = self._levels if levels is None else levels - codes = self._codes if codes is None else codes - names = self.names if names is None else names - - mi = MultiIndex(levels=levels, codes=codes, names=names, copy=deep) - return mi - mi = MultiIndex._from_data(self._data.copy(deep=deep)) if self._levels is not None: mi._levels = [idx.copy(deep=deep) for idx in self._levels] @@ -535,21 +467,7 @@ def __repr__(self): ) ) - if not PANDAS_GE_150: - # Need this whole `if` block, - # this is a workaround for the following issue: - # https://github.com/pandas-dev/pandas/issues/39984 - preprocess_pdf = pd.DataFrame( - { - name: col.to_pandas(nullable=(col.dtype.kind != "f")) - for name, col in preprocess._data.items() - } - ) - - preprocess_pdf.columns = preprocess.names - preprocess = pd.MultiIndex.from_frame(preprocess_pdf) - else: - preprocess = preprocess.to_pandas(nullable=True) + preprocess = preprocess.to_pandas(nullable=True) preprocess.values[:] = tuples_list else: preprocess = preprocess.to_pandas(nullable=True) @@ -635,7 +553,7 @@ def levels(self): (3, 12)], names=['a', 'b']) >>> midx.levels - [Int64Index([1, 2, 3], dtype='int64', name='a'), Int64Index([10, 11, 12], dtype='int64', name='b')] + [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')] """ # noqa: E501 if self._levels is None: self._compute_levels_and_codes() @@ -769,13 +687,7 @@ def _compute_levels_and_codes(self): codes = {} for name, col in self._data.items(): - with warnings.catch_warnings(): - # TODO: Remove this filter when - # `na_sentinel` is removed from `factorize`. - # This is a filter to not let the warnings from - # `factorize` show up in other parts of public APIs. - warnings.simplefilter("ignore") - code, cats = cudf.Series._from_data({None: col}).factorize() + code, cats = cudf.Series._from_data({None: col}).factorize() cats.name = name codes[name] = code.astype(np.int64) levels.append(cats) @@ -792,15 +704,21 @@ def _compute_validity_mask(self, index, row_tuple, max_length): continue lookup[i] = cudf.Series(row) frame = cudf.DataFrame(dict(enumerate(index._data.columns))) - data_table = cudf.concat( - [ - frame, - cudf.DataFrame( - {"idx": cudf.Series(column.as_column(range(len(frame))))} - ), - ], - axis=1, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + data_table = cudf.concat( + [ + frame, + cudf.DataFrame( + { + "idx": cudf.Series( + column.as_column(range(len(frame))) + ) + } + ), + ], + axis=1, + ) # Sort indices in pandas compatible mode # because we want the indices to be fetched # in a deterministic order. @@ -1099,25 +1017,23 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): # TODO: Currently this function makes a shallow copy, which is # incorrect. We want to make a deep copy, otherwise further # modifications of the resulting DataFrame will affect the MultiIndex. - if name is None: - warnings.warn( - "Explicitly passing `name=None` currently preserves the " - "Index's name or uses a default name of 0. This behaviour " - "is deprecated, and in the future `None` will be used " - "as the name of the resulting DataFrame column.", - FutureWarning, - ) - name = no_default - - if name is not no_default: + if name is no_default: + column_names = [ + level if name is None else name + for level, name in enumerate(self.names) + ] + else: + if not is_list_like(name): + raise TypeError( + "'name' must be a list / sequence of column names." + ) if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " "number of levels on index." ) column_names = name - else: - column_names = self.names + all_none_names = None if not ( all_none_names := all(x is None for x in column_names) @@ -1201,11 +1117,11 @@ def _concat(cls, objs): obj.columns = colnames source_data = cudf.DataFrame._concat(source_data) - names = [None] * source_data._num_columns - objs = list(filter(lambda o: o.names is not None, objs)) - for o in range(len(objs)): - for i, name in enumerate(objs[o].names): - names[i] = names[i] or name + try: + # Only set names if all objs have the same names + (names,) = {o.names for o in objs} - {None} + except ValueError: + names = [None] * source_data._num_columns return cudf.MultiIndex.from_frame(source_data, names=names) @classmethod @@ -1648,7 +1564,7 @@ def droplevel(self, level=-1): Dropping multiple levels: >>> idx.droplevel(["first", "second"]) - Int64Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') + Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ mi = self.copy(deep=False) mi._poplevels(level) @@ -1911,72 +1827,60 @@ def _level_index_from_level(self, level): return level @_cudf_nvtx_annotate - def get_loc(self, key, method=None, tolerance=None): - """ - Get location for a label or a tuple of labels. - - The location is returned as an integer/slice or boolean mask. - - Parameters - ---------- - key : label or tuple of labels (one for each level) - method : None - - Returns - ------- - loc : int, slice object or boolean mask - - If index is unique, search result is unique, return a single int. - - If index is monotonic, index is returned as a slice object. - - Otherwise, cudf attempts a best effort to convert the search - result into a slice object, and will return a boolean mask if - failed to do so. Notice this can deviate from Pandas behavior - in some situations. - - Examples - -------- - >>> import cudf - >>> mi = cudf.MultiIndex.from_tuples( - ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) - >>> mi.get_loc('b') - slice(1, 3, None) - >>> mi.get_loc(('b', 'e')) - 1 - >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( - ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) - >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas - slice(1, 4, 2) - - .. pandas-compat:: - **MultiIndex.get_loc** - - The return types of this function may deviates from the - method provided by Pandas. If the index is neither - lexicographically sorted nor unique, a best effort attempt is made - to coerce the found indices into a slice. For example: - - .. code-block:: - - >>> import pandas as pd - >>> import cudf - >>> x = pd.MultiIndex.from_tuples([ - ... (2, 1, 1), (1, 2, 3), (1, 2, 1), - ... (1, 1, 1), (1, 1, 1), (2, 2, 1), - ... ]) - >>> x.get_loc(1) - array([False, True, True, True, True, False]) - >>> cudf.from_pandas(x).get_loc(1) - slice(1, 5, 1) - """ + def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: raise NotImplementedError( "Parameter tolerance is not supported yet." ) - if method is not None: + if method == "nearest": raise NotImplementedError( - "only the default get_loc method is currently supported for" - " MultiIndex" + f"{method=} is not supported yet for MultiIndex." ) + result = cudf.core.column.full( + len(target), + fill_value=-1, + dtype=libcudf.types.size_type_dtype, + ) + if not len(self): + return result.values + try: + target = cudf.MultiIndex.from_tuples(target) + except TypeError: + return result.values + + join_keys = [ + _match_join_keys(lcol, rcol, "inner") + for lcol, rcol in zip(target._data.columns, self._data.columns) + ] + join_keys = map(list, zip(*join_keys)) + scatter_map, indices = libcudf.join.join( + *join_keys, + how="inner", + ) + (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) + result_series = cudf.Series(result) + + if method in {"ffill", "bfill", "pad", "backfill"}: + result_series = _get_indexer_basic( + index=self, + positions=result_series, + method=method, + target_col=target.to_frame(index=False)[ + list(range(0, self.nlevels)) + ], + tolerance=tolerance, + ) + elif method is not None: + raise ValueError( + f"{method=} is unsupported, only supported values are: " + "{['ffill'/'pad', 'bfill'/'backfill', None]}" + ) + + return result_series.to_cupy() + + @_cudf_nvtx_annotate + def get_loc(self, key): is_sorted = ( self.is_monotonic_increasing or self.is_monotonic_decreasing ) @@ -2078,7 +1982,7 @@ def _union(self, other, sort=None): # TODO: When to_frame is refactored to return a # deep copy in future, we should push most of the common # logic between MultiIndex._union & BaseIndex._union into - # GenericIndex._union. + # Index._union. other_df = other.copy(deep=True).to_frame(index=False) self_df = self.copy(deep=True).to_frame(index=False) col_names = list(range(0, self.nlevels)) @@ -2095,7 +1999,7 @@ def _union(self, other, sort=None): midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels]) midx.names = self.names if self.names == other.names else None - if sort is None and len(other): + if sort in {None, True} and len(other): return midx.sort_values() return midx @@ -2118,7 +2022,7 @@ def _intersection(self, other, sort=None): result_df = cudf.merge(self_df, other_df, how="inner") midx = self.__class__.from_frame(result_df, names=res_name) - if sort is None and len(other): + if sort in {None, True} and len(other): return midx.sort_values() return midx diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index 5b0df97de71..1a79b122561 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -15,6 +15,7 @@ # limitations under the License. import pickle +import warnings import numpy as np import pandas as pd @@ -72,7 +73,9 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: ) # fill the gaps: - filled = upsampled.fillna(method=method) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + filled = upsampled.fillna(method=method) # filter the result to only include the values corresponding # to the bin labels: diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3cbe58ed39c..2ea538d66a1 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import itertools import warnings @@ -14,6 +14,7 @@ from cudf._lib.types import size_type_dtype from cudf._typing import Dtype from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column.categorical import CategoricalColumn from cudf.utils.dtypes import min_unsigned_type @@ -35,7 +36,7 @@ def _align_objs(objs, how="outer", sort=None): A list of reindexed and aligned objects ready for concatenation """ - # Check if multiindex then check if indexes match. GenericIndex + # Check if multiindex then check if indexes match. Index # returns ndarray tuple of bools requiring additional filter. # Then check for duplicate index value. i_objs = iter(objs) @@ -86,7 +87,7 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None): else: index = indexes[0] if sort is None: - sort = not isinstance(index, cudf.StringIndex) + sort = not index._is_object() for other in indexes[1:]: index = index.union(other, sort=False) @@ -321,9 +322,23 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): df = cudf.DataFrame() _normalize_series_and_dataframe(objs, axis=axis) + any_empty = any(obj.empty for obj in objs) + if any_empty: + # Do not remove until pandas-3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + ) # Inner joins involving empty data frames always return empty dfs, but # We must delay returning until we have set the column names. - empty_inner = any(obj.empty for obj in objs) and join == "inner" + empty_inner = any_empty and join == "inner" objs = [obj for obj in objs if obj.shape != (0, 0)] @@ -418,11 +433,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): return result elif typ is cudf.Series: - objs = [obj for obj in objs if len(obj)] - if len(objs) == 0: - return cudf.Series() - elif len(objs) == 1 and not ignore_index: - return objs[0] + new_objs = [obj for obj in objs if len(obj)] + if len(new_objs) == 1 and not ignore_index: + return new_objs[0] else: return cudf.Series._concat( objs, axis=axis, index=None if ignore_index else True @@ -430,7 +443,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): - return cudf.core.index.GenericIndex._concat(objs) + return cudf.core.index.Index._concat(objs) else: raise TypeError(f"cannot concatenate object of type {typ}") @@ -612,7 +625,7 @@ def get_dummies( cats=None, sparse=False, drop_first=False, - dtype=no_default, + dtype="bool", ): """Returns a dataframe whose columns are the one hot encodings of all columns in `df` @@ -643,7 +656,7 @@ def get_dummies( columns. Note this is different from pandas default behavior, which encodes all columns with dtype object or categorical dtype : str, optional - Output dtype, default 'uint8' + Output dtype, default 'bool' Examples -------- @@ -651,15 +664,15 @@ def get_dummies( >>> df = cudf.DataFrame({"a": ["value1", "value2", None], "b": [0, 0, 0]}) >>> cudf.get_dummies(df) b a_value1 a_value2 - 0 0 1 0 - 1 0 0 1 - 2 0 0 0 + 0 0 True False + 1 0 False True + 2 0 False False >>> cudf.get_dummies(df, dummy_na=True) - b a_None a_value1 a_value2 - 0 0 0 1 0 - 1 0 0 0 1 - 2 0 1 0 0 + b a_ a_value1 a_value2 + 0 0 False True False + 1 0 False False True + 2 0 True False False >>> import numpy as np >>> df = cudf.DataFrame({"a":cudf.Series([1, 2, np.nan, None], @@ -672,11 +685,11 @@ def get_dummies( 3 >>> cudf.get_dummies(df, dummy_na=True, columns=["a"]) - a_1.0 a_2.0 a_nan a_null - 0 1 0 0 0 - 1 0 1 0 0 - 2 0 0 1 0 - 3 0 0 0 1 + a_ a_1.0 a_2.0 a_nan + 0 False True False False + 1 False False True False + 2 False False False True + 3 True False False False >>> series = cudf.Series([1, 2, None, 2, 4]) >>> series @@ -687,12 +700,12 @@ def get_dummies( 4 4 dtype: int64 >>> cudf.get_dummies(series, dummy_na=True) - null 1 2 4 - 0 0 1 0 0 - 1 0 0 1 0 - 2 1 0 0 0 - 3 0 0 1 0 - 4 0 0 0 1 + 1 2 4 + 0 False True False False + 1 False False True False + 2 True False False False + 3 False False True False + 4 False False False True """ if cats is None: @@ -703,16 +716,6 @@ def get_dummies( if drop_first: raise NotImplementedError("drop_first is not supported yet") - if dtype is no_default: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "Default `dtype` value will be changed to 'bool' in a future " - "release, please update `dtype='bool'` to adapt for " - "future behavior.", - FutureWarning, - ) - dtype = "uint8" - if isinstance(df, cudf.DataFrame): encode_fallback_dtypes = ["object", "category"] @@ -918,7 +921,7 @@ def as_tuple(x): ) -def pivot(data, index=None, columns=None, values=None): +def pivot(data, columns=None, index=no_default, values=no_default): """ Return reshaped DataFrame organized by the given index and column values. @@ -928,10 +931,10 @@ def pivot(data, index=None, columns=None, values=None): Parameters ---------- - index : column name, optional - Column used to construct the index of the result. columns : column name, optional Column used to construct the columns of the result. + index : column name, optional + Column used to construct the index of the result. values : column name or list of column names, optional Column(s) whose values are rearranged to produce the result. If not specified, all remaining columns of the DataFrame @@ -970,7 +973,7 @@ def pivot(data, index=None, columns=None, values=None): """ df = data values_is_list = True - if values is None: + if values is no_default: values = df._columns_view( col for col in df._column_names if col not in (index, columns) ) @@ -979,7 +982,7 @@ def pivot(data, index=None, columns=None, values=None): values = [values] values_is_list = False values = df._columns_view(values) - if index is None: + if index is no_default: index = df.index else: index = cudf.core.index.Index(df.loc[:, index]) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index db3120fc11f..3f51ecdf7dc 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -15,7 +15,6 @@ Literal, MutableMapping, Optional, - Sequence, Set, Tuple, Union, @@ -47,6 +46,7 @@ is_string_dtype, ) from cudf.core import indexing_utils +from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -276,6 +276,18 @@ def __setitem__(self, key, value): to_dtype = np.result_type(value.dtype, self._frame._column.dtype) value = value.astype(to_dtype) if to_dtype != self._frame._column.dtype: + # Do not remove until pandas-3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with " + f"{self._frame._column.dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + ) self._frame._column._mimic_inplace( self._frame._column.astype(to_dtype), inplace=True ) @@ -363,6 +375,12 @@ def _loc_to_iloc(self, arg): arg = arg[0] if _is_scalar_or_zero_d_array(arg): index_dtype = self._frame.index.dtype + warn_msg = ( + "Series.__getitem__ treating keys as positions is deprecated. " + "In a future version, integer keys will always be treated " + "as labels (consistent with DataFrame behavior). To access " + "a value by position, use `ser.iloc[pos]`" + ) if not _is_non_decimal_numeric_dtype(index_dtype) and not ( isinstance(index_dtype, cudf.CategoricalDtype) and is_integer_dtype(index_dtype.categories.dtype) @@ -371,11 +389,19 @@ def _loc_to_iloc(self, arg): if isinstance(arg, cudf.Scalar) and is_integer_dtype( arg.dtype ): - found_index = arg.value - return found_index + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn(warn_msg, FutureWarning) + return arg.value elif is_integer(arg): - found_index = arg - return found_index + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn(warn_msg, FutureWarning) + return arg try: indices = self._frame.index._indices_of(arg) if (n := len(indices)) == 0: @@ -576,18 +602,6 @@ def __init__( copy=False, nan_as_null=True, ): - if ( - isinstance(data, Sequence) - and len(data) == 0 - and dtype is None - and getattr(data, "dtype", None) is None - ): - warnings.warn( - "The default dtype for empty Series will be 'object' instead " - "of 'float64' in a future version. Specify a dtype explicitly " - "to silence this warning.", - FutureWarning, - ) index_from_data = None name_from_data = None if data is None: @@ -957,82 +971,6 @@ def to_dict(self, into: type[dict] = dict) -> dict: """ return self.to_pandas().to_dict(into=into) - @_cudf_nvtx_annotate - def append(self, to_append, ignore_index=False, verify_integrity=False): - """Append values from another ``Series`` or array-like object. - If ``ignore_index=True``, the index is reset. - - Parameters - ---------- - to_append : Series or list/tuple of Series - Series to append with self. - ignore_index : boolean, default False. - If True, do not use the index. - verify_integrity : bool, default False - This Parameter is currently not supported. - - Returns - ------- - Series - A new concatenated series - - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - Series objects. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series([1, 2, 3]) - >>> s2 = cudf.Series([4, 5, 6]) - >>> s1 - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> s2 - 0 4 - 1 5 - 2 6 - dtype: int64 - >>> s1.append(s2) - 0 1 - 1 2 - 2 3 - 0 4 - 1 5 - 2 6 - dtype: int64 - - >>> s3 = cudf.Series([4, 5, 6], index=[3, 4, 5]) - >>> s3 - 3 4 - 4 5 - 5 6 - dtype: int64 - >>> s1.append(s3) - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - - With `ignore_index` set to True: - - >>> s1.append(s2, ignore_index=True) - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - """ - return super()._append(to_append, ignore_index, verify_integrity) - @_cudf_nvtx_annotate def reindex(self, *args, **kwargs): """ @@ -1169,7 +1107,9 @@ def reindex(self, *args, **kwargs): """, ) ) - def reset_index(self, level=None, drop=False, name=None, inplace=False): + def reset_index( + self, level=None, drop=False, name=no_default, inplace=False + ): if not drop and inplace: raise TypeError( "Cannot reset_index inplace on a Series " @@ -1177,7 +1117,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): ) data, index = self._reset_index(level=level, drop=drop) if not drop: - if name is None: + if name is no_default: name = 0 if self.name is None else self.name data[name] = data.pop(self.name) return cudf.core.dataframe.DataFrame._from_data(data, index) @@ -1469,7 +1409,9 @@ def __repr__(self): if max_rows not in (0, None) and len(self) > max_rows: top = self.head(int(max_rows / 2 + 1)) bottom = self.tail(int(max_rows / 2 + 1)) - preprocess = cudf.concat([top, bottom]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + preprocess = cudf.concat([top, bottom]) else: preprocess = self.copy() preprocess.index = preprocess.index._clean_nulls_from_index() @@ -1650,9 +1592,11 @@ def _concat(cls, objs, axis=0, index=True): if isinstance(objs[0].index, cudf.MultiIndex): index = cudf.MultiIndex._concat([o.index for o in objs]) else: - index = cudf.core.index.GenericIndex._concat( - [o.index for o in objs] - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + index = cudf.core.index.Index._concat( + [o.index for o in objs] + ) names = {obj.name for obj in objs} if len(names) == 1: @@ -2022,20 +1966,20 @@ def between(self, left, right, inclusive="both") -> Series: return self._from_data({self.name: lmask & rmask}, self._index) @_cudf_nvtx_annotate - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def all(self, axis=0, bool_only=None, skipna=True, **kwargs): if bool_only not in (None, True): raise NotImplementedError( "The bool_only parameter is not supported for Series." ) - return super().all(axis, skipna, level, **kwargs) + return super().all(axis, skipna, **kwargs) @_cudf_nvtx_annotate - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def any(self, axis=0, bool_only=None, skipna=True, **kwargs): if bool_only not in (None, True): raise NotImplementedError( "The bool_only parameter is not supported for Series." ) - return super().any(axis, skipna, level, **kwargs) + return super().any(axis, skipna, **kwargs) @_cudf_nvtx_annotate def to_pandas( @@ -2380,8 +2324,8 @@ def argsort( return obj @_cudf_nvtx_annotate - def replace(self, to_replace=None, value=None, *args, **kwargs): - if is_dict_like(to_replace) and value is not None: + def replace(self, to_replace=None, value=no_default, *args, **kwargs): + if is_dict_like(to_replace) and value not in {None, no_default}: raise ValueError( "Series.replace cannot use dict-like to_replace and non-None " "value" @@ -2641,7 +2585,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): # Stats # @_cudf_nvtx_annotate - def count(self, level=None): + def count(self): """ Return number of non-NA/null observations in the Series @@ -2662,10 +2606,6 @@ def count(self, level=None): Parameters currently not supported is `level`. """ - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - return self.valid_count @_cudf_nvtx_annotate @@ -3133,7 +3073,7 @@ def value_counts( 3.0 3 2.0 2 1.0 1 - dtype: int64 + Name: count, dtype: int64 The order of the counts can be changed by passing ``ascending=True``: @@ -3141,7 +3081,7 @@ def value_counts( 1.0 1 2.0 2 3.0 3 - dtype: int64 + Name: count, dtype: int64 With ``normalize`` set to True, returns the relative frequency by dividing all values by the sum of values. @@ -3150,7 +3090,7 @@ def value_counts( 3.0 0.500000 2.0 0.333333 1.0 0.166667 - dtype: float64 + Name: proportion, dtype: float64 To include ``NA`` value counts, pass ``dropna=False``: @@ -3170,24 +3110,24 @@ def value_counts( 2.0 2 2 1.0 1 - dtype: int64 + Name: count, dtype: int64 >>> s = cudf.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(bins=3) (2.0, 3.0] 2 (0.996, 2.0] 2 (3.0, 4.0] 1 - dtype: int64 + Name: count, dtype: int64 """ if bins is not None: series_bins = cudf.cut(self, bins, include_lowest=True) - + result_name = "proportion" if normalize else "count" if dropna and self.null_count == len(self): return Series( [], dtype=np.int64, - name=self.name, - index=cudf.Index([], dtype=self.dtype), + name=result_name, + index=cudf.Index([], dtype=self.dtype, name=self.name), ) if bins is not None: @@ -3207,7 +3147,7 @@ def value_counts( res = res.reindex(self.dtype.categories).fillna(0) res._index = res._index.astype(self.dtype) - res.index.name = None + res.index.name = self.name if sort: res = res.sort_values(ascending=ascending) @@ -3222,7 +3162,7 @@ def value_counts( res.index._column, res.index.categories.dtype ) res.index = int_index - + res.name = result_name return res @_cudf_nvtx_annotate @@ -3310,18 +3250,9 @@ def describe( percentiles=None, include=None, exclude=None, - datetime_is_numeric=False, ): """{docstring}""" - if not datetime_is_numeric: - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "`datetime_is_numeric` is deprecated and will be removed in " - "a future release. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - ) if percentiles is not None: if not all(0 <= x <= 1 for x in percentiles): raise ValueError( @@ -3603,7 +3534,7 @@ def keys(self): c 3 dtype: int64 >>> sr.keys() - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') """ return self.index @@ -3646,7 +3577,7 @@ def explode(self, ignore_index=False): @_cudf_nvtx_annotate def pct_change( - self, periods=1, fill_method="ffill", limit=None, freq=None + self, periods=1, fill_method=no_default, limit=no_default, freq=None ): """ Calculates the percent change between sequential elements @@ -3658,9 +3589,16 @@ def pct_change( Periods to shift for forming percent change. fill_method : str, default 'ffill' How to handle NAs before computing percent changes. + + .. deprecated:: 24.04 + All options of `fill_method` are deprecated + except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. + + .. deprecated:: 24.04 + `limit` is deprecated. freq : str, optional Increment to use from time series API. Not yet implemented. @@ -3669,17 +3607,44 @@ def pct_change( ------- Series """ - if limit is not None: + if limit is not no_default: raise NotImplementedError("limit parameter not supported yet.") if freq is not None: raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + elif fill_method not in { + no_default, + None, + "ffill", + "pad", + "bfill", + "backfill", + }: raise ValueError( - "fill_method must be one of 'ffill', 'pad', " + "fill_method must be one of None, 'ffill', 'pad', " "'bfill', or 'backfill'." ) + if fill_method not in (no_default, None) or limit is not no_default: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + "The 'fill_method' and 'limit' keywords in " + f"{type(self).__name__}.pct_change are deprecated and will be " + "removed in a future version. Either fill in any non-leading " + "NA values prior to calling pct_change or specify " + "'fill_method=None' to not fill NA values.", + FutureWarning, + ) - data = self.fillna(method=fill_method, limit=limit) + if fill_method is no_default: + fill_method = "ffill" + if limit is no_default: + limit = None + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + data = self.fillna(method=fill_method, limit=limit) diff = data.diff(periods=periods) change = diff / data.shift(periods=periods, freq=freq) return change diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index b73f756d7dc..97779522b8b 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,7 +3,6 @@ from __future__ import annotations -import warnings from typing import Any, Dict, Optional, Tuple, Union import cupy @@ -17,6 +16,7 @@ is_bool_dtype, is_integer, is_integer_dtype, + is_numeric_dtype, ) from cudf.core.column import ColumnBase, as_column from cudf.core.frame import Frame @@ -41,21 +41,16 @@ def _reduce( self, op, axis=no_default, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): if axis not in (None, 0, no_default): raise NotImplementedError("axis parameter is not implemented yet") - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only and not isinstance( - self._column, cudf.core.column.numerical_base.NumericalBaseColumn - ): - raise NotImplementedError( - f"Series.{op} does not implement numeric_only." + if numeric_only and not is_numeric_dtype(self._column): + raise TypeError( + f"Series.{op} does not allow numeric_only={numeric_only} " + "with non-numeric dtypes." ) try: return getattr(self._column, op)(**kwargs) @@ -164,7 +159,7 @@ def from_arrow(cls, array): >>> import cudf >>> import pyarrow as pa >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - StringIndex(['a' 'b' None], dtype='object') + Index(['a', 'b', None], dtype='object') >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) 0 a 1 b @@ -206,23 +201,14 @@ def to_arrow(self): @property # type: ignore @_cudf_nvtx_annotate - def is_monotonic(self): - """Return boolean if values in the object are monotonically increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. + def is_unique(self): + """Return boolean if values in the object are unique. Returns ------- bool """ - # Do not remove until pandas 2.0 support is added. - warnings.warn( - "is_monotonic is deprecated and will be removed in a future " - "version. Use is_monotonic_increasing instead.", - FutureWarning, - ) - - return self.is_monotonic_increasing + return self._column.is_unique @property # type: ignore @_cudf_nvtx_annotate @@ -259,21 +245,13 @@ def __cuda_array_interface__(self): raise AttributeError @_cudf_nvtx_annotate - def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): + def factorize(self, sort=False, use_na_sentinel=True): """Encode the input values as integer labels. Parameters ---------- sort : bool, default True Sort uniques and shuffle codes to maintain the relationship. - na_sentinel : number, default -1 - Value to indicate missing category. - - .. deprecated:: 23.04 - - The na_sentinel argument is deprecated and will be removed in - a future version of cudf. Specify use_na_sentinel as - either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NA values. If False, NA values will be encoded as non-negative @@ -295,12 +273,11 @@ def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): >>> codes array([0, 0, 1], dtype=int8) >>> uniques - StringIndex(['a' 'c'], dtype='object') + Index(['a', 'c'], dtype='object') """ return cudf.core.algorithms.factorize( self, sort=sort, - na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel, ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 08b9529515d..faa7407daaf 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -80,7 +80,7 @@ def to_datetime( format: Optional[str] = None, exact: bool = True, unit: str = "ns", - infer_datetime_format: bool = False, + infer_datetime_format: bool = True, origin="unix", cache: bool = True, ): @@ -117,7 +117,7 @@ def to_datetime( origin(unix epoch start). Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. - infer_datetime_format : bool, default False + infer_datetime_format : bool, default True If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing @@ -162,6 +162,13 @@ def to_datetime( f"{errors=} is not implemented when arg is not scalar-like" ) + if infer_datetime_format in {None, False}: + warnings.warn( + "`infer_datetime_format` is deprecated and will " + "be removed in a future version of cudf.", + FutureWarning, + ) + if arg is None: return None @@ -941,10 +948,14 @@ def date_range( # FIXME: when `end_estim` is out of bound, but the actual `end` is not, # we shouldn't raise but compute the sequence as is. The trailing overflow # part should get trimmed at the end. - end_estim = ( - pd.Timestamp(start.value) - + periods * offset._maybe_as_fast_pandas_offset() - ).to_datetime64() + with warnings.catch_warnings(): + # Need to ignore userwarnings where nonzero nanoseconds + # are dropped in conversion during the binops + warnings.simplefilter("ignore", UserWarning) + end_estim = ( + pd.Timestamp(start.value) + + periods * offset._maybe_as_fast_pandas_offset() + ).to_datetime64() if "months" in offset.kwds or "years" in offset.kwds: # If `offset` is non-fixed frequency, resort to libcudf. diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 207fb469990..890e4ecc2f0 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -10,7 +10,6 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_number from cudf.core import column -from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import as_column from cudf.core.mixins import Reducible @@ -217,21 +216,13 @@ def _apply_agg_column(self, source_column, agg_name): following_window = None window = self.window elif isinstance(self.window, BaseIndexer): - if PANDAS_GE_150: - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - step=None, - ) - else: - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - ) + start, end = self.window.get_window_bounds( + num_values=len(self.obj), + min_periods=self.min_periods, + center=self.center, + closed=None, + step=None, + ) start = as_column(start, dtype="int32") end = as_column(end, dtype="int32") @@ -543,8 +534,6 @@ def _window_to_window_sizes(self, window): ) def _apply_agg(self, agg_name): - if agg_name == "count" and not self._time_window: - self.min_periods = 0 index = cudf.MultiIndex.from_frame( cudf.DataFrame( { diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index fdda7194ab4..b2f3fd09146 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -103,6 +103,8 @@ def read_json( iotypes=(BytesIO, StringIO), allow_raw_text_input=True, storage_options=storage_options, + warn_on_raw_text_input=True, + warn_meta=("json", "read_json"), ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index bac919182c0..6c70b08384f 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -794,13 +794,15 @@ def _parquet_to_frame( dtype=_dtype, ) - # Concatenate dfs and return. - # Assume we can ignore the index if it has no name. - return ( - cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) - if len(dfs) > 1 - else dfs[0] - ) + if len(dfs) > 1: + # Concatenate dfs and return. + # Assume we can ignore the index if it has no name. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + res = cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) + return res + else: + return dfs[0] @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index afcfc13a9c4..b7c8e92e8db 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -2,6 +2,7 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import copyreg +import importlib import pickle import sys @@ -17,7 +18,6 @@ _FastSlowAttribute, _FunctionProxy, _Unusable, - get_final_type_map, make_final_proxy_type as _make_final_proxy_type, make_intermediate_proxy_type as _make_intermediate_proxy_type, register_proxy_func, @@ -48,6 +48,22 @@ cudf.set_option("mode.pandas_compatible", True) +def _pandas_util_dir(): + # In pandas 2.0, pandas.util contains public APIs under + # __getattr__ but no __dir__ to find them + # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py + return list(importlib.import_module("pandas.util").__dict__.keys()) + [ + "hash_array", + "hash_pandas_object", + "Appender", + "Substitution", + "cache_readonly", + ] + + +pd.util.__dir__ = _pandas_util_dir + + def make_final_proxy_type( name, fast_type, @@ -203,19 +219,6 @@ def Index__new__(cls, *args, **kwargs): }, ) -get_final_type_map()[cudf.StringIndex] = Index -get_final_type_map()[cudf.Int8Index] = Index -get_final_type_map()[cudf.Int8Index] = Index -get_final_type_map()[cudf.Int16Index] = Index -get_final_type_map()[cudf.Int32Index] = Index -get_final_type_map()[cudf.UInt8Index] = Index -get_final_type_map()[cudf.UInt16Index] = Index -get_final_type_map()[cudf.UInt32Index] = Index -get_final_type_map()[cudf.UInt64Index] = Index -get_final_type_map()[cudf.Float32Index] = Index -get_final_type_map()[cudf.GenericIndex] = Index - - RangeIndex = make_final_proxy_type( "RangeIndex", cudf.RangeIndex, @@ -471,17 +474,6 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) - -Int64Index = make_final_proxy_type( - "Int64Index", - cudf.Int64Index, - pd.core.indexes.numeric.Int64Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={"__init__": _DELETE}, -) - UInt8Dtype = make_final_proxy_type( "UInt8Dtype", _Unusable, @@ -518,16 +510,6 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) -UInt64Index = make_final_proxy_type( - "UInt64Index", - cudf.UInt64Index, - pd.core.indexes.numeric.UInt64Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={"__init__": _DELETE}, -) - IntervalIndex = make_final_proxy_type( "IntervalIndex", cudf.IntervalIndex, @@ -593,16 +575,6 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) -Float64Index = make_final_proxy_type( - "Float64Index", - cudf.Float64Index, - pd.core.indexes.numeric.Float64Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={"__init__": _DELETE}, -) - SeriesGroupBy = make_intermediate_proxy_type( "SeriesGroupBy", cudf.core.groupby.groupby.SeriesGroupBy, @@ -1273,8 +1245,6 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): pd.core.indexes.datetimelike.DatetimeTimedeltaMixin, pd.core.indexes.datetimelike.DatetimeIndexOpsMixin, pd.core.indexes.extension.NDArrayBackedExtensionIndex, - pd.core.indexes.numeric.IntegerIndex, - pd.core.indexes.numeric.NumericIndex, pd.core.generic.NDFrame, pd.core.indexes.accessors.PeriodProperties, pd.core.indexes.accessors.Properties, diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 9182246826f..e067d15af4c 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import itertools import string @@ -19,7 +19,6 @@ import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes -from cudf.api.types import is_scalar from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string from cudf.core.udf.strings_typing import StringView, string_view, udf_string @@ -397,32 +396,6 @@ def assert_column_memory_ne( raise AssertionError("lhs and rhs holds the same memory.") -def _create_pandas_series_float64_default( - data=None, index=None, dtype=None, *args, **kwargs -): - # Wrapper around pd.Series using a float64 - # default dtype for empty data to silence warnings. - # TODO: Remove this in pandas-2.0 upgrade - if dtype is None and ( - data is None or (not is_scalar(data) and len(data) == 0) - ): - dtype = "float64" - return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) - - -def _create_cudf_series_float64_default( - data=None, index=None, dtype=None, *args, **kwargs -): - # Wrapper around cudf.Series using a float64 - # default dtype for empty data to silence warnings. - # TODO: Remove this in pandas-2.0 upgrade - if dtype is None and ( - data is None or (not is_scalar(data) and len(data) == 0) - ): - dtype = "float64" - return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs) - - parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 39fdac0f71a..fc253c5c197 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Union - import cupy as cp import numpy as np import pandas as pd @@ -11,7 +9,7 @@ import cudf from cudf._lib.unary import is_nan from cudf.api.types import ( - is_categorical_dtype, + _is_categorical_dtype, is_numeric_dtype, is_string_dtype, ) @@ -65,25 +63,17 @@ def _check_types( if not exact or exact == "equiv": if ( isinstance(left, cudf.RangeIndex) - and isinstance( - right, - ( - cudf.Int8Index, - cudf.Int16Index, - cudf.Int32Index, - cudf.Int64Index, - ), + and ( + isinstance(right, cudf.Index) + and hasattr(right, "dtype") + and right.dtype.kind == "i" ) ) or ( isinstance(right, cudf.RangeIndex) - and isinstance( - left, - ( - cudf.Int8Index, - cudf.Int16Index, - cudf.Int32Index, - cudf.Int64Index, - ), + and ( + isinstance(left, cudf.Index) + and hasattr(left, "dtype") + and left.dtype.kind == "i" ) ): return @@ -96,7 +86,7 @@ def _check_types( if ( exact and not isinstance(left, cudf.MultiIndex) - and is_categorical_dtype(left) + and _is_categorical_dtype(left) ): if left.dtype != right.dtype: raise_assert_detail( @@ -109,7 +99,6 @@ def assert_column_equal( right, check_dtype=True, check_column_type="equiv", - check_less_precise=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, @@ -137,8 +126,6 @@ def assert_column_equal( Whether to check the columns class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. - check_less_precise : bool or int, default False - Not yet supported check_exact : bool, default False Whether to compare number exactly. check_datetime_like_compat : bool, default False @@ -157,8 +144,8 @@ def assert_column_equal( """ if check_dtype is True: if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) + _is_categorical_dtype(left) + and _is_categorical_dtype(right) and not check_categorical ): pass @@ -186,7 +173,7 @@ def assert_column_equal( return if check_exact and check_categorical: - if is_categorical_dtype(left) and is_categorical_dtype(right): + if _is_categorical_dtype(left) and _is_categorical_dtype(right): left_cat = left.categories right_cat = right.categories @@ -220,8 +207,8 @@ def assert_column_equal( if ( not check_dtype - and is_categorical_dtype(left) - and is_categorical_dtype(right) + and _is_categorical_dtype(left) + and _is_categorical_dtype(right) ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) @@ -271,7 +258,7 @@ def assert_column_equal( raise e else: columns_equal = False - if is_categorical_dtype(left) and is_categorical_dtype(right): + if _is_categorical_dtype(left) and _is_categorical_dtype(right): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: @@ -308,7 +295,6 @@ def assert_index_equal( right, exact="equiv", check_names: bool = True, - check_less_precise: Union[bool, int] = False, check_exact: bool = True, check_categorical: bool = True, check_order: bool = True, @@ -332,11 +318,9 @@ def assert_index_equal( exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted - for Int8Index, Int16Index, Int32Index, Int64Index as well. + for Index with an int8/int32/int64 dtype as well. check_names : bool, default True Whether to check the names attribute. - check_less_precise : bool or int, default False - Not yet supported check_exact : bool, default False Whether to compare number exactly. check_categorical : bool, default True @@ -420,7 +404,6 @@ def assert_index_equal( exact=check_exact, check_names=check_names, check_exact=check_exact, - check_less_precise=check_less_precise, check_order=check_order, rtol=rtol, atol=atol, @@ -449,7 +432,6 @@ def assert_series_equal( check_dtype=True, check_index_type="equiv", check_series_type=True, - check_less_precise=False, check_names=True, check_exact=False, check_datetimelike_compat=False, @@ -481,8 +463,6 @@ def assert_series_equal( Whether to check the series class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. - check_less_precise : bool or int, default False - Not yet supported check_names : bool, default True Whether to check that the names attribute for both the index and column attributes of the Series is identical. @@ -546,7 +526,6 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, @@ -559,7 +538,6 @@ def assert_series_equal( right._column, check_dtype=check_dtype, check_column_type=check_series_type, - check_less_precise=check_less_precise, check_exact=check_exact, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index 1ebafbcb654..7780f9853a2 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -7,6 +7,8 @@ import cudf from cudf.api import types +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_214 +from cudf.testing._utils import expect_warning_if @pytest.mark.parametrize( @@ -115,7 +117,7 @@ ), ) def test_is_categorical_dtype(obj, expect): - assert types.is_categorical_dtype(obj) == expect + assert types._is_categorical_dtype(obj) == expect @pytest.mark.parametrize( @@ -497,8 +499,8 @@ def test_is_integer(obj, expect): (pd.Series(dtype="int"), False), (pd.Series(dtype="float"), False), (pd.Series(dtype="complex"), False), - (pd.Series(dtype="str"), True), - (pd.Series(dtype="unicode"), True), + (pd.Series(dtype="str"), not PANDAS_GE_200), + (pd.Series(dtype="unicode"), not PANDAS_GE_200), (pd.Series(dtype="datetime64[s]"), False), (pd.Series(dtype="timedelta64[s]"), False), (pd.Series(dtype="category"), False), @@ -1035,9 +1037,13 @@ def test_is_decimal_dtype(obj, expect): ), ) def test_pandas_agreement(obj): - assert types.is_categorical_dtype(obj) == pd_types.is_categorical_dtype( - obj - ) + with expect_warning_if( + PANDAS_GE_210, DeprecationWarning if PANDAS_GE_214 else FutureWarning + ): + expected = pd_types.is_categorical_dtype(obj) + with pytest.warns(DeprecationWarning): + actual = types.is_categorical_dtype(obj) + assert expected == actual assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj) assert types.is_integer_dtype(obj) == pd_types.is_integer_dtype(obj) assert types.is_integer(obj) == pd_types.is_integer(obj) diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 32f3e39dd7c..38a34c206d7 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -1,8 +1,9 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import pytest from cudf import NA, DataFrame +from cudf.core._compat import PANDAS_GE_210 from cudf.testing import _utils as utils @@ -29,8 +30,10 @@ def test_applymap_dataframe(data, func, na_action): gdf = DataFrame(data) pdf = gdf.to_pandas(nullable=True) - expect = pdf.applymap(func, na_action=na_action) - got = gdf.applymap(func, na_action=na_action) + with utils.expect_warning_if(PANDAS_GE_210): + expect = pdf.applymap(func, na_action=na_action) + with pytest.warns(FutureWarning): + got = gdf.applymap(func, na_action=na_action) utils.assert_eq(expect, got, check_dtype=False) @@ -41,8 +44,10 @@ def test_applymap_raise_cases(): def f(x, some_kwarg=0): return x + some_kwarg - with pytest.raises(NotImplementedError): - df.applymap(f, some_kwarg=1) + with pytest.warns(FutureWarning): + with pytest.raises(NotImplementedError): + df.applymap(f, some_kwarg=1) - with pytest.raises(ValueError): - df.applymap(f, na_action="some_invalid_option") + with pytest.warns(FutureWarning): + with pytest.raises(ValueError): + df.applymap(f, na_action="some_invalid_option") diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 758a8cbb535..58939f0ddd9 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -1,4 +1,5 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest @@ -65,13 +66,12 @@ def test_array_func_cudf_series(np_ar, func): [ lambda x: np.mean(x, axis=0), lambda x: np.sum(x, axis=0), - lambda x: np.var(x, ddof=1), + lambda x: np.var(x, ddof=1, axis=0), lambda x: np.dot(x, x.transpose()), lambda x: np.all(x), lambda x: np.any(x), - lambda x: np.product(x), - lambda x: np.product(x, axis=0), - lambda x: np.product(x, axis=1), + lambda x: np.prod(x, axis=0), + lambda x: np.prod(x, axis=1), ], ) def test_array_func_cudf_dataframe(pd_df, func): @@ -104,11 +104,7 @@ def test_array_func_missing_cudf_dataframe(pd_df, func): @pytest.mark.parametrize( "func", [ - lambda x: np.mean(x), - lambda x: np.sum(x), - lambda x: np.var(x, ddof=1), lambda x: np.unique(x), - lambda x: np.dot(x, x), ], ) def test_array_func_cudf_index(np_ar, func): diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 81950bb8bde..3ba0403d67c 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import operator import warnings @@ -10,8 +10,12 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 -from cudf.testing._utils import assert_eq, set_random_null_mask_inplace +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_LT_300 +from cudf.testing._utils import ( + assert_eq, + expect_warning_if, + set_random_null_mask_inplace, +) _UFUNCS = [ obj @@ -47,6 +51,21 @@ def _hide_ufunc_warnings(ufunc): category=RuntimeWarning, ) yield + elif name in { + "bitwise_and", + "bitwise_or", + "bitwise_xor", + }: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Operation between non boolean Series with different " + "indexes will no longer return a boolean result in " + "a future version. Cast both Series to object type " + "to maintain the prior behavior.", + category=FutureWarning, + ) + yield else: yield @@ -57,17 +76,14 @@ def test_ufunc_index(request, ufunc): fname = ufunc.__name__ request.applymarker( pytest.mark.xfail( - condition=( - fname in {"bitwise_and", "bitwise_or", "bitwise_xor"} - and not PANDAS_GE_150 - ), - reason="https://github.com/pandas-dev/pandas/issues/46769", + condition=not hasattr(cp, fname), + reason=f"cupy has no support for '{fname}'", ) ) request.applymarker( pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", + condition=fname == "matmul" and PANDAS_LT_300, + reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079", ) ) @@ -165,6 +181,16 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): ) ) + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GE_200 + and fname.startswith("bitwise") + and indexed + and has_nulls, + reason="https://github.com/pandas-dev/pandas/issues/52500", + ) + ) + N = 100 # Avoid zeros in either array to skip division by 0 errors. Also limit the # scale to avoid issues with overflow, etc. We use ints because some @@ -207,7 +233,27 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): assert_eq(g, e, check_exact=False) else: if has_nulls: - expect[mask] = np.nan + with expect_warning_if( + PANDAS_GE_210 + and fname + in ( + "isfinite", + "isinf", + "isnan", + "logical_and", + "logical_not", + "logical_or", + "logical_xor", + "signbit", + "equal", + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + ) + ): + expect[mask] = np.nan assert_eq(got, expect, check_exact=False) @@ -342,8 +388,8 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): request.applymarker( pytest.mark.xfail( condition=( - indexed - and fname + not PANDAS_GE_200 + and indexed in { "add", "arctan2", @@ -380,7 +426,7 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): } ), reason=( - "pandas does not currently support misaligned " + "pandas<2.0 does not currently support misaligned " "indexes in DataFrames" ), ) @@ -433,5 +479,25 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): assert_eq(g, e, check_exact=False) else: if has_nulls: - expect[mask] = np.nan + with expect_warning_if( + PANDAS_GE_210 + and fname + in ( + "isfinite", + "isinf", + "isnan", + "logical_and", + "logical_not", + "logical_or", + "logical_xor", + "signbit", + "equal", + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + ) + ): + expect[mask] = np.nan assert_eq(got, expect, check_exact=False) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 9de7dac652c..3ebefa6e071 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,7 +13,6 @@ import cudf from cudf import Series -from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.index import as_index from cudf.testing import _utils as utils @@ -663,11 +662,11 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) - # Test with a GenericIndex + # Test with an Index pdf2 = pd.DataFrame( {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] ) - # Test with a GenericIndex in a different order + # Test with an Index in a different order pdf3 = pd.DataFrame( {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, index=[0, 3, 5, 3], @@ -1706,13 +1705,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): "minutes", "seconds", "microseconds", - pytest.param( - "nanoseconds", - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/36589", - ), - ), + "nanoseconds", ], ) @pytest.mark.parametrize( @@ -1724,7 +1717,7 @@ def test_datetime_dateoffset_binaryop( date_col, n_periods, frequency, dtype, op ): gsr = cudf.Series(date_col, dtype=dtype) - psr = gsr.to_pandas() # converts to nanos + psr = gsr.to_pandas() kwargs = {frequency: n_periods} @@ -1758,29 +1751,17 @@ def test_datetime_dateoffset_binaryop( {"months": 2, "years": 5}, {"microseconds": 1, "seconds": 1}, {"months": 2, "years": 5, "seconds": 923, "microseconds": 481}, - pytest.param( - {"milliseconds": 4}, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="Pandas gets the wrong answer for milliseconds", - ), - ), - pytest.param( - {"milliseconds": 4, "years": 2}, - marks=pytest_xfail( - reason="https://github.com/pandas-dev/pandas/issues/49897" - ), - ), - pytest.param( - {"nanoseconds": 12}, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="Pandas gets the wrong answer for nanoseconds", - ), - ), + {"milliseconds": 4}, + {"milliseconds": 4, "years": 2}, {"nanoseconds": 12}, ], ) +@pytest.mark.filterwarnings( + "ignore:Non-vectorized DateOffset:pandas.errors.PerformanceWarning" +) +@pytest.mark.filterwarnings( + "ignore:Discarding nonzero nanoseconds:UserWarning" +) @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): gsr = cudf.Series(date_col, dtype="datetime64[ns]") @@ -1816,13 +1797,7 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): "minutes", "seconds", "microseconds", - pytest.param( - "nanoseconds", - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/36589", - ), - ), + "nanoseconds", ], ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 52c50ec58a8..ad32ebce01b 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -11,30 +11,13 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_134 from cudf.testing._utils import ( NUMERIC_TYPES, assert_eq, assert_exceptions_equal, - expect_warning_if, ) -@contextmanager -def _hide_deprecated_pandas_categorical_inplace_warnings(function_name): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ( - "The `inplace` parameter in " - f"pandas.Categorical.{function_name} is deprecated and will " - "be removed in a future version." - ), - category=FutureWarning, - ) - yield - - @contextmanager def _hide_cudf_safe_casting_warning(): with warnings.catch_warnings(): @@ -362,46 +345,30 @@ def test_categorical_set_categories_preserves_order(): ) -@pytest.mark.parametrize("inplace", [True, False]) -def test_categorical_as_ordered(pd_str_cat, inplace): +def test_categorical_as_ordered(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) assert cd_sr.cat.ordered is False assert cd_sr.cat.ordered == pd_sr.cat.ordered - # pandas internally uses a deprecated call to set_ordered(inplace=inplace) - # inside as_ordered. - with pytest.warns(FutureWarning): - pd_sr_1 = pd_sr.cat.as_ordered(inplace=inplace) - with expect_warning_if(inplace, FutureWarning): - cd_sr_1 = cd_sr.cat.as_ordered(inplace=inplace) - if inplace: - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr + pd_sr_1 = pd_sr.cat.as_ordered() + cd_sr_1 = cd_sr.cat.as_ordered() assert cd_sr_1.cat.ordered is True assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered assert str(cd_sr_1) == str(pd_sr_1) -@pytest.mark.parametrize("inplace", [True, False]) -def test_categorical_as_unordered(pd_str_cat, inplace): +def test_categorical_as_unordered(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) assert cd_sr.cat.ordered is True assert cd_sr.cat.ordered == pd_sr.cat.ordered - # pandas internally uses a deprecated call to set_ordered(inplace=inplace) - # inside as_unordered. - with pytest.warns(FutureWarning): - pd_sr_1 = pd_sr.cat.as_unordered(inplace=inplace) - with expect_warning_if(inplace, FutureWarning): - cd_sr_1 = cd_sr.cat.as_unordered(inplace=inplace) - if inplace: - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr + pd_sr_1 = pd_sr.cat.as_unordered() + cd_sr_1 = cd_sr.cat.as_unordered() assert cd_sr_1.cat.ordered is False assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered @@ -410,22 +377,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace): @pytest.mark.parametrize("from_ordered", [True, False]) @pytest.mark.parametrize("to_ordered", [True, False]) -@pytest.mark.parametrize( - "inplace", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", - ), - ), - False, - ], -) -def test_categorical_reorder_categories( - pd_str_cat, from_ordered, to_ordered, inplace -): +def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) @@ -433,39 +385,19 @@ def test_categorical_reorder_categories( assert str(pd_sr) == str(cd_sr) - kwargs = dict(ordered=to_ordered, inplace=inplace) + kwargs = dict( + ordered=to_ordered, + ) - with _hide_deprecated_pandas_categorical_inplace_warnings( - "reorder_categories" - ): - pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) - if inplace: - with pytest.warns(FutureWarning): - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr - else: - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) + pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) + cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) assert_eq(pd_sr_1, cd_sr_1) assert str(cd_sr_1) == str(pd_sr_1) -@pytest.mark.parametrize( - "inplace", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", - ), - ), - False, - ], -) -def test_categorical_add_categories(pd_str_cat, inplace): +def test_categorical_add_categories(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) @@ -473,18 +405,8 @@ def test_categorical_add_categories(pd_str_cat, inplace): assert str(pd_sr) == str(cd_sr) - with _hide_deprecated_pandas_categorical_inplace_warnings( - "add_categories" - ): - pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace) - - if inplace: - with pytest.warns(FutureWarning): - cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr - else: - cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) + pd_sr_1 = pd_sr.cat.add_categories(["d"]) + cd_sr_1 = cd_sr.cat.add_categories(["d"]) assert "d" in pd_sr_1.cat.categories.to_list() assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() @@ -492,20 +414,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): assert_eq(pd_sr_1, cd_sr_1) -@pytest.mark.parametrize( - "inplace", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", - ), - ), - False, - ], -) -def test_categorical_remove_categories(pd_str_cat, inplace): +def test_categorical_remove_categories(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) @@ -513,18 +422,8 @@ def test_categorical_remove_categories(pd_str_cat, inplace): assert str(pd_sr) == str(cd_sr) - with _hide_deprecated_pandas_categorical_inplace_warnings( - "remove_categories" - ): - pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace) - - if inplace: - with pytest.warns(FutureWarning): - cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr - else: - cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) + pd_sr_1 = pd_sr.cat.remove_categories(["a"]) + cd_sr_1 = cd_sr.cat.remove_categories(["a"]) assert "a" not in pd_sr_1.cat.categories.to_list() assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() @@ -532,15 +431,12 @@ def test_categorical_remove_categories(pd_str_cat, inplace): assert_eq(pd_sr_1, cd_sr_1) # test using ordered operators - with _hide_deprecated_pandas_categorical_inplace_warnings( - "remove_categories" - ) as _, pytest.warns(FutureWarning) as _: - assert_exceptions_equal( - lfunc=cd_sr.to_pandas().cat.remove_categories, - rfunc=cd_sr.cat.remove_categories, - lfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}), - rfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}), - ) + assert_exceptions_equal( + lfunc=cd_sr.to_pandas().cat.remove_categories, + rfunc=cd_sr.cat.remove_categories, + lfunc_args_and_kwargs=([["a", "d"]], {}), + rfunc_args_and_kwargs=([["a", "d"]], {}), + ) def test_categorical_dataframe_slice_copy(): diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index c3623f495c0..8e8555b2005 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if cudf.api.types.is_categorical_dtype(col.dtype): + if cudf.api.types._is_categorical_dtype(col.dtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) elif cudf.api.types.is_string_dtype(col.dtype): @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): else: pd_series = series.to_pandas() - if cudf.api.types.is_categorical_dtype(col.dtype): + if cudf.api.types._is_categorical_dtype(col.dtype): # The cudf.Series is constructed from an already sliced column, whereas # the pandas.Series is constructed from the unsliced series and then # sliced, so the indexes should be different and we must ignore it. diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 99d4bdd9910..bf764b02faa 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -1,10 +1,11 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import pandas as pd import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.core.column_accessor import ColumnAccessor from cudf.testing._utils import assert_eq @@ -52,7 +53,15 @@ def test_to_pandas_simple(simple_data): Test that a ColumnAccessor converts to a correct pd.Index """ ca = ColumnAccessor(simple_data) - assert_eq(ca.to_pandas_index(), pd.DataFrame(simple_data).columns) + # We cannot return RangeIndex, while pandas returns RangeIndex. + # Pandas compares `inferred_type` which is `empty` for + # Index([], dtype='object'), and `integer` for RangeIndex() + # to ignore this `inferred_type` comparison, we pass exact=False. + assert_eq( + ca.to_pandas_index(), + pd.DataFrame(simple_data).columns, + exact=not PANDAS_GE_200, + ) def test_to_pandas_multiindex(mi_data): diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 466455eb48c..4b0e46bf286 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,5 +1,7 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +import warnings +from contextlib import contextmanager from decimal import Decimal import numpy as np @@ -7,10 +9,28 @@ import pytest import cudf as gd -from cudf.api.types import is_categorical_dtype -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.api.types import _is_categorical_dtype +from cudf.core._compat import PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import ( + assert_eq, + assert_exceptions_equal, + expect_warning_if, +) + + +@contextmanager +def _hide_concat_empty_dtype_warning(): + with warnings.catch_warnings(): + # Ignoring warnings in this test as warnings are + # being caught and validated in other tests. + warnings.filterwarnings( + "ignore", + "The behavior of array concatenation with empty entries " + "is deprecated.", + category=FutureWarning, + ) + yield def make_frames(index=None, nulls="none"): @@ -62,8 +82,9 @@ def test_concat_dataframe(index, nulls, axis): df_empty1 = gdf_empty1.to_pandas() # DataFrame - res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() - sol = pd.concat([df, df2, df, df_empty1], axis=axis) + with _hide_concat_empty_dtype_warning(): + res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() + sol = pd.concat([df, df2, df, df_empty1], axis=axis) assert_eq( res, sol, @@ -365,7 +386,7 @@ def test_pandas_concat_compatibility_axis1_eq_index(): ps1 = s1.to_pandas() ps2 = s2.to_pandas() - with pytest.warns(FutureWarning): + with expect_warning_if(not PANDAS_GE_200): assert_exceptions_equal( lfunc=pd.concat, rfunc=gd.concat, @@ -472,8 +493,9 @@ def test_concat_series_dataframe_input(objs): pd_objs = objs gd_objs = [gd.from_pandas(obj) for obj in objs] - expected = pd.concat(pd_objs) - actual = gd.concat(gd_objs) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat(pd_objs) + actual = gd.concat(gd_objs) assert_eq( expected.fillna(-1), @@ -578,8 +600,8 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual = gd.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: for key, col in actual[actual.columns].items(): - if is_categorical_dtype(col.dtype): - if not is_categorical_dtype(expected[key].dtype): + if _is_categorical_dtype(col.dtype): + if not _is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = expected[key].fillna("-1").astype("str") @@ -596,7 +618,12 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual[key] = col.fillna(-1) assert_eq(expected, actual, check_dtype=False, check_index_type=True) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -801,13 +828,7 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): axis=axis, ) - if PANDAS_GE_150: - assert_eq(expected, actual, check_index_type=True) - else: - # special handling of check_index_type below - # required because: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -834,23 +855,24 @@ def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): gdf3 = gd.from_pandas(pdf3) gdf_empty1 = gd.from_pandas(pdf_empty1) - assert_eq( - pd.concat( - [pdf1, pdf2, pdf3, pdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - gd.concat( - [gdf1, gdf2, gdf3, gdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - check_index_type=False, - ) + with _hide_concat_empty_dtype_warning(): + assert_eq( + pd.concat( + [pdf1, pdf2, pdf3, pdf_empty1], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + gd.concat( + [gdf1, gdf2, gdf3, gdf_empty1], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + check_index_type=False, + ) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -874,13 +896,7 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - if PANDAS_GE_150: - assert_eq(expected, actual, check_index_type=True) - else: - # special handling of check_index_type below - # required because: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize( @@ -904,10 +920,6 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43584", -) def test_concat_join_no_overlapping_columns( pdf1, pdf2, ignore_index, sort, join, axis ): @@ -929,13 +941,7 @@ def test_concat_join_no_overlapping_columns( axis=axis, ) - if PANDAS_GE_150: - assert_eq(expected, actual, check_index_type=True) - else: - # special handling of check_index_type below - # required because: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("ignore_index", [False, True]) @@ -961,20 +967,21 @@ def test_concat_join_no_overlapping_columns_many_and_empty( gdf6 = gd.from_pandas(pdf6) gdf_empty = gd.from_pandas(pdf_empty) - expected = pd.concat( - [pdf4, pdf5, pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = gd.concat( - [gdf4, gdf5, gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf4, pdf5, pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [gdf4, gdf5, gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) assert_eq( expected, actual, @@ -1033,20 +1040,21 @@ def test_concat_join_no_overlapping_columns_many_and_empty2( ): objs_gd = [gd.from_pandas(o) if o is not None else o for o in objs] - expected = pd.concat( - objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = gd.concat( - objs_gd, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + objs_gd, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) assert_eq(expected, actual, check_index_type=False) @@ -1069,24 +1077,27 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( gdf6 = gd.from_pandas(pdf6) gdf_empty = gd.from_pandas(pdf_empty) - expected = pd.concat( - [pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = gd.concat( - [gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=not PANDAS_GE_200, ) - # TODO: change `check_index_type` to `True` - # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/46675 - assert_eq(expected, actual, check_index_type=False) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -1097,7 +1108,7 @@ def test_concat_join_series(ignore_index, sort, join, axis): s1 = gd.Series(["a", "b", "c"]) s2 = gd.Series(["a", "b"]) s3 = gd.Series(["a", "b", "c", "d"]) - s4 = gd.Series() + s4 = gd.Series(dtype="str") ps1 = s1.to_pandas() ps2 = s2.to_pandas() @@ -1111,30 +1122,21 @@ def test_concat_join_series(ignore_index, sort, join, axis): ignore_index=ignore_index, axis=axis, ) - actual = gd.concat( - [s1, s2, s3, s4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - if PANDAS_GE_150: - assert_eq( - expected, - actual, - check_index_type=True, - ) - else: - # special handling of check_index_type required below: - # https://github.com/pandas-dev/pandas/issues/46675 - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq( - expected, - actual, - check_index_type=(axis == 0), + with expect_warning_if(axis == 1): + actual = gd.concat( + [s1, s2, s3, s4], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, ) + assert_eq( + expected, + actual, + check_index_type=True, + ) + @pytest.mark.parametrize( "df", @@ -1201,8 +1203,8 @@ def test_concat_join_empty_dataframes( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if is_categorical_dtype(col.dtype): - if not is_categorical_dtype(expected[key].dtype): + if _is_categorical_dtype(col.dtype): + if not _is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = ( @@ -1293,19 +1295,7 @@ def test_concat_join_empty_dataframes( ) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize( - "join", - [ - "inner", - pytest.param( - "outer", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/37937", - ), - ), - ], -) +@pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [1]) def test_concat_join_empty_dataframes_axis_1( df, other, ignore_index, axis, join, sort @@ -1315,16 +1305,25 @@ def test_concat_join_empty_dataframes_axis_1( gdf = gd.from_pandas(df) other_gd = [gdf] + [gd.from_pandas(o) for o in other] - expected = pd.concat( - other_pd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) - actual = gd.concat( - other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + other_pd, + ignore_index=ignore_index, + axis=axis, + join=join, + sort=sort, + ) + actual = gd.concat( + other_gd, + ignore_index=ignore_index, + axis=axis, + join=join, + sort=sort, + ) if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if is_categorical_dtype(col.dtype): + if _is_categorical_dtype(col.dtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") # if not expected.empty: diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 318f81c2576..8171f3a1872 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import codecs import gzip @@ -17,7 +17,7 @@ import cudf from cudf import read_csv -from cudf.core._compat import PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_200 from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -247,11 +247,14 @@ def test_csv_reader_datetime(parse_dates): parse_dates=parse_dates, dayfirst=True, ) + # Need to used `date_format='mixed'`, + # https://github.com/pandas-dev/pandas/issues/53355 pdf = pd.read_csv( StringIO(buffer), names=["date1", "date2", "bad"], parse_dates=parse_dates, dayfirst=True, + date_format="mixed", ) assert_eq(gdf, pdf) @@ -369,6 +372,11 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): assert len(out.columns) == len(df_out.columns) assert len(out) == len(df_out) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + out["2"] = out["2"].astype("datetime64[ns]") assert_eq(df_out, out) @@ -587,12 +595,12 @@ def test_csv_reader_NaN_values(): header=None, na_values=custom_na_values, ) - assert gdf.dtypes[0] == "int8" + assert gdf.dtypes.iloc[0] == "int8" assert all(gdf["0"][idx] is cudf.NA for idx in range(len(gdf["0"]))) # data type detection should evaluate the column to object if some nulls gdf = read_csv(StringIO(all_cells), header=None) - assert gdf.dtypes[0] == np.dtype("object") + assert gdf.dtypes.iloc[0] == np.dtype("object") def test_csv_reader_thousands(tmpdir): @@ -1360,10 +1368,6 @@ def test_csv_reader_column_names(names): assert list(df) == list(names) -@pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/rapidsai/cudf/issues/10618", -) def test_csv_reader_repeated_column_name(): buffer = """A,A,A.1,A,A.2,A,A.4,A,A 1,2,3.1,4,a.2,a,a.4,a,a diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0664e7991b5..a0f6c4c3cfc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -12,6 +12,7 @@ import textwrap import warnings from collections import OrderedDict, defaultdict, namedtuple +from contextlib import contextmanager from copy import copy import cupy @@ -23,12 +24,8 @@ from packaging import version import cudf -from cudf.core._compat import ( - PANDAS_GE_134, - PANDAS_GE_150, - PANDAS_GE_200, - PANDAS_LT_140, -) +from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_LT_203 from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.errors import MixedTypeError @@ -37,7 +34,6 @@ ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, - _create_cudf_series_float64_default, assert_eq, assert_exceptions_equal, assert_neq, @@ -64,6 +60,46 @@ pytest_xfail = pytest.mark.skipif +@contextmanager +def _hide_ufunc_warnings(eval_str): + # pandas raises warnings for some inputs to the following ufuncs: + if any( + x in eval_str + for x in { + "arctanh", + "log", + } + ): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "invalid value encountered in", + category=RuntimeWarning, + ) + warnings.filterwarnings( + "ignore", + "divide by zero encountered in", + category=RuntimeWarning, + ) + yield + else: + yield + + +@contextmanager +def _hide_concat_empty_dtype_warning(): + with warnings.catch_warnings(): + # Ignoring warnings in this test as warnings are + # being caught and validated in other tests. + warnings.filterwarnings( + "ignore", + "The behavior of array concatenation with empty " + "entries is deprecated.", + category=FutureWarning, + ) + yield + + def test_init_via_list_of_tuples(): data = [ (5, "cats", "jump", np.nan), @@ -271,7 +307,7 @@ def test_series_from_cupy_scalars(): @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) -def test_append_index(a, b): +def test_concat_index(a, b): df = pd.DataFrame() df["a"] = a df["b"] = b @@ -280,19 +316,14 @@ def test_append_index(a, b): gdf["a"] = a gdf["b"] = b - # Check the default index after appending two columns(Series) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = df.a.append(df.b) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.a.append(gdf.b) + expected = pd.concat([df.a, df.b]) + actual = cudf.concat([gdf.a, gdf.b]) assert len(expected) == len(actual) assert_eq(expected.index, actual.index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = df.a.append(df.b, ignore_index=True) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.a.append(gdf.b, ignore_index=True) + expected = pd.concat([df.a, df.b], ignore_index=True) + actual = cudf.concat([gdf.a, gdf.b], ignore_index=True) assert len(expected) == len(actual) assert_eq(expected.index, actual.index) @@ -309,26 +340,9 @@ def test_append_index(a, b): {"a": [1, None, None], "b": [3, np.nan, np.nan]}, {1: ["a", "b", "c"], 2: ["q", "w", "u"]}, {1: ["a", np.nan, "c"], 2: ["q", None, "u"]}, - pytest.param( - {}, - marks=pytest_xfail( - reason="https://github.com/rapidsai/cudf/issues/11080" - ), - ), - pytest.param( - {1: [], 2: [], 3: []}, - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/rapidsai/cudf/issues/11080", - ), - ), - pytest.param( - [1, 2, 3], - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/rapidsai/cudf/issues/11080", - ), - ), + {}, + {1: [], 2: [], 3: []}, + [1, 2, 3], ], ) def test_axes(data): @@ -339,7 +353,7 @@ def test_axes(data): actual = csr.axes for e, a in zip(expected, actual): - assert_eq(e, a) + assert_eq(e, a, exact=not PANDAS_GE_200) def test_dataframe_truncate_axis_0(): @@ -1337,6 +1351,11 @@ def test_dataframe_setitem_from_masked_object(): def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf["a"] = [] + if PANDAS_GE_200: + # TODO: Remove this workaround after + # the following bug is fixed: + # https://github.com/pandas-dev/pandas/issues/56679 + pdf["a"] = pdf["a"].astype("str") pdf["b"] = [1, 2, 3] gdf = cudf.DataFrame() @@ -1642,8 +1661,9 @@ def test_dataframe_concat_different_column_types(): "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] ) def test_concat_empty_dataframe(df_1, df_2): - got = cudf.concat([df_1, df_2]) - expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat([df_1, df_2]) + expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) # ignoring dtypes as pandas upcasts int to float # on concatenation with empty dataframes @@ -1669,15 +1689,21 @@ def test_concat_empty_dataframe(df_1, df_2): ], ) def test_concat_different_column_dataframe(df1_d, df2_d): - got = cudf.concat( - [cudf.DataFrame(df1_d), cudf.DataFrame(df2_d), cudf.DataFrame(df1_d)], - sort=False, - ) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat( + [ + cudf.DataFrame(df1_d), + cudf.DataFrame(df2_d), + cudf.DataFrame(df1_d), + ], + sort=False, + ) pdf1 = pd.DataFrame(df1_d) pdf2 = pd.DataFrame(df2_d) - # pandas warns when trying to concatenate any empty float columns (or float + # pandas(lower than pandas 2.0 only) warns when trying to + # concatenate any empty float columns (or float # columns with all None values) with any non-empty bool columns. def is_invalid_concat(left, right): return ( @@ -1686,7 +1712,7 @@ def is_invalid_concat(left, right): and right.count() == 0 ) - cond = any( + cond = (not PANDAS_GE_200) and any( is_invalid_concat(pdf1[colname], pdf2[colname]) or is_invalid_concat(pdf2[colname], pdf1[colname]) for colname in set(pdf1) & set(pdf2) @@ -1710,8 +1736,9 @@ def is_invalid_concat(left, right): ) @pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) def test_concat_empty_series(ser_1, ser_2): - got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) - expect = pd.concat([ser_1, ser_2]) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) + expect = pd.concat([ser_1, ser_2]) assert_eq(got, expect, check_index_type=True) @@ -1832,18 +1859,7 @@ def test_nonmatching_index_setitem(nrows): assert_eq(gdf["c"].to_pandas(), gdf_series.to_pandas()) -@pytest.mark.parametrize( - "dtype", - [ - "int", - pytest.param( - "int64[pyarrow]", - marks=pytest.mark.skipif( - not PANDAS_GE_150, reason="pyarrow support only in >=1.5" - ), - ), - ], -) +@pytest.mark.parametrize("dtype", ["int", "int64[pyarrow]"]) def test_from_pandas(dtype): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype) df.columns.name = "custom_column_name" @@ -2670,8 +2686,6 @@ def test_unary_operators(func, pdf, gdf): def test_is_monotonic(gdf): pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) gdf = cudf.DataFrame.from_pandas(pdf) - with pytest.warns(FutureWarning): - assert not gdf.index.is_monotonic assert not gdf.index.is_monotonic_increasing assert not gdf.index.is_monotonic_decreasing @@ -2734,8 +2748,8 @@ def test_decimal_quantile(q, interpolation, decimal_type): def test_empty_quantile(): - pdf = pd.DataFrame({"x": []}) - df = cudf.DataFrame({"x": []}) + pdf = pd.DataFrame({"x": []}, dtype="float64") + df = cudf.DataFrame({"x": []}, dtype="float64") actual = df.quantile() expected = pdf.quantile() @@ -2806,6 +2820,12 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): pa_chunk_array = pa.chunked_array(np_list_data) expect = pd.Series(pa_chunk_array.to_pandas()) + if cudf.api.types.is_datetime64_dtype( + data_type + ) or cudf.api.types.is_timedelta64_dtype(data_type): + # Workaround for an Arrow Bug: + # https://github.com/apache/arrow/issues/34462 + expect = expect.astype(data_type) got = cudf.Series(pa_chunk_array) assert_eq(expect, got) @@ -2820,6 +2840,12 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): ) expect = pa_table.to_pandas() + if cudf.api.types.is_datetime64_dtype( + data_type + ) or cudf.api.types.is_timedelta64_dtype(data_type): + # Workaround for an Arrow Bug: + # https://github.com/apache/arrow/issues/34462 + expect = expect.astype(data_type) got = cudf.DataFrame.from_arrow(pa_table) assert_eq(expect, got) @@ -2981,7 +3007,7 @@ def test_series_all_null(num_elements, null_type): @pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements - sr = _create_cudf_series_float64_default(data, nan_as_null=False) + sr = cudf.Series(data, nan_as_null=False) np.testing.assert_equal(sr.null_count, 0) @@ -3527,15 +3553,7 @@ def test_dataframe_empty_sort_index(): [2, 0, 1], ] ), - pytest.param( - pd.RangeIndex(2, -1, -1), - marks=[ - pytest_xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43591", - ) - ], - ), + pd.RangeIndex(2, -1, -1), ], ) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @@ -3596,8 +3614,16 @@ def test_dataframe_sort_index( @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) def test_dataframe_mulitindex_sort_index( - axis, level, ascending, inplace, ignore_index, na_position + request, axis, level, ascending, inplace, ignore_index, na_position ): + request.applymarker( + pytest.mark.xfail( + condition=axis in (1, "columns") + and ignore_index + and not (level is None and not ascending), + reason="https://github.com/pandas-dev/pandas/issues/56478", + ) + ) pdf = pd.DataFrame( { "b": [1.0, 3.0, np.nan], @@ -3609,17 +3635,14 @@ def test_dataframe_mulitindex_sort_index( ).set_index(["b", "a", 1]) gdf = cudf.DataFrame.from_pandas(pdf) - # ignore_index is supported in v.1.0 - expected = pdf.sort_index( axis=axis, level=level, ascending=ascending, inplace=inplace, na_position=na_position, + ignore_index=ignore_index, ) - if ignore_index is True: - expected = expected got = gdf.sort_index( axis=axis, level=level, @@ -3630,12 +3653,8 @@ def test_dataframe_mulitindex_sort_index( ) if inplace is True: - if ignore_index is True: - pdf = pdf.reset_index(drop=True) assert_eq(pdf, gdf) else: - if ignore_index is True: - expected = expected.reset_index(drop=True) assert_eq(expected, got) @@ -3855,8 +3874,8 @@ def test_dataframe_describe_exclude(): df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(exclude=["float"]) + + gdf_results = df.describe(exclude=["float"]) pdf_results = pdf.describe(exclude=["float"]) assert_eq(gdf_results, pdf_results) @@ -3871,8 +3890,7 @@ def test_dataframe_describe_include(): df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(include=["int"]) + gdf_results = df.describe(include=["int"]) pdf_results = pdf.describe(include=["int"]) assert_eq(gdf_results, pdf_results) @@ -3886,8 +3904,7 @@ def test_dataframe_describe_default(): df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe() + gdf_results = df.describe() pdf_results = pdf.describe() assert_eq(pdf_results, gdf_results) @@ -3904,8 +3921,7 @@ def test_series_describe_include_all(): df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(include="all") + gdf_results = df.describe(include="all") pdf_results = pdf.describe(include="all") assert_eq(gdf_results[["x", "y"]], pdf_results[["x", "y"]]) @@ -3926,8 +3942,7 @@ def test_dataframe_describe_percentiles(): df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(percentiles=sample_percentiles) + gdf_results = df.describe(percentiles=sample_percentiles) pdf_results = pdf.describe(percentiles=sample_percentiles) assert_eq(pdf_results, gdf_results) @@ -4130,15 +4145,7 @@ def test_dataframe_round_dict_decimal_validation(): [None, None], [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]], [[1, True], [2, False], [3, False]], - pytest.param( - [["a", True], ["b", False], ["c", False]], - marks=[ - pytest_xfail( - reason="NotImplementedError: all does not " - "support columns of object dtype." - ) - ], - ), + [["a", True], ["b", False], ["c", False]], ], ) def test_all(data): @@ -4149,6 +4156,9 @@ def test_all(data): if np.array(data).ndim <= 1: pdata = pd.Series(data=data, dtype=dtype).replace([None], False) gdata = cudf.Series.from_pandas(pdata) + got = gdata.all() + expected = pdata.all() + assert_eq(got, expected) else: pdata = pd.DataFrame(data, columns=["a", "b"], dtype=dtype).replace( [None], False @@ -4161,12 +4171,9 @@ def test_all(data): expected = pdata.all(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.all(level="a") - - got = gdata.all() - expected = pdata.all() - assert_eq(got, expected) + got = gdata.all() + expected = pdata.all() + assert_eq(got, expected) @pytest.mark.parametrize( @@ -4186,21 +4193,13 @@ def test_all(data): [None, None], [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]], [[1, True], [2, False], [3, False]], - pytest.param( - [["a", True], ["b", False], ["c", False]], - marks=[ - pytest_xfail( - reason="NotImplementedError: any does not " - "support columns of object dtype." - ) - ], - ), + [["a", True], ["b", False], ["c", False]], ], ) @pytest.mark.parametrize("axis", [0, 1]) def test_any(data, axis): # Provide a dtype when data is empty to avoid future pandas changes. - dtype = None if data else float + dtype = float if all(x is None for x in data) or len(data) < 1 else None if np.array(data).ndim <= 1: pdata = pd.Series(data=data, dtype=dtype) gdata = cudf.Series(data=data, dtype=dtype) @@ -4222,12 +4221,9 @@ def test_any(data, axis): expected = pdata.any(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.any(level="a") - - got = gdata.any(axis=axis) - expected = pdata.any(axis=axis) - assert_eq(got, expected) + got = gdata.any(axis=axis) + expected = pdata.any(axis=axis) + assert_eq(got, expected) @pytest.mark.parametrize("axis", [0, 1]) @@ -4272,8 +4268,7 @@ def test_empty_dataframe_describe(): gdf = cudf.from_pandas(pdf) expected = pdf.describe() - with pytest.warns(FutureWarning): - actual = gdf.describe() + actual = gdf.describe() assert_eq(expected, actual) @@ -4746,7 +4741,7 @@ def test_dataframe_fillna_preserves_column_rangeindex(): ) def test_series_values_host_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = _create_cudf_series_float64_default(data) + gds = cudf.Series(data=data, dtype=None if data else float) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -4769,7 +4764,7 @@ def test_series_values_host_property(data): ) def test_series_values_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = _create_cudf_series_float64_default(data) + gds = cudf.from_pandas(pds) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) np.testing.assert_array_equal(gds_vals.get(), pds.values) @@ -5202,7 +5197,9 @@ def test_df_constructor_dtype(dtype): { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], - "c": [np.NaN, np.NaN, np.NaN, np.NaN], + "c": cudf.Series( + [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False + ), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), @@ -5222,38 +5219,39 @@ def test_df_constructor_dtype(dtype): "op", ["max", "min", "sum", "product", "mean", "var", "std"] ) @pytest.mark.parametrize("skipna", [True, False]) -def test_rowwise_ops(data, op, skipna): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_rowwise_ops(data, op, skipna, numeric_only): gdf = data pdf = gdf.to_pandas() - kwargs = {"axis": 1, "skipna": skipna} + kwargs = {"axis": 1, "skipna": skipna, "numeric_only": numeric_only} if op in ("var", "std"): kwargs["ddof"] = 0 - with expect_warning_if( - not all( - ( - (pdf[column].count() == 0) - if skipna - else (pdf[column].notna().count() == 0) - ) - or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or cudf.api.types.is_bool_dtype(pdf[column].dtype) - for column in pdf + if not numeric_only and not all( + ( + (pdf[column].count() == 0) + if skipna + else (pdf[column].notna().count() == 0) ) + or cudf.api.types.is_numeric_dtype(pdf[column].dtype) + or cudf.api.types.is_bool_dtype(pdf[column].dtype) + for column in pdf ): + with pytest.raises(TypeError): + expected = getattr(pdf, op)(**kwargs) + with pytest.raises(TypeError): + got = getattr(gdf, op)(**kwargs) + else: expected = getattr(pdf, op)(**kwargs) - with expect_warning_if( - not all( - cudf.api.types.is_numeric_dtype(gdf[column].dtype) - or cudf.api.types.is_bool_dtype(gdf[column].dtype) - for column in gdf - ), - UserWarning, - ): got = getattr(gdf, op)(**kwargs) - assert_eq(expected, got, check_exact=False) + assert_eq( + expected, + got, + check_dtype=False, + check_index_type=False if len(got.index) == 0 else True, + ) @pytest.mark.parametrize( @@ -5283,67 +5281,18 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): @pytest.mark.parametrize( - "op,expected", + "op", [ - ( - "max", - cudf.Series( - [10.0, None, np.NaN, 2234.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "min", - cudf.Series( - [10.0, None, np.NaN, 13.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "sum", - cudf.Series( - [20.0, None, np.NaN, 2247.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "product", - cudf.Series( - [100.0, None, np.NaN, 29042.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "mean", - cudf.Series( - [10.0, None, np.NaN, 1123.5, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "var", - cudf.Series( - [0.0, None, np.NaN, 1233210.25, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "std", - cudf.Series( - [0.0, None, np.NaN, 1110.5, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), + "max", + "min", + "sum", + "product", + "mean", + "var", + "std", ], ) -def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): +def test_rowwise_ops_nullable_dtypes_partial_null(op): gdf = cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], @@ -5356,10 +5305,12 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): if op in ("var", "std"): got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) + expected = getattr(gdf.to_pandas(), op)(axis=1, ddof=0, skipna=False) else: got = getattr(gdf, op)(axis=1, skipna=False) + expected = getattr(gdf.to_pandas(), op)(axis=1, skipna=False) - assert_eq(got.null_count, expected.null_count) + assert_eq(got.null_count, 2) assert_eq(got, expected) @@ -5502,23 +5453,37 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): ) @pytest.mark.parametrize("op", ["max", "min"]) @pytest.mark.parametrize("skipna", [True, False]) -def test_rowwise_ops_datetime_dtypes(data, op, skipna): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - with expect_warning_if( - not all(cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes), - UserWarning, - ): - got = getattr(gdf, op)(axis=1, skipna=skipna) - with expect_warning_if( - not all(pd.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes), - FutureWarning, + if not numeric_only and not all( + cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes ): - expected = getattr(pdf, op)(axis=1, skipna=skipna) - - assert_eq(got, expected) + with pytest.raises(TypeError): + got = getattr(gdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + with pytest.raises(TypeError): + expected = getattr(pdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + else: + got = getattr(gdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + expected = getattr(pdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + if got.dtype == cudf.dtype( + "datetime64[us]" + ) and expected.dtype == np.dtype("datetime64[ns]"): + # Workaround for a PANDAS-BUG: + # https://github.com/pandas-dev/pandas/issues/52524 + assert_eq(got.astype("datetime64[ns]"), expected) + else: + assert_eq(got, expected, check_dtype=False) @pytest.mark.parametrize( @@ -5702,39 +5667,32 @@ def test_cov_nans(): ], ) def test_df_sr_binop(gsr, colnames, op): - # Anywhere that the column names of the DataFrame don't match the index - # names of the Series will trigger a deprecated reindexing. Since this - # behavior is deprecated in pandas, this test is temporarily silencing - # those warnings until cudf updates to pandas 2.0 as its compatibility - # target, at which point a large number of the parametrizations can be - # removed altogether (along with this warnings filter). - with warnings.catch_warnings(): - assert version.parse(pd.__version__) < version.parse("2.0.0") - warnings.filterwarnings( - action="ignore", - category=FutureWarning, - message=( - "Automatic reindexing on DataFrame vs Series comparisons is " - "deprecated" - ), - ) - data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] - data = dict(zip(colnames, data)) + data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] + data = dict(zip(colnames, data)) - gsr = gsr.astype("float64") + gsr = gsr.astype("float64") - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas(nullable=True) + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas(nullable=True) - psr = gsr.to_pandas(nullable=True) + psr = gsr.to_pandas(nullable=True) + try: expect = op(pdf, psr) + except ValueError: + with pytest.raises(ValueError): + op(gdf, gsr) + with pytest.raises(ValueError): + op(psr, pdf) + with pytest.raises(ValueError): + op(gsr, gdf) + else: got = op(gdf, gsr).to_pandas(nullable=True) - assert_eq(expect, got, check_dtype=False) + assert_eq(expect, got, check_dtype=False, check_like=True) expect = op(psr, pdf) got = op(gsr, gdf).to_pandas(nullable=True) - assert_eq(expect, got, check_dtype=False) + assert_eq(expect, got, check_dtype=False, check_like=True) @pytest_unmark_spilling @@ -6677,12 +6635,22 @@ def test_dataframe_init_1d_list(data, columns): expect = pd.DataFrame(data, columns=columns) actual = cudf.DataFrame(data, columns=columns) - assert_eq(expect, actual, check_index_type=len(data) != 0) + assert_eq( + expect, + actual, + check_index_type=len(data) != 0, + check_column_type=not PANDAS_GE_200 and len(data) == 0, + ) expect = pd.DataFrame(data, columns=None) actual = cudf.DataFrame(data, columns=None) - assert_eq(expect, actual, check_index_type=len(data) != 0) + assert_eq( + expect, + actual, + check_index_type=len(data) != 0, + check_column_type=not PANDAS_GE_200 and len(data) == 0, + ) @pytest.mark.parametrize( @@ -6784,7 +6752,13 @@ def test_dataframe_init_from_arrays_cols(data, cols, index): None, ], ) -def test_dataframe_assign_scalar(col_data, assign_val): +def test_dataframe_assign_scalar(request, col_data, assign_val): + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GE_200 and len(col_data) == 0, + reason="https://github.com/pandas-dev/pandas/issues/56679", + ) + ) pdf = pd.DataFrame({"a": col_data}) gdf = cudf.DataFrame({"a": col_data}) @@ -6859,7 +6833,7 @@ def test_dataframe_info_basic(): str_cmp = textwrap.dedent( """\ - StringIndex: 10 entries, a to 1111 + Index: 10 entries, a to 1111 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- @@ -6932,7 +6906,7 @@ def test_dataframe_info_verbose_mem_usage(): str_cmp = textwrap.dedent( """\ - StringIndex: 3 entries, sdfdsf to dsfdf + Index: 3 entries, sdfdsf to dsfdf Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- @@ -7538,7 +7512,11 @@ def test_dataframe_from_dict_cp_np_arrays( def test_dataframe_keys(df): gdf = cudf.from_pandas(df) - assert_eq(df.keys(), gdf.keys()) + assert_eq( + df.keys(), + gdf.keys(), + exact=not (PANDAS_GE_200 and len(gdf.columns) == 0), + ) @pytest.mark.parametrize( @@ -7631,22 +7609,42 @@ def test_series_keys(ps): ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_append_dataframe(df, other, sort, ignore_index): +def test_dataframe_concat_dataframe(df, other, sort, ignore_index): pdf = df other_pd = other gdf = cudf.from_pandas(df) other_gd = cudf.from_pandas(other) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, sort=sort, ignore_index=ignore_index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf, other_pd], sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf, other_gd], sort=sort, ignore_index=ignore_index + ) + + # In empty dataframe cases, Pandas & cudf differ in columns + # creation, pandas creates RangeIndex(0, 0) + # whereas cudf creates an empty Index([], dtype="object"). + check_column_type = ( + False if len(expected.columns) == len(df.columns) == 0 else True + ) if expected.shape != df.shape: - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=check_column_type, + ) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=check_column_type, + ) @pytest_unmark_spilling @@ -7678,33 +7676,24 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index): "other", [ pd.Series([10, 11, 23, 234, 13]), - pytest.param( - pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="pandas bug: " - "https://github.com/pandas-dev/pandas/issues/35092", - ), - ), + pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), {1: 1}, {0: 10, 1: 100, 2: 102}, ], ) @pytest.mark.parametrize("sort", [False, True]) -def test_dataframe_append_series_dict(df, other, sort): +def test_dataframe_concat_series(df, other, sort): pdf = df - other_pd = other - gdf = cudf.from_pandas(df) - if isinstance(other, pd.Series): - other_gd = cudf.from_pandas(other) + + if isinstance(other, dict): + other_pd = pd.Series(other) else: - other_gd = other + other_pd = other + other_gd = cudf.from_pandas(other_pd) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, ignore_index=True, sort=sort) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, ignore_index=True, sort=sort) + expected = pd.concat([pdf, other_pd], ignore_index=True, sort=sort) + actual = cudf.concat([gdf, other_gd], ignore_index=True, sort=sort) if expected.shape != df.shape: # Ignore the column type comparison because pandas incorrectly @@ -7721,20 +7710,18 @@ def test_dataframe_append_series_dict(df, other, sort): assert_eq(expected, actual, check_index_type=not gdf.empty) -def test_dataframe_append_series_mixed_index(): +def test_dataframe_concat_series_mixed_index(): df = cudf.DataFrame({"first": [], "d": []}) + pdf = df.to_pandas() + sr = cudf.Series([1, 2, 3, 4]) + psr = sr.to_pandas() - with pytest.raises( - TypeError, - match=re.escape( - "cudf does not support mixed types, please type-cast " - "the column index of dataframe and index of series " - "to same dtypes." - ), - ): - with pytest.warns(FutureWarning, match="append method is deprecated"): - df.append(sr, ignore_index=True) + assert_eq( + cudf.concat([df, sr], ignore_index=True), + pd.concat([pdf, psr], ignore_index=True), + check_dtype=False, + ) @pytest_unmark_spilling @@ -7859,24 +7846,41 @@ def test_dataframe_append_series_mixed_index(): ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_append_dataframe_lists(df, other, sort, ignore_index): +def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): pdf = df other_pd = other gdf = cudf.from_pandas(df) - other_gd = [ - cudf.from_pandas(o) if isinstance(o, pd.DataFrame) else o - for o in other - ] + other_gd = [cudf.from_pandas(o) for o in other] + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf] + other_pd, sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf] + other_gd, sort=sort, ignore_index=ignore_index + ) + + # In some cases, Pandas creates an empty Index([], dtype="object") for + # columns whereas cudf creates a RangeIndex(0, 0). + check_column_type = ( + False if len(expected.columns) == len(df.columns) == 0 else True + ) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, sort=sort, ignore_index=ignore_index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) if expected.shape != df.shape: - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=check_column_type, + ) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=check_column_type, + ) @pytest.mark.parametrize( @@ -7890,7 +7894,8 @@ def test_dataframe_append_dataframe_lists(df, other, sort, ignore_index): def test_dataframe_bfill(df, alias): gdf = cudf.from_pandas(df) - actual = getattr(df, alias)() + with expect_warning_if(PANDAS_GE_200 and alias == "backfill"): + actual = getattr(df, alias)() with expect_warning_if(alias == "backfill"): expected = getattr(gdf, alias)() assert_eq(expected, actual) @@ -7907,7 +7912,8 @@ def test_dataframe_bfill(df, alias): def test_dataframe_ffill(df, alias): gdf = cudf.from_pandas(df) - actual = getattr(df, alias)() + with expect_warning_if(PANDAS_GE_200 and alias == "pad"): + actual = getattr(df, alias)() with expect_warning_if(alias == "pad"): expected = getattr(gdf, alias)() assert_eq(expected, actual) @@ -7956,20 +7962,20 @@ def test_dataframe_ffill(df, alias): ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_append_lists(df, other, sort, ignore_index): +def test_dataframe_concat_lists(df, other, sort, ignore_index): pdf = df - other_pd = other + other_pd = [pd.DataFrame(o) for o in other] gdf = cudf.from_pandas(df) - other_gd = [ - cudf.from_pandas(o) if isinstance(o, pd.DataFrame) else o - for o in other - ] + other_gd = [cudf.from_pandas(o) for o in other_pd] - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, sort=sort, ignore_index=ignore_index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf] + other_pd, sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf] + other_gd, sort=sort, ignore_index=ignore_index + ) if expected.shape != df.shape: assert_eq( @@ -7979,20 +7985,21 @@ def test_dataframe_append_lists(df, other, sort, ignore_index): check_column_type=not gdf.empty, ) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=PANDAS_GE_200 and len(gdf.columns) != 0, + ) -def test_dataframe_append_error(): +def test_dataframe_concat_series_without_name(): df = cudf.DataFrame({"a": [1, 2, 3]}) - ps = cudf.Series([1, 2, 3]) + pdf = df.to_pandas() + gs = cudf.Series([1, 2, 3]) + ps = gs.to_pandas() - with pytest.raises( - TypeError, - match="Can only append a Series if ignore_index=True " - "or if the Series has a name", - ): - with pytest.warns(FutureWarning, match="append method is deprecated"): - df.append(ps) + assert_eq(pd.concat([pdf, ps]), cudf.concat([df, gs])) def test_cudf_arrow_array_error(): @@ -8272,6 +8279,7 @@ def test_dataframe_init_with_columns(data, columns, request): gdf, check_index_type=len(pdf.index) != 0, check_dtype=not (pdf.empty and len(pdf.columns)), + check_column_type=not PANDAS_GE_200, ) @@ -8365,7 +8373,12 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns, request): check_index_type=True, ) else: - assert_eq(expected, actual, check_index_type=True) + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=not PANDAS_GE_200, + ) @pytest_unmark_spilling @@ -8464,7 +8477,7 @@ def test_dataframe_init_from_series_list_with_index( actual = actual.sort_index(axis=1) assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) else: - assert_eq(expected, actual) + assert_eq(expected, actual, check_column_type=not PANDAS_GE_200) @pytest.mark.parametrize( @@ -8604,8 +8617,8 @@ def test_dataframe_iterrows_itertuples(): def test_describe_misc_include(df, include): pdf = df.to_pandas() - expected = pdf.describe(include=include, datetime_is_numeric=True) - actual = df.describe(include=include, datetime_is_numeric=True) + expected = pdf.describe(include=include) + actual = df.describe(include=include) for col in expected.columns: if expected[col].dtype == np.dtype("object"): @@ -8671,8 +8684,8 @@ def test_describe_misc_include(df, include): def test_describe_misc_exclude(df, exclude): pdf = df.to_pandas() - expected = pdf.describe(exclude=exclude, datetime_is_numeric=True) - actual = df.describe(exclude=exclude, datetime_is_numeric=True) + expected = pdf.describe(exclude=exclude) + actual = df.describe(exclude=exclude) for col in expected.columns: if expected[col].dtype == np.dtype("object"): @@ -8720,8 +8733,18 @@ def test_describe_misc_exclude(df, exclude): ) @pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) -def test_dataframe_mode(df, numeric_only, dropna): +def test_dataframe_mode(request, df, numeric_only, dropna): pdf = df.to_pandas() + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GE_200 + and PANDAS_LT_203 + and numeric_only is False + and "b" in df.columns + and df["b"].dtype == np.dtype("timedelta64[s]"), + reason="https://github.com/pandas-dev/pandas/issues/53497", + ) + ) expected = pdf.mode(numeric_only=numeric_only, dropna=dropna) actual = df.mode(numeric_only=numeric_only, dropna=dropna) @@ -9065,7 +9088,12 @@ def assert_local_eq(actual, df, expected, host_columns): check_index_type=check_index_type, ) else: - assert_eq(expected, actual, check_index_type=check_index_type) + assert_eq( + expected, + actual, + check_index_type=check_index_type, + check_column_type=not PANDAS_GE_200, + ) if df.empty and columns is None and not PANDAS_GE_200: request.node.add_marker( @@ -9098,17 +9126,8 @@ def test_dataframe_constructor_column_index_only(): @pytest.mark.parametrize( "data", [ - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, - {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, - {"a": [1, 2, 3], "b": [3, 4, 5], "c": [True, True, False]}, - {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, - { - "a": [1.0, 2.0, 3.0], - "b": [True, True, False], - "c": [False, True, False], - }, - {"a": [1, 2, 3], "b": [3, 4, 5], "c": [2.0, 3.0, 4.0]}, - {"a": [1, 2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, + {"a": [1, 2.5, 3], "b": [3, 4.5, 5], "c": [2.0, 3.0, 4.0]}, + {"a": [1, 2.2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, ], ) @pytest.mark.parametrize( @@ -9133,14 +9152,36 @@ def test_agg_for_dataframes(data, aggs): expect = pdf.agg(aggs).sort_index() got = gdf.agg(aggs).sort_index() - assert_eq(expect, got, check_dtype=False) + + assert_eq(expect, got, check_dtype=True) + + +@pytest_unmark_spilling +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, + {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, + ], +) +@pytest.mark.parametrize( + "aggs", + [ + ["min", "sum", "max"], + "sum", + {"a": "sum", "b": "min", "c": "max"}, + ], +) +def test_agg_for_dataframes_error(data, aggs): + gdf = cudf.DataFrame(data) + + with pytest.raises(TypeError): + gdf.agg(aggs) @pytest.mark.parametrize("aggs", [{"a": np.sum, "b": np.min, "c": np.max}]) def test_agg_for_unsupported_function(aggs): - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) with pytest.raises(NotImplementedError): gdf.agg(aggs) @@ -9148,9 +9189,7 @@ def test_agg_for_unsupported_function(aggs): @pytest.mark.parametrize("aggs", ["asdf"]) def test_agg_for_dataframe_with_invalid_function(aggs): - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) with pytest.raises( AttributeError, @@ -9161,9 +9200,7 @@ def test_agg_for_dataframe_with_invalid_function(aggs): @pytest.mark.parametrize("aggs", [{"a": "asdf"}]) def test_agg_for_series_with_invalid_function(aggs): - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) with pytest.raises( AttributeError, @@ -9496,16 +9533,7 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode): pdf = pd.DataFrame(data, index=p_index, columns=labels) gdf = cudf.from_pandas(pdf) - if PANDAS_GE_134: - expect = pdf.explode(label_to_explode, ignore_index) - else: - # https://github.com/pandas-dev/pandas/issues/43314 - if isinstance(label_to_explode, int): - pdlabel_to_explode = [label_to_explode] - else: - pdlabel_to_explode = label_to_explode - expect = pdf.explode(pdlabel_to_explode, ignore_index) - + expect = pdf.explode(label_to_explode, ignore_index) got = gdf.explode(label_to_explode, ignore_index) assert_eq(expect, got, check_dtype=False) @@ -10044,30 +10072,33 @@ def test_dataframe_rename_duplicate_column(): ], ) @pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +@pytest.mark.parametrize( + "fill_method", ["ffill", "bfill", "pad", "backfill", no_default] +) def test_dataframe_pct_change(data, periods, fill_method): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - actual = gdf.pct_change(periods=periods, fill_method=fill_method) - expected = pdf.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if(fill_method is not no_default): + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if( + PANDAS_GE_210 + and (fill_method is not no_default or pdf.isna().any().any()) + ): + expected = pdf.pct_change(periods=periods, fill_method=fill_method) assert_eq(expected, actual) -def test_mean_timeseries(): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_mean_timeseries(numeric_only): gdf = cudf.datasets.timeseries() + if not numeric_only: + gdf = gdf.select_dtypes(include="number") pdf = gdf.to_pandas() - expected = pdf.mean(numeric_only=True) - actual = gdf.mean(numeric_only=True) - - assert_eq(expected, actual) - - with pytest.warns(FutureWarning): - expected = pdf.mean() - with pytest.warns(FutureWarning): - actual = gdf.mean() + expected = pdf.mean(numeric_only=numeric_only) + actual = gdf.mean(numeric_only=numeric_only) assert_eq(expected, actual) @@ -10082,19 +10113,15 @@ def test_mean_timeseries(): } ], ) -def test_std_different_dtypes(data): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_std_different_dtypes(data, numeric_only): gdf = cudf.DataFrame(data) + if not numeric_only: + gdf = gdf.select_dtypes(include="number") pdf = gdf.to_pandas() - expected = pdf.std(numeric_only=True) - actual = gdf.std(numeric_only=True) - - assert_eq(expected, actual) - - with pytest.warns(FutureWarning): - expected = pdf.std() - with pytest.warns(FutureWarning): - actual = gdf.std() + expected = pdf.std(numeric_only=numeric_only) + actual = gdf.std(numeric_only=numeric_only) assert_eq(expected, actual) @@ -10114,7 +10141,7 @@ def test_empty_numeric_only(data): pdf = gdf.to_pandas() expected = pdf.prod(numeric_only=True) actual = gdf.prod(numeric_only=True) - assert_eq(expected, actual) + assert_eq(expected, actual, check_dtype=True) @pytest.fixture(params=[0, 10], ids=["empty", "10"]) @@ -10188,7 +10215,8 @@ def df_eval(request): ) def test_dataframe_eval(df_eval, expr, dtype): df_eval = df_eval.astype(dtype) - expect = df_eval.to_pandas().eval(expr) + with _hide_ufunc_warnings(expr): + expect = df_eval.to_pandas().eval(expr) got = df_eval.eval(expr) # In the specific case where the evaluated expression is a unary function # of a single column with no nesting, pandas will retain the name. This @@ -10198,7 +10226,8 @@ def test_dataframe_eval(df_eval, expr, dtype): # Test inplace if re.search("[^=><]=[^=]", expr) is not None: pdf_eval = df_eval.to_pandas() - pdf_eval.eval(expr, inplace=True) + with _hide_ufunc_warnings(expr): + pdf_eval.eval(expr, inplace=True) df_eval.eval(expr, inplace=True) assert_eq(pdf_eval, df_eval) @@ -10830,7 +10859,7 @@ def test_dataframe_contains(name, contains, other_names): assert (contains in pdf) == expectation assert (contains in gdf) == expectation elif pd.api.types.is_float_dtype(gdf.columns.dtype): - # In some cases, the columns are converted to a Float64Index based on + # In some cases, the columns are converted to an Index[float] based on # the other column names. That casts name values from None to np.nan. expectation = contains is np.nan and (name is None or name is np.nan) assert (contains in pdf) == expectation diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index 98f801d0cba..320c221fcb2 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import numpy as np @@ -52,6 +52,6 @@ def test_make_bool(): n = 10 state = np.random.RandomState(12) arr = gd.datasets.make_bool(n, state) - assert np.alltrue(np.isin(arr, [True, False])) + assert np.all(np.isin(arr, [True, False])) assert arr.size == n assert arr.dtype == bool diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 0ccc4fc06ab..24d8aa052e8 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2,6 +2,7 @@ import datetime import operator +import warnings import cupy as cp import numpy as np @@ -12,7 +13,7 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 +from cudf.core._compat import PANDAS_EQ_200, PANDAS_GE_200, PANDAS_GE_210 from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -191,8 +192,8 @@ def test_dt_series(data, field): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) base = getattr(pd_data.dt, field) - test = getattr(gdf_data.dt, field).to_pandas().astype("int64") - assert_eq(base, test) + test = getattr(gdf_data.dt, field) + assert_eq(base, test, check_dtype=False) @pytest.mark.parametrize("data", [data1(), data2()]) @@ -200,7 +201,7 @@ def test_dt_series(data, field): def test_dt_index(data, field): pd_data = data.copy() gdf_data = DatetimeIndex(pd_data) - assert_eq(getattr(gdf_data, field), getattr(pd_data, field)) + assert_eq(getattr(gdf_data, field), getattr(pd_data, field), exact=False) def test_setitem_datetime(): @@ -614,47 +615,23 @@ def test_datetime_dataframe(): ], ) @pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize("infer_datetime_format", [True, False]) -def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): +def test_cudf_to_datetime(data, dayfirst): pd_data = data - is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) - is_string_data = ( - gd_data.ndim == 1 - and not gd_data.empty - and gd_data.dtype.kind == "O" - ) else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data) else: gd_data = pd_data - is_string_data = isinstance(gd_data, list) and isinstance( - next(iter(gd_data), None), str - ) - if dayfirst and not infer_datetime_format and is_string_data: - # Note: pandas<2.0 also does not respect dayfirst=True correctly - # for object data - with pytest.raises(NotImplementedError): - cudf.to_datetime( - gd_data, - dayfirst=dayfirst, - infer_datetime_format=infer_datetime_format, - ) + expected = pd.to_datetime(pd_data, dayfirst=dayfirst) + actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) + + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) else: - expected = pd.to_datetime( - pd_data, - dayfirst=dayfirst, - infer_datetime_format=infer_datetime_format, - ) - actual = cudf.to_datetime( - gd_data, - dayfirst=dayfirst, - infer_datetime_format=infer_datetime_format, - ) - assert_eq(actual, expected) + assert_eq(actual, expected, check_exact=False) @pytest.mark.parametrize( @@ -693,12 +670,14 @@ def test_to_datetime_errors(data): else: gd_data = pd_data - assert_exceptions_equal( - pd.to_datetime, - cudf.to_datetime, - ([pd_data],), - ([gd_data],), - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + assert_exceptions_equal( + pd.to_datetime, + cudf.to_datetime, + ([pd_data],), + ([gd_data],), + ) def test_to_datetime_not_implemented(): @@ -743,7 +722,10 @@ def test_to_datetime_units(data, unit): expected = pd.to_datetime(pd_data, unit=unit) actual = cudf.to_datetime(gd_data, unit=unit) - assert_eq(actual, expected) + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, exact=False, check_exact=False) @pytest.mark.parametrize( @@ -793,14 +775,19 @@ def test_to_datetime_format(data, format, infer_datetime_format): else: gd_data = pd_data - expected = pd.to_datetime( - pd_data, format=format, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, format=format, infer_datetime_format=infer_datetime_format - ) + with expect_warning_if(True, UserWarning): + expected = pd.to_datetime( + pd_data, format=format, infer_datetime_format=infer_datetime_format + ) + with expect_warning_if(not infer_datetime_format): + actual = cudf.to_datetime( + gd_data, format=format, infer_datetime_format=infer_datetime_format + ) - assert_eq(actual, expected) + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, check_exact=False) def test_to_datetime_data_out_of_range_for_format(): @@ -864,7 +851,8 @@ def test_datetime_scalar_timeunit_cast(timeunit): gs = Series(testscalar) ps = pd.Series(testscalar) - assert_eq(ps, gs) + + assert_eq(ps, gs, check_dtype=False) gdf = DataFrame() gdf["a"] = np.arange(5) @@ -874,7 +862,8 @@ def test_datetime_scalar_timeunit_cast(timeunit): pdf["a"] = np.arange(5) pdf["b"] = testscalar - assert_eq(pdf, gdf) + assert gdf["b"].dtype == cudf.dtype("datetime64[s]") + assert_eq(pdf, gdf, check_dtype=True) @pytest.mark.parametrize( @@ -926,6 +915,7 @@ def test_str_to_datetime_error(): np.datetime64("2005-02-25"), np.datetime64("2005-02-25T03:30"), np.datetime64("nat"), + # TODO: https://github.com/pandas-dev/pandas/issues/52295 ], ) @pytest.mark.parametrize("data_dtype", DATETIME_TYPES) @@ -1295,9 +1285,8 @@ def test_datetime_reductions(data, op, dtype): def test_datetime_infer_format(data, timezone, dtype): ts_data = np.datetime_as_string(data, timezone=timezone) sr = cudf.Series(ts_data) + psr = pd.Series(ts_data) if timezone == "naive": - psr = pd.Series(ts_data) - expected = psr.astype(dtype) actual = sr.astype(dtype) @@ -1505,26 +1494,12 @@ def test_is_month_start(data, dtype): date_range_test_periods = [1, 10, 100] date_range_test_freq = [ {"months": 3, "years": 1}, - pytest.param( - {"hours": 10, "days": 57, "nanoseconds": 3}, - marks=pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="Pandas ignoring nanoseconds component. " - "https://github.com/pandas-dev/pandas/issues/44393", - ), - ), + {"hours": 10, "days": 57, "nanoseconds": 3}, "83D", "17h", "-680T", "110546s", - pytest.param( - "110546789L", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="Pandas DateOffset ignores milliseconds. " - "https://github.com/pandas-dev/pandas/issues/43371", - ), - ), + "110546789L", "110546789248U", ] @@ -1573,7 +1548,8 @@ def test_date_range_start_end_freq(request, start, end, freq): request.applymarker( pytest.mark.xfail( condition=( - isinstance(freq, dict) + not PANDAS_GE_200 + and isinstance(freq, dict) and freq.get("hours", None) == 10 and freq.get("days", None) == 57 and freq.get("nanoseconds", None) == 3 @@ -1627,7 +1603,8 @@ def test_date_range_start_freq_periods(request, start, freq, periods): request.applymarker( pytest.mark.xfail( condition=( - isinstance(freq, dict) + not PANDAS_GE_200 + and isinstance(freq, dict) and freq.get("hours", None) == 10 and freq.get("days", None) == 57 and freq.get("nanoseconds", None) == 3 @@ -1665,7 +1642,8 @@ def test_date_range_end_freq_periods(request, end, freq, periods): request.applymarker( pytest.mark.xfail( condition=( - "nanoseconds" in freq + not PANDAS_GE_210 + and "nanoseconds" in freq and periods != 1 and end == "1970-01-01 00:00:00" ), @@ -1981,7 +1959,22 @@ def test_error_values(): @pytest.mark.parametrize( "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] ) -def test_ceil(data, time_type, resolution): +def test_ceil(request, data, time_type, resolution): + alias_map = {"L": "ms", "U": "us", "N": "ns"} + request.applymarker( + pytest.mark.xfail( + condition=( + PANDAS_EQ_200 + and resolution in {"L", "ms", "U", "us", "N"} + and np.dtype( + f"datetime64[{alias_map.get(resolution, resolution)}]" + ) + > np.dtype(time_type) + ), + reason="https://github.com/pandas-dev/pandas/issues/52761", + strict=True, + ) + ) gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -2011,7 +2004,23 @@ def test_ceil(data, time_type, resolution): @pytest.mark.parametrize( "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] ) -def test_floor(data, time_type, resolution): +def test_floor(request, data, time_type, resolution): + alias_map = {"L": "ms", "U": "us", "N": "ns"} + request.applymarker( + pytest.mark.xfail( + condition=( + PANDAS_EQ_200 + and resolution in {"L", "ms", "U", "us", "N"} + and np.dtype( + f"datetime64[{alias_map.get(resolution, resolution)}]" + ) + > np.dtype(time_type) + ), + reason="https://github.com/pandas-dev/pandas/issues/52761", + strict=True, + ) + ) + gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -2041,7 +2050,23 @@ def test_floor(data, time_type, resolution): @pytest.mark.parametrize( "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] ) -def test_round(data, time_type, resolution): +def test_round(request, data, time_type, resolution): + alias_map = {"L": "ms", "U": "us", "N": "ns"} + request.applymarker( + pytest.mark.xfail( + condition=( + PANDAS_EQ_200 + and resolution in {"L", "ms", "U", "us", "N"} + and np.dtype( + f"datetime64[{alias_map.get(resolution, resolution)}]" + ) + > np.dtype(time_type) + ), + reason="https://github.com/pandas-dev/pandas/issues/52761", + strict=True, + ) + ) + gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -2081,8 +2106,10 @@ def test_first(idx, offset): p = pd.Series(range(len(idx)), dtype="int64", index=idx) g = cudf.from_pandas(p) - expect = p.first(offset=offset) - got = g.first(offset=offset) + with expect_warning_if(PANDAS_GE_210): + expect = p.first(offset=offset) + with pytest.warns(FutureWarning): + got = g.first(offset=offset) assert_eq(expect, got) @@ -2111,8 +2138,10 @@ def test_first_start_at_end_of_month(idx, offset): p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) - expect = p.first(offset=offset) - got = g.first(offset=offset) + with expect_warning_if(PANDAS_GE_210): + expect = p.first(offset=offset) + with pytest.warns(FutureWarning): + got = g.first(offset=offset) assert_eq(expect, got) @@ -2148,8 +2177,10 @@ def test_last(idx, offset): p = pd.Series(range(len(idx)), dtype="int64", index=idx) g = cudf.from_pandas(p) - expect = p.last(offset=offset) - got = g.last(offset=offset) + with expect_warning_if(PANDAS_GE_210): + expect = p.last(offset=offset) + with pytest.warns(FutureWarning): + got = g.last(offset=offset) assert_eq(expect, got) @@ -2306,7 +2337,7 @@ def test_format_timezone_not_implemented(code): @pytest.mark.parametrize("tz", ["UTC-3", "+01:00"]) def test_utc_offset_not_implemented(tz): - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) @@ -2423,46 +2454,3 @@ def test_dateimeindex_from_noniso_string(): def test_to_datetime_errors_non_scalar_not_implemented(errors): with pytest.raises(NotImplementedError): cudf.to_datetime([1, ""], unit="s", errors=errors) - - -@pytest.mark.parametrize( - "box", [list, pd.Index, cudf.Index, pd.Series, cudf.Series] -) -@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) -def test_to_datetime_arraylike_utc_true(box, dtype): - pd_data = [1, 2] - cudf_data = box(pd_data) - if box is not list: - cudf_data = cudf_data.astype(dtype) - if box is cudf.Series or box is pd.Series: - pd_data = pd.Series(pd_data) - result = cudf.to_datetime(cudf_data, utc=True) - expected = pd.to_datetime(pd_data, utc=True) - assert_eq(result, expected) - - -@pytest.mark.xfail( - raises=TypeError, - reason="libcudf.copying.get_element doesn't understand pd.DatetimeTZDtype", -) -def test_to_datetime_scalar_utc_true(): - data = pd.Timestamp(2020, 1, 1) - with cudf.option_context("mode.pandas_compatible", True): - result = cudf.to_datetime(data, utc=True) - expected = pd.Timestamp(year=2020, month=1, day=1, tz="UTC") - assert_eq(result, expected) - - -def test_to_datetime_dataframe_utc_true(): - data = cudf.DataFrame([[2020, 1, 1]], columns=["year", "month", "day"]) - result = cudf.to_datetime(data, utc=True) - expected = pd.Series([datetime.datetime(2020, 1, 1)]).dt.tz_localize("UTC") - assert_eq(result, expected) - - -def test_datetimeindex_dtype_np_dtype(): - dtype = np.dtype("datetime64[ns]") - data = [1] - gdti = cudf.DatetimeIndex(data, dtype=dtype) - pdti = pd.DatetimeIndex(data, dtype=dtype) - assert_eq(gdti, pdti) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index bffbade14d8..a22b678ebe6 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -7,7 +7,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer import as_buffer from cudf.core.column import as_column, build_column from cudf.core.df_protocol import ( @@ -278,9 +277,5 @@ def test_NA_mixed_dtype(): assert_df_unique_dtype_cols(data_mixed) -@pytest.mark.skipif( - not PANDAS_GE_150, - reason="Pandas versions < 1.5.0 do not support interchange protocol", -) def test_from_cpu_df(pandas_df): cudf.from_dataframe(pandas_df, allow_copy=True) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index ac104b7e513..f1acd7b4320 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -5,10 +5,7 @@ import pytest import cudf -from cudf.testing._utils import ( - _create_pandas_series_float64_default, - assert_eq, -) +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( @@ -24,7 +21,7 @@ @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 6e24099f1a8..0efd8d9781c 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -1,12 +1,12 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd import pyarrow as pa import pytest +from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, @@ -20,11 +20,6 @@ from cudf.testing._utils import assert_eq from cudf.utils.dtypes import np_to_pa_dtype -if PANDAS_GE_150: - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -else: - from pandas.core.arrays._arrow_utils import ArrowIntervalType - def test_cdt_basic(): psr = pd.Series(["a", "b", "a", "c"], dtype="category") diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index ad513ea3cd5..447b2b3c4f5 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -9,11 +9,7 @@ import cudf from cudf import concat -from cudf.testing._utils import ( - _create_pandas_series_float64_default, - assert_eq, - assert_exceptions_equal, -) +from cudf.testing._utils import assert_eq, assert_exceptions_equal # TODO: PANDAS 1.0 support # Revisit drop_duplicates() tests to update parameters like ignore_index. @@ -62,7 +58,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = _create_pandas_series_float64_default(data) + pds = pd.Series(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index b757f8acb6e..a0b86d735cc 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -19,7 +19,8 @@ import cudf from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.core.udf.utils import UDFError, precompiled @@ -559,9 +560,12 @@ def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) + with expect_warning_if( + func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning + ): + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) @pytest.mark.parametrize("dtype", ["float64"]) @@ -645,7 +649,8 @@ def func(group): with pytest.raises(UDFError, match=m): run_groupby_apply_jit_test(dataset, func, keys) return - run_groupby_apply_jit_test(dataset, func, keys) + with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): + run_groupby_apply_jit_test(dataset, func, keys) @pytest.mark.parametrize("dtype", ["int32", "int64"]) @@ -660,7 +665,8 @@ def test_groupby_apply_jit_correlation_zero_variance(dtype): def func(group): return group["b"].corr(group["c"]) - run_groupby_apply_jit_test(data, func, ["a"]) + with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): + run_groupby_apply_jit_test(data, func, ["a"]) @pytest.mark.parametrize("op", unary_ops) @@ -898,10 +904,6 @@ def test_groupby_2keys_agg(nelem, func): # https://github.com/pandas-dev/pandas/issues/40685 is resolved. # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], ) -@pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43209", -) def test_groupby_agg_decimal(num_groups, nelem_per_group, func): # The number of digits after the decimal to use. decimal_digits = 2 @@ -1188,13 +1190,7 @@ def test_advanced_groupby_levels(): @pytest.mark.parametrize( "func", [ - pytest.param( - lambda df: df.groupby(["x", "y", "z"]).sum(), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/32464", - ), - ), + lambda df: df.groupby(["x", "y", "z"]).sum(), lambda df: df.groupby(["x", "y"]).sum(), lambda df: df.groupby(["x", "y"]).agg("sum"), lambda df: df.groupby(["y"]).sum(), @@ -1226,8 +1222,7 @@ def test_groupby_unsupported_columns(): ) pdf["b"] = pd_cat gdf = cudf.from_pandas(pdf) - with pytest.warns(FutureWarning): - pdg = pdf.groupby("x").sum() + pdg = pdf.groupby("x").sum(numeric_only=True) # cudf does not yet support numeric_only, so our default is False (unlike # pandas, which defaults to inferring and throws a warning about it). gdg = gdf.groupby("x").sum() @@ -1400,7 +1395,7 @@ def test_groupby_multi_agg_hash_groupby(agg): @pytest.mark.parametrize( - "agg", ["min", "max", "idxmax", "idxmax", "sum", "prod", "count", "mean"] + "agg", ["min", "max", "idxmax", "idxmin", "sum", "prod", "count", "mean"] ) def test_groupby_nulls_basic(agg): check_dtype = agg not in _index_type_aggs @@ -1438,11 +1433,12 @@ def test_groupby_nulls_basic(agg): # TODO: fillna() used here since we don't follow # Pandas' null semantics. Should we change it? - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)().fillna(0), - getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1), - check_dtype=check_dtype, - ) + with expect_warning_if(agg in {"idxmax", "idxmin"}): + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)().fillna(0), + getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1), + check_dtype=check_dtype, + ) def test_groupby_nulls_in_index(): @@ -1536,7 +1532,7 @@ def test_groupby_index_type(): df["string_col"] = ["a", "b", "c"] df["counts"] = [1, 2, 3] res = df.groupby(by="string_col").counts.sum() - assert isinstance(res.index, cudf.StringIndex) + assert res.index.dtype == cudf.dtype("object") @pytest.mark.parametrize( @@ -1786,15 +1782,11 @@ def test_grouping(grouper): ) gdf = cudf.from_pandas(pdf) - # There's no easy way to validate that the same warning is thrown by both - # cudf and pandas here because it's only thrown upon iteration, so we - # settle for catching warnings on the whole block. - with expect_warning_if(isinstance(grouper, list) and len(grouper) == 1): - for pdf_group, gdf_group in zip( - pdf.groupby(grouper), gdf.groupby(grouper) - ): - assert pdf_group[0] == gdf_group[0] - assert_eq(pdf_group[1], gdf_group[1]) + for pdf_group, gdf_group in zip( + pdf.groupby(grouper), gdf.groupby(grouper) + ): + assert pdf_group[0] == gdf_group[0] + assert_eq(pdf_group[1], gdf_group[1]) @pytest.mark.parametrize("agg", [lambda x: x.count(), "count"]) @@ -2120,14 +2112,19 @@ def test_groupby_list_columns_excluded(): ) gdf = cudf.from_pandas(pdf) - # cudf does not yet support numeric_only, so our default is False, but - # pandas defaults to inferring and throws a warning about it, so we need to - # catch that. pandas future behavior will match ours by default (at which - # point supporting numeric_only=True will be the open feature request). - with pytest.warns(FutureWarning): - pandas_result = pdf.groupby("a").mean() - with pytest.warns(FutureWarning): - pandas_agg_result = pdf.groupby("a").agg("mean") + if PANDAS_GE_200: + pandas_result = pdf.groupby("a").mean(numeric_only=True) + pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) + else: + # cudf does not yet support numeric_only, so our default is False, but + # pandas defaults to inferring and throws a warning about it, so + # we need to catch that. pandas future behavior will match ours + # by default (at which point supporting numeric_only=True will + # be the open feature request). + with pytest.warns(FutureWarning): + pandas_result = pdf.groupby("a").mean() + with pytest.warns(FutureWarning): + pandas_agg_result = pdf.groupby("a").agg("mean") assert_groupby_results_equal( pandas_result, gdf.groupby("a").mean(), check_dtype=False @@ -2257,11 +2254,16 @@ def test_groupby_apply_return_series_dataframe(func, args): ) def test_groupby_no_keys(pdf): gdf = cudf.from_pandas(pdf) + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": not PANDAS_GE_200} + else: + kwargs = {} assert_groupby_results_equal( pdf.groupby([]).max(), gdf.groupby([]).max(), check_dtype=False, - check_index_type=False, # Int64Index v/s Float64Index + check_index_type=False, # Int64 v/s Float64 + **kwargs, ) @@ -2271,10 +2273,15 @@ def test_groupby_no_keys(pdf): ) def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": not PANDAS_GE_200} + else: + kwargs = {} assert_groupby_results_equal( pdf.groupby([], group_keys=False).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), - check_index_type=False, # Int64Index v/s Float64Index + check_index_type=False, # Int64 v/s Float64 + **kwargs, ) @@ -2805,16 +2812,18 @@ def test_groupby_various_by_fillna(by, data, args): ps = pd.Series(data) gs = cudf.from_pandas(ps) - expect = ps.groupby(by).fillna(**args) + with expect_warning_if(PANDAS_GE_210 and "method" in args): + expect = ps.groupby(by).fillna(**args) if isinstance(by, pd.Grouper): by = cudf.Grouper(level=by.level) - got = gs.groupby(by).fillna(**args) + with expect_warning_if("method" in args): + got = gs.groupby(by).fillna(**args) assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize("nelem", [10, 100, 1000]) -@pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill"]) +@pytest.mark.parametrize("method", ["ffill", "bfill"]) def test_groupby_fillna_method(nelem, method): t = rand_dataframe( dtypes_meta=[ @@ -2852,8 +2861,9 @@ def test_groupby_fillna_method(nelem, method): pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(key_col).fillna(method=method) - with expect_warning_if(method in {"pad", "backfill"}): + with expect_warning_if(PANDAS_GE_210): + expect = pdf.groupby(key_col).fillna(method=method) + with pytest.warns(FutureWarning): got = gdf.groupby(key_col).fillna(method=method) assert_groupby_results_equal( @@ -2954,7 +2964,13 @@ def test_groupby_freq_week(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("label", [None, "left", "right"]) @@ -2981,7 +2997,13 @@ def test_groupby_freq_day(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("label", [None, "left", "right"]) @@ -3008,7 +3030,13 @@ def test_groupby_freq_min(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("label", [None, "left", "right"]) @@ -3035,7 +3063,13 @@ def test_groupby_freq_s(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize( @@ -3081,11 +3115,20 @@ def test_groupby_get_group(pdf, group, name, obj): else: gobj = obj - expected = pdf.groupby(group).get_group(name=name, obj=obj) - actual = gdf.groupby(group).get_group(name=name, obj=gobj) + pgb = pdf.groupby(group) + ggb = gdf.groupby(group) + with expect_warning_if(obj is not None): + expected = pgb.get_group(name=name, obj=obj) + with expect_warning_if(obj is not None): + actual = ggb.get_group(name=name, obj=gobj) assert_groupby_results_equal(expected, actual) + expected = pdf.iloc[pgb.indices.get(name)] + actual = gdf.iloc[ggb.indices.get(name)] + + assert_eq(expected, actual) + @pytest.mark.parametrize( "by", @@ -3194,16 +3237,22 @@ def test_groupby_transform_maintain_index(by): ], ) @pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", no_default, None]) def test_groupby_pct_change(data, gkey, periods, fill_method): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - with expect_warning_if(fill_method in ("pad", "backfill")): + with expect_warning_if(fill_method not in (no_default, None)): actual = gdf.groupby(gkey).pct_change( periods=periods, fill_method=fill_method ) - with expect_warning_if(fill_method in ("pad", "backfill")): + with expect_warning_if( + PANDAS_GE_210 + and ( + fill_method not in (no_default, None) + or (fill_method is not None and pdf.isna().any().any()) + ) + ): expected = pdf.groupby(gkey).pct_change( periods=periods, fill_method=fill_method ) @@ -3239,20 +3288,7 @@ def test_groupby_pct_change_empty_columns(): assert_eq(expected, actual) -@pytest.mark.parametrize( - "group_keys", - [ - None, - pytest.param( - True, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/pull/34998", - ), - ), - False, - ], -) +@pytest.mark.parametrize("group_keys", [None, True, False]) @pytest.mark.parametrize("by", ["A", ["A", "B"]]) def test_groupby_group_keys(group_keys, by): gdf = cudf.DataFrame( @@ -3313,8 +3349,12 @@ def test_groupby_dtypes(groups): {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} ) pdf = df.to_pandas() + with expect_warning_if(PANDAS_GE_210): + expected = pdf.groupby(groups).dtypes + with pytest.warns(FutureWarning): + actual = df.groupby(groups).dtypes - assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes) + assert_eq(expected, actual) @pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]]) @@ -3515,11 +3555,12 @@ def test_head_tail_empty(): expected = pdf.groupby(pd.Series(values)).head() got = df.groupby(cudf.Series(values)).head() - assert_eq(expected, got) + assert_eq(expected, got, check_column_type=not PANDAS_GE_200) expected = pdf.groupby(pd.Series(values)).tail() got = df.groupby(cudf.Series(values)).tail() - assert_eq(expected, got) + + assert_eq(expected, got, check_column_type=not PANDAS_GE_200) @pytest.mark.parametrize( @@ -3644,8 +3685,9 @@ def test_categorical_grouping_pandas_compatibility(): with cudf.option_context("mode.pandas_compatible", True): actual = gdf.groupby("key", sort=False).sum() - expected = pdf.groupby("key", sort=False).sum() - + with pytest.warns(FutureWarning): + # observed param deprecation. + expected = pdf.groupby("key", sort=False).sum() assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index e0a369d8d91..996b651b9fe 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -15,11 +15,11 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype -from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200 from cudf.core.index import ( CategoricalIndex, DatetimeIndex, - GenericIndex, + Index, RangeIndex, as_index, ) @@ -30,10 +30,7 @@ OTHER_TYPES, SERIES_OR_INDEX_NAMES, SIGNED_INTEGER_TYPES, - SIGNED_TYPES, UNSIGNED_TYPES, - _create_cudf_series_float64_default, - _create_pandas_series_float64_default, assert_column_memory_eq, assert_column_memory_ne, assert_eq, @@ -135,11 +132,8 @@ def test_index_comparision(): [ lambda x: x.min(), lambda x: x.max(), - lambda x: x.sum(), - lambda x: x.mean(), lambda x: x.any(), lambda x: x.all(), - lambda x: x.prod(), ], ) def test_reductions(func): @@ -212,9 +206,9 @@ def test_pandas_as_index(): gdf_category_index = as_index(pdf_category_index) # Check instance types - assert isinstance(gdf_int_index, GenericIndex) - assert isinstance(gdf_uint_index, GenericIndex) - assert isinstance(gdf_float_index, GenericIndex) + assert isinstance(gdf_int_index, Index) + assert isinstance(gdf_uint_index, Index) + assert isinstance(gdf_float_index, Index) assert isinstance(gdf_datetime_index, DatetimeIndex) assert isinstance(gdf_category_index, CategoricalIndex) @@ -318,90 +312,69 @@ def test_set_index_as_property(): @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) -def test_index_copy_range(name, dtype, deep=True): +def test_index_copy_range(name, deep=True): cidx = cudf.RangeIndex(1, 5) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype,", ["datetime64[ns]", "int64"]) -def test_index_copy_datetime(name, dtype, deep=True): +def test_index_copy_datetime(name, deep=True): cidx = cudf.DatetimeIndex(["2001", "2002", "2003"]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", ["category", "object"]) -def test_index_copy_string(name, dtype, deep=True): +def test_index_copy_string(name, deep=True): cidx = cudf.Index(["a", "b", "c"]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize( - "dtype", - NUMERIC_TYPES + ["datetime64[ns]", "timedelta64[ns]"] + OTHER_TYPES, -) -def test_index_copy_integer(name, dtype, deep=True): +def test_index_copy_integer(name, deep=True): """Test for NumericIndex Copy Casts""" cidx = cudf.Index([1, 2, 3]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", SIGNED_TYPES) -def test_index_copy_float(name, dtype, deep=True): +def test_index_copy_float(name, deep=True): """Test for NumericIndex Copy Casts""" cidx = cudf.Index([1.0, 2.0, 3.0]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["category"]) -def test_index_copy_category(name, dtype, deep=True): +def test_index_copy_category(name, deep=True): cidx = cudf.core.index.CategoricalIndex([1, 2, 3]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_column_memory_ne(cidx._values, cidx_copy._values) assert_eq(pidx_copy, cidx_copy) @@ -426,12 +399,12 @@ def test_index_copy_deep(idx, deep, copy_on_write): original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) if ( - isinstance(idx, cudf.StringIndex) + isinstance(idx._values, cudf.core.column.StringColumn) or not deep or (cudf.get_option("copy_on_write") and not deep) ): # StringColumn is immutable hence, deep copies of a - # StringIndex will share the same StringColumn. + # Index with string dtype will share the same StringColumn. # When `copy_on_write` is turned on, Index objects will # have unique column object but they all point to same @@ -538,15 +511,11 @@ def test_empty_df_head_tail_index(n): None, ), (pd.Index(range(5)), pd.Index(range(4)) > 0, None, ValueError), - pytest.param( + ( pd.Index(range(5)), pd.Index(range(5)) > 1, 10, None, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_133, - reason="https://github.com/pandas-dev/pandas/issues/43240", - ), ), ( pd.Index(np.arange(10)), @@ -683,7 +652,7 @@ def test_index_where(data, condition, other, error): gs_other = other if error is None: - if isinstance(ps.dtype, pd.CategoricalDtype): + if hasattr(ps, "dtype") and isinstance(ps.dtype, pd.CategoricalDtype): expect = ps.where(ps_condition, other=ps_other) got = gs.where(gs_condition, other=gs_other) np.testing.assert_array_equal( @@ -823,7 +792,7 @@ def test_index_to_series(data): [2], ], ) -@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize("sort", [None, False, True]) @pytest.mark.parametrize( "name_data,name_other", [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")], @@ -861,8 +830,8 @@ def test_index_difference_sort_error(): assert_exceptions_equal( pdi.difference, gdi.difference, - ([pdi], {"sort": True}), - ([gdi], {"sort": True}), + ([pdi], {"sort": "A"}), + ([gdi], {"sort": "A"}), ) @@ -1011,8 +980,8 @@ def test_index_equal_misc(data, other): actual = gd_data.equals(np.array(gd_other)) assert_eq(expected, actual) - expected = pd_data.equals(_create_pandas_series_float64_default(pd_other)) - actual = gd_data.equals(_create_cudf_series_float64_default(gd_other)) + expected = pd_data.equals(pd.Series(pd_other)) + actual = gd_data.equals(cudf.Series(gd_other)) assert_eq(expected, actual) expected = pd_data.astype("category").equals(pd_other) @@ -1058,16 +1027,19 @@ def test_index_append(data, other): pd_data = pd.Index(data) pd_other = pd.Index(other) - gd_data = cudf.core.index.as_index(data) - gd_other = cudf.core.index.as_index(other) + gd_data = cudf.Index(data) + gd_other = cudf.Index(other) if cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): gd_data = gd_data.astype("str") gd_other = gd_other.astype("str") - expected = pd_data.append(pd_other) - - actual = gd_data.append(gd_other) + with expect_warning_if( + (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype + ): + expected = pd_data.append(pd_other) + with expect_warning_if(len(data) == 0 or len(other) == 0): + actual = gd_data.append(gd_other) if len(data) == 0 and len(other) == 0: # Pandas default dtype to "object" for empty list # cudf default dtype to "float" for empty list @@ -1083,10 +1055,12 @@ def test_index_empty_append_name_conflict(): non_empty = cudf.Index([1], name="bar") expected = cudf.Index([1]) - result = non_empty.append(empty) + with pytest.warns(FutureWarning): + result = non_empty.append(empty) assert_eq(result, expected) - result = empty.append(non_empty) + with pytest.warns(FutureWarning): + result = empty.append(non_empty) assert_eq(result, expected) @@ -1257,8 +1231,13 @@ def test_index_append_list(data, other): gd_data = cudf.from_pandas(data) gd_other = [cudf.from_pandas(i) for i in other] - expected = pd_data.append(pd_other) - actual = gd_data.append(gd_other) + with expect_warning_if( + (len(data) == 0 or any(len(d) == 0 for d in other)) + and (any(d.dtype != data.dtype for d in other)) + ): + expected = pd_data.append(pd_other) + with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)): + actual = gd_data.append(gd_other) assert_eq(expected, actual) @@ -1279,91 +1258,33 @@ def test_index_basic(data, dtype, name): @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) def test_integer_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.Int64Index(data, dtype=dtype, name=name) - # Int8Index - with pytest.warns(FutureWarning): - gindex = cudf.Int8Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int8") - - # Int16Index - with pytest.warns(FutureWarning): - gindex = cudf.Int16Index(data, dtype=dtype, name=name) + pindex = pd.Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int16") - - # Int32Index - with pytest.warns(FutureWarning): - gindex = cudf.Int32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int32") - - # Int64Index - with pytest.warns(FutureWarning): - gindex = cudf.Int64Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", UNSIGNED_TYPES) def test_unsigned_integer_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.UInt64Index(data, dtype=dtype, name=name) - # UInt8Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt8Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint8") - - # UInt16Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt16Index(data, dtype=dtype, name=name) + pindex = pd.Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint16") - - # UInt32Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint32") - - # UInt64Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt64Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", FLOAT_TYPES) def test_float_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.Float64Index(data, dtype=dtype, name=name) - # Float32Index - with pytest.warns(FutureWarning): - gindex = cudf.Float32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("float32") - - # Float64Index - with pytest.warns(FutureWarning): - gindex = cudf.Float64Index(data, dtype=dtype, name=name) + pindex = pd.Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("float64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @@ -1412,6 +1333,9 @@ def test_categorical_index_basic(data, categories, dtype, ordered, name): [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], names=("number1", "color2"), ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), ], ) @pytest.mark.parametrize( @@ -1425,6 +1349,9 @@ def test_categorical_index_basic(data, categories, dtype, ordered, name): [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], names=("number1", "color2"), ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), ], ) def test_multiindex_append(data, other): @@ -1552,7 +1479,7 @@ def test_index_fillna(data, fill_value): assert_eq( pdi.fillna(fill_value), gdi.fillna(fill_value), exact=False - ) # Int64Index v/s Float64Index + ) # Int64 v/s Float64 @pytest.mark.parametrize( @@ -1590,7 +1517,13 @@ def test_index_from_arrow(data): arrow_array = pa.Array.from_pandas(pdi) expected_index = pd.Index(arrow_array.to_pandas()) gdi = cudf.Index.from_arrow(arrow_array) - + if PANDAS_GE_200 and gdi.dtype == cudf.dtype("datetime64[s]"): + # Arrow bug: + # https://github.com/apache/arrow/issues/33321 + # arrow cannot convert non-nanosecond + # resolution to appropriate type in pandas. + # Hence need to type-cast. + expected_index = expected_index.astype(gdi.dtype) assert_eq(expected_index, gdi) @@ -1780,20 +1713,16 @@ def test_index_set_names_error(idx, level, names): "idx", [pd.Index([1, 3, 6]), pd.Index([6, 1, 3])], # monotonic # non-monotonic ) -@pytest.mark.parametrize("key", list(range(0, 8))) +@pytest.mark.parametrize("key", [list(range(0, 8))]) @pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -def test_get_loc_single_unique_numeric(idx, key, method): +def test_get_indexer_single_unique_numeric(idx, key, method): pi = idx gi = cudf.from_pandas(pi) if ( - (key not in pi and method is None) # `method` only applicable to monotonic index - or (not pi.is_monotonic_increasing and method is not None) - # Get key before the first element is KeyError - or (key == 0 and method in "ffill") - # Get key after the last element is KeyError - or (key == 7 and method in "bfill") + not pi.is_monotonic_increasing + and method is not None ): assert_exceptions_equal( lfunc=pi.get_loc, @@ -1802,10 +1731,9 @@ def test_get_loc_single_unique_numeric(idx, key, method): rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + with expect_warning_if(not PANDAS_GE_200 and method is not None): + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) @@ -1814,30 +1742,55 @@ def test_get_loc_single_unique_numeric(idx, key, method): "idx", [pd.RangeIndex(3, 100, 4)], ) -@pytest.mark.parametrize("key", list(range(1, 110, 3))) -@pytest.mark.parametrize("method", [None, "ffill"]) -def test_get_loc_rangeindex(idx, key, method): +@pytest.mark.parametrize( + "key", + [ + list(range(1, 20, 3)), + list(range(20, 35, 3)), + list(range(35, 77, 3)), + list(range(77, 110, 3)), + ], +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) +@pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) +def test_get_indexer_rangeindex(idx, key, method, tolerance): pi = idx gi = cudf.from_pandas(pi) + expected = pi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [pd.RangeIndex(3, 100, 4)], +) +@pytest.mark.parametrize("key", list(range(1, 110, 3))) +def test_get_loc_rangeindex(idx, key): + pi = idx + gi = cudf.from_pandas(pi) if ( - (key not in pi and method is None) + (key not in pi) # Get key before the first element is KeyError - or (key < pi.start and method in "ffill") + or (key < pi.start) # Get key after the last element is KeyError - or (key >= pi.stop and method in "bfill") + or (key >= pi.stop) ): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_loc(key) + got = gi.get_loc(key) assert_eq(expected, got) @@ -1850,8 +1803,7 @@ def test_get_loc_rangeindex(idx, key, method): ], ) @pytest.mark.parametrize("key", [0, 3, 6, 7]) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_single_duplicate_numeric(idx, key, method): +def test_get_loc_single_duplicate_numeric(idx, key): pi = idx gi = cudf.from_pandas(pi) @@ -1859,14 +1811,44 @@ def test_get_loc_single_duplicate_numeric(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([-1, 2, 3, 6]), # monotonic + pd.Index([6, 1, 3, 4]), # non-monotonic + ], +) +@pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) +@pytest.mark.parametrize("tolerance", [None, 1, 2]) +def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): + pi = idx + gi = cudf.from_pandas(pi) + + if not pi.is_monotonic_increasing and method is not None: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) assert_eq(expected, got) @@ -1875,31 +1857,43 @@ def test_get_loc_single_duplicate_numeric(idx, key, method): "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] ) @pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_loc_single_unique_string(idx, key, method): +def test_get_loc_single_unique_string(idx, key): pi = idx gi = cudf.from_pandas(pi) - if ( - (key not in pi and method is None) - # `method` only applicable to monotonic index - or (not pi.is_monotonic_increasing and method is not None) - # Get key before the first element is KeyError - or (key == "a" and method == "ffill") - # Get key after the last element is KeyError - or (key == "z" and method == "bfill") - ): + if key not in pi: assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] +) +@pytest.mark.parametrize("key", [["a", "f", "n", "z"], ["p", "p", "b"]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_single_unique_string(idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + + if not pi.is_monotonic_increasing and method is not None: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) @@ -1908,8 +1902,7 @@ def test_get_loc_single_unique_string(idx, key, method): "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["m", "f", "m", "q"])] ) @pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_single_duplicate_string(idx, key, method): +def test_get_loc_single_duplicate_string(idx, key): pi = idx gi = cudf.from_pandas(pi) @@ -1917,14 +1910,39 @@ def test_get_loc_single_duplicate_string(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["a", "f", "m", "q"])] +) +@pytest.mark.parametrize("key", [["a"], ["f", "n", "z"]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_single_duplicate_string(idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + + if ( + # `method` only applicable to monotonic index + (not pi.is_monotonic_increasing and method is not None) + or not pi.is_unique + ): + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) @@ -1944,8 +1962,7 @@ def test_get_loc_single_duplicate_string(idx, key, method): ], ) @pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_multi_numeric(idx, key, method): +def test_get_loc_multi_numeric(idx, key): pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -1953,18 +1970,42 @@ def test_get_loc_multi_numeric(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_loc(key) + got = gi.get_loc(key) assert_eq(expected, got) +@pytest.mark.parametrize( + "idx", + [ + pd.MultiIndex.from_tuples( + [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)] + ), + pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] + ), + pd.MultiIndex.from_tuples( + [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)] + ), + ], +) +@pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_numeric(idx, key, method): + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + @pytest.mark.parametrize( "idx", [ @@ -1983,8 +2024,7 @@ def test_get_loc_multi_numeric(idx, key, method): ((9, 9, 9), None), ], ) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_multi_numeric_deviate(idx, key, result, method): +def test_get_loc_multi_numeric_deviate(idx, key, result): pi = idx gi = cudf.from_pandas(pi) @@ -2000,17 +2040,48 @@ def test_get_loc_multi_numeric_deviate(idx, key, result, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), ) else: expected = result - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + got = gi.get_loc(key) assert_eq(expected, got) +@pytest.mark.parametrize( + "idx", + [ + pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] + ) + ], +) +@pytest.mark.parametrize( + "key", + [ + ((1, 2, 3),), + ((2, 1, 1),), + ((9, 9, 9),), + ], +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_numeric_deviate(request, idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + request.applymarker( + pytest.mark.xfail( + condition=method is not None and key == ((1, 2, 3),), + reason="https://github.com/pandas-dev/pandas/issues/53452", + ) + ) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + @pytest.mark.parametrize( "idx", [ @@ -2069,8 +2140,7 @@ def test_get_loc_multi_numeric_deviate(idx, key, result, method): @pytest.mark.parametrize( "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] ) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_multi_string(idx, key, method): +def test_get_loc_multi_string(idx, key): pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2078,18 +2148,101 @@ def test_get_loc_multi_string(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_loc(key) + got = gi.get_loc(key) assert_eq(expected, got) +@pytest.mark.parametrize( + "idx", + [ + pd.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ] + ), + pd.MultiIndex.from_tuples( + [ + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + pd.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + ], +) +@pytest.mark.parametrize( + "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_string(idx, key, method): + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx1", + [ + lambda: cudf.Index(["a", "b", "c"]), + lambda: cudf.RangeIndex(0, 10), + lambda: cudf.Index([1, 2, 3], dtype="category"), + lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), + lambda: cudf.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + ], +) +@pytest.mark.parametrize( + "idx2", + [ + lambda: cudf.Index(["a", "b", "c"]), + lambda: cudf.RangeIndex(0, 10), + lambda: cudf.Index([1, 2, 3], dtype="category"), + lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), + ], +) +def test_get_indexer_invalid(idx1, idx2): + idx1 = idx1() + idx2 = idx2() + assert_eq( + idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) + ) + + @pytest.mark.parametrize( "objs", [ @@ -2144,7 +2297,7 @@ def test_range_index_concat(objs): ), ], ) -@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize("sort", [None, False, True]) def test_union_index(idx1, idx2, sort): expected = idx1.union(idx2, sort=sort) @@ -2203,7 +2356,7 @@ def test_union_unsigned_vs_signed(dtype1, dtype2): (pd.Index([]), pd.Index([1, 2], dtype="category")), ], ) -@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize("sort", [None, False, True]) @pytest.mark.parametrize("pandas_compatible", [True, False]) def test_intersection_index(idx1, idx2, sort, pandas_compatible): expected = idx1.intersection(idx2, sort=sort) @@ -2352,7 +2505,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): ], ) def test_isin_index(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) @@ -2595,7 +2748,9 @@ def test_rangeindex_join_user_option(default_integer_bitwidth): actual = idx1.join(idx2, how="inner", sort=True) expected = idx1.to_pandas().join(idx2.to_pandas(), how="inner", sort=True) assert actual.dtype == cudf.dtype(f"int{default_integer_bitwidth}") - assert_eq(expected, actual) + # exact=False to ignore dtype comparison, + # because `default_integer_bitwidth` is cudf only option + assert_eq(expected, actual, exact=False) def test_rangeindex_where_user_option(default_integer_bitwidth): @@ -2656,7 +2811,8 @@ def test_index_methods(index, func): if func == "append": expected = pidx.append(other=pidx) - actual = gidx.append(other=gidx) + with expect_warning_if(len(gidx) == 0): + actual = gidx.append(other=gidx) else: expected = getattr(pidx, func)() actual = getattr(gidx, func)() @@ -2901,10 +3057,9 @@ def test_index_to_frame(data, data_name, index, name): pidx = pd.Index(data, name=data_name) gidx = cudf.from_pandas(pidx) - with expect_warning_if(name is None): + with expect_warning_if(not PANDAS_GE_200 and name is None): expected = pidx.to_frame(index=index, name=name) - with expect_warning_if(name is None): - actual = gidx.to_frame(index=index, name=name) + actual = gidx.to_frame(index=index, name=name) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 27e84f179b6..1cdaa3c52a7 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from datetime import datetime from itertools import combinations @@ -9,11 +9,13 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_210 from cudf.testing import _utils as utils from cudf.testing._utils import ( INTEGER_TYPES, assert_eq, assert_exceptions_equal, + expect_warning_if, ) index_dtypes = INTEGER_TYPES @@ -151,8 +153,10 @@ def test_series_get_item_iloc_defer(arg): ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) gs = cudf.from_pandas(ps) - expect = ps[arg] - got = gs[arg] + with expect_warning_if(PANDAS_GE_210 and not isinstance(arg, str)): + expect = ps[arg] + with expect_warning_if(not isinstance(arg, str)): + got = gs[arg] assert_eq(expect, got) @@ -163,7 +167,7 @@ def test_series_iloc_defer_cudf_scalar(): for t in index_dtypes: arg = cudf.Scalar(1, dtype=t) - got = gs[arg] + got = gs.iloc[arg] expect = 2 assert_eq(expect, got) @@ -926,8 +930,17 @@ def test_series_setitem_basics(key, value, nulls): elif nulls == "all": psr[:] = None gsr = cudf.from_pandas(psr) - psr[key] = value - gsr[key] = value + with expect_warning_if( + PANDAS_GE_210 + and isinstance(value, list) + and len(value) == 0 + and nulls == "none" + ): + psr[key] = value + with expect_warning_if( + isinstance(value, list) and len(value) == 0 and not len(key) == 0 + ): + gsr[key] = value assert_eq(psr, gsr, check_dtype=False) @@ -970,8 +983,17 @@ def test_series_setitem_iloc(key, value, nulls): elif nulls == "all": psr[:] = None gsr = cudf.from_pandas(psr) - psr.iloc[key] = value - gsr.iloc[key] = value + with expect_warning_if( + PANDAS_GE_210 + and isinstance(value, list) + and len(value) == 0 + and nulls == "none" + ): + psr.iloc[key] = value + with expect_warning_if( + isinstance(value, list) and len(value) == 0 and not len(key) == 0 + ): + gsr.iloc[key] = value assert_eq(psr, gsr, check_dtype=False) @@ -990,8 +1012,12 @@ def test_series_setitem_iloc(key, value, nulls): def test_series_setitem_dtype(key, value): psr = pd.Series([1, 2, 3], dtype="int32") gsr = cudf.from_pandas(psr) - psr[key] = value - gsr[key] = value + + with expect_warning_if(isinstance(value, (float, list))): + psr[key] = value + with expect_warning_if(isinstance(value, (float, list))): + gsr[key] = value + assert_eq(psr, gsr) @@ -1252,15 +1278,15 @@ def test_iloc_categorical_index(index): @pytest.mark.parametrize( "sli", [ - slice("2001", "2020"), slice("2001", "2002"), slice("2002", "2001"), - slice(None, "2020"), slice("2001", None), ], ) @pytest.mark.parametrize("is_dataframe", [True, False]) def test_loc_datetime_index(sli, is_dataframe): + sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop)) + if is_dataframe is True: pd_data = pd.DataFrame( {"a": [1, 2, 3]}, @@ -1273,13 +1299,32 @@ def test_loc_datetime_index(sli, is_dataframe): ) gd_data = cudf.from_pandas(pd_data) - expect = pd_data.loc[sli] got = gd_data.loc[sli] - assert_eq(expect, got) +@pytest.mark.parametrize( + "sli", + [ + slice("2001", "2020"), + slice(None, "2020"), + ], +) +def test_loc_datetime_index_slice_not_in(sli): + pd_data = pd.Series( + [1, 2, 3], + pd.Series(["2001", "2009", "2002"], dtype="datetime64[ns]"), + ) + gd_data = cudf.from_pandas(pd_data) + with pytest.raises(KeyError): + assert_eq(pd_data.loc[sli], gd_data.loc[sli]) + + with pytest.raises(KeyError): + sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop)) + assert_eq(pd_data.loc[sli], gd_data.loc[sli]) + + @pytest.mark.parametrize( "gdf_kwargs", [ @@ -1584,9 +1629,12 @@ def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns(): actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame( {"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2]) ) - expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame( - {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2]) - ) + with pytest.warns(FutureWarning): + # Seems to be a false warning from pandas, + # but nevertheless catching it. + expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame( + {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2]) + ) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index c0b085a5097..5ad542546aa 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -1,9 +1,14 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import pytest import cudf -from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.core._compat import PANDAS_GE_210 +from cudf.testing._utils import ( + assert_eq, + assert_exceptions_equal, + expect_warning_if, +) @pytest.mark.parametrize( @@ -49,8 +54,10 @@ def test_interpolate_series(data, method, axis): gsr = cudf.Series(data) psr = gsr.to_pandas() - expect = psr.interpolate(method=method, axis=axis) - got = gsr.interpolate(method=method, axis=axis) + with expect_warning_if(PANDAS_GE_210 and psr.dtype == "object"): + expect = psr.interpolate(method=method, axis=axis) + with expect_warning_if(gsr.dtype == "object"): + got = gsr.interpolate(method=method, axis=axis) assert_eq(expect, got, check_dtype=psr.dtype != "object") @@ -87,8 +94,10 @@ def test_interpolate_series_values_or_index(data, index, method): gsr = cudf.Series(data, index=index) psr = gsr.to_pandas() - expect = psr.interpolate(method=method) - got = gsr.interpolate(method=method) + with expect_warning_if(PANDAS_GE_210 and gsr.dtype == "object"): + expect = psr.interpolate(method=method) + with expect_warning_if(gsr.dtype == "object"): + got = gsr.interpolate(method=method) assert_eq(expect, got, check_dtype=psr.dtype != "object") @@ -100,12 +109,12 @@ def test_interpolate_series_values_or_index(data, index, method): {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, {"axis": 0, "method": "linear"}, ), - ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}), - ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}), - ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "backward"}), ( {"A": [1, 2, 3]}, - {"method": "backfill", "limit_direction": "forward"}, + {"method": "backfill", "limit_direction": "backward"}, ), ], ) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index ece676329bc..8b912fe28bc 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -182,8 +183,8 @@ def test_dataframe_join_suffix(): assert list(expect.columns) == list(got.columns) assert_eq(expect.index.values, got.index.values) - got_sorted = got.sort_values(by=list(got.columns), axis=0) - expect_sorted = expect.sort_values(by=list(expect.columns), axis=0) + got_sorted = got.sort_values(by=["b_left", "c", "b_right"], axis=0) + expect_sorted = expect.sort_values(by=["b_left", "c", "b_right"], axis=0) for k in expect_sorted.columns: _check_series(expect_sorted[k].fillna(-1), got_sorted[k].fillna(-1)) @@ -785,7 +786,7 @@ def test_join_datetimes_index(dtype): assert gdf["d"].dtype == cudf.dtype(dtype) - assert_join_results_equal(pdf, gdf, how="inner") + assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False) def test_join_with_different_names(): @@ -979,7 +980,7 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) + exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = [1, 2] exp_other_data = ["a", "b"] @@ -1009,7 +1010,7 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) + exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) if dtype_l != dtype_r: exp_join_data = [1, 2, 3, 4.5] @@ -1050,7 +1051,7 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) + exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = [1, 2, 3] exp_other_data = ["a", "b", "c"] @@ -1933,7 +1934,11 @@ def test_string_join_key(str_data, num_keys, how): gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data - + if PANDAS_GE_200 and len(other_data) == 0: + # TODO: Remove this workaround after + # the following bug is fixed: + # https://github.com/pandas-dev/pandas/issues/56679 + pdf["a"] = pdf["a"].astype("str") pdf2 = pdf.copy() gdf2 = gdf.copy() @@ -2009,6 +2014,11 @@ def test_string_join_non_key(str_data, num_cols, how): gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data + if PANDAS_GE_200 and len(other_data) == 0: + # TODO: Remove this workaround after + # the following bug is fixed: + # https://github.com/pandas-dev/pandas/issues/56679 + pdf["a"] = pdf["a"].astype("str") pdf2 = pdf.copy() gdf2 = gdf.copy() @@ -2147,15 +2157,21 @@ def test_join_multiindex_empty(): lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}, index=["a", "b", "c"]) lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) rhs = pd.DataFrame(index=["a", "c", "d"]) - with pytest.warns(FutureWarning): - expect = lhs.join(rhs, how="inner") - - lhs = cudf.from_pandas(lhs) - rhs = cudf.from_pandas(rhs) - with pytest.warns(FutureWarning): - got = lhs.join(rhs, how="inner") - - assert_join_results_equal(expect, got, how="inner") + g_lhs = cudf.from_pandas(lhs) + g_rhs = cudf.from_pandas(rhs) + if PANDAS_GE_200: + assert_exceptions_equal( + lfunc=lhs.join, + rfunc=g_lhs.join, + lfunc_args_and_kwargs=([rhs], {"how": "inner"}), + rfunc_args_and_kwargs=([g_rhs], {"how": "inner"}), + check_exception_type=False, + ) + else: + with pytest.warns(FutureWarning): + _ = lhs.join(rhs, how="inner") + with pytest.raises(ValueError): + _ = g_lhs.join(g_rhs, how="inner") def test_join_on_index_with_duplicate_names(): @@ -2223,11 +2239,18 @@ def test_index_join_return_indexers_notimplemented(): @pytest.mark.parametrize("how", ["inner", "outer"]) -def test_index_join_names(how): +def test_index_join_names(request, how): idx1 = cudf.Index([10, 1, 2, 4, 2, 1], name="a") idx2 = cudf.Index([-10, 2, 3, 1, 2], name="b") + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/57065", + ) + ) + pidx1 = idx1.to_pandas() + pidx2 = idx2.to_pandas() - expected = idx1.to_pandas().join(idx2.to_pandas(), how=how) + expected = pidx1.join(pidx2, how=how) actual = idx1.join(idx2, how=how) assert_join_results_equal(actual, expected, how=how) @@ -2261,5 +2284,5 @@ def test_merge_timedelta_types(dtype1, dtype2): if isinstance(actual.index, cudf.RangeIndex) and isinstance(expected.index, pd.Index) else True, - check_dtype=True, + check_dtype=len(actual) > 0, ) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 47f5b99acf7..ec980adc334 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import copy import gzip @@ -13,11 +13,13 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, assert_eq, + expect_warning_if, ) @@ -94,6 +96,8 @@ def json_files(request, tmp_path_factory, pdf): ) if index is False and orient == "table": pytest.skip("'index=False' isn't valid when 'orient' is 'table'") + if index is True and orient not in ("split", "table", "index", "columns"): + pytest.skip("'index=False' isn't valid when 'orient' is 'table'") fname_df = tmp_path_factory.mktemp("json") / "test_df.json" fname_series = tmp_path_factory.mktemp("json") / "test_series.json" pdf.to_json(fname_df, index=index, compression=compression, orient=orient) @@ -212,6 +216,18 @@ def test_cudf_json_writer_read(gdf_writer_types): if pdf2.empty: pdf2.reset_index(drop=True, inplace=True) pdf2.columns = pdf2.columns.astype("object") + if PANDAS_GE_200: + # Pandas moved to consistent datetimes parsing format: + # https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#datetimes-are-now-parsed-with-a-consistent-format + for unit in ["s", "ms"]: + if f"col_datetime64[{unit}]" in pdf2.columns: + pdf2[f"col_datetime64[{unit}]"] = ( + pd.to_datetime( + pdf2[f"col_datetime64[{unit}]"], format="mixed" + ) + .dt.tz_localize(None) + .astype(f"datetime64[{unit}]") + ) assert_eq(pdf2, gdf2) @@ -325,8 +341,16 @@ def json_input(request, tmp_path_factory): @pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"]) def test_json_lines_basic(json_input, engine): - cu_df = cudf.read_json(json_input, engine=engine, lines=True) - pd_df = pd.read_json(json_input, lines=True) + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + cu_df = cudf.read_json(json_input, engine=engine, lines=True) + with expect_warning_if( + isinstance(json_input, str) + and PANDAS_GE_210 + and not json_input.endswith(".json") + ): + pd_df = pd.read_json(json_input, lines=True) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): @@ -340,7 +364,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine): tmp_file1 = tmpdir.join("MultiInputs1.json") tmp_file2 = tmpdir.join("MultiInputs2.json") - pdf = pd.read_json(json_input, lines=True) + with expect_warning_if( + isinstance(json_input, str) + and PANDAS_GE_210 + and not json_input.endswith(".json") + ): + pdf = pd.read_json(json_input, lines=True) pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records") pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records") @@ -355,7 +384,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine): @pytest.mark.parametrize("engine", ["auto", "cudf"]) def test_json_read_directory(tmpdir, json_input, engine): - pdf = pd.read_json(json_input, lines=True) + with expect_warning_if( + isinstance(json_input, str) + and PANDAS_GE_210 + and not json_input.endswith(".json") + ): + pdf = pd.read_json(json_input, lines=True) pdf.to_json( tmpdir.join("MultiInputs1.json"), compression="infer", @@ -387,37 +421,47 @@ def test_json_read_directory(tmpdir, json_input, engine): def test_json_lines_byte_range(json_input): # include the first row and half of the second row # should parse the first two rows - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(0, 15) + will_warn = isinstance(json_input, str) and not json_input.endswith( + ".json" ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(0, 15) + ) assert df.shape == (2, 3) # include half of the second row and half of the third row # should parse only the third row - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 10) - ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 10) + ) assert df.shape == (1, 3) # include half of the second row and entire third row # should parse only the third row - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 0) - ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 0) + ) assert df.shape == (1, 3) # include half of the second row till past the end of the file # should parse only the third row - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(10, 50) - ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(10, 50) + ) assert df.shape == (1, 3) def test_json_lines_dtypes(json_input): - df = cudf.read_json( - json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} - ) + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + df = cudf.read_json( + json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} + ) assert all(df.dtypes == ["float64", "int64", "int16"]) @@ -457,32 +501,32 @@ def test_json_engine_selection(): json = "[1, 2, 3]" # should use the cudf engine - df = cudf.read_json(json, lines=True) + df = cudf.read_json(StringIO(json), lines=True) # column names are strings when parsing with cudf for col_name in df.columns: assert isinstance(col_name, str) # should use the pandas engine - df = cudf.read_json(json, lines=False, engine="pandas") + df = cudf.read_json(StringIO(json), lines=False, engine="pandas") # column names are ints when parsing with pandas for col_name in df.columns: assert isinstance(col_name, int) # should use the pandas engine - df = cudf.read_json(json, lines=True, engine="pandas") + df = cudf.read_json(StringIO(json), lines=True, engine="pandas") # column names are ints when parsing with pandas for col_name in df.columns: assert isinstance(col_name, int) # should raise an exception with pytest.raises(ValueError): - cudf.read_json(json, lines=False, engine="cudf_legacy") + cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy") def test_json_bool_values(): buffer = "[true,1]\n[false,false]\n[true,true]" - cu_df = cudf.read_json(buffer, lines=True) - pd_df = pd.read_json(buffer, lines=True) + cu_df = cudf.read_json(StringIO(buffer), lines=True) + pd_df = pd.read_json(StringIO(buffer), lines=True) # types should be ['bool', 'int64'] np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -491,7 +535,7 @@ def test_json_bool_values(): np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) cu_df = cudf.read_json( - buffer, lines=True, dtype={"0": "bool", "1": "long"} + StringIO(buffer), lines=True, dtype={"0": "bool", "1": "long"} ) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -509,7 +553,7 @@ def test_json_bool_values(): ], ) def test_json_null_literal(buffer): - df = cudf.read_json(buffer, lines=True, engine="cudf_legacy") + df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy") # first column contains a null field, type should be set to float # second column contains only empty fields, type should be set to int8 @@ -521,7 +565,7 @@ def test_json_null_literal(buffer): def test_json_bad_protocol_string(): - test_string = '{"field": "s3://path"}' + test_string = StringIO('{"field": "s3://path"}') expect = pd.DataFrame([{"field": "s3://path"}]) got = cudf.read_json(test_string, lines=True) @@ -735,7 +779,7 @@ def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine): def test_default_float_bitwidth(default_float_bitwidth): # Test that float columns in json are _inferred_ as 32 bit columns. df = cudf.read_json( - '{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}', + StringIO('{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}'), engine="cudf", lines=True, orient="records", @@ -1218,7 +1262,7 @@ def test_json_round_trip_gzip(): @pytest.mark.parametrize("lines", [True, False]) def test_json_array_of_arrays(data, lines): data = data if lines else "[" + data.replace("\n", ",") + "]" - pdf = pd.read_json(data, orient="values", lines=lines) + pdf = pd.read_json(StringIO(data), orient="values", lines=lines) df = cudf.read_json( StringIO(data), engine="cudf", @@ -1312,8 +1356,8 @@ def _replace_with_nulls(df, replace_items): # both json lines and json string tested. json_string = "[" + jsonl_string.replace("\n", ",") + "]" - pdf = pd.read_json(jsonl_string, orient="records", lines=True) - pdf2 = pd.read_json(json_string, orient="records", lines=False) + pdf = pd.read_json(StringIO(jsonl_string), orient="records", lines=True) + pdf2 = pd.read_json(StringIO(json_string), orient="records", lines=False) assert_eq(pdf, pdf2) # replace list elements with None if it has dict and non-dict # in above test cases, these items are mixed with dict/list items diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 53d972d96c5..53919a95115 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -1,7 +1,8 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. """ -Tests related to is_unique and is_monotonic attributes +Tests related to is_unique, is_monotonic_increasing & +is_monotonic_decreasing attributes """ import numpy as np import pandas as pd @@ -9,13 +10,8 @@ import cudf from cudf import Index, MultiIndex, Series -from cudf.core.index import ( - CategoricalIndex, - DatetimeIndex, - GenericIndex, - RangeIndex, -) -from cudf.testing._utils import assert_eq, expect_warning_if +from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) @@ -28,11 +24,6 @@ def test_range_index(testrange): ) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -51,15 +42,10 @@ def test_range_index(testrange): ], ) def test_generic_index(testlist): - index = GenericIndex(testlist) + index = Index(testlist) index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -78,11 +64,6 @@ def test_string_index(testlist): index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -97,11 +78,6 @@ def test_categorical_index(testlist): index_pd = pd.CategoricalIndex(raw_cat) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -141,11 +117,6 @@ def test_datetime_index(testlist): index_pd = pd.DatetimeIndex(testlist) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -168,11 +139,6 @@ def test_series(testlist): series_pd = pd.Series(testlist) assert series.is_unique == series_pd.is_unique - with pytest.warns(FutureWarning): - expect = series_pd.index.is_monotonic - with pytest.warns(FutureWarning): - got = series.index.is_monotonic - assert got == expect assert series.is_monotonic_increasing == series_pd.is_monotonic_increasing assert series.is_monotonic_decreasing == series_pd.is_monotonic_decreasing @@ -197,11 +163,6 @@ def test_multiindex(): gdf = cudf.from_pandas(pdf) assert pdf.index.is_unique == gdf.index.is_unique - with pytest.warns(FutureWarning): - expect = pdf.index.is_monotonic - with pytest.warns(FutureWarning): - got = gdf.index.is_monotonic - assert got == expect assert ( pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing ) @@ -236,11 +197,6 @@ def test_multiindex_tuples(testarr): index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -255,15 +211,12 @@ def test_multiindex_tuples(testarr): ], ) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["loc", "getitem", None]) -def test_get_slice_bound(testlist, side, kind): - index = GenericIndex(testlist) +def test_get_slice_bound(testlist, side): + index = Index(testlist) index_pd = pd.Index(testlist) for label in testlist: - with pytest.warns(FutureWarning): - expect = index_pd.get_slice_bound(label, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = index.get_slice_bound(label, side, kind) + expect = index_pd.get_slice_bound(label, side) + got = index.get_slice_bound(label, side) assert got == expect @@ -273,16 +226,13 @@ def test_get_slice_bound(testlist, side, kind): [[-1, 0, 5, 10, 11], [-1, 0, 1, 2], [2, 3, 4, 5], [-1, 0, 1], [2, 3, 4]], ) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["getitem", "loc"]) -def test_rangeindex_get_slice_bound_basic(bounds, indices, side, kind): +def test_rangeindex_get_slice_bound_basic(bounds, indices, side): start, stop = bounds pd_index = pd.RangeIndex(start, stop) cudf_index = RangeIndex(start, stop) for idx in indices: - with pytest.warns(FutureWarning): - expect = pd_index.get_slice_bound(idx, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = cudf_index.get_slice_bound(idx, side, kind) + expect = pd_index.get_slice_bound(idx, side) + got = cudf_index.get_slice_bound(idx, side) assert expect == got @@ -295,31 +245,25 @@ def test_rangeindex_get_slice_bound_basic(bounds, indices, side, kind): [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], ) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["getitem", "loc"]) -def test_rangeindex_get_slice_bound_step(bounds, label, side, kind): +def test_rangeindex_get_slice_bound_step(bounds, label, side): start, stop, step = bounds pd_index = pd.RangeIndex(start, stop, step) cudf_index = RangeIndex(start, stop, step) - with pytest.warns(FutureWarning): - expect = pd_index.get_slice_bound(label, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = cudf_index.get_slice_bound(label, side, kind) + expect = pd_index.get_slice_bound(label, side) + got = cudf_index.get_slice_bound(label, side) assert expect == got @pytest.mark.parametrize("label", [1, 3, 5, 7, 9, 11]) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["loc", "getitem", None]) -def test_get_slice_bound_missing(label, side, kind): +def test_get_slice_bound_missing(label, side): mylist = [2, 4, 6, 8, 10] - index = GenericIndex(mylist) + index = Index(mylist) index_pd = pd.Index(mylist) - with pytest.warns(FutureWarning): - expect = index_pd.get_slice_bound(label, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = index.get_slice_bound(label, side, kind) + expect = index_pd.get_slice_bound(label, side) + got = index.get_slice_bound(label, side) assert got == expect @@ -327,12 +271,10 @@ def test_get_slice_bound_missing(label, side, kind): @pytest.mark.parametrize("side", ["left", "right"]) def test_get_slice_bound_missing_str(label, side): mylist = ["b", "d", "f"] - index = GenericIndex(mylist) + index = Index(mylist) index_pd = pd.Index(mylist) - with pytest.warns(FutureWarning): - got = index.get_slice_bound(label, side, "getitem") - with pytest.warns(FutureWarning): - expect = index_pd.get_slice_bound(label, side, "getitem") + got = index.get_slice_bound(label, side) + expect = index_pd.get_slice_bound(label, side) assert got == expect diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 78bce89f2a8..e15b3f6db40 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -726,15 +726,8 @@ def test_multiindex_equals(): } ], ) -@pytest.mark.parametrize( - "levels", - [[["2000-01-01", "2000-01-02", "2000-01-03"], ["A", "B", "C"]], None], -) -@pytest.mark.parametrize( - "codes", [[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], None] -) @pytest.mark.parametrize("names", [["X", "Y"]]) -def test_multiindex_copy_sem(data, levels, codes, names): +def test_multiindex_copy_sem(data, names): """Test semantic equality for MultiIndex.copy""" gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -743,12 +736,10 @@ def test_multiindex_copy_sem(data, levels, codes, names): pdf = pdf.groupby(["Date", "Symbol"], sort=True).mean() gmi = gdf.index - with expect_warning_if(levels is not None or codes is not None): - gmi_copy = gmi.copy(levels=levels, codes=codes, names=names) + gmi_copy = gmi.copy(names=names) pmi = pdf.index - with expect_warning_if(levels is not None or codes is not None): - pmi_copy = pmi.copy(levels=levels, codes=codes, names=names) + pmi_copy = pmi.copy(names=names) for glv, plv in zip(gmi_copy.levels, pmi_copy.levels): assert all(glv.values_host == plv.values) @@ -1962,13 +1953,13 @@ def test_multiindex_to_frame_allow_duplicates( ): gidx = cudf.from_pandas(pidx) - if ( + if name is None or ( ( len(pidx.names) != len(set(pidx.names)) and not all(x is None for x in pidx.names) ) and not allow_duplicates - and (name is None or name is no_default) + and name is no_default ): assert_exceptions_equal( pidx.to_frame, @@ -1998,22 +1989,20 @@ def test_multiindex_to_frame_allow_duplicates( ) or (isinstance(name, list) and len(name) != len(set(name))): # cudf doesn't have the ability to construct dataframes # with duplicate column names - with expect_warning_if(name is None): - with pytest.raises(ValueError): - gidx.to_frame( - index=index, - name=name, - allow_duplicates=allow_duplicates, - ) + with pytest.raises(ValueError): + gidx.to_frame( + index=index, + name=name, + allow_duplicates=allow_duplicates, + ) else: - with expect_warning_if(name is None): + with expect_warning_if(not PANDAS_GE_200 and name is None): expected = pidx.to_frame( index=index, name=name, allow_duplicates=allow_duplicates ) - with expect_warning_if(name is None): - actual = gidx.to_frame( - index=index, name=name, allow_duplicates=allow_duplicates - ) + actual = gidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index fee5cc0ad21..2139e7b9860 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,7 +5,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import NUMERIC_TYPES, assert_eq from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -267,12 +266,7 @@ def test_to_numeric_downcast_large_float_pd_bug(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - if PANDAS_GE_150: - assert_eq(expected, got) - else: - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( @@ -350,12 +344,7 @@ def test_to_numeric_downcast_string_large_float(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - if PANDAS_GE_150: - assert_eq(expected, got) - else: - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): - assert_eq(expected, got) + assert_eq(expected, got) else: expected = pd.Series([np.inf, -np.inf]) with pytest.warns( diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index baaca0b806f..cd0055ad78b 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from string import ascii_lowercase @@ -22,20 +22,13 @@ (range(10), [1, 2, 3, 4, 5] * 2), ], ) -def test_get_dummies(data, index): - gdf = cudf.DataFrame({"x": data}, index=index) +@pytest.mark.parametrize("dtype", ["bool", "uint8"]) +def test_get_dummies(data, index, dtype): pdf = pd.DataFrame({"x": data}, index=index) + gdf = cudf.from_pandas(pdf) - encoded_expected = pd.get_dummies(pdf, prefix="test") - with pytest.warns(FutureWarning): - encoded_actual = cudf.get_dummies(gdf, prefix="test") - - assert_eq( - encoded_expected, - encoded_actual, - check_dtype=len(data) != 0, - ) - encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8) + encoded_expected = pd.get_dummies(pdf, prefix="test", dtype=dtype) + encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=dtype) assert_eq( encoded_expected, @@ -55,8 +48,7 @@ def test_onehot_get_dummies_multicol(n_cols): pdf = pd.DataFrame(data) encoded_expected = pd.get_dummies(pdf, prefix="test") - with pytest.warns(FutureWarning): - encoded_actual = cudf.get_dummies(gdf, prefix="test") + encoded_actual = cudf.get_dummies(gdf, prefix="test") assert_eq(encoded_expected, encoded_actual) @@ -64,17 +56,13 @@ def test_onehot_get_dummies_multicol(n_cols): @pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("dummy_na", [True, False]) def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): - pdf = pd.DataFrame({"a": [0, 1, np.nan]}) - df = cudf.DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) + df = cudf.DataFrame({"a": [0, 1, np.nan]}, nan_as_null=nan_as_null) + pdf = df.to_pandas(nullable=nan_as_null) expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"]) - with pytest.warns(FutureWarning): - actual = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) + got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) - if dummy_na and nan_as_null: - actual = actual.rename(columns={"a_": "a_nan"})[expected.columns] - - assert_eq(expected, actual) + assert_eq(expected, got, check_like=True) @pytest.mark.parametrize( @@ -109,10 +97,9 @@ def test_get_dummies_prefix_sep(prefix, prefix_sep): encoded_expected = pd.get_dummies( pdf, prefix=prefix, prefix_sep=prefix_sep ) - with pytest.warns(FutureWarning): - encoded_actual = cudf.get_dummies( - gdf, prefix=prefix, prefix_sep=prefix_sep - ) + encoded_actual = cudf.get_dummies( + gdf, prefix=prefix, prefix_sep=prefix_sep + ) assert_eq(encoded_expected, encoded_actual) @@ -126,8 +113,7 @@ def test_get_dummies_with_nan(): df.to_pandas(nullable=True), dummy_na=True, columns=["a"] ) - with pytest.warns(FutureWarning): - actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) + actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) assert_eq(expected, actual) @@ -166,9 +152,6 @@ def test_get_dummies_array_like_with_nan(): ser.to_pandas(nullable=True), dummy_na=True, prefix="a", prefix_sep="_" ) - with pytest.warns(FutureWarning): - actual = cudf.get_dummies( - ser, dummy_na=True, prefix="a", prefix_sep="_" - ) + actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_") assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 6b7f86098a0..4f293c9860e 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -159,7 +159,7 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index): pdf = orcfile.read().to_pandas(date_as_object=False) gdf = cudf.read_orc(path, use_index=use_index) - assert_eq(pdf, gdf, check_categorical=False) + assert_eq(pdf, gdf, check_categorical=False, check_exact=False) def test_orc_reader_strings(datadir): @@ -576,7 +576,7 @@ def test_int_overflow(tmpdir): # The number of rows and the large element trigger delta encoding num_rows = 513 - df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int32") + df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int64") df["a"][0] = 1024 * 1024 * 1024 df["a"][num_rows - 1] = 1 df.to_orc(file_path) @@ -1671,16 +1671,7 @@ def run_orc_columns_and_index_param(index_obj, index, columns): expected = pd.read_orc(buffer, columns=columns) got = cudf.read_orc(buffer, columns=columns) - if columns: - # TODO: Remove workaround after this issue is fixed: - # https://github.com/pandas-dev/pandas/issues/47944 - assert_eq( - expected.sort_index(axis=1), - got.sort_index(axis=1), - check_index_type=True, - ) - else: - assert_eq(expected, got, check_index_type=True) + assert_eq(expected, got, check_index_type=True) @pytest.mark.parametrize("index_obj", [None, [10, 11, 12], ["x", "y", "z"]]) @@ -1827,7 +1818,7 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): with expect_warning_if(engine == "pyarrow", UserWarning): got = cudf.read_orc(buffer, engine=engine) - assert_eq(negative_timestamp_df, got) + assert_eq(negative_timestamp_df, got, check_dtype=False) def test_orc_writer_negative_timestamp(negative_timestamp_df): @@ -1836,8 +1827,10 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): buffer = BytesIO() negative_timestamp_df.to_orc(buffer) - assert_eq(negative_timestamp_df, pd.read_orc(buffer)) - assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read()) + assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False) + assert_eq( + negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False + ) def test_orc_reader_apache_negative_timestamp(datadir): diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index 9b5a8c19cf5..da506a8d5b2 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import numpy as np import pandas as pd -from cudf import DataFrame, GenericIndex, Series +from cudf import DataFrame, Index, Series from cudf._lib.copying import pack, unpack from cudf.testing._utils import assert_eq @@ -52,7 +52,7 @@ def check_packed_equality(df): assert_packed_frame_equality(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_equality(sortvaldf) @@ -120,7 +120,7 @@ def check_packed_unique_pointers(df): assert_packed_frame_unique_pointers(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_unique_pointers(sortvaldf) @@ -188,7 +188,7 @@ def check_packed_pickled_equality(df): assert_packed_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_picklable(sortvaldf) # out-of-band buffers = [] @@ -261,7 +261,7 @@ def check_packed_serialized_equality(df): assert_packed_frame_serializable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_serializable(sortvaldf) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 007349ab551..b4e24bd1617 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -21,7 +21,7 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf -from cudf.core._compat import PANDAS_LT_153 +from cudf.core._compat import PANDAS_GE_200 from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -31,7 +31,6 @@ from cudf.testing._utils import ( TIMEDELTA_TYPES, assert_eq, - assert_exceptions_equal, set_random_null_mask_inplace, ) @@ -209,10 +208,13 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): ) test_pdf.columns.name = None - # Randomly but reproducibly mark subset of rows as invalid - random.seed(1337) - mask = random.sample(range(nrows), nvalids) - test_pdf[test_pdf.index.isin(mask)] = np.NaN + if nvalids: + # Randomly but reproducibly mark subset of rows as invalid + random.seed(1337) + mask = random.sample(range(nrows), nvalids) + test_pdf[test_pdf.index.isin(mask)] = np.NaN + if dtype: + test_pdf = test_pdf.astype(dtype) return test_pdf @@ -289,7 +291,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine): expect = expect.reset_index(drop=True) got = got.reset_index(drop=True) - assert_eq(expect, got) + assert_eq(expect, got, check_column_type=not PANDAS_GE_200) @pytest.mark.parametrize("has_null", [False, True]) @@ -660,7 +662,7 @@ def test_parquet_reader_select_columns(datadir): def test_parquet_reader_invalids(tmpdir): - test_pdf = make_pdf(nrows=1000, nvalids=1000 // 4, dtype=np.int64) + test_pdf = make_pdf(nrows=1000, nvalids=1000 // 4, dtype="Int64") fname = tmpdir.join("invalids.parquet") test_pdf.to_parquet(fname, engine="pyarrow") @@ -668,7 +670,7 @@ def test_parquet_reader_invalids(tmpdir): expect = pd.read_parquet(fname) got = cudf.read_parquet(fname) - assert_eq(expect, got) + assert_eq(expect, got.to_pandas(nullable=True)) def test_parquet_reader_filenotfound(tmpdir): @@ -755,8 +757,8 @@ def create_parquet_source(df, src_type, fname): "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] ) def test_parquet_reader_multiple_files(tmpdir, src): - test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2) - test_pdf2 = make_pdf(nrows=500) + test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2, dtype="float64") + test_pdf2 = make_pdf(nrows=500, dtype="float64") expect = pd.concat([test_pdf1, test_pdf2]) src1 = create_parquet_source(test_pdf1, src, tmpdir.join("multi1.parquet")) @@ -1605,14 +1607,23 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): expect = pdf got = pd.read_parquet(gdf_fname) - + if PANDAS_GE_200: + # https://github.com/pandas-dev/pandas/issues/52412 + assert got["col_datetime64[ms]"].dtype == np.dtype("datetime64[ns]") + assert got["col_datetime64[us]"].dtype == np.dtype("datetime64[ns]") + got["col_datetime64[ms]"] = got["col_datetime64[ms]"].astype( + "datetime64[ms]" + ) + got["col_datetime64[us]"] = got["col_datetime64[us]"].astype( + "datetime64[us]" + ) # verify INT96 timestamps were converted back to the same data. assert_eq(expect, got, check_categorical=False) def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2) - test_pdf2 = make_pdf(nrows=20) + test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") + test_pdf2 = make_pdf(nrows=20, dtype="float64") expect = pd.concat([test_pdf1, test_pdf2]) tmpdir.mkdir("multi_part") @@ -1895,6 +1906,15 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) + if PANDAS_GE_200 and isinstance(got_pd["c"].dtype, pd.CategoricalDtype): + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["c"] = got_pd["c"].astype( + pd.CategoricalDtype( + categories=got_pd["c"].dtype.categories.astype("int64"), + ordered=got_pd["c"].dtype.ordered, + ) + ) assert_eq(got_pd, got_cudf) # If filename is specified, check that it is correct @@ -1942,6 +1962,15 @@ def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) assert_eq(got_pd, got_cudf) @@ -1982,7 +2011,15 @@ def test_parquet_writer_chunked_max_file_size( # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) - + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) assert_eq( got_pd.sort_values(["b"]).reset_index(drop=True), got_cudf.sort_values(["b"]).reset_index(drop=True), @@ -2028,6 +2065,15 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) assert_eq(got_pd, got_cudf) @@ -2135,6 +2181,15 @@ def test_read_parquet_partitioned_filtered( filters = [[("a", "==", 10)], [("c", "==", 1)]] got = cudf.read_parquet(read_path, filters=filters) expect = pd.read_parquet(read_path, filters=filters) + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + expect["c"] = expect["c"].astype( + pd.CategoricalDtype( + categories=expect["c"].dtype.categories.astype("int64"), + ordered=expect["c"].dtype.ordered, + ) + ) assert_eq(expect, got) @@ -2353,7 +2408,12 @@ def run_parquet_index(pdf, index): expected = pd.read_parquet(pandas_buffer) actual = cudf.read_parquet(cudf_buffer) - assert_eq(expected, actual, check_index_type=True) + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize( @@ -2552,43 +2612,37 @@ def test_parquet_writer_list_statistics(tmpdir): ] }, # Struct of Lists - pytest.param( - { - "Real estate records": [ - None, - { - "Status": "NRI", - "Ownerships": { - "land_unit": [None, 2, None], - "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], - }, - }, - { - "Status": None, - "Ownerships": { - "land_unit": [4, 5], - "flats": [[7, 8], []], - }, + { + "Real estate records": [ + None, + { + "Status": "NRI", + "Ownerships": { + "land_unit": [None, 2, None], + "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], }, - { - "Status": "RI", - "Ownerships": {"land_unit": None, "flats": [[]]}, + }, + { + "Status": None, + "Ownerships": { + "land_unit": [4, 5], + "flats": [[7, 8], []], }, - {"Status": "RI", "Ownerships": None}, - { - "Status": None, - "Ownerships": { - "land_unit": [7, 8, 9], - "flats": [[], [], []], - }, + }, + { + "Status": "RI", + "Ownerships": {"land_unit": None, "flats": [[]]}, + }, + {"Status": "RI", "Ownerships": None}, + { + "Status": None, + "Ownerships": { + "land_unit": [7, 8, 9], + "flats": [[], [], []], }, - ] - }, - marks=pytest.mark.xfail( - condition=PANDAS_LT_153, - reason="pandas assertion fixed in pandas 1.5.3", - ), - ), + }, + ] + }, ], ) def test_parquet_writer_nested(tmpdir, data): @@ -2616,21 +2670,37 @@ def test_parquet_writer_decimal(decimal_type, data): buff = BytesIO() gdf.to_parquet(buff) - got = pd.read_parquet(buff, use_nullable_dtypes=True) + got = pd.read_parquet(buff, dtype_backend="numpy_nullable") assert_eq(gdf["val"].to_pandas(nullable=True), got["val"]) assert_eq(gdf["dec_val"].to_pandas(), got["dec_val"]) def test_parquet_writer_column_validation(): - df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + cudf_parquet = BytesIO() + pandas_parquet = BytesIO() + df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) pdf = df.to_pandas() - assert_exceptions_equal( - lfunc=df.to_parquet, - rfunc=pdf.to_parquet, - lfunc_args_and_kwargs=(["cudf.parquet"],), - rfunc_args_and_kwargs=(["pandas.parquet"],), - ) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.warns(UserWarning): + df.to_parquet(cudf_parquet) + + if PANDAS_GE_200: + with pytest.warns(UserWarning): + pdf.to_parquet(pandas_parquet) + + assert_eq( + pd.read_parquet(cudf_parquet), + cudf.read_parquet(pandas_parquet), + ) + assert_eq( + cudf.read_parquet(cudf_parquet), + pd.read_parquet(pandas_parquet), + ) + + with cudf.option_context("mode.pandas_compatible", False): + with pytest.raises(ValueError): + df.to_parquet(cudf_parquet) def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): @@ -2652,11 +2722,23 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): got = pd.read_parquet(fname) nullable = num_rows > 0 + + if not PANDAS_GE_200: + # BUG in pre-2.0.1: + # https://github.com/pandas-dev/pandas/issues/52449 + gdf["col_datetime64[ms]"] = gdf["col_datetime64[ms]"].astype( + "datetime64[ns]" + ) + gdf["col_datetime64[us]"] = gdf["col_datetime64[us]"].astype( + "datetime64[ns]" + ) + if nullable: gdf = gdf.drop(columns="col_datetime64[ms]") gdf = gdf.drop(columns="col_datetime64[us]") got = got.drop(columns="col_datetime64[ms]") got = got.drop(columns="col_datetime64[us]") + assert_eq(gdf.to_pandas(nullable=nullable), got) @@ -2753,7 +2835,7 @@ def postprocess(val): fname = datadir / "one_level_list2.parquet" expect = pd.read_parquet(fname) - expect = expect.applymap(postprocess) + expect = expect.map(postprocess) got = cudf.read_parquet(fname) assert_eq(expect, got, check_dtype=False) @@ -2958,7 +3040,9 @@ def test_parquet_roundtrip_time_delta(): ) buffer = BytesIO() df.to_parquet(buffer) - assert_eq(df, cudf.read_parquet(buffer)) + # TODO: Remove `check_dtype` once following issue is fixed in arrow: + # https://github.com/apache/arrow/issues/33321 + assert_eq(df, cudf.read_parquet(buffer), check_dtype=not PANDAS_GE_200) def test_parquet_reader_malformed_file(datadir): diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 71c1f206a64..13a07ef8adc 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import pickle @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf import DataFrame, GenericIndex, RangeIndex, Series +from cudf import DataFrame, Index, RangeIndex, Series from cudf.core.buffer import as_buffer from cudf.testing._utils import assert_eq @@ -22,7 +22,7 @@ def check_serialization(df): assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex)) + assert isinstance(sortvaldf.index, (Index, RangeIndex)) assert_frame_picklable(sortvaldf) # out-of-band buffers = [] @@ -80,7 +80,7 @@ def test_memory_usage_dataframe(): def test_pickle_index(): nelem = 10 - idx = GenericIndex(np.arange(nelem), name="a") + idx = Index(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) assert (idx == out).all() diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index f8a8903b518..1a5f25e320f 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from itertools import chain, combinations_with_replacement, product @@ -55,13 +55,18 @@ def test_rank_all_arguments( assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs)) assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs)) if numeric_only: - with pytest.warns(FutureWarning): - expect = pdf["str"].rank(**kwargs) - got = gdf["str"].rank(**kwargs) - assert expect.empty == got.empty - expected = pdf.select_dtypes(include=np.number) - else: - expected = pdf.copy(deep=True) + assert_exceptions_equal( + lfunc=pdf["str"].rank, + rfunc=gdf["str"].rank, + lfunc_args_and_kwargs=( + [], + kwargs, + ), + rfunc_args_and_kwargs=( + [], + kwargs, + ), + ) actual = gdf.rank(**kwargs) expected = pdf.rank(**kwargs) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 47968ec1d97..1a38cb3dd22 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from decimal import Decimal @@ -355,12 +355,14 @@ def test_any_all_axis_none(data, op): def test_reductions_axis_none_warning(op): df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) pdf = df.to_pandas() - with pytest.warns(FutureWarning): + with expect_warning_if( + op in {"sum", "product", "std", "var"}, + FutureWarning, + ): actual = getattr(df, op)(axis=None) with expect_warning_if( - op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"}, + op in {"sum", "product", "std", "var"}, FutureWarning, ): expected = getattr(pdf, op)(axis=None) - assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e52bbe54072..6db1c97b9fd 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,13 +8,14 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, assert_exceptions_equal, + expect_warning_if, ) @@ -166,18 +167,12 @@ def test_series_replace_with_nulls(): "c": ["abc", "def", ".", None, None], } ), - pytest.param( - cudf.DataFrame( - { - "a": ["one", "two", None, "three"], - "b": ["one", None, "two", "three"], - }, - dtype="category", - ), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/46672", - ), + cudf.DataFrame( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + dtype="category", ), cudf.DataFrame( { @@ -348,8 +343,10 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): # Explicitly using nans_as_nulls=True gdata = cudf.from_pandas(pdata, nan_as_null=True) - expected = pdata.fillna(method=method, inplace=inplace) - actual = gdata.fillna(method=method, inplace=inplace) + with expect_warning_if(PANDAS_GE_210): + expected = pdata.fillna(method=method, inplace=inplace) + with pytest.warns(FutureWarning): + actual = gdata.fillna(method=method, inplace=inplace) if inplace: expected = pdata @@ -665,8 +662,10 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace): # Explicitly using nans_as_nulls=True gdata = cudf.from_pandas(pdata, nan_as_null=True) - expected = pdata.fillna(method=method, inplace=inplace) - actual = gdata.fillna(method=method, inplace=inplace) + with expect_warning_if(PANDAS_GE_210): + expected = pdata.fillna(method=method, inplace=inplace) + with pytest.warns(FutureWarning): + actual = gdata.fillna(method=method, inplace=inplace) if inplace: expected = pdata @@ -1006,8 +1005,9 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): pd.Series(["one", "two", "three"], dtype="category"), {"to_replace": "one", "value": "two", "inplace": True}, marks=pytest.mark.xfail( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", + condition=PANDAS_GE_200, + reason="https://github.com/pandas-dev/pandas/issues/43232" + "https://github.com/pandas-dev/pandas/issues/53358", ), ), ( @@ -1057,8 +1057,10 @@ def test_replace_inplace(pframe, replace_args): assert_eq(gpu_frame, pandas_frame) assert_eq(gpu_copy, cpu_copy) - gpu_frame.replace(**replace_args) - pandas_frame.replace(**replace_args) + with expect_warning_if(len(replace_args) == 0): + gpu_frame.replace(**replace_args) + with expect_warning_if(len(replace_args) == 0): + pandas_frame.replace(**replace_args) assert_eq(gpu_frame, pandas_frame) assert_eq(gpu_copy, cpu_copy) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index efc738eec1f..8f65bd26bd1 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -294,39 +294,40 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): [ ( cudf.Index([1, 2, 3, None]), - "Int64Index([1, 2, 3, ], dtype='int64')", + "Index([1, 2, 3, ], dtype='int64')", ), ( cudf.Index([None, 2.2, 3.324342, None]), - "Float64Index([, 2.2, 3.324342, ], dtype='float64')", + "Index([, 2.2, 3.324342, ], dtype='float64')", ), ( cudf.Index([None, None, None], name="hello"), - "StringIndex([None None None], dtype='object', name='hello')", + "Index([, , ], dtype='object', name='hello')", ), ( cudf.Index([None, None, None], dtype="float", name="hello"), - "Float64Index([, , ], dtype='float64', name='hello')", + "Index([, , ], dtype='float64', name='hello')", ), ( cudf.Index([None], dtype="float64", name="hello"), - "Float64Index([], dtype='float64', name='hello')", + "Index([], dtype='float64', name='hello')", ), ( cudf.Index([None], dtype="int8", name="hello"), - "Int8Index([], dtype='int8', name='hello')", + "Index([], dtype='int8', name='hello')", ), ( cudf.Index([None] * 50, dtype="object"), - "StringIndex([None None None None None None None None " - "None None None None None None\n None None None None None None " - "None None None None None None None None\n None None None None " - "None None None None None None None None None None\n None None " - "None None None None None None], dtype='object')", + "Index([, , , , , , , , , " + ", , ,\n , , , , , , , " + ", , , , ,\n , , , , " + ", , , , , , , ,\n , " + ", , , , , , , , , , " + ",\n , ],\n dtype='object')", ), ( cudf.Index([None] * 20, dtype="uint32"), - "UInt32Index([, , , , , , , , " + "Index([, , , , , , , , " ",\n , , , , , , , , " ",\n , ],\n dtype='uint32')", ), @@ -334,7 +335,7 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): cudf.Index( [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" ), - "Int16Index([, 111, 22, 33, , 23, 34, 2343, ], " + "Index([, 111, 22, 33, , 23, 34, 2343, ], " "dtype='int16')", ), ( @@ -468,7 +469,7 @@ def test_dataframe_null_index_repr(df, pandas_special_case): actual_repr = repr(gdf) if pandas_special_case: - # Pandas inconsistently print StringIndex null values + # Pandas inconsistently print Index null values # as `None` at some places and `NaN` at few other places # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. @@ -542,7 +543,7 @@ def test_series_null_index_repr(sr, pandas_special_case): actual_repr = repr(gsr) if pandas_special_case: - # Pandas inconsistently print StringIndex null values + # Pandas inconsistently print Index null values # as `None` at some places and `NaN` at few other places # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index f0101803995..6281d54aa60 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -1,10 +1,11 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.testing._utils import assert_eq @@ -14,6 +15,7 @@ def assert_resample_results_equal(lhs, rhs, **kwargs): rhs.sort_index(), check_dtype=False, check_freq=False, + check_index_type=not PANDAS_GE_200, **kwargs, ) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b437c82bf6e..b49a921e812 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -76,7 +76,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars) # pandas' melt makes the 'variable' column of 'object' type (string) # cuDF's melt makes it Categorical because it doesn't support strings - expect["variable"] = expect["variable"].astype("category") + expect["variable"] = expect["variable"].astype( + got["variable"].dtype.to_pandas() + ) assert_eq(expect, got) @@ -490,14 +492,8 @@ def test_pivot_simple(index, column, data): pdf = pd.DataFrame({"index": index, "column": column, "data": data}) gdf = cudf.from_pandas(pdf) - # In pandas 2.0 this will be a failure because pandas will require all of - # these as keyword arguments. Matching that check in cudf is a bit - # cumbersome and not worth the effort to match the warning, so this code - # just catches pandas's warning (rather than updating the signature) so - # that when it starts failing we know to update our impl of pivot. - with pytest.warns(FutureWarning): - expect = pdf.pivot("index", "column") - got = gdf.pivot("index", "column") + expect = pdf.pivot(columns="column", index="index") + got = gdf.pivot(columns="column", index="index") check_index_and_columns = expect.shape != (0, 0) assert_eq( diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 91643f21155..9c3c9d1082c 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,17 +8,14 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 -from cudf.testing._utils import ( - _create_pandas_series_float64_default, - assert_eq, -) +from cudf.core._compat import PANDAS_GE_200 +from cudf.testing._utils import assert_eq from cudf.testing.dataset_generator import rand_dataframe @contextmanager def _hide_pandas_rolling_min_periods_warning(agg): - if agg == "count": + if not PANDAS_GE_200 and agg == "count": with pytest.warns( FutureWarning, match="min_periods=None will default to the size of window " @@ -58,8 +55,8 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = _create_pandas_series_float64_default(data, index=index) - gsr = cudf.Series(psr) + psr = pd.Series(data, index=index) + gsr = cudf.from_pandas(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): expect = getattr( @@ -314,7 +311,7 @@ def test_rolling_getitem_window(): ) @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = _create_pandas_series_float64_default(data, index=index) + psr = pd.Series(data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): @@ -483,7 +480,7 @@ def test_rolling_custom_index_support(): from pandas.api.indexers import BaseIndexer class CustomIndexer(BaseIndexer): - def custom_get_window_bounds( + def get_window_bounds( self, num_values, min_periods, center, closed, step=None ): start = np.empty(num_values, dtype=np.int64) @@ -499,24 +496,6 @@ def custom_get_window_bounds( return start, end - if PANDAS_GE_150: - - def get_window_bounds( - self, num_values, min_periods, center, closed, step - ): - return self.custom_get_window_bounds( - num_values, min_periods, center, closed, step - ) - - else: - - def get_window_bounds( - self, num_values, min_periods, center, closed - ): - return self.custom_get_window_bounds( - num_values, min_periods, center, closed - ) - use_expanding = [True, False, True, False, True] indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index b92f84b677c..cdce17eeb76 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -1,9 +1,9 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import os import socket from contextlib import contextmanager -from io import BytesIO +from io import BytesIO, StringIO import numpy as np import pandas as pd @@ -433,7 +433,7 @@ def test_read_json(s3_base, s3so): storage_options=s3so, ) - expect = pd.read_json(buffer, lines=True) + expect = pd.read_json(StringIO(buffer), lines=True) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 87efe6bbbcc..4e2a9f581c3 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -195,8 +195,8 @@ def test_serialize_range_index(): def test_serialize_generic_index(): - index = cudf.core.index.GenericIndex(cudf.Series(np.arange(10))) - outindex = cudf.core.index.GenericIndex.deserialize(*index.serialize()) + index = cudf.core.index.Index(cudf.Series(np.arange(10))) + outindex = cudf.core.index.Index.deserialize(*index.serialize()) assert_eq(index, outindex) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 7dcbf859f08..14006f90b45 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -14,14 +14,12 @@ import pytest import cudf -from cudf.core._compat import PANDAS_LT_140 +from cudf.api.extensions import no_default from cudf.errors import MixedTypeError from cudf.testing._utils import ( NUMERIC_TYPES, SERIES_OR_INDEX_NAMES, TIMEDELTA_TYPES, - _create_cudf_series_float64_default, - _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -99,17 +97,16 @@ def test_series_init_dict_lists(data): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_basic(data, others, ignore_index): +def test_series_concat_basic(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = pd.Series(others) other_gs = cudf.Series(others) - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + assert_eq(expected, actual) @@ -146,17 +143,15 @@ def test_series_append_basic(data, others, ignore_index): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_basic_str(data, others, ignore_index): +def test_series_concat_basic_str(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = pd.Series(others) other_gs = cudf.Series(others) - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) assert_eq(expected, actual) @@ -199,21 +194,20 @@ def test_series_append_basic_str(data, others, ignore_index): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_series_with_index(data, others, ignore_index): +def test_series_concat_series_with_index(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = others other_gs = cudf.from_pandas(others) - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + assert_eq(expected, actual) -def test_series_append_error_mixed_types(): +def test_series_concat_error_mixed_types(): gsr = cudf.Series([1, 2, 3, 4]) other = cudf.Series(["a", "b", "c", "d"]) @@ -222,16 +216,14 @@ def test_series_append_error_mixed_types(): match="cudf does not support mixed types, please type-cast " "both series to same dtypes.", ): - with pytest.warns(FutureWarning): - gsr.append(other) + cudf.concat([gsr, other]) with pytest.raises( TypeError, match="cudf does not support mixed types, please type-cast " "both series to same dtypes.", ): - with pytest.warns(FutureWarning): - gsr.append([gsr, other, gsr, other]) + cudf.concat([gsr, gsr, other, gsr, other]) @pytest.mark.parametrize( @@ -282,35 +274,32 @@ def test_series_append_error_mixed_types(): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_list_series_with_index(data, others, ignore_index): +def test_series_concat_list_series_with_index(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = others other_gs = [cudf.from_pandas(obj) for obj in others] - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr] + other_ps, ignore_index=ignore_index) + actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index) + assert_eq(expected, actual) -def test_series_append_existing_buffers(): +def test_series_concat_existing_buffers(): a1 = np.arange(10, dtype=np.float64) gs = cudf.Series(a1) # Add new buffer a2 = cudf.Series(np.arange(5)) - with pytest.warns(FutureWarning): - gs = gs.append(a2) + gs = cudf.concat([gs, a2]) assert len(gs) == 15 np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()])) # Ensure appending to previous buffer a3 = cudf.Series(np.arange(3)) - with pytest.warns(FutureWarning): - gs = gs.append(a3) + gs = cudf.concat([gs, a3]) assert len(gs) == 18 a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()]) np.testing.assert_equal(gs.to_numpy(), a4) @@ -318,13 +307,11 @@ def test_series_append_existing_buffers(): # Appending different dtype a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) - with pytest.warns(FutureWarning): - gs = a5.append(a6) + gs = cudf.concat([a5, a6]) np.testing.assert_equal( gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) ) - with pytest.warns(FutureWarning): - gs = cudf.Series(a6).append(a5) + gs = cudf.concat([cudf.Series(a6), a5]) np.testing.assert_equal( gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) ) @@ -402,8 +389,8 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = _create_pandas_series_float64_default(data) - gsr = _create_cudf_series_float64_default(data) + psr = pd.Series(data) + gsr = cudf.Series(data) assert_eq(psr.size, gsr.size) @@ -412,8 +399,7 @@ def test_series_size(data): def test_series_describe_numeric(dtype): ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) gs = cudf.from_pandas(ps) - with pytest.warns(FutureWarning): - actual = gs.describe() + actual = gs.describe() expected = ps.describe() assert_eq(expected, actual, check_dtype=True) @@ -430,9 +416,8 @@ def test_series_describe_datetime(dtype): # Treating datetimes as categoricals is deprecated in pandas and will # be removed in future. Future behavior is treating datetime as numeric. - expected = ps.describe(datetime_is_numeric=True) - with pytest.warns(FutureWarning): - actual = gs.describe() + expected = ps.describe() + actual = gs.describe() assert_eq(expected.astype("str"), actual) @@ -443,8 +428,7 @@ def test_series_describe_timedelta(dtype): gs = cudf.from_pandas(ps) expected = ps.describe() - with pytest.warns(FutureWarning): - actual = gs.describe() + actual = gs.describe() assert_eq(actual, expected.astype("str")) @@ -469,8 +453,7 @@ def test_series_describe_other_types(ps): gs = cudf.from_pandas(ps) expected = ps.describe() - with pytest.warns(FutureWarning): - actual = gs.describe() + actual = gs.describe() if len(ps) == 0: assert_eq(expected.fillna("a").astype("str"), actual.fillna("a")) @@ -478,29 +461,6 @@ def test_series_describe_other_types(ps): assert_eq(expected.astype("str"), actual) -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0]) -def test_series_factorize(data, na_sentinel): - gsr = _create_cudf_series_float64_default(data) - psr = gsr.to_pandas() - - with pytest.warns(FutureWarning): - expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) - with pytest.warns(FutureWarning): - actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) - - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats.values, actual_cats.to_pandas().values) - - @pytest.mark.parametrize( "data", [ @@ -512,7 +472,7 @@ def test_series_factorize(data, na_sentinel): ) @pytest.mark.parametrize("use_na_sentinel", [True, False]) def test_series_factorize_use_na_sentinel(data, use_na_sentinel): - gsr = _create_cudf_series_float64_default(data) + gsr = cudf.Series(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize( @@ -536,7 +496,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel): ) @pytest.mark.parametrize("sort", [True, False]) def test_series_factorize_sort(data, sort): - gsr = _create_cudf_series_float64_default(data) + gsr = cudf.Series(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize(sort=sort) @@ -1357,15 +1317,7 @@ def test_series_raises_float16(data): pd.RangeIndex(0, 3, 1), [3.0, 1.0, np.nan], ["a", "z", None], - pytest.param( - pd.RangeIndex(4, -1, -2), - marks=[ - pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43591", - ) - ], - ), + pd.RangeIndex(4, -1, -2), ], ) @pytest.mark.parametrize("axis", [0, "index"]) @@ -1473,7 +1425,7 @@ def test_nullable_bool_dtype_series(data, bool_dtype): @pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser"]) +@pytest.mark.parametrize("name", [None, "ser", no_default]) @pytest.mark.parametrize("inplace", [True, False]) def test_reset_index(level, drop, inplace, original_name, name): midx = pd.MultiIndex.from_tuples( @@ -1488,10 +1440,8 @@ def test_reset_index(level, drop, inplace, original_name, name): "test_reset_index_dup_level_name_exceptions" ) - with expect_warning_if(name is None and not drop): - expect = ps.reset_index( - level=level, drop=drop, name=name, inplace=inplace - ) + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) + got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) if inplace: expect = ps @@ -1516,10 +1466,7 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): "test_reset_index_dup_level_name_exceptions" ) - with expect_warning_if(name is None and not drop): - expect = ps.reset_index( - level=level, drop=drop, inplace=inplace, name=name - ) + expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) if inplace: expect = ps @@ -1545,8 +1492,7 @@ def test_reset_index_named(drop, inplace, original_name, name): "test_reset_index_dup_level_name_exceptions" ) - with expect_warning_if(name is None and not drop): - expect = ps.reset_index(drop=drop, inplace=inplace, name=name) + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) got = gs.reset_index(drop=drop, inplace=inplace, name=name) if inplace: @@ -1708,7 +1654,7 @@ def test_series_nunique_index(data): ], ) def test_axes(data): - csr = _create_cudf_series_float64_default(data) + csr = cudf.Series(data) psr = csr.to_pandas() expected = psr.axes @@ -1786,7 +1732,7 @@ def test_series_truncate_datetimeindex(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = _create_pandas_series_float64_default(data, index=index) + psr = pd.Series(data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) @@ -1846,7 +1792,7 @@ def test_fill_new_category(): ], ) def test_isin_datetime(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1875,7 +1821,7 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1904,7 +1850,7 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -2125,7 +2071,7 @@ def test_series_to_dict(into): ], ) def test_series_hasnans(data): - gs = _create_cudf_series_float64_default(data, nan_as_null=False) + gs = cudf.Series(data, nan_as_null=False) ps = gs.to_pandas(nullable=True) # Check type to avoid mixing Python bool and NumPy bool @@ -2198,13 +2144,14 @@ def test_series_init_dict_with_index(data, index): "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] ) def test_series_init_scalar_with_index(data, index): - pandas_series = _create_pandas_series_float64_default(data, index=index) - cudf_series = _create_cudf_series_float64_default(data, index=index) + pandas_series = pd.Series(data, index=index) + cudf_series = cudf.Series(data, index=index) assert_eq( pandas_series, cudf_series, - check_index_type=False if data is None and index is None else True, + check_index_type=data is not None or index is not None, + check_dtype=data is not None, ) @@ -2348,15 +2295,12 @@ def test_series_round_builtin(data, digits): assert_eq(expected, actual) -def test_series_empty_warning(): - with pytest.warns(FutureWarning): - expected = pd.Series([]) - with pytest.warns(FutureWarning): - actual = cudf.Series([]) - assert_eq(expected, actual) +def test_series_empty_dtype(): + expected = pd.Series([]) + actual = cudf.Series([]) + assert_eq(expected, actual, check_dtype=True) -@pytest.mark.filterwarnings("ignore::FutureWarning") # tested above @pytest.mark.parametrize("data", [None, {}, []]) def test_series_empty_index_rangeindex(data): expected = cudf.RangeIndex(0) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 6e1e53fc869..de0826d61e9 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -1,12 +1,16 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 -from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.core._compat import PANDAS_GE_210 +from cudf.testing._utils import ( + assert_eq, + assert_exceptions_equal, + expect_warning_if, +) @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) @@ -226,22 +230,12 @@ def test_categorical_setitem_invalid(): ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") - if PANDAS_GE_150: - assert_exceptions_equal( - lfunc=ps.__setitem__, - rfunc=gs.__setitem__, - lfunc_args_and_kwargs=([0, 5], {}), - rfunc_args_and_kwargs=([0, 5], {}), - ) - else: - # Following workaround is needed because: - # https://github.com/pandas-dev/pandas/issues/46646 - with pytest.raises( - ValueError, - match="Cannot setitem on a Categorical with a new category, set " - "the categories first", - ): - gs[0] = 5 + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + ) def test_series_slice_setitem_list(): @@ -310,13 +304,12 @@ def test_series_setitem_upcasting(dtype, indices): # column dtype. new_value = np.float64(np.pi) col_ref = cr._column - sr[indices] = new_value - cr[indices] = new_value - if PANDAS_GE_150: - assert_eq(sr, cr) - else: - # pandas bug, incorrectly fails to upcast from float32 to float64 - assert_eq(sr.values, cr.values) + with expect_warning_if(PANDAS_GE_210 and dtype != np.float64): + sr[indices] = new_value + with expect_warning_if(dtype != np.float64): + cr[indices] = new_value + assert_eq(sr, cr) + if dtype == np.float64: # no-op type cast should not modify backing column assert col_ref == cr._column diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 8152c1bc03c..f30c14373bf 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -346,7 +346,7 @@ def _check_scatter_by_map(dfs, col): with pytest.warns(UserWarning): df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size - # Test GenericIndex + # Test Index df2 = df.set_index("c") generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(generic_result, df2["b"]) @@ -391,6 +391,6 @@ def test_dataframe_scatter_by_map_7513(ids): def test_dataframe_scatter_by_map_empty(): - df = DataFrame({"a": [], "b": []}) + df = DataFrame({"a": [], "b": []}, dtype="float64") scattered = df.scatter_by_map(df["a"]) assert len(scattered) == 0 diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 8ff4dc73c4c..b35dd28c4ec 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -8,10 +8,10 @@ import pytest import cudf +from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_GE_210 from cudf.datasets import randomdata from cudf.testing._utils import ( - _create_cudf_series_float64_default, - _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -221,8 +221,8 @@ def test_approx_quantiles_int(): ], ) def test_misc_quantiles(data, q): - pdf_series = _create_pandas_series_float64_default(data) - gdf_series = _create_cudf_series_float64_default(data) + pdf_series = pd.Series(data, dtype="float64" if len(data) == 0 else None) + gdf_series = cudf.from_pandas(pdf_series) expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) actual = gdf_series.quantile(q) @@ -232,37 +232,39 @@ def test_misc_quantiles(data, q): @pytest.mark.parametrize( "data", [ - cudf.Series(np.random.normal(-100, 100, 1000)), - cudf.Series(np.random.randint(-50, 50, 1000)), - cudf.Series(np.zeros(100)), - cudf.Series(np.repeat(np.nan, 100)), - cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), - cudf.Series( - [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False - ), - cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([], dtype="float64"), - cudf.Series([-3]), + {"data": np.random.normal(-100, 100, 1000)}, + {"data": np.random.randint(-50, 50, 1000)}, + {"data": (np.zeros(100))}, + {"data": np.repeat(np.nan, 100)}, + {"data": np.array([1.123, 2.343, np.nan, 0.0])}, + { + "data": [5, 10, 53, None, np.nan, None, 12, 43, -423], + "nan_as_null": False, + }, + {"data": [1.1032, 2.32, 43.4, 13, -312.0], "index": [0, 4, 3, 19, 6]}, + {"data": [], "dtype": "float64"}, + {"data": [-3]}, ], ) @pytest.mark.parametrize("null_flag", [False, True]) @pytest.mark.parametrize("numeric_only", [False, True]) def test_kurtosis_series(data, null_flag, numeric_only): - pdata = data.to_pandas() + gs = cudf.Series(**data) + ps = gs.to_pandas() - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None + if null_flag and len(gs) > 2: + gs.iloc[[0, 2]] = None + ps.iloc[[0, 2]] = None - got = data.kurtosis(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurtosis(numeric_only=numeric_only) - np.testing.assert_array_almost_equal(got, expected) + got = gs.kurtosis(numeric_only=numeric_only) + expected = ps.kurtosis(numeric_only=numeric_only) - got = data.kurt(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt(numeric_only=numeric_only) - np.testing.assert_array_almost_equal(got, expected) + assert_eq(got, expected) + + got = gs.kurt(numeric_only=numeric_only) + expected = ps.kurt(numeric_only=numeric_only) + + assert_eq(got, expected) @pytest.mark.parametrize("op", ["skew", "kurt"]) @@ -270,13 +272,12 @@ def test_kurt_skew_error(op): gs = cudf.Series(["ab", "cd"]) ps = gs.to_pandas() - with pytest.warns(FutureWarning): - assert_exceptions_equal( - getattr(gs, op), - getattr(ps, op), - lfunc_args_and_kwargs=([], {"numeric_only": True}), - rfunc_args_and_kwargs=([], {"numeric_only": True}), - ) + assert_exceptions_equal( + getattr(gs, op), + getattr(ps, op), + lfunc_args_and_kwargs=([], {"numeric_only": True}), + rfunc_args_and_kwargs=([], {"numeric_only": True}), + ) @pytest.mark.parametrize( @@ -306,8 +307,8 @@ def test_skew_series(data, null_flag, numeric_only): got = data.skew(numeric_only=numeric_only) expected = pdata.skew(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) + + assert_eq(got, expected) @pytest.mark.parametrize("dtype", params_dtypes) @@ -352,14 +353,31 @@ def test_series_median(dtype, num_na): ], ) @pytest.mark.parametrize("periods", range(-5, 5)) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) -def test_series_pct_change(data, periods, fill_method): +@pytest.mark.parametrize( + "fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None] +) +def test_series_pct_change(request, data, periods, fill_method): cs = cudf.Series(data) ps = cs.to_pandas() - + request.applymarker( + pytest.mark.xfail( + condition=( + len(cs) == 0 and periods == 0 and fill_method is no_default + ), + reason="https://github.com/pandas-dev/pandas/issues/57056", + ) + ) if np.abs(periods) <= len(cs): - got = cs.pct_change(periods=periods, fill_method=fill_method) - expected = ps.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if(fill_method not in (no_default, None)): + got = cs.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if( + PANDAS_GE_210 + and ( + fill_method not in (no_default, None) + or (fill_method is not None and ps.isna().any()) + ) + ): + expected = ps.pct_change(periods=periods, fill_method=fill_method) np.testing.assert_array_almost_equal( got.to_numpy(na_value=np.nan), expected ) @@ -464,7 +482,7 @@ def test_corr1d(data1, data2, method): # Spearman allows for size 1 samples, but will error if all data in a # sample is identical since the covariance is zero and so the correlation # coefficient is not defined. - cond = (is_singular and method == "pearson") or ( + cond = ((is_singular or is_identical) and method == "pearson") or ( is_identical and not is_singular and method == "spearman" ) if method == "spearman": @@ -522,14 +540,16 @@ def test_df_corr(method): ) @pytest.mark.parametrize("skipna", [True, False]) def test_nans_stats(data, ops, skipna): - psr = _create_pandas_series_float64_default(data) - gsr = _create_cudf_series_float64_default(data, nan_as_null=False) + psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) + gsr = cudf.from_pandas(psr) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - gsr = _create_cudf_series_float64_default(data, nan_as_null=False) + gsr = cudf.Series( + data, dtype="float64" if len(data) == 0 else None, nan_as_null=False + ) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only # testing for `skipna=True` when `nan_as_null=False` @@ -558,28 +578,28 @@ def test_min_count_ops(data, ops, skipna, min_count): @pytest.mark.parametrize( - "gsr", + "data1", [ - cudf.Series([1, 2, 3, 4], dtype="datetime64[ns]"), - cudf.Series([1, 2, 3, 4], dtype="timedelta64[ns]"), + [1, 2, 3, 4], + [10, 1, 3, 5], ], ) -def test_cov_corr_invalid_dtypes(gsr): - psr = gsr.to_pandas() +@pytest.mark.parametrize( + "data2", + [ + [1, 2, 3, 4], + [10, 1, 3, 5], + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +def test_cov_corr_datetime_timedelta(data1, data2, dtype): + gsr1 = cudf.Series(data1, dtype=dtype) + gsr2 = cudf.Series(data2, dtype=dtype) + psr1 = gsr1.to_pandas() + psr2 = gsr2.to_pandas() - assert_exceptions_equal( - lfunc=psr.corr, - rfunc=gsr.corr, - lfunc_args_and_kwargs=([psr],), - rfunc_args_and_kwargs=([gsr],), - ) - - assert_exceptions_equal( - lfunc=psr.cov, - rfunc=gsr.cov, - lfunc_args_and_kwargs=([psr],), - rfunc_args_and_kwargs=([gsr],), - ) + assert_eq(psr1.corr(psr2), gsr1.corr(gsr2)) + assert_eq(psr1.cov(psr2), gsr1.cov(gsr2)) @pytest.mark.parametrize( @@ -591,30 +611,26 @@ def test_cov_corr_invalid_dtypes(gsr): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis_df(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_kurtosis_df(data, null_flag, numeric_only): + if not numeric_only: + data = data.select_dtypes(include="number") pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - with pytest.warns(FutureWarning): - got = data.kurtosis() + got = data.kurtosis(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - with pytest.warns(FutureWarning): - expected = pdata.kurtosis() - np.testing.assert_array_almost_equal(got, expected) - with pytest.warns(FutureWarning): - got = data.kurt() - got = got if np.isscalar(got) else got.to_numpy() - with pytest.warns(FutureWarning): - expected = pdata.kurt() + expected = pdata.kurtosis(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) - got = data.kurt(numeric_only=True) + got = data.kurt(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt(numeric_only=True) + + expected = pdata.kurt(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) @@ -627,21 +643,17 @@ def test_kurtosis_df(data, null_flag): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew_df(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_skew_df(data, null_flag, numeric_only): + if not numeric_only: + data = data.select_dtypes(include="number") pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - with pytest.warns(FutureWarning): - got = data.skew() - with pytest.warns(FutureWarning): - expected = pdata.skew() - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) - - got = data.skew(numeric_only=True) - expected = pdata.skew(numeric_only=True) + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 4c5598b547e..b2bf687ba06 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -15,9 +15,8 @@ import cudf from cudf import concat -from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.string import StringColumn -from cudf.core.index import StringIndex, as_index +from cudf.core.index import Index, as_index from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -1074,8 +1073,7 @@ def test_string_index(): pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) - with pytest.warns(FutureWarning): - stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name") + stringIndex = Index(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) @@ -1722,13 +1720,7 @@ def test_strings_filling_tests(data, width, fillchar): ["A,,B", "1,,5", "3,00,0"], ["Linda van der Berg", "George Pitt-Rivers"], ["³", "⅕", ""], - pytest.param( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/20868", - ), - ), + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], [" ", "\t\r\n ", ""], ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], ], @@ -1864,7 +1856,11 @@ def test_string_count(data, pat, flags): ps.str.count(pat=pat, flags=flags), check_dtype=False, ) - assert_eq(as_index(gs).str.count(pat=pat), pd.Index(ps).str.count(pat=pat)) + assert_eq( + cudf.Index(gs).str.count(pat=pat), + pd.Index(ps).str.count(pat=pat), + exact=False, + ) @pytest.mark.parametrize( @@ -2230,7 +2226,11 @@ def test_string_str_rindex(data, sub, er): if er is None: assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) - assert_eq(pd.Index(ps).str.rindex(sub), as_index(gs).str.rindex(sub)) + assert_eq( + pd.Index(ps).str.rindex(sub), + as_index(gs).str.rindex(sub), + exact=False, + ) try: ps.str.rindex(sub) @@ -3477,3 +3477,21 @@ def test_str_iterate_error(): s = cudf.Series(["abc", "xyz"]) with pytest.raises(TypeError): iter(s.str) + + +def test_string_reduction_error(): + s = cudf.Series([None, None], dtype="str") + ps = s.to_pandas(nullable=True) + assert_exceptions_equal( + s.any, + ps.any, + lfunc_args_and_kwargs=([], {"skipna": False}), + rfunc_args_and_kwargs=([], {"skipna": False}), + ) + + assert_exceptions_equal( + s.all, + ps.all, + lfunc_args_and_kwargs=([], {"skipna": False}), + rfunc_args_and_kwargs=([], {"skipna": False}), + ) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 12f1ace7867..18fe1700e25 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -9,6 +9,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -702,31 +703,31 @@ def test_timedelta_dt_properties(data, dtype): gsr = cudf.Series(data, dtype=dtype) psr = gsr.to_pandas() - def local_assert(expected, actual): + def local_assert(expected, actual, **kwargs): if gsr.isnull().any(): - assert_eq(expected, actual.astype("float")) + assert_eq(expected, actual.astype("float"), **kwargs) else: - assert_eq(expected, actual) + assert_eq(expected, actual, **kwargs) expected_days = psr.dt.days actual_days = gsr.dt.days - local_assert(expected_days, actual_days) + local_assert(expected_days, actual_days, check_dtype=False) expected_seconds = psr.dt.seconds actual_seconds = gsr.dt.seconds - local_assert(expected_seconds, actual_seconds) + local_assert(expected_seconds, actual_seconds, check_dtype=False) expected_microseconds = psr.dt.microseconds actual_microseconds = gsr.dt.microseconds - local_assert(expected_microseconds, actual_microseconds) + local_assert(expected_microseconds, actual_microseconds, check_dtype=False) expected_nanoseconds = psr.dt.nanoseconds actual_nanoseconds = gsr.dt.nanoseconds - local_assert(expected_nanoseconds, actual_nanoseconds) + local_assert(expected_nanoseconds, actual_nanoseconds, check_dtype=False) @pytest.mark.parametrize( @@ -1323,7 +1324,11 @@ def test_numeric_to_timedelta(data, dtype, timedelta_dtype): psr = sr.to_pandas() actual = sr.astype(timedelta_dtype) - expected = pd.Series(psr.to_numpy().astype(timedelta_dtype)) + + if PANDAS_GE_200: + expected = psr.astype(timedelta_dtype) + else: + expected = pd.Series(psr.to_numpy().astype(timedelta_dtype)) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 95ea4544917..0e29d2bfdcc 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -183,7 +183,17 @@ def func(row): ) gdf["a"] = gdf["a"].astype(dtype_l) gdf["b"] = gdf["b"].astype(dtype_r) - run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) + + pdf = gdf.to_pandas() + expect = op(pdf["a"], pdf["b"]) + obtain = gdf.apply(func, axis=1) + assert_eq(expect, obtain, check_dtype=False) + # TODO: After the following pandas issue is + # fixed, uncomment the following line and delete + # through `to_pandas()` statement. + # https://github.com/pandas-dev/pandas/issues/52411 + + # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) @pytest.mark.parametrize("op", comparison_ops) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 5a7b8bae980..68447f423a4 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. """ Helper functions for parameterized docstring @@ -126,15 +126,6 @@ def wrapper(func): exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. - datetime_is_numeric : bool, default False - For DataFrame input, this also controls whether datetime columns - are included by default. - - .. deprecated:: 23.04 - - `datetime_is_numeric` is deprecated and will be removed in - a future version of cudf. - Returns ------- output_frame : Series or DataFrame diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index df363b72909..8fa4a230e2c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -172,7 +172,7 @@ def cudf_dtype_from_pydata_dtype(dtype): Python dtype. """ - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype elif cudf.api.types.is_decimal32_dtype(dtype): return cudf.core.dtypes.Decimal32Dtype @@ -262,6 +262,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if isinstance(val, pd.Timestamp): if val.tz is not None: raise NotImplementedError(tz_error_msg) + val = val.to_datetime64() elif isinstance(val, pd.Timedelta): val = val.to_timedelta64() @@ -415,9 +416,9 @@ def get_min_float_dtype(col): def is_mixed_with_object_dtype(lhs, rhs): - if cudf.api.types.is_categorical_dtype(lhs.dtype): + if cudf.api.types._is_categorical_dtype(lhs.dtype): return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) - elif cudf.api.types.is_categorical_dtype(rhs.dtype): + elif cudf.api.types._is_categorical_dtype(rhs.dtype): return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) return (lhs.dtype == "object" and rhs.dtype != "object") or ( @@ -517,10 +518,10 @@ def find_common_type(dtypes): # Early exit for categoricals since they're not hashable and therefore # can't be put in a set. - if any(cudf.api.types.is_categorical_dtype(dtype) for dtype in dtypes): + if any(cudf.api.types._is_categorical_dtype(dtype) for dtype in dtypes): if all( ( - cudf.api.types.is_categorical_dtype(dtype) + cudf.api.types._is_categorical_dtype(dtype) and (not dtype.ordered if hasattr(dtype, "ordered") else True) ) for dtype in dtypes @@ -601,7 +602,7 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - common_dtype = np.find_common_type(list(dtypes), []) + common_dtype = np.result_type(*dtypes) if common_dtype == np.dtype("float16"): return cudf.dtype("float32") return cudf.dtype(common_dtype) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 57e657eb5c1..feb02bac60d 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -15,6 +15,7 @@ from pyarrow import PythonFile as ArrowPythonFile from pyarrow.lib import NativeFile +from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial try: @@ -1666,6 +1667,8 @@ def get_reader_filepath_or_buffer( allow_raw_text_input=False, storage_options=None, bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, + warn_on_raw_text_input=None, + warn_meta=None, ): """{docstring}""" @@ -1679,6 +1682,18 @@ def get_reader_filepath_or_buffer( path_or_data, storage_options ) if fs is None: + if warn_on_raw_text_input: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " + "deprecated and will be removed in a future version. " + "To read from a literal string, wrap it in a " + "'StringIO' object.", + FutureWarning, + ) return path_or_data, compression if _is_local_filesystem(fs): @@ -1691,6 +1706,30 @@ def get_reader_filepath_or_buffer( raise FileNotFoundError( f"{path_or_data} could not be resolved to any files" ) + elif warn_on_raw_text_input: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " + "deprecated and will be removed in a future version. " + "To read from a literal string, wrap it in a " + "'StringIO' object.", + FutureWarning, + ) + elif warn_on_raw_text_input: + # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." + warnings.warn( + f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " + "deprecated and will be removed in a future version. " + "To read from a literal string, wrap it in a " + "'StringIO' object.", + FutureWarning, + ) else: if len(paths) == 0: diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index df4bed0be0a..546f8df95f3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -269,26 +269,6 @@ def test_rename_categories(): tm.assert_series_equal(psr, sr) -def test_rename_categories_inplace(): - psr = pd.Series([1, 2, 3], dtype="category") - sr = xpd.Series([1, 2, 3], dtype="category") - with pytest.warns(FutureWarning): - psr.cat.rename_categories({1: 5}, inplace=True) - sr.cat.rename_categories({1: 5}, inplace=True) - tm.assert_series_equal(psr, sr) - - -def test_rename_categories_inplace_after_copying_parent(): - s = xpd.Series([1, 2, 3], dtype="category") - # cudf does not define "rename_categories", - # so this copies `s` from device to host: - rename_categories = s.cat.rename_categories - _ = len(s) # trigger a copy of `s` from host to device: - with pytest.warns(FutureWarning): - rename_categories([5, 2, 3], inplace=True) - assert s.cat.categories.tolist() == [5, 2, 3] - - def test_column_rename(dataframe): pdf, df = dataframe pdf.columns = ["x", "y"] @@ -663,8 +643,7 @@ def test_rolling_win_type(): pdf = pd.DataFrame(range(5)) df = xpd.DataFrame(range(5)) result = df.rolling(2, win_type="boxcar").mean() - with pytest.warns(DeprecationWarning): - expected = pdf.rolling(2, win_type="boxcar").mean() + expected = pdf.rolling(2, win_type="boxcar").mean() tm.assert_equal(result, expected) @@ -930,6 +909,9 @@ def test_resample(): ) expected = ser.resample("D").max() result = xser.resample("D").max() + # TODO: See if as_unit can be avoided + expected.index = expected.index.as_unit("s") + result.index = result.index.as_unit("s") tm.assert_series_equal(result, expected) @@ -1017,12 +999,6 @@ def __init__(self, myinput): xpd.PeriodIndex, xpd.MultiIndex, xpd.IntervalIndex, - xpd.UInt64Index, - xpd.Int64Index, - xpd.Float64Index, - xpd.core.indexes.numeric.UInt64Index, - xpd.core.indexes.numeric.Int64Index, - xpd.core.indexes.numeric.Float64Index, ], ) def test_index_subclass(index_type): @@ -1032,22 +1008,6 @@ def test_index_subclass(index_type): assert not issubclass(xpd.Index, index_type) -def test_index_internal_subclass(): - # test that proxy index types that are not related by inheritance - # still appear to be so if the underlying slow types are related - # by inheritance: - assert issubclass( - xpd.Int64Index, - xpd.core.indexes.numeric.NumericIndex, - ) == issubclass( - pd.Int64Index, - pd.core.indexes.numeric.NumericIndex, - ) - assert isinstance( - xpd.Index([1, 2, 3]), xpd.core.indexes.numeric.NumericIndex - ) == isinstance(pd.Index([1, 2, 3]), pd.core.indexes.numeric.NumericIndex) - - def test_np_array_of_timestamps(): expected = np.array([pd.Timestamp(1)]) + pd.tseries.offsets.MonthEnd() got = np.array([xpd.Timestamp(1)]) + xpd.tseries.offsets.MonthEnd() @@ -1080,7 +1040,7 @@ def test_np_array_of_timestamps(): # Other types xpd.tseries.offsets.BDay(5), xpd.Timestamp("2001-01-01"), - xpd.Timestamp("2001-01-01", freq="D"), + xpd.Timestamp("2001-01-01", tz="UTC"), xpd.Timedelta("1 days"), xpd.Timedelta(1, "D"), ], @@ -1214,15 +1174,6 @@ def test_read_sas_context(): assert isinstance(df, xpd.DataFrame) -@pytest.mark.parametrize( - "idx_obj", ["Float64Index", "Int64Index", "UInt64Index"] -) -def test_pandas_module_getattr_objects(idx_obj): - # Objects that are behind pandas.__getattr__ (version 1.5 specific) - idx = getattr(xpd, idx_obj)([1, 2, 3]) - assert isinstance(idx, xpd.Index) - - def test_concat_fast(): pytest.importorskip("cudf") diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 840baa58355..c666a6851d0 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.3", "ninja", - "numpy>=1.21,<1.25", + "numpy>=1.21", "protoc-wheel", "pyarrow==14.0.1.*", "rmm==24.4.*", @@ -30,10 +30,10 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numba>=0.57", - "numpy>=1.21,<1.25", + "numpy>=1.21", "nvtx>=0.2.1", "packaging", - "pandas>=1.3,<1.6.0dev0", + "pandas>=2.0,<2.1.5dev0", "protobuf>=4.21,<5", "ptxcompiler", "pyarrow>=14.0.1,<15.0.0a0", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index dc3c27233cb..bffad703ed0 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.3", "ninja", - "numpy>=1.21,<1.25", + "numpy>=1.21", "pyarrow==14.0.1.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index 1a1fc84ef89..bae4b051cae 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. """ Tests for Streamz Dataframes (SDFs) built on top of cuDF DataFrames. @@ -749,7 +749,7 @@ def on_old(self, state, new): def test_groupby_aggregate_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example).groupby(["name"]) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = ( @@ -771,7 +771,7 @@ def test_groupby_aggregate_with_start_state(stream): assert assert_eq(output1[0][1].reset_index(), out_df1) assert assert_eq(output2[0].reset_index(), out_df2) - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example).groupby(["name"]) output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list() output4 = ( @@ -817,7 +817,7 @@ def test_reductions_with_start_state(stream): def test_rolling_aggs_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( sdf.rolling(2, with_state=True, start=()) @@ -863,7 +863,7 @@ def test_rolling_aggs_with_start_state(stream): def test_window_aggs_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( sdf.window(2, with_state=True, start=None) @@ -881,7 +881,7 @@ def test_window_aggs_with_start_state(stream): assert output0[-1][1] == 450 stream = Stream() - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output1 = ( sdf.window(2, with_state=True, start=output0[-1][0]) @@ -895,7 +895,7 @@ def test_window_aggs_with_start_state(stream): def test_windowed_groupby_aggs_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( sdf.window(5, with_state=True, start=None) @@ -915,7 +915,7 @@ def test_windowed_groupby_aggs_with_start_state(stream): stream.emit(df) stream = Stream() - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output1 = ( sdf.window(5, with_state=True, start=output0[-1][0]) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 6e007b5cb87..454cce76ff2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -61,8 +61,6 @@ def _nonempty_index(idx): data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) - elif isinstance(idx, cudf.StringIndex): - return cudf.StringIndex(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1 @@ -73,16 +71,18 @@ def _nonempty_index(idx): categories=categories, codes=codes, ordered=ordered ) return cudf.core.index.CategoricalIndex(values, name=idx.name) - elif isinstance(idx, cudf.core.index.GenericIndex): - return cudf.core.index.GenericIndex( - np.arange(2, dtype=idx.dtype), name=idx.name - ) elif isinstance(idx, cudf.core.multiindex.MultiIndex): levels = [meta_nonempty(lev) for lev in idx.levels] codes = [[0, 0] for i in idx.levels] return cudf.core.multiindex.MultiIndex( levels=levels, codes=codes, names=idx.names ) + elif isinstance(idx._column, cudf.core.column.StringColumn): + return cudf.Index(["cat", "dog"], name=idx.name) + elif isinstance(idx, cudf.core.index.Index): + return cudf.core.index.Index( + np.arange(2, dtype=idx.dtype), name=idx.name + ) raise TypeError(f"Don't know how to handle index of type {type(idx)}") @@ -312,7 +312,7 @@ def tolist_cudf(obj): ) @_dask_cudf_nvtx_annotate def is_categorical_dtype_cudf(obj): - return cudf.api.types.is_categorical_dtype(obj) + return cudf.api.types._is_categorical_dtype(obj) @grouper_dispatch.register((cudf.Series, cudf.DataFrame)) @@ -333,7 +333,7 @@ def percentile_cudf(a, q, interpolation="linear"): if isinstance(q, Iterator): q = list(q) - if cudf.api.types.is_categorical_dtype(a.dtype): + if cudf.api.types._is_categorical_dtype(a.dtype): result = cp.percentile(a.cat.codes, q, interpolation=interpolation) return ( diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index c2b2428bf14..b051b21790e 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -281,9 +281,12 @@ def var( dtype=None, out=None, naive=False, + numeric_only=False, ): axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) + meta = self._meta_nonempty.var( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) if axis == 1: result = map_partitions( M.var, @@ -293,6 +296,7 @@ def var( axis=axis, skipna=skipna, ddof=ddof, + numeric_only=numeric_only, ) return handle_out(out, result) elif naive: diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index 5f1aa98e888..a35a9f1be48 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -226,7 +226,19 @@ def test_read_csv_skiprows_error(csv_begin_bad_lines): def test_read_csv_skipfooter(csv_end_bad_lines): # Repro from Issue#13552 + with dask.config.set({"dataframe.convert-string": False}): + ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute() + ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute() + dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False) + + +def test_read_csv_skipfooter_arrow_string_fail(request, csv_end_bad_lines): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/14915", + ) + ) ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute() ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute() diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index fddbfb16e27..5e06832ed94 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import os @@ -80,7 +80,9 @@ def test_read_json_nested(tmp_path): } ) kwargs = dict(orient="records", lines=True) - with tmp_path / "data.json" as f: + with tmp_path / "data.json" as f, dask.config.set( + {"dataframe.convert-string": False} + ): df.to_json(f, **kwargs) # Ensure engine='cudf' is tested. actual = dask_cudf.read_json(f, engine="cudf", **kwargs) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 7b4e20012f7..583d4b07f6f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import glob import math @@ -13,6 +13,7 @@ from dask.utils import natural_sort_key import cudf +from cudf.core._compat import PANDAS_GE_200 import dask_cudf @@ -165,7 +166,9 @@ def test_dask_timeseries_from_pandas(tmpdir): pdf = ddf2.compute() pdf.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) - dd.assert_eq(ddf2, read_df.compute()) + # Workaround until following issue is fixed: + # https://github.com/apache/arrow/issues/33321 + dd.assert_eq(ddf2, read_df.compute(), check_index_type=not PANDAS_GE_200) @pytest.mark.parametrize("index", [False, None]) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 2e71202f151..d01ada92e33 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -17,7 +17,7 @@ from dask.utils import M import cudf as gd -from cudf.api.types import is_categorical_dtype +from cudf.api.types import _is_categorical_dtype from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate _SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported @@ -230,7 +230,7 @@ def quantile_divisions(df, by, npartitions): if ( len(columns) == 1 and df[columns[0]].dtype != "object" - and not is_categorical_dtype(df[columns[0]].dtype) + and not _is_categorical_dtype(df[columns[0]].dtype) ): dtype = df[columns[0]].dtype divisions = divisions[columns[0]].astype("int64") diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 3a54672c1d3..a6a457d98a4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -5,6 +5,7 @@ import pytest from pandas.testing import assert_series_equal +import dask from dask import dataframe as dd from cudf import DataFrame, Series, date_range @@ -53,8 +54,8 @@ def test_dt_series(data, field): sr = Series(pdsr) dsr = dgd.from_cudf(sr, npartitions=5) base = getattr(pdsr.dt, field) - test = getattr(dsr.dt, field).compute().to_pandas().astype("int64") - assert_series_equal(base, test) + test = getattr(dsr.dt, field).compute() + assert_eq(base, test, check_dtype=False) @pytest.mark.parametrize("data", [data_dt_1()]) @@ -137,30 +138,30 @@ def test_categorical_basic(data): 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) + with dask.config.set({"dataframe.convert-string": False}): + df = DataFrame() + df["a"] = ["xyz", "abc", "def"] * 10 - df = DataFrame() - df["a"] = ["xyz", "abc", "def"] * 10 + pdf = df.to_pandas() + cddf = dgd.from_cudf(df, 1) + cddf["b"] = cddf["a"].astype("category") - pdf = df.to_pandas() - cddf = dgd.from_cudf(df, 1) - cddf["b"] = cddf["a"].astype("category") - - ddf = dd.from_pandas(pdf, 1) - ddf["b"] = ddf["a"].astype("category") + ddf = dd.from_pandas(pdf, 1) + ddf["b"] = ddf["a"].astype("category") - assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"]) + assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"]) - with pytest.raises(NotImplementedError): - cddf["b"].cat.categories + with pytest.raises(NotImplementedError): + cddf["b"].cat.categories - with pytest.raises(NotImplementedError): - ddf["b"].cat.categories + with pytest.raises(NotImplementedError): + ddf["b"].cat.categories - cddf = cddf.categorize() - ddf = ddf.categorize() + cddf = cddf.categorize() + ddf = ddf.categorize() - assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories) - assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered) + assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories) + assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered) @pytest.mark.parametrize("data", [data_cat_1()]) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 5b11b337f21..afe2a050695 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -782,14 +782,16 @@ def test_dataframe_set_index(): df["str"] = list("abcdefghijklmnopqrstuvwxyz") pdf = df.to_pandas() - ddf = dgd.from_cudf(df, npartitions=4) - ddf = ddf.set_index("str") + with dask.config.set({"dataframe.convert-string": False}): + ddf = dgd.from_cudf(df, npartitions=4) + ddf = ddf.set_index("str") - pddf = dd.from_pandas(pdf, npartitions=4) - pddf = pddf.set_index("str") - from cudf.testing._utils import assert_eq + pddf = dd.from_pandas(pdf, npartitions=4) + pddf = pddf.set_index("str") + + from cudf.testing._utils import assert_eq - assert_eq(ddf.compute(), pddf.compute()) + assert_eq(ddf.compute(), pddf.compute()) def test_series_describe(): @@ -803,7 +805,7 @@ def test_series_describe(): dd.assert_eq( dsr.describe(), pdsr.describe(), - check_less_precise=3, + rtol=1e-3, ) @@ -832,7 +834,7 @@ def test_zero_std_describe(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + dd.assert_eq(ddf.describe(), pddf.describe(), rtol=1e-3) def test_large_numbers_var(): @@ -847,7 +849,7 @@ def test_large_numbers_var(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.var(), pddf.var(), check_less_precise=3) + dd.assert_eq(ddf.var(), pddf.var(), rtol=1e-3) def test_index_map_partitions(): diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index c64e25fd437..76703206726 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. + +from datetime import datetime import numpy as np import pandas as pd @@ -82,6 +84,16 @@ def test_deterministic_tokenize(index): assert tokenize(df2) == tokenize(df2) +def test_deterministic_tokenize_multiindex(): + dt = datetime.strptime("1995-03-15", "%Y-%m-%d") + index = cudf.MultiIndex( + levels=[[1, 2], [dt]], + codes=[[0, 1], [0, 0]], + ) + df = cudf.DataFrame(index=index) + assert tokenize(df) == tokenize(df) + + @pytest.mark.parametrize("preserve_index", [True, False]) def test_pyarrow_schema_dispatch(preserve_index): from dask.dataframe.dispatch import ( diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 0dc57d8df55..c8cc6e65fa5 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -610,7 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): if as_index: # Groupby columns became the index. # Sorting the index should not change anything. - dd.assert_eq(gf.index, gf.sort_index().index) + dd.assert_eq(gf.index.to_frame(), gf.sort_index().index.to_frame()) else: # Groupby columns are did NOT become the index. # Sorting by these columns should not change anything. diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index e966e58f46e..e347e8be9e4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +import dask from dask import dataframe as dd import cudf @@ -69,11 +70,17 @@ def test_rowwise_reductions(data, op): gddf = dgd.from_cudf(data, npartitions=10) pddf = gddf.to_dask_dataframe() - if op in ("var", "std"): - expected = getattr(pddf, op)(axis=1, ddof=0) - got = getattr(gddf, op)(axis=1, ddof=0) - else: - expected = getattr(pddf, op)(axis=1) - got = getattr(pddf, op)(axis=1) - - dd.assert_eq(expected.compute(), got.compute(), check_exact=False) + with dask.config.set({"dataframe.convert-string": False}): + if op in ("var", "std"): + expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0) + got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0) + else: + expected = getattr(pddf, op)(numeric_only=True, axis=1) + got = getattr(pddf, op)(numeric_only=True, axis=1) + + dd.assert_eq( + expected, + got, + check_exact=False, + check_dtype=op not in ("var", "std"), + ) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index cac8d534e68..b46b6cc1362 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -21,8 +21,8 @@ dependencies = [ "cudf==24.4.*", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.21,<1.25", - "pandas>=1.3,<1.6.0dev0", + "numpy>=1.21", + "pandas>=2.0,<2.1.5dev0", "rapids-dask-dependency==24.4.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [