diff --git a/build.sh b/build.sh index 8b3add1dddd..765a1b5325f 100755 --- a/build.sh +++ b/build.sh @@ -18,7 +18,7 @@ ARGS=$* REPODIR=$(cd $(dirname $0); pwd) VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" -HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"\\\"] +HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) libcudf - build the cudf C++ code only @@ -32,7 +32,6 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk -v - verbose build mode -g - build for debug -n - no install step - -l - build legacy tests --allgpuarch - build for all supported GPU architectures --disable_nvtx - disable inserting NVTX profiling ranges --show_depr_warn - show cmake deprecation warnings diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 178bdab0154..62eeb4d131b 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. ######################################### # cuDF GPU build and test script for CI # ######################################### @@ -98,11 +98,7 @@ conda list --show-channel-urls ################################################################################ logger "Build libcudf..." -if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds -else - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds -fi +"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds ################################################################################ # BENCHMARK - Run and parse libcudf and cuDF benchmarks diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index e6ef72d930c..f2f67e9e000 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -23,25 +23,13 @@ if [ -z "$MY_UPLOAD_KEY" ]; then return 0 fi -################################################################################ -# SETUP - Get conda file output locations -################################################################################ - -gpuci_logger "Get conda file output locations" - -export LIBCUDF_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf --output` -export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf_kafka --output` -export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output` -export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output` -export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output` -export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output` - ################################################################################ # UPLOAD - Conda packages ################################################################################ gpuci_logger "Starting conda uploads" if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then + export LIBCUDF_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf --output) test -e ${LIBCUDF_FILE} echo "Upload libcudf" echo ${LIBCUDF_FILE} @@ -49,16 +37,19 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then + export CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf --python=$PYTHON --output) test -e ${CUDF_FILE} echo "Upload cudf" echo ${CUDF_FILE} gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE} --no-progress + export DASK_CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/dask-cudf --python=$PYTHON --output) test -e ${DASK_CUDF_FILE} echo "Upload dask-cudf" echo ${DASK_CUDF_FILE} gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE} --no-progress + export CUSTREAMZ_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/custreamz --python=$PYTHON --output) test -e ${CUSTREAMZ_FILE} echo "Upload custreamz" echo ${CUSTREAMZ_FILE} @@ -66,6 +57,7 @@ if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then fi if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then + export LIBCUDF_KAFKA_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf_kafka --output) test -e ${LIBCUDF_KAFKA_FILE} echo "Upload libcudf_kafka" echo ${LIBCUDF_KAFKA_FILE} @@ -73,6 +65,7 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF_KAFKA" == "1" ]]; then + export CUDF_KAFKA_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf_kafka --python=$PYTHON --output) test -e ${CUDF_KAFKA_FILE} echo "Upload cudf_kafka" echo ${CUDF_KAFKA_FILE} diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 4acdc372817..a79ffa0fc47 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -128,11 +128,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then ################################################################################ gpuci_logger "Build from source" - if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds - else - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds - fi + "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds ################################################################################ # TEST - Run GoogleTest @@ -226,11 +222,7 @@ else install_dask gpuci_logger "Build python libs from source" - if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds - else - "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds - fi + "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds fi diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 2e5c6ec8ca8..11eef015364 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -260,7 +260,7 @@ ConfigureBench( string/convert_durations.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp - string/copy.cpp + string/copy.cu string/extract.cpp string/factory.cu string/filter.cpp diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index 09c011cada1..b13835c15bb 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -135,12 +135,14 @@ static void BM_multibyte_split(benchmark::State& state) default: CUDF_FAIL(); } + auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { cuda_event_timer raii(state, true); auto output = cudf::io::text::multibyte_split(*source, delim); } state.SetBytesProcessed(state.iterations() * device_input.size()); + state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); } class MultibyteSplitBenchmark : public cudf::benchmark { diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cu similarity index 83% rename from cpp/benchmarks/string/copy.cpp rename to cpp/benchmarks/string/copy.cu index d40b0e069bc..2f064e71c44 100644 --- a/cpp/benchmarks/string/copy.cpp +++ b/cpp/benchmarks/string/copy.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,8 @@ * limitations under the License. */ -#include +#include "string_bench_args.hpp" + #include #include #include @@ -23,10 +24,7 @@ #include #include -#include -#include - -#include "string_bench_args.hpp" +#include class StringCopy : public cudf::benchmark { }; @@ -47,11 +45,14 @@ static void BM_copy(benchmark::State& state, copy_type ct) create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); // scatter indices - std::vector host_map_data(n_rows); - std::iota(host_map_data.begin(), host_map_data.end(), 0); - std::random_shuffle(host_map_data.begin(), host_map_data.end()); - cudf::test::fixed_width_column_wrapper index_map(host_map_data.begin(), - host_map_data.end()); + auto index_map_col = make_numeric_column( + cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED); + auto index_map = index_map_col->mutable_view(); + thrust::shuffle_copy(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(n_rows), + index_map.begin(), + thrust::default_random_engine()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 325f023f283..ba15e37f9ea 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,13 @@ #pragma once #include +#include +#include #include +#include +#include +#include #include /** @@ -375,6 +380,43 @@ class column_view : public detail::column_view_base { */ auto child_end() const noexcept { return _children.cend(); } + /** + * @brief Construct a column view from a device_span. + * + * Only numeric and chrono types are supported. + * + * @tparam T The device span type. Must be const and match the column view's type. + * @param data A typed device span containing the column view's data. + */ + template () or cudf::is_chrono())> + column_view(device_span data) + : column_view( + cudf::data_type{cudf::type_to_id()}, data.size(), data.data(), nullptr, 0, 0, {}) + { + CUDF_EXPECTS(data.size() < std::numeric_limits::max(), + "Data exceeds the maximum size of a column view."); + } + + /** + * @brief Converts a column view into a device span. + * + * Only numeric and chrono data types are supported. The column view must not + * be nullable. + * + * @tparam T The device span type. Must be const and match the column view's type. + * @throws cudf::logic_error if the column view type does not match the span type. + * @throws cudf::logic_error if the column view is nullable. + * @return A typed device span of the column view's data. + */ + template () or cudf::is_chrono())> + [[nodiscard]] operator device_span() const + { + CUDF_EXPECTS(type() == cudf::data_type{cudf::type_to_id()}, + "Device span type must match column view type."); + CUDF_EXPECTS(!nullable(), "A nullable column view cannot be converted to a device span."); + return device_span(data(), size()); + } + private: friend column_view bit_cast(column_view const& input, data_type type); diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index c919d814700..6cf1acd2f5a 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -130,7 +131,7 @@ struct MurmurHash3_32 { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - [[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs) + constexpr result_type hash_combine(result_type lhs, result_type rhs) const { result_type combined{lhs}; diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index cce05042917..e4e94074fb8 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include -#include namespace cudf { /** @@ -26,6 +25,22 @@ namespace cudf { * @file */ +/** + * @brief Identifies the hash function to be used + */ +enum class hash_id { + HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed + HASH_MURMUR3, ///< Murmur3 hash function + HASH_MD5, ///< MD5 hash function + HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function + HASH_SPARK_MURMUR3 ///< Spark Murmur3 hash function +}; + +/** + * @brief The default seed value for hash functions + */ +static constexpr uint32_t DEFAULT_HASH_SEED = 0; + /** * @brief Computes the hash value of each row in the input set of columns. * diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 6b1ad7db08b..3ffd9a87d39 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include #include diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 6222b2e680e..76e2589a5a9 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -326,21 +326,5 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh */ std::size_t size_of(data_type t); -/** - * @brief Identifies the hash function to be used - */ -enum class hash_id { - HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed - HASH_MURMUR3, ///< Murmur3 hash function - HASH_MD5, ///< MD5 hash function - HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function - HASH_SPARK_MURMUR3 ///< Spark Murmur3 hash function -}; - -/** - * @brief The default seed value for hash functions - */ -static constexpr uint32_t DEFAULT_HASH_SEED = 0; - /** @} */ } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d9b4ed01605..f96edd3ce5a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -48,8 +48,13 @@ endfunction() # ################################################################################################## # * column tests ---------------------------------------------------------------------------------- ConfigureTest( - COLUMN_TEST column/bit_cast_test.cpp column/column_view_shallow_test.cpp column/column_test.cu - column/column_device_view_test.cu column/compound_test.cu + COLUMN_TEST + column/bit_cast_test.cpp + column/column_device_view_test.cu + column/column_test.cu + column/column_view_device_span_test.cpp + column/column_view_shallow_test.cpp + column/compound_test.cu ) # ################################################################################################## diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp new file mode 100644 index 00000000000..2b7ea3b3650 --- /dev/null +++ b/cpp/tests/column/column_view_device_span_test.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +template () or cudf::is_chrono())> +std::unique_ptr example_column() +{ + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + return cudf::test::fixed_width_column_wrapper(begin, end).release(); +} + +template +struct ColumnViewDeviceSpanTests : public cudf::test::BaseFixture { +}; + +using DeviceSpanTypes = cudf::test::FixedWidthTypesWithoutFixedPoint; +TYPED_TEST_SUITE(ColumnViewDeviceSpanTests, DeviceSpanTypes); + +TYPED_TEST(ColumnViewDeviceSpanTests, conversion_round_trip) +{ + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + + // Test implicit conversion, round trip + cudf::device_span device_span_from_col_view = col_view; + cudf::column_view col_view_from_device_span = device_span_from_col_view; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_view, col_view_from_device_span); +} + +struct ColumnViewDeviceSpanErrorTests : public cudf::test::BaseFixture { +}; + +TEST_F(ColumnViewDeviceSpanErrorTests, type_mismatch) +{ + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + EXPECT_THROW((void)cudf::device_span{col_view}, cudf::logic_error); +} + +TEST_F(ColumnViewDeviceSpanErrorTests, nullable_column) +{ + auto col = example_column(); + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_NULL), col->size()); + auto col_view = cudf::column_view{*col}; + EXPECT_THROW((void)cudf::device_span{col_view}, cudf::logic_error); +} diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index d1fa787e000..27a8be95e9b 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -141,3 +141,5 @@ TEST_F(MultibyteSplitTest, HandpickedInput) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS); } + +CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index fd9992152a6..41d10b7b6da 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -1,18 +1,25 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector -cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: + + ctypedef enum hash_id "cudf::hash_id": + HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" + HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" + HASH_MD5 "cudf::hash_id::HASH_MD5" + HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3" + HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" + cdef unique_ptr[column] hash "cudf::hash" ( const table_view& input, - const libcudf_types.hash_id hash_function, + const hash_id hash_function, const uint32_t seed ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index 23727a20ec2..b1a257feedf 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -81,13 +81,6 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: DECIMAL64 "cudf::type_id::DECIMAL64" DECIMAL128 "cudf::type_id::DECIMAL128" - ctypedef enum hash_id "cudf::hash_id": - HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" - HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" - HASH_MD5 "cudf::hash_id::HASH_MD5" - HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3" - HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" - cdef cppclass data_type: data_type() except + data_type(const data_type&) except + diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index adc48159aac..301f571f5fb 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp cimport bool @@ -10,7 +10,7 @@ from libcpp.vector cimport vector cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.hash cimport hash as cpp_hash +from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -58,11 +58,11 @@ def hash(source_table, str method, int seed=0): cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) cdef unique_ptr[column] c_result - cdef libcudf_types.hash_id c_hash_function + cdef cpp_hash_id c_hash_function if method == "murmur3": - c_hash_function = libcudf_types.hash_id.HASH_MURMUR3 + c_hash_function = cpp_hash_id.HASH_MURMUR3 elif method == "md5": - c_hash_function = libcudf_types.hash_id.HASH_MD5 + c_hash_function = cpp_hash_id.HASH_MD5 else: raise ValueError(f"Unsupported hash function: {method}") with nogil: diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c96d940c378..7a9a17631a9 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1347,6 +1347,16 @@ def isin(self, values): array([ True, False, False]) """ + # To match pandas behavior, even though only list-like objects are + # supposed to be passed, only scalars throw errors. Other types (like + # dicts) just transparently return False (see the implementation of + # ColumnBase.isin). + if is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a {type(values).__name__}" + ) + return self._values.isin(values).values @classmethod diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a4de6db9bda..2596f90c59b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -43,6 +43,7 @@ from cudf.core import column, df_protocol, reshape from cudf.core.abc import Serializable from cudf.core.column import ( + CategoricalColumn, as_column, build_categorical_column, build_column, @@ -5169,82 +5170,81 @@ def isin(self, values): falcon True True dog False False """ - if isinstance(values, dict): - - result_df = DataFrame() - - for col in self._data.names: - if col in values: - val = values[col] - result_df[col] = self._data[col].isin(val) - else: - result_df[col] = column.full( - size=len(self), fill_value=False, dtype="bool" - ) - - result_df.index = self.index - return result_df - elif isinstance(values, Series): + # TODO: propagate nulls through isin + # https://github.com/rapidsai/cudf/issues/7556 + + fill_value = cudf.Scalar(False) + + def make_false_column_like_self(): + return column.full(len(self), fill_value, "bool") + + # Preprocess different input types into a mapping from column names to + # a list of values to check. + result = {} + if isinstance(values, IndexedFrame): + # Note: In the case where values is a Series, computing some + # information about the values column outside the loop may result + # in performance gains. However, since categorical conversion + # depends on the current column in the loop, using the correct + # precomputed variables inside the loop requires nontrivial logic. + # This optimization could be attempted if `isin` ever becomes a + # bottleneck. values = values.reindex(self.index) + other_cols = ( + values._data + if isinstance(values, DataFrame) + else {name: values._column for name in self._data} + ) + for col, self_col in self._data.items(): + if col in other_cols: + other_col = other_cols[col] + self_is_cat = isinstance(self_col, CategoricalColumn) + other_is_cat = isinstance(other_col, CategoricalColumn) + + if self_is_cat != other_is_cat: + # It is valid to compare the levels of a categorical + # column to a non-categorical column. + if self_is_cat: + self_col = self_col._get_decategorized_column() + else: + other_col = other_col._get_decategorized_column() - result = DataFrame() - # TODO: propagate nulls through isin - # https://github.com/rapidsai/cudf/issues/7556 - for col in self._data.names: - if isinstance( - self[col]._column, cudf.core.column.CategoricalColumn - ) and isinstance( - values._column, cudf.core.column.CategoricalColumn - ): - res = (self._data[col] == values._column).fillna(False) - result[col] = res - elif ( - isinstance( - self[col]._column, cudf.core.column.CategoricalColumn - ) - or np.issubdtype(self[col].dtype, cudf.dtype("object")) - ) or ( - isinstance( - values._column, cudf.core.column.CategoricalColumn - ) - or np.issubdtype(values.dtype, cudf.dtype("object")) - ): - result[col] = utils.scalar_broadcast_to(False, len(self)) + # We use the type checks from _before_ the conversion + # because if only one was categorical then it's already + # been converted and we have to check if they're strings. + if self_is_cat and other_is_cat: + self_is_str = other_is_str = False + else: + # These checks must happen after the conversions above + # since numpy can't handle categorical dtypes. + self_is_str = is_string_dtype(self_col.dtype) + other_is_str = is_string_dtype(other_col.dtype) + + if self_is_str != other_is_str: + # Strings can't compare to anything else. + result[col] = make_false_column_like_self() + else: + result[col] = (self_col == other_col).fillna(False) else: - result[col] = (self._data[col] == values._column).fillna( - False - ) - - result.index = self.index - return result - elif isinstance(values, DataFrame): - values = values.reindex(self.index) - - result = DataFrame() - for col in self._data.names: - if col in values.columns: - result[col] = ( - self._data[col] == values[col]._column - ).fillna(False) + result[col] = make_false_column_like_self() + elif is_dict_like(values): + for name, col in self._data.items(): + if name in values: + result[name] = col.isin(values[name]) else: - result[col] = utils.scalar_broadcast_to(False, len(self)) - result.index = self.index - return result + result[name] = make_false_column_like_self() + elif is_list_like(values): + for name, col in self._data.items(): + result[name] = col.isin(values) else: - if not is_list_like(values): - raise TypeError( - f"only list-like or dict-like objects are " - f"allowed to be passed to DataFrame.isin(), " - f"you passed a " - f"'{type(values).__name__}'" - ) - - result_df = DataFrame() + raise TypeError( + "only list-like or dict-like objects are " + "allowed to be passed to DataFrame.isin(), " + "you passed a " + f"'{type(values).__name__}'" + ) - for col in self._data.names: - result_df[col] = self._data[col].isin(values) - result_df.index = self.index - return result_df + return DataFrame._from_data(result, self.index) # # Stats diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 90ae7274a3f..8574a152c44 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2658,14 +2658,17 @@ def isin(self, values): dtype: bool """ + # Even though only list-like objects are supposed to be passed, only + # scalars throw errors. Other types (like dicts) just transparently + # return False (see the implementation of ColumnBase.isin). if is_scalar(values): raise TypeError( "only list-like objects are allowed to be passed " f"to isin(), you passed a [{type(values).__name__}]" ) - return Series( - self._column.isin(values), index=self.index, name=self.name + return Series._from_data( + {self.name: self._column.isin(values)}, index=self.index ) def unique(self): diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index f1aad1af9e6..e4b4d5020ea 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,6 +1,8 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. import operator +import warnings +from contextlib import contextmanager from functools import reduce import cupy as cp @@ -17,6 +19,37 @@ ] +@contextmanager +def _hide_ufunc_warnings(ufunc): + # pandas raises warnings for some inputs to the following ufuncs: + name = ufunc.__name__ + if name in { + "arccos", + "arccosh", + "arcsin", + "arctanh", + "fmod", + "log", + "log10", + "log2", + "reciprocal", + }: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + f"invalid value encountered in {name}", + category=RuntimeWarning, + ) + warnings.filterwarnings( + "ignore", + f"divide by zero encountered in {name}", + category=RuntimeWarning, + ) + yield + else: + yield + + @pytest.mark.parametrize("ufunc", _UFUNCS) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("indexed", [True, False]) @@ -76,7 +109,8 @@ def test_ufunc_series(ufunc, has_nulls, indexed): pytest.xfail(reason="Operation not supported by cupy") raise - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + with _hide_ufunc_warnings(ufunc): + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) try: if ufunc.nout > 1: @@ -256,7 +290,8 @@ def test_ufunc_dataframe(ufunc, has_nulls, indexed): pytest.xfail(reason="Operation not supported by cupy") raise - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + with _hide_ufunc_warnings(ufunc): + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) try: if ufunc.nout > 1: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f765c614907..acd9e28c661 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4250,272 +4250,6 @@ def test_value_counts(): ) -@pytest.mark.parametrize( - "data", - [ - [], - [0, 12, 14], - [0, 14, 12, 12, 3, 10, 12, 14], - np.random.randint(-100, 100, 200), - pd.Series([0.0, 1.0, None, 10.0]), - [None, None, None, None], - [np.nan, None, -1, 2, 3], - ], -) -@pytest.mark.parametrize( - "values", - [ - np.random.randint(-100, 100, 10), - [], - [np.nan, None, -1, 2, 3], - [1.0, 12.0, None, None, 120], - [0, 14, 12, 12, 3, 10, 12, 14, None], - [None, None, None], - ["0", "12", "14"], - ["0", "12", "14", "a"], - ], -) -def test_isin_numeric(data, values): - index = np.random.randint(0, 100, len(data)) - psr = cudf.utils.utils._create_pandas_series(data=data, index=index) - gsr = cudf.Series.from_pandas(psr, nan_as_null=False) - - expected = psr.isin(values) - got = gsr.isin(values) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series( - ["2018-01-01", "2019-04-03", None, "2019-12-30"], - dtype="datetime64[ns]", - ), - pd.Series( - [ - "2018-01-01", - "2019-04-03", - None, - "2019-12-30", - "2018-01-01", - "2018-01-01", - ], - dtype="datetime64[ns]", - ), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - [1514764800000000000, 1577664000000000000], - [ - 1514764800000000000, - 1577664000000000000, - 1577664000000000000, - 1577664000000000000, - 1514764800000000000, - ], - ["2019-04-03", "2019-12-30", "2012-01-01"], - [ - "2012-01-01", - "2012-01-01", - "2012-01-01", - "2019-04-03", - "2019-12-30", - "2012-01-01", - ], - ], -) -def test_isin_datetime(data, values): - psr = cudf.utils.utils._create_pandas_series(data=data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series(["this", "is", None, "a", "test"]), - pd.Series(["test", "this", "test", "is", None, "test", "a", "test"]), - pd.Series(["0", "12", "14"]), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["this", "is"], - [None, None, None], - ["12", "14", "19"], - pytest.param( - [12, 14, 19], - marks=pytest.mark.xfail( - not PANDAS_GE_120, - reason="pandas's failure here seems like a bug(in < 1.2) " - "given the reverse succeeds", - ), - ), - ["is", "this", "is", "this", "is"], - ], -) -def test_isin_string(data, values): - psr = cudf.utils.utils._create_pandas_series(data=data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series(["a", "b", "c", "c", "c", "d", "e"], dtype="category"), - pd.Series(["a", "b", None, "c", "d", "e"], dtype="category"), - pd.Series([0, 3, 10, 12], dtype="category"), - pd.Series([0, 3, 10, 12, 0, 10, 3, 0, 0, 3, 3], dtype="category"), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["a", "b", None, "f", "words"], - ["0", "12", None, "14"], - [0, 10, 12, None, 39, 40, 1000], - [0, 0, 0, 0, 3, 3, 3, None, 1, 2, 3], - ], -) -def test_isin_categorical(data, values): - psr = cudf.utils.utils._create_pandas_series(data=data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series( - ["this", "is", None, "a", "test"], index=["a", "b", "c", "d", "e"] - ), - pd.Series([0, 15, 10], index=[0, None, 9]), - pd.Series( - range(25), - index=pd.date_range( - start="2019-01-01", end="2019-01-02", freq="H" - ), - ), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["this", "is"], - [0, 19, 13], - ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02"], - ], -) -def test_isin_index(data, values): - psr = cudf.utils.utils._create_pandas_series(data=data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.index.isin(values) - expected = psr.index.isin(values) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - pd.MultiIndex.from_arrays( - [[1, 2, 3], ["red", "blue", "green"]], names=("number", "color") - ), - pd.MultiIndex.from_arrays([[], []], names=("number", "color")), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]], - names=("number", "color"), - ), - ], -) -@pytest.mark.parametrize( - "values,level,err", - [ - (["red", "orange", "yellow"], "color", None), - (["red", "white", "yellow"], "color", None), - ([0, 1, 2, 10, 11, 15], "number", None), - ([0, 1, 2, 10, 11, 15], None, TypeError), - (pd.Series([0, 1, 2, 10, 11, 15]), None, TypeError), - (pd.Index([0, 1, 2, 10, 11, 15]), None, TypeError), - (pd.Index([0, 1, 2, 8, 11, 15]), "number", None), - (pd.Index(["red", "white", "yellow"]), "color", None), - ([(1, "red"), (3, "red")], None, None), - (((1, "red"), (3, "red")), None, None), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3], ["red", "blue", "green"]], - names=("number", "color"), - ), - None, - None, - ), - ( - pd.MultiIndex.from_arrays([[], []], names=("number", "color")), - None, - None, - ), - ( - pd.MultiIndex.from_arrays( - [ - [1, 2, 3, 10, 100], - ["red", "blue", "green", "pink", "white"], - ], - names=("number", "color"), - ), - None, - None, - ), - ], -) -def test_isin_multiindex(data, values, level, err): - pmdx = data - gmdx = cudf.from_pandas(data) - - if err is None: - expected = pmdx.isin(values, level=level) - if isinstance(values, pd.MultiIndex): - values = cudf.from_pandas(values) - got = gmdx.isin(values, level=level) - - assert_eq(got, expected) - else: - assert_exceptions_equal( - lfunc=pmdx.isin, - rfunc=gmdx.isin, - lfunc_args_and_kwargs=([values], {"level": level}), - rfunc_args_and_kwargs=([values], {"level": level}), - check_exception_type=False, - expected_error_message=re.escape( - "values need to be a Multi-Index or set/list-like tuple " - "squences when `level=None`." - ), - ) - - @pytest.mark.parametrize( "data", [ @@ -4541,6 +4275,8 @@ def test_isin_multiindex(data, values, level, err): "num_wings": [2, 0, 2, 1, 2, 4, -1], } ), + pd.DataFrame({"a": ["a", "b", "c"]}, dtype="category"), + pd.DataFrame({"a": ["a", "b", "c"]}), ], ) @pytest.mark.parametrize( @@ -4569,6 +4305,9 @@ def test_isin_multiindex(data, values, level, err): pd.Series([1, 2, 3, 4, 5]), "abc", 123, + pd.Series(["a", "b", "c"]), + pd.Series(["a", "b", "c"], dtype="category"), + pd.DataFrame({"a": ["a", "b", "c"]}, dtype="category"), ], ) def test_isin_dataframe(data, values): @@ -4591,6 +4330,13 @@ def test_isin_dataframe(data, values): not PANDAS_GE_110, "https://github.com/pandas-dev/pandas/issues/34256", ) + except TypeError as e: + # Can't do isin with different categories + if str(e) == ( + "Categoricals can only be compared if 'categories' " + "are the same." + ): + return if isinstance(values, (pd.DataFrame, pd.Series)): values = cudf.from_pandas(values) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 6679725ae9a..faaa42ac7f8 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. """ Test related to Index @@ -2528,3 +2528,115 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): if NA_idx is not None: assert idx[NA_idx] is cudf.NA + + +@pytest.mark.parametrize( + "data", + [ + [], + pd.Series( + ["this", "is", None, "a", "test"], index=["a", "b", "c", "d", "e"] + ), + pd.Series([0, 15, 10], index=[0, None, 9]), + pd.Series( + range(25), + index=pd.date_range( + start="2019-01-01", end="2019-01-02", freq="H" + ), + ), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + ["this", "is"], + [0, 19, 13], + ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02"], + ], +) +def test_isin_index(data, values): + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) + + got = gsr.index.isin(values) + expected = psr.index.isin(values) + + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + pd.MultiIndex.from_arrays( + [[1, 2, 3], ["red", "blue", "green"]], names=("number", "color") + ), + pd.MultiIndex.from_arrays([[], []], names=("number", "color")), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]], + names=("number", "color"), + ), + ], +) +@pytest.mark.parametrize( + "values,level,err", + [ + (["red", "orange", "yellow"], "color", None), + (["red", "white", "yellow"], "color", None), + ([0, 1, 2, 10, 11, 15], "number", None), + ([0, 1, 2, 10, 11, 15], None, TypeError), + (pd.Series([0, 1, 2, 10, 11, 15]), None, TypeError), + (pd.Index([0, 1, 2, 10, 11, 15]), None, TypeError), + (pd.Index([0, 1, 2, 8, 11, 15]), "number", None), + (pd.Index(["red", "white", "yellow"]), "color", None), + ([(1, "red"), (3, "red")], None, None), + (((1, "red"), (3, "red")), None, None), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3], ["red", "blue", "green"]], + names=("number", "color"), + ), + None, + None, + ), + ( + pd.MultiIndex.from_arrays([[], []], names=("number", "color")), + None, + None, + ), + ( + pd.MultiIndex.from_arrays( + [ + [1, 2, 3, 10, 100], + ["red", "blue", "green", "pink", "white"], + ], + names=("number", "color"), + ), + None, + None, + ), + ], +) +def test_isin_multiindex(data, values, level, err): + pmdx = data + gmdx = cudf.from_pandas(data) + + if err is None: + expected = pmdx.isin(values, level=level) + if isinstance(values, pd.MultiIndex): + values = cudf.from_pandas(values) + got = gmdx.isin(values, level=level) + + assert_eq(got, expected) + else: + assert_exceptions_equal( + lfunc=pmdx.isin, + rfunc=gmdx.isin, + lfunc_args_and_kwargs=([values], {"level": level}), + rfunc_args_and_kwargs=([values], {"level": level}), + check_exception_type=False, + expected_error_message=re.escape( + "values need to be a Multi-Index or set/list-like tuple " + "squences when `level=None`." + ), + ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 358484d79b9..3e3c5d1b053 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import operator import re @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_120 from cudf.testing._utils import ( NUMERIC_TYPES, TIMEDELTA_TYPES, @@ -1548,45 +1549,154 @@ def test_series_nunique_index(data): @pytest.mark.parametrize( - "fill_value,data", + "data", [ - (7, [6, 3, 4]), - ("x", ["a", "b", "c", "d", "e", "f"]), - (7, [6, 3, 4, 2, 1, 7, 8, 5]), - (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]), - ("b", pd.Categorical(["a", "b", "c"])), - (None, [0.0, 1.0, 2.0, 3.0]), + [], + [0, 12, 14], + [0, 14, 12, 12, 3, 10, 12, 14], + np.random.randint(-100, 100, 200), + pd.Series([0.0, 1.0, None, 10.0]), + [None, None, None, None], + [np.nan, None, -1, 2, 3], ], ) @pytest.mark.parametrize( - "begin,end", + "values", [ - (0, -1), - (0, 4), - (1, -1), - (1, 4), - (-2, 1), - (-2, -1), - (10, 12), - (8, 10), - (10, 8), - (-10, -8), - (-2, 6), + np.random.randint(-100, 100, 10), + [], + [np.nan, None, -1, 2, 3], + [1.0, 12.0, None, None, 120], + [0, 14, 12, 12, 3, 10, 12, 14, None], + [None, None, None], + ["0", "12", "14"], + ["0", "12", "14", "a"], ], ) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fill(data, fill_value, begin, end, inplace): - gs = cudf.Series(data) - ps = gs.to_pandas() +def test_isin_numeric(data, values): + index = np.random.randint(0, 100, len(data)) + psr = cudf.utils.utils._create_pandas_series(data=data, index=index) + gsr = cudf.Series.from_pandas(psr, nan_as_null=False) + + expected = psr.isin(values) + got = gsr.isin(values) + + assert_eq(got, expected) - actual = gs - gs[begin:end] = fill_value - ps[begin:end] = fill_value - assert_eq(ps, actual) +@pytest.mark.parametrize( + "data", + [ + [], + pd.Series( + ["2018-01-01", "2019-04-03", None, "2019-12-30"], + dtype="datetime64[ns]", + ), + pd.Series( + [ + "2018-01-01", + "2019-04-03", + None, + "2019-12-30", + "2018-01-01", + "2018-01-01", + ], + dtype="datetime64[ns]", + ), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + [1514764800000000000, 1577664000000000000], + [ + 1514764800000000000, + 1577664000000000000, + 1577664000000000000, + 1577664000000000000, + 1514764800000000000, + ], + ["2019-04-03", "2019-12-30", "2012-01-01"], + [ + "2012-01-01", + "2012-01-01", + "2012-01-01", + "2019-04-03", + "2019-12-30", + "2012-01-01", + ], + ], +) +def test_isin_datetime(data, values): + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) + + got = gsr.isin(values) + expected = psr.isin(values) + assert_eq(got, expected) -@pytest.mark.xfail(raises=ValueError) -def test_fill_new_category(): - gs = cudf.Series(pd.Categorical(["a", "b", "c"])) - gs[0:1] = "d" +@pytest.mark.parametrize( + "data", + [ + [], + pd.Series(["this", "is", None, "a", "test"]), + pd.Series(["test", "this", "test", "is", None, "test", "a", "test"]), + pd.Series(["0", "12", "14"]), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + ["this", "is"], + [None, None, None], + ["12", "14", "19"], + pytest.param( + [12, 14, 19], + marks=pytest.mark.xfail( + not PANDAS_GE_120, + reason="pandas's failure here seems like a bug(in < 1.2) " + "given the reverse succeeds", + ), + ), + ["is", "this", "is", "this", "is"], + ], +) +def test_isin_string(data, values): + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) + + got = gsr.isin(values) + expected = psr.isin(values) + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + [], + pd.Series(["a", "b", "c", "c", "c", "d", "e"], dtype="category"), + pd.Series(["a", "b", None, "c", "d", "e"], dtype="category"), + pd.Series([0, 3, 10, 12], dtype="category"), + pd.Series([0, 3, 10, 12, 0, 10, 3, 0, 0, 3, 3], dtype="category"), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + ["a", "b", None, "f", "words"], + ["0", "12", None, "14"], + [0, 10, 12, None, 39, 40, 1000], + [0, 0, 0, 0, 3, 3, 3, None, 1, 2, 3], + ], +) +def test_isin_categorical(data, values): + psr = cudf.utils.utils._create_pandas_series(data=data) + gsr = cudf.Series.from_pandas(psr) + + got = gsr.isin(values) + expected = psr.isin(values) + assert_eq(got, expected) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index cf845a5d525..4dadfede866 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -93,6 +93,8 @@ def wrapper(*args, **kwargs): return wrapper +# TODO: We should evaluate whether calls to this could be more easily replaced +# with column.full, which appears to be significantly faster in simple cases. def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)):