From 1e53776c0e17da1809fb9d12a8b3ab6ffbc669c3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 29 Jun 2021 18:40:02 -0500 Subject: [PATCH] Upgrade arrow to 4.0.1 (#7495) Fixes: https://github.com/rapidsai/cudf/issues/7224 This PR: - [x] Adds support for arrow 4.0.1 in cudf. - [x] Moves testing-related utilities to `cudf.testing` module. - [x] Fixes miscellaneous errors related to arrow upgrade. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Paul Taylor (https://github.com/trxcllnt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Jeremy Dyer (https://github.com/jdye64) - Paul Taylor (https://github.com/trxcllnt) - Dillon Cullinan (https://github.com/dillon-cullinan) - Devavret Makkar (https://github.com/devavret) - Keith Kraus (https://github.com/kkraus14) - Michael Wang (https://github.com/isVoid) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cudf/pull/7495 --- ci/gpu/build.sh | 2 +- conda/environments/cudf_dev_cuda11.0.yml | 4 +- conda/environments/cudf_dev_cuda11.2.yml | 4 +- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- conda/recipes/libcudf_kafka/meta.yaml | 4 +- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +- python/cudf/cudf/_fuzz_testing/avro.py | 2 +- python/cudf/cudf/_fuzz_testing/csv.py | 2 +- python/cudf/cudf/_fuzz_testing/json.py | 2 +- python/cudf/cudf/_fuzz_testing/orc.py | 2 +- python/cudf/cudf/_fuzz_testing/parquet.py | 2 +- .../cudf/_fuzz_testing/tests/fuzz_test_csv.py | 2 +- .../_fuzz_testing/tests/fuzz_test_json.py | 2 +- python/cudf/cudf/_fuzz_testing/utils.py | 2 +- python/cudf/cudf/_lib/gpuarrow.pyx | 4 +- python/cudf/cudf/_lib/utils.pyx | 16 +- python/cudf/cudf/core/dataframe.py | 11 +- .../{tests/utils.py => testing/_utils.py} | 0 .../{tests => testing}/dataset_generator.py | 0 python/cudf/cudf/tests/__init__.py | 0 python/cudf/cudf/tests/test_apply_rows.py | 2 +- python/cudf/cudf/tests/test_applymap.py | 2 +- python/cudf/cudf/tests/test_array_function.py | 2 +- python/cudf/cudf/tests/test_array_ufunc.py | 7 +- .../test_avro_reader_fastavro_integration.py | 2 +- python/cudf/cudf/tests/test_binops.py | 20 +- python/cudf/cudf/tests/test_categorical.py | 6 +- python/cudf/cudf/tests/test_column.py | 8 +- .../cudf/cudf/tests/test_column_accessor.py | 2 +- python/cudf/cudf/tests/test_concat.py | 6 +- python/cudf/cudf/tests/test_contains.py | 2 +- python/cudf/cudf/tests/test_copying.py | 2 +- python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_cuda_apply.py | 2 +- .../cudf/tests/test_cuda_array_interface.py | 5 +- .../cudf/cudf/tests/test_custom_accessor.py | 4 +- python/cudf/cudf/tests/test_cut.py | 7 +- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_dataframe_copy.py | 2 +- python/cudf/cudf/tests/test_datasets.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 2 +- python/cudf/cudf/tests/test_decimal.py | 2 +- python/cudf/cudf/tests/test_dlpack.py | 2 +- python/cudf/cudf/tests/test_dropna.py | 2 +- python/cudf/cudf/tests/test_dtypes.py | 2 +- python/cudf/cudf/tests/test_duplicates.py | 2 +- python/cudf/cudf/tests/test_factorize.py | 2 +- python/cudf/cudf/tests/test_feather.py | 2 +- python/cudf/cudf/tests/test_fill.py | 2 +- python/cudf/cudf/tests/test_gcs.py | 2 +- .../cudf/cudf/tests/test_gpu_arrow_parser.py | 5 +- python/cudf/cudf/tests/test_groupby.py | 4 +- python/cudf/cudf/tests/test_hdf.py | 2 +- python/cudf/cudf/tests/test_hdfs.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 8 +- python/cudf/cudf/tests/test_interval.py | 2 +- python/cudf/cudf/tests/test_joining.py | 194 +++++++++++++++++- python/cudf/cudf/tests/test_json.py | 2 +- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_monotonic.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 2 +- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_numpy_interop.py | 2 +- python/cudf/cudf/tests/test_onehot.py | 2 +- python/cudf/cudf/tests/test_ops.py | 2 +- python/cudf/cudf/tests/test_orc.py | 6 +- python/cudf/cudf/tests/test_pandas_interop.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 4 +- python/cudf/cudf/tests/test_pickling.py | 2 +- python/cudf/cudf/tests/test_quantiles.py | 2 +- python/cudf/cudf/tests/test_query.py | 2 +- python/cudf/cudf/tests/test_query_mask.py | 2 +- python/cudf/cudf/tests/test_rank.py | 2 +- python/cudf/cudf/tests/test_reductions.py | 4 +- python/cudf/cudf/tests/test_replace.py | 4 +- python/cudf/cudf/tests/test_repr.py | 2 +- python/cudf/cudf/tests/test_reshape.py | 2 +- python/cudf/cudf/tests/test_rolling.py | 2 +- python/cudf/cudf/tests/test_s3.py | 2 +- python/cudf/cudf/tests/test_scalar.py | 2 +- python/cudf/cudf/tests/test_scan.py | 7 +- python/cudf/cudf/tests/test_search.py | 2 +- python/cudf/cudf/tests/test_serialize.py | 4 +- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_seriesmap.py | 4 +- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_sorting.py | 2 +- python/cudf/cudf/tests/test_sparse_df.py | 2 +- python/cudf/cudf/tests/test_stats.py | 2 +- python/cudf/cudf/tests/test_string.py | 193 +---------------- python/cudf/cudf/tests/test_struct.py | 2 +- python/cudf/cudf/tests/test_testing.py | 2 +- python/cudf/cudf/tests/test_text.py | 2 +- python/cudf/cudf/tests/test_timedelta.py | 4 +- python/cudf/cudf/tests/test_transform.py | 2 +- python/cudf/cudf/tests/test_unaops.py | 2 +- .../custreamz/custreamz/tests/test_kafka.py | 2 +- .../dask_cudf/tests/test_accessor.py | 6 +- python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- .../dask_cudf/tests/test_distributed.py | 2 +- 102 files changed, 359 insertions(+), 338 deletions(-) rename python/cudf/cudf/{tests/utils.py => testing/_utils.py} (100%) rename python/cudf/cudf/{tests => testing}/dataset_generator.py (100%) delete mode 100644 python/cudf/cudf/tests/__init__.py diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 1d0154aedc7..c854e67fbdf 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -217,7 +217,7 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term +py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 1568327f88c..5561a573609 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=4.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -44,8 +44,8 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz + - arrow-cpp=4.0.1 - dlpack>=0.5,<0.6.0a0 - - arrow-cpp=1.0.1 - arrow-cpp-proc * cuda - double-conversion - rapidjson diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 9d520ada253..6c8ae4cb9b0 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=4.0.1 - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -44,8 +44,8 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz + - arrow-cpp=4.0.1 - dlpack>=0.5,<0.6.0a0 - - arrow-cpp=1.0.1 - arrow-cpp-proc * cuda - double-conversion - rapidjson diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index d1aaf924555..3da7c63857d 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -30,7 +30,7 @@ requirements: - setuptools - numba >=0.53.1 - dlpack>=0.5,<0.6.0a0 - - pyarrow 1.0.1 + - pyarrow 4.0.1 - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 147c1685ecc..6464013d646 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 1.0.1 + - arrow-cpp 4.0.1 - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index f1ec813a17f..6b15890e7c7 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -25,8 +25,8 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf {{ version }} - - librdkafka >=1.5.0,<1.5.3 + - libcudf {{version}} + - librdkafka >=1.6.0,<1.7.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 0eee5abd2f3..e15f3f7e16d 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -127,6 +127,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) endfunction() -set(CUDF_VERSION_Arrow 1.0.1) +set(CUDF_VERSION_Arrow 4.0.1) find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3}) diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index a07e3acf416..4c167ac627f 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -15,7 +15,7 @@ pandas_to_avro, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 84346ed61ad..0acb9c8a471 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -12,7 +12,7 @@ _generate_rand_meta, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes logging.basicConfig( diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index 5ecb27f7665..df9226cf059 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -13,7 +13,7 @@ _generate_rand_meta, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes logging.basicConfig( diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 607294a49c9..2aa01eb3967 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -16,7 +16,7 @@ pandas_to_orc, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 8c63b12d972..5b00f96d88d 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -12,7 +12,7 @@ _generate_rand_meta, pyarrow_to_pandas, ) -from cudf.tests import dataset_generator as dg +from cudf.testing import dataset_generator as dg logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py index e6a5d081980..9b6abeb1276 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py @@ -13,7 +13,7 @@ compare_content, run_test, ) -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pythonfuzz(data_handle=CSVReader) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py index f3da03f447b..2f5e6204f7c 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py @@ -9,7 +9,7 @@ from cudf._fuzz_testing.json import JSONReader, JSONWriter from cudf._fuzz_testing.main import pythonfuzz from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pythonfuzz(data_handle=JSONReader) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 71b5a35a225..fe9ed4d4934 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -9,7 +9,7 @@ import pyorc import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils.dtypes import ( pandas_dtypes_to_cudf_dtypes, pyarrow_dtypes_to_pandas_dtypes, diff --git a/python/cudf/cudf/_lib/gpuarrow.pyx b/python/cudf/cudf/_lib/gpuarrow.pyx index 6513cd59424..a7da22637b9 100644 --- a/python/cudf/cudf/_lib/gpuarrow.pyx +++ b/python/cudf/cudf/_lib/gpuarrow.pyx @@ -15,7 +15,7 @@ from pyarrow.includes.libarrow cimport ( CRecordBatchStreamReader ) from pyarrow.lib cimport ( - _CRecordBatchReader, + RecordBatchReader, Buffer, Schema, pyarrow_wrap_schema @@ -23,7 +23,7 @@ from pyarrow.lib cimport ( import pyarrow as pa -cdef class CudaRecordBatchStreamReader(_CRecordBatchReader): +cdef class CudaRecordBatchStreamReader(RecordBatchReader): cdef: CIpcReadOptions options diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 13eedb34c18..e5dfb5a5c35 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -138,12 +138,16 @@ cpdef generate_pandas_metadata(Table table, index): index_descriptors.append(descr) metadata = pa.pandas_compat.construct_metadata( - table, - col_names, - index_levels, - index_descriptors, - index, - types, + columns_to_convert=[ + col + for col in table._columns + ], + df=table, + column_names=col_names, + index_levels=index_levels, + index_descriptors=index_descriptors, + preserve_index=index, + types=types, ) md_dict = json.loads(metadata[b"pandas"]) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 436f14cf6e3..0901334396a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5652,11 +5652,12 @@ def to_arrow(self, preserve_index=True): out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - self, - out.schema.names, - [self.index], - index_descr, - preserve_index, + columns_to_convert=[self[col] for col in self._data.names], + df=self, + column_names=out.schema.names, + index_levels=[self.index], + index_descriptors=index_descr, + preserve_index=preserve_index, types=out.schema.types, ) diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/testing/_utils.py similarity index 100% rename from python/cudf/cudf/tests/utils.py rename to python/cudf/cudf/testing/_utils.py diff --git a/python/cudf/cudf/tests/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py similarity index 100% rename from python/cudf/cudf/tests/dataset_generator.py rename to python/cudf/cudf/testing/dataset_generator.py diff --git a/python/cudf/cudf/tests/__init__.py b/python/cudf/cudf/tests/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py index 0ba80278fca..f025549971f 100644 --- a/python/cudf/cudf/tests/test_apply_rows.py +++ b/python/cudf/cudf/tests/test_apply_rows.py @@ -2,7 +2,7 @@ import cudf from cudf.core.column import column -from cudf.tests.utils import assert_eq, gen_rand_series +from cudf.testing._utils import assert_eq, gen_rand_series def _kernel_multiply(a, b, out): diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 1f35bc93c78..fa3c88a3551 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -7,7 +7,7 @@ import pytest from cudf import Series -from cudf.tests import utils +from cudf.testing import _utils as utils @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 03f9cf1d7e5..cd4dd28f179 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils.utils import IS_NEP18_ACTIVE missing_arrfunc_cond = not IS_NEP18_ACTIVE diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index f9e0bb2ce8a..8cfcf4d2b6d 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,9 +1,10 @@ -import cudf -import numpy as np import cupy as cp +import numpy as np import pandas as pd import pytest -from cudf.tests.utils import assert_eq + +import cudf +from cudf.testing._utils import assert_eq @pytest.fixture diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index a52ee937574..48e3b0ec42c 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -18,7 +18,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def cudf_from_avro_util(schema, records): diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 090e03c9403..1c97cbb10ff 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -14,7 +14,7 @@ import cudf from cudf.core import Series from cudf.core.index import as_index -from cudf.tests import utils +from cudf.testing import _utils as utils from cudf.utils.dtypes import ( BOOL_TYPES, DATETIME_TYPES, @@ -1742,12 +1742,6 @@ def test_binops_with_NA_consistent(dtype, op): assert result._column.null_count == len(data) -def _decimal_series(input, dtype): - return cudf.Series( - [x if x is None else decimal.Decimal(x) for x in input], dtype=dtype, - ) - - @pytest.mark.parametrize( "args", [ @@ -2080,10 +2074,10 @@ def _decimal_series(input, dtype): def test_binops_decimal(args): op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args - a = _decimal_series(lhs, l_dtype) - b = _decimal_series(rhs, r_dtype) + a = utils._decimal_series(lhs, l_dtype) + b = utils._decimal_series(rhs, r_dtype) expect = ( - _decimal_series(expect, expect_dtype) + utils._decimal_series(expect, expect_dtype) if isinstance(expect_dtype, cudf.Decimal64Dtype) else cudf.Series(expect, dtype=expect_dtype) ) @@ -2242,7 +2236,7 @@ def test_binops_decimal(args): ), ], ) -@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES) +@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES) @pytest.mark.parametrize("reflected", [True, False]) def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): """ @@ -2258,7 +2252,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): else: op, ldata, ldtype, rdata, _, expected = args - lhs = _decimal_series(ldata, ldtype) + lhs = utils._decimal_series(ldata, ldtype) rhs = cudf.Series(rdata, dtype=integer_dtype) if reflected: @@ -2746,7 +2740,7 @@ def test_binops_decimal_scalar_compare(args, reflected): else: op, ldata, ldtype, rdata, _, expected = args - lhs = _decimal_series(ldata, ldtype) + lhs = utils._decimal_series(ldata, ldtype) rhs = rdata if reflected: diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 7b1aea174c8..6a23f568348 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -9,7 +9,11 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal +from cudf.testing._utils import ( + NUMERIC_TYPES, + assert_eq, + assert_exceptions_equal, +) @pytest.fixture diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 3ac6cc0bb44..f3387b3d27d 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -8,7 +8,7 @@ import cudf from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal from cudf.utils import dtypes as dtypeutils dtypes = sorted( @@ -140,8 +140,8 @@ def test_column_series_multi_dim(data): @pytest.mark.parametrize( ("data", "error"), [ - ([1, "1.0", "2", -3], TypeError), - ([np.nan, 0, "null", cp.nan], TypeError), + ([1, "1.0", "2", -3], pa.lib.ArrowInvalid), + ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid), ( [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)], None, @@ -152,7 +152,7 @@ def test_column_mixed_dtype(data, error): if error is None: cudf.Series(data) else: - with pytest.raises(TypeError): + with pytest.raises(error): cudf.Series(data) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 86a7927dcac..99d4bdd9910 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -6,7 +6,7 @@ import cudf from cudf.core.column_accessor import ColumnAccessor -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq simple_test_data = [ {}, diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 5c4c121db4d..2578cb13bff 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,16 +1,16 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. import re +from decimal import Decimal import numpy as np import pandas as pd import pytest -from decimal import Decimal import cudf as gd -from cudf.tests.utils import assert_eq, assert_exceptions_equal -from cudf.utils.dtypes import is_categorical_dtype from cudf.core.dtypes import Decimal64Dtype +from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.utils.dtypes import is_categorical_dtype def make_frames(index=None, nulls="none"): diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index b669c40022e..b6650600261 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -6,7 +6,7 @@ from cudf import Series from cudf.core.index import RangeIndex, as_index -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index ed6a1169a2a..0965b5298a4 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -6,7 +6,7 @@ import cudf from cudf.core import Series -from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 925369048cb..c19fde8b5d6 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -14,7 +14,7 @@ import cudf from cudf import read_csv -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal def make_numeric_dataframe(nrows, dtype): diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index fa880da6804..2604030097b 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -9,7 +9,7 @@ from numba import cuda from cudf import DataFrame -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 42e5ab38f50..ecf961f133b 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -10,7 +10,7 @@ from numba import cuda import cudf -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @@ -171,6 +171,9 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): def test_cuda_array_interface_pytorch(): torch = pytest.importorskip("torch") + if not torch.cuda.is_available(): + pytest.skip("need gpu version of pytorch to be installed") + series = cudf.Series([1, -1, 10, -56]) tensor = torch.tensor(series) got = cudf.Series(tensor) diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index d72b5875677..16e5b345ce2 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -2,9 +2,9 @@ import pandas as pd import pytest -import cudf as gd -from cudf.tests.utils import assert_eq +import cudf as gd +from cudf.testing._utils import assert_eq @gd.api.extensions.register_dataframe_accessor("point") diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py index 926826ac188..710df78e36b 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/test_cut.py @@ -4,11 +4,12 @@ Test related to Cut """ -import pandas as pd import numpy as np -from cudf.core.cut import cut +import pandas as pd import pytest -from cudf.tests.utils import assert_eq + +from cudf.core.cut import cut +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a89b9b58e6e..2b32471c30c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -20,8 +20,8 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.core.column import column -from cudf.tests import utils -from cudf.tests.utils import ( +from cudf.testing import _utils as utils +from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 35788e660ea..5b258c760b3 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -7,7 +7,7 @@ from numba import cuda from cudf.core.dataframe import DataFrame -from cudf.tests.utils import ALL_TYPES, assert_eq +from cudf.testing._utils import ALL_TYPES, assert_eq """ DataFrame copy expectations diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index a603a6b4658..b7bc89f008d 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,7 +1,7 @@ import numpy as np import cudf as gd -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def test_dataset_timeseries(): diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 647ff5250ba..653ee8389fa 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -14,7 +14,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core.index import DatetimeIndex -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 4816094814a..d2de44b0c8f 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -10,7 +10,7 @@ import cudf from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn from cudf.core.dtypes import Decimal64Dtype -from cudf.tests.utils import ( +from cudf.testing._utils import ( FLOAT_TYPES, INTEGER_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index b8175d05137..4b2fca0d12d 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq nelems = [0, 3, 10] dtype = [np.uint16, np.int32, np.float64] diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index d01627309d6..e1d0c38c760 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index a5895caf49f..41d7f5d215e 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -14,7 +14,7 @@ ListDtype, StructDtype, ) -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils.dtypes import np_to_pa_dtype diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index f721b7a28e5..f464ac1a6c2 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -9,7 +9,7 @@ import cudf from cudf import concat -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal # TODO: PANDAS 1.0 support # Revisit drop_duplicates() tests to update parameters like ignore_index. diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 48ffef4a11c..3df0031745e 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -7,7 +7,7 @@ import cudf from cudf.core import DataFrame, Index -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 525b88fc7ff..6c83ee3c458 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.tests.utils import NUMERIC_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, assert_eq if LooseVersion(pd.__version__) < LooseVersion("0.24"): try: diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py index 83d15b36e64..efbe2834486 100644 --- a/python/cudf/cudf/tests/test_fill.py +++ b/python/cudf/cudf/tests/test_fill.py @@ -2,7 +2,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index 5d287a57df8..99d79e41520 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq gcsfs = pytest.importorskip("gcsfs") diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py index e3c8e69695d..a088ae9f923 100644 --- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py +++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py @@ -1,4 +1,5 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. + import logging import numpy as np @@ -8,7 +9,7 @@ import cudf from cudf.comm.gpuarrow import GpuArrowReader -from cudf.tests.utils import INTEGER_TYPES +from cudf.testing._utils import INTEGER_TYPES def make_gpu_parse_arrow_data_batch(): diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index f346edb4304..e423a64fe4d 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -15,14 +15,14 @@ import cudf from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.dataset_generator import rand_dataframe -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, SIGNED_TYPES, TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, ) +from cudf.testing.dataset_generator import rand_dataframe _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index f908d5f51f5..1bf91a52c2f 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq try: import tables # noqa F401 diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index e3867c620fe..24554f113bb 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -11,7 +11,7 @@ from pyarrow import orc as orc import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq if not os.environ.get("RUN_HDFS_TESTS"): pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 23e04831176..f03454c479a 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -21,7 +21,7 @@ RangeIndex, as_index, ) -from cudf.tests.utils import ( +from cudf.testing._utils import ( FLOAT_TYPES, NUMERIC_TYPES, OTHER_TYPES, diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 1d34f7636da..58d39ff35a6 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -9,8 +9,12 @@ import cudf from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 -from cudf.tests import utils -from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal +from cudf.testing import _utils as utils +from cudf.testing._utils import ( + INTEGER_TYPES, + assert_eq, + assert_exceptions_equal, +) index_dtypes = INTEGER_TYPES diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index c7eafedd409..fc193441113 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 2d8f451abb9..7b56f864272 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -7,7 +7,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype -from cudf.tests.utils import ( +from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, @@ -96,7 +96,7 @@ def assert_join_results_equal(expect, got, how, **kwargs): got.sort_values(got.columns.to_list()).reset_index(drop=True), **kwargs, ) - elif isinstance(expect, (pd.Index, cudf.BaseIndex)): + elif isinstance(expect, (pd.Index, cudf.Index)): return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) else: raise ValueError(f"Not a join result: {type(expect).__name__}") @@ -1922,3 +1922,193 @@ def test_join_merge_invalid_keys(on, how): with pytest.raises(KeyError): pd_left.merge(pd_right, on=on) gd_left.merge(gd_right, on=on) + + +@pytest.mark.parametrize( + "str_data", + [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], +) +@pytest.mark.parametrize("num_keys", [1, 2, 3]) +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_string_join_key(str_data, num_keys, how): + other_data = [1, 2, 3, 4, 5][: len(str_data)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + for i in range(num_keys): + pdf[i] = pd.Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") + pdf["a"] = other_data + gdf["a"] = other_data + + pdf2 = pdf.copy() + gdf2 = gdf.copy() + + expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) + got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] # reorder columns + + if how == "right": + got = got[expect.columns] # reorder columns + + assert_join_results_equal(expect, got, how=how) + + +@pytest.mark.parametrize( + "str_data_nulls", + [ + ["a", "b", "c"], + ["a", "b", "f", "g"], + ["f", "g", "h", "i", "j"], + ["f", "g", "h"], + [None, None, None, None, None], + [], + ], +) +def test_string_join_key_nulls(str_data_nulls): + str_data = ["a", "b", "c", "d", "e"] + other_data = [1, 2, 3, 4, 5] + + other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + pdf["key"] = pd.Series(str_data, dtype="str") + gdf["key"] = cudf.Series(str_data, dtype="str") + pdf["vals"] = other_data + gdf["vals"] = other_data + + pdf2 = pd.DataFrame() + gdf2 = cudf.DataFrame() + pdf2["key"] = pd.Series(str_data_nulls, dtype="str") + gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") + pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") + gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") + + expect = pdf.merge(pdf2, on="key", how="left") + got = gdf.merge(gdf2, on="key", how="left") + got["vals_y"] = got["vals_y"].fillna(-1) + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] + + expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") + + assert_join_results_equal(expect, got, how="left") + + +@pytest.mark.parametrize( + "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] +) +@pytest.mark.parametrize("num_cols", [1, 2, 3]) +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_string_join_non_key(str_data, num_cols, how): + other_data = [1, 2, 3, 4, 5][: len(str_data)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + for i in range(num_cols): + pdf[i] = pd.Series(str_data, dtype="str") + gdf[i] = cudf.Series(str_data, dtype="str") + pdf["a"] = other_data + gdf["a"] = other_data + + pdf2 = pdf.copy() + gdf2 = gdf.copy() + + expect = pdf.merge(pdf2, on=["a"], how=how) + got = gdf.merge(gdf2, on=["a"], how=how) + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] + + if how == "right": + got = got[expect.columns] # reorder columns + + assert_join_results_equal(expect, got, how=how) + + +@pytest.mark.parametrize( + "str_data_nulls", + [ + ["a", "b", "c"], + ["a", "b", "f", "g"], + ["f", "g", "h", "i", "j"], + ["f", "g", "h"], + [None, None, None, None, None], + [], + ], +) +def test_string_join_non_key_nulls(str_data_nulls): + str_data = ["a", "b", "c", "d", "e"] + other_data = [1, 2, 3, 4, 5] + + other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] + + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + pdf["vals"] = pd.Series(str_data, dtype="str") + gdf["vals"] = cudf.Series(str_data, dtype="str") + pdf["key"] = other_data + gdf["key"] = other_data + + pdf2 = pd.DataFrame() + gdf2 = cudf.DataFrame() + pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") + gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") + pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") + gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") + + expect = pdf.merge(pdf2, on="key", how="left") + got = gdf.merge(gdf2, on="key", how="left") + + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] + + assert_join_results_equal(expect, got, how="left") + + +def test_string_join_values_nulls(): + left_dict = [ + {"b": "MATCH 1", "a": 1.0}, + {"b": "MATCH 1", "a": 1.0}, + {"b": "LEFT NO MATCH 1", "a": -1.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "MATCH 1", "a": 1.0}, + {"b": "MATCH 1", "a": 1.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "MATCH 2", "a": 2.0}, + {"b": "LEFT NO MATCH 2", "a": -2.0}, + {"b": "MATCH 3", "a": 3.0}, + {"b": "MATCH 3", "a": 3.0}, + ] + + right_dict = [ + {"b": "RIGHT NO MATCH 1", "c": -1.0}, + {"b": "MATCH 3", "c": 3.0}, + {"b": "MATCH 2", "c": 2.0}, + {"b": "RIGHT NO MATCH 2", "c": -2.0}, + {"b": "RIGHT NO MATCH 3", "c": -3.0}, + {"b": "MATCH 1", "c": 1.0}, + ] + + left_pdf = pd.DataFrame(left_dict) + right_pdf = pd.DataFrame(right_dict) + + left_gdf = cudf.DataFrame.from_pandas(left_pdf) + right_gdf = cudf.DataFrame.from_pandas(right_pdf) + + expect = left_pdf.merge(right_pdf, how="left", on="b") + got = left_gdf.merge(right_gdf, how="left", on="b") + + expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) + got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) + + assert_join_results_equal(expect, got, how="left") diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 2da2cea164f..0b138f446ae 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -12,7 +12,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq def make_numeric_dataframe(nrows, dtype): diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 42541f1e8b1..a6a9ba97ef5 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -9,7 +9,7 @@ import cudf from cudf import NA from cudf._lib.copying import get_element -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index b26887ad6ae..e9c828ec0f5 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -16,7 +16,7 @@ RangeIndex, StringIndex, ) -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index bd78612d6c7..c8e5a9f071b 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -15,7 +15,7 @@ import cudf from cudf.core.column import as_column from cudf.core.index import as_index -from cudf.tests.utils import assert_eq, assert_exceptions_equal, assert_neq +from cudf.testing._utils import assert_eq, assert_exceptions_equal, assert_neq def test_multiindex_levels_codes_validation(): diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 12b17447268..7a766a49a62 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_100 -from cudf.tests.utils import NUMERIC_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, assert_eq from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py index 521840f8a8a..e5efe2f027d 100644 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ b/python/cudf/cudf/tests/test_numpy_interop.py @@ -2,7 +2,7 @@ import pytest from cudf.core import DataFrame, Series -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def test_to_records_noindex(): diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 61195faa4d0..bbec4594e15 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -8,7 +8,7 @@ import cudf from cudf.core import DataFrame, GenericIndex, Series -from cudf.tests import utils +from cudf.testing import _utils as utils def test_onehot_simple(): diff --git a/python/cudf/cudf/tests/test_ops.py b/python/cudf/cudf/tests/test_ops.py index 8cdef19d9ba..ac3f784ecd4 100644 --- a/python/cudf/cudf/tests/test_ops.py +++ b/python/cudf/cudf/tests/test_ops.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq, gen_rand +from cudf.testing._utils import assert_eq, gen_rand def test_sqrt_float(): diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index bd8131d4673..213b7bf39d7 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -15,7 +15,11 @@ import cudf from cudf.core.dtypes import Decimal64Dtype from cudf.io.orc import ORCWriter -from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes +from cudf.testing._utils import ( + assert_eq, + gen_rand_series, + supported_numpy_dtypes, +) @pytest.fixture(scope="module") diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py index 24c60f12a2f..a8a45fc3c28 100644 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ b/python/cudf/cudf/tests/test_pandas_interop.py @@ -5,7 +5,7 @@ import cudf from cudf.core import DataFrame -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def test_to_pandas(): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 54bf17e4c2b..2d0a4006f44 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -18,8 +18,8 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata -from cudf.tests import dataset_generator as dg -from cudf.tests.utils import ( +from cudf.testing import dataset_generator as dg +from cudf.testing._utils import ( TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index ca819c7f59b..48a25fcfadb 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -8,7 +8,7 @@ from cudf.core import DataFrame, GenericIndex, Series from cudf.core.buffer import Buffer -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq if sys.version_info < (3, 8): try: diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 49a2603b9a3..4055485c49a 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -1,7 +1,7 @@ import pandas as pd import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def test_single_q(): diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index b6915a63947..8dc5df2dd7c 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -12,7 +12,7 @@ import cudf from cudf.core import DataFrame -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq from cudf.utils import queryutils _params_query_parser = [] diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py index 35479f8308c..ab1c085c6c0 100644 --- a/python/cudf/cudf/tests/test_query_mask.py +++ b/python/cudf/cudf/tests/test_query_mask.py @@ -3,7 +3,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq _data = [ {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]}, diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index c86b2c61aa5..3c98496def3 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -7,7 +7,7 @@ import pytest from cudf.core import DataFrame -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal @pytest.fixture diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 0fa09bc5df7..7cbc56f943c 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -13,8 +13,8 @@ import cudf from cudf.core import Series from cudf.core.dtypes import Decimal64Dtype -from cudf.tests import utils -from cudf.tests.utils import NUMERIC_TYPES, assert_eq, gen_rand +from cudf.testing import _utils as utils +from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand params_dtype = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 6dca539b8d5..b59428779c1 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1,15 +1,15 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import re +from decimal import Decimal import numpy as np import pandas as pd import pytest -from decimal import Decimal import cudf from cudf.core.dtypes import Decimal64Dtype -from cudf.tests.utils import ( +from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 093be41275a..4906349ecba 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -10,7 +10,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests import utils +from cudf.testing import _utils as utils from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b030924779d..0c4313eb47c 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -9,7 +9,7 @@ import cudf from cudf import melt as cudf_melt from cudf.core._compat import PANDAS_GE_120 -from cudf.tests.utils import ( +from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index fcc5591adda..07e7f43c992 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 2eefcfef7d2..133597b8f19 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -14,7 +14,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq moto = pytest.importorskip("moto", minversion="1.3.14") boto3 = pytest.importorskip("boto3") diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 01e6b52f526..605005f41fc 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -11,7 +11,7 @@ import cudf from cudf import Scalar as pycudf_scalar from cudf._lib.copying import get_element -from cudf.tests.utils import ( +from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py index f7e8c5a8563..0ef7b89a606 100644 --- a/python/cudf/cudf/tests/test_scan.py +++ b/python/cudf/cudf/tests/test_scan.py @@ -5,8 +5,13 @@ import pytest import cudf -from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand from cudf.core.dtypes import Decimal64Dtype +from cudf.testing._utils import ( + INTEGER_TYPES, + NUMERIC_TYPES, + assert_eq, + gen_rand, +) params_sizes = [0, 1, 2, 5] diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index 4c42e2cb50f..c16c6486cd4 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq, gen_rand, random_bitmask +from cudf.testing._utils import assert_eq, gen_rand, random_bitmask @pytest.mark.parametrize("side", ["left", "right"]) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 4be5adf84de..b436825cf69 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -8,8 +8,8 @@ import pytest import cudf -from cudf.tests import utils -from cudf.tests.utils import assert_eq +from cudf.testing import _utils as utils +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d400a9ce8a9..f3da4275aea 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -9,7 +9,7 @@ import pytest import cudf -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index 324074b6021..d4ef3ba235d 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -4,12 +4,12 @@ from math import floor import numpy as np -import cudf import pandas as pd import pytest +import cudf from cudf import Series -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal def test_series_map_basic(): diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 28cb2568908..c7429f3c246 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -6,7 +6,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120, PANDAS_LE_122 -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index b90aebc33dc..95942045654 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -9,7 +9,7 @@ from cudf.core import DataFrame, Series from cudf.core.column import NumericalColumn -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 4551f48845f..50c8f3f41a8 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -8,7 +8,7 @@ from cudf.comm.gpuarrow import GpuArrowReader from cudf.core import DataFrame, Series -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def read_data(): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 4e07c974280..d4e944848c9 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -9,7 +9,7 @@ import cudf from cudf.datasets import randomdata -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import assert_eq, assert_exceptions_equal params_dtypes = [np.int32, np.uint32, np.float32, np.float64] methods = ["min", "max", "sum", "mean", "var", "std"] diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 58b3996ab5c..3c153a16a13 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -18,8 +18,7 @@ from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index -from cudf.tests.test_joining import assert_join_results_equal -from cudf.tests.utils import ( +from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, assert_eq, @@ -919,196 +918,6 @@ def test_string_split(data, pat, n, expand): assert_eq(expect, got) -@pytest.mark.parametrize( - "str_data", - [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], -) -@pytest.mark.parametrize("num_keys", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_key(str_data, num_keys, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_keys): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) - got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] # reorder columns - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["key"] = pd.Series(str_data, dtype="str") - gdf["key"] = cudf.Series(str_data, dtype="str") - pdf["vals"] = other_data - gdf["vals"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["key"] = pd.Series(str_data_nulls, dtype="str") - gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - got["vals_y"] = got["vals_y"].fillna(-1) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_non_key(str_data, num_cols, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_cols): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=["a"], how=how) - got = gdf.merge(gdf2, on=["a"], how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_non_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["vals"] = pd.Series(str_data, dtype="str") - gdf["vals"] = cudf.Series(str_data, dtype="str") - pdf["key"] = other_data - gdf["key"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") - gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - assert_join_results_equal(expect, got, how="left") - - -def test_string_join_values_nulls(): - left_dict = [ - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "LEFT NO MATCH 1", "a": -1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "LEFT NO MATCH 2", "a": -2.0}, - {"b": "MATCH 3", "a": 3.0}, - {"b": "MATCH 3", "a": 3.0}, - ] - - right_dict = [ - {"b": "RIGHT NO MATCH 1", "c": -1.0}, - {"b": "MATCH 3", "c": 3.0}, - {"b": "MATCH 2", "c": 2.0}, - {"b": "RIGHT NO MATCH 2", "c": -2.0}, - {"b": "RIGHT NO MATCH 3", "c": -3.0}, - {"b": "MATCH 1", "c": 1.0}, - ] - - left_pdf = pd.DataFrame(left_dict) - right_pdf = pd.DataFrame(right_dict) - - left_gdf = cudf.DataFrame.from_pandas(left_pdf) - right_gdf = cudf.DataFrame.from_pandas(right_pdf) - - expect = left_pdf.merge(right_pdf, how="left", on="b") - got = left_gdf.merge(right_gdf, how="left", on="b") - - expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - - assert_join_results_equal(expect, got, how="left") - - @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] ) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 21542a6c415..da2af1469c0 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index eee7078433d..b2e5ea70ddc 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -10,7 +10,7 @@ assert_index_equal, assert_series_equal, ) -from cudf.tests.utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 072fc23abba..6c3fdd4640a 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq def test_tokenize(): diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 3efc30af01e..a65fdeeb0dd 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -11,8 +11,8 @@ import cudf from cudf.core._compat import PANDAS_GE_120 -from cudf.tests import utils as utils -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.testing import _utils as utils +from cudf.testing._utils import assert_eq, assert_exceptions_equal _TIMEDELTA_DATA = [ [1000000, 200000, 3000000], diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index ed409de196e..582d5a43edf 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -6,7 +6,7 @@ import pytest from cudf.core import Series -from cudf.tests.utils import NUMERIC_TYPES +from cudf.testing._utils import NUMERIC_TYPES supported_types = NUMERIC_TYPES diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index f132271cfd8..2089f764724 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -10,7 +10,7 @@ import cudf from cudf.core import Series -from cudf.tests import utils +from cudf.testing import _utils as utils _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor] diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py index 059655d4ca0..d29ebf8db8b 100644 --- a/python/custreamz/custreamz/tests/test_kafka.py +++ b/python/custreamz/custreamz/tests/test_kafka.py @@ -2,7 +2,7 @@ import confluent_kafka as ck import pytest -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000]) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 48e0d022a52..94e0169bdf9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -5,11 +5,11 @@ from dask import dataframe as dd -from cudf import DataFrame, Series -from cudf.tests.utils import assert_eq, does_not_raise - import dask_cudf as dgd +from cudf import DataFrame, Series +from cudf.testing._utils import assert_eq, does_not_raise + ############################################################################# # Datetime Accessor # ############################################################################# diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 2f73534b45a..cf5203a22e5 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -706,7 +706,7 @@ def test_dataframe_set_index(): pddf = dd.from_pandas(pdf, npartitions=4) pddf = pddf.set_index("str") - from cudf.tests.utils import assert_eq + from cudf.testing._utils import assert_eq assert_eq(ddf.compute(), pddf.compute()) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index 85354704902..876a66f78d7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -7,7 +7,7 @@ from distributed.utils_test import loop # noqa: F401 import cudf -from cudf.tests.utils import assert_eq +from cudf.testing._utils import assert_eq import dask_cudf