Merge branch 'branch-24.02' into impr-enum-class-fieldtype

rapidsai · Jan 9, 2024 · 91c90b6 · 91c90b6
2 parents 0f085d5 + 3a1601d
commit 91c90b6
Show file tree

Hide file tree

Showing 18 changed files with 184 additions and 130 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -116,7 +116,7 @@ jobs:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
-    needs: wheel-tests-cudf
+    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -23,7 +23,7 @@ pyproject_file="${package_dir}/pyproject.toml"
 
 sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
 echo "${version}" > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py"
 
 # For nightlies we want to ensure that we're pulling in alphas as well. The
 # easiest way to do so is to augment the spec with a constraint containing a
@@ -34,7 +34,7 @@ if ! rapids-is-release-build; then
     alpha_spec=',>=0.0.0a0'
 fi
 
-if [[ ${package_name} == "dask_cudf" ]]; then
+if [[ ${package_name} == "dask-cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}

diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh dask_cudf ${package_dir}
+./ci/build_wheel.sh dask-cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -51,7 +51,6 @@ dependencies:
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.2.*
 - make
-- mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
 - myst-nb

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -50,7 +50,6 @@ dependencies:
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.2.*
 - make
-- mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
 - myst-nb

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -603,7 +603,6 @@ dependencies:
           - cramjam
           - fastavro>=0.22.9
           - hypothesis
-          - mimesis>=4.1.0
           - pytest-benchmark
           - pytest-cases
           - python-snappy>=0.6.0
@@ -755,7 +754,12 @@ dependencies:
           - ipython
           - openpyxl
   notebook_cuda_version:
-    common:
-      - output_types: conda
-        packages:
-          - cuda-version=12.0
+    specific:
+       - output_types: conda
+         matrices:
+           - matrix: {cuda: "12.0"}
+             packages:
+               - cuda-version=12.0
+           - matrix: {cuda: "11.8"}
+             packages:
+               - cuda-version=11.8
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1836,7 +1836,7 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
     @classmethod
-    def from_pandas(cls, index, nan_as_null=no_default):
+    def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
         """
         Convert from a Pandas Index.
 

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -99,9 +99,14 @@ def _has_nulls(self):
 
     @_cudf_nvtx_annotate
     def serialize(self):
+        # TODO: See if self._data can be serialized outright
         header = {
             "type-serialized": pickle.dumps(type(self)),
             "column_names": pickle.dumps(tuple(self._data.names)),
+            "column_rangeindex": pickle.dumps(self._data.rangeindex),
+            "column_multiindex": pickle.dumps(self._data.multiindex),
+            "column_label_dtype": pickle.dumps(self._data.label_dtype),
+            "column_level_names": pickle.dumps(self._data._level_names),
         }
         header["columns"], frames = serialize_columns(self._columns)
         return header, frames
@@ -112,7 +117,20 @@ def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
         columns = deserialize_columns(header["columns"], frames)
-        return cls_deserialize._from_data(dict(zip(column_names, columns)))
+        kwargs = {}
+        for metadata in [
+            "rangeindex",
+            "multiindex",
+            "label_dtype",
+            "level_names",
+        ]:
+            key = f"column_{metadata}"
+            if key in header:
+                kwargs[metadata] = pickle.loads(header[key])
+        col_accessor = ColumnAccessor(
+            data=dict(zip(column_names, columns)), **kwargs
+        )
+        return cls_deserialize._from_data(col_accessor)
 
     @classmethod
     @_cudf_nvtx_annotate

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1601,7 +1601,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_pandas(cls, multiindex, nan_as_null=no_default):
+    def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         """
         Convert from a Pandas MultiIndex
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -57,7 +57,6 @@
     TimeDeltaColumn,
     arange,
     as_column,
-    column,
     full,
 )
 from cudf.core.column.categorical import (
@@ -202,7 +201,6 @@ def __getitem__(self, arg):
 
     @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
-        from cudf.core.column import column
 
         if isinstance(key, tuple):
             key = list(key)
@@ -264,7 +262,7 @@ def __setitem__(self, key, value):
                 self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
             )
         ):
-            value = column.as_column(value)
+            value = as_column(value)
 
         if (
             (
@@ -568,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         4      14
         dtype: int64
         """
-        col = column.as_column(data).set_mask(mask)
+        col = as_column(data).set_mask(mask)
         return cls(data=col)
 
     @_cudf_nvtx_annotate
@@ -593,73 +591,33 @@ def __init__(
                 "to silence this warning.",
                 FutureWarning,
             )
-        if isinstance(data, pd.Series):
-            if name is None:
-                name = data.name
-            if isinstance(data.index, pd.MultiIndex):
-                index = cudf.from_pandas(data.index)
-            else:
-                index = as_index(data.index)
-        elif isinstance(data, pd.Index):
-            if name is None:
-                name = data.name
-            data = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-        elif isinstance(data, BaseIndex):
-            if name is None:
-                name = data.name
-            data = data._values
-            if dtype is not None:
-                data = data.astype(dtype)
+        index_from_data = None
+        name_from_data = None
+        if data is None:
+            data = {}
+
+        if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
+            if copy:
+                data = data.copy(deep=True)
+            name_from_data = data.name
+            column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
+            if isinstance(data, (pd.Series, Series)):
+                index_from_data = as_index(data.index)
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
                 "ColumnAccessor"
             )
-
-        if isinstance(data, Series):
-            if index is not None:
-                data = data.reindex(index)
-            else:
-                index = data._index
-            if name is None:
-                name = data.name
-            data = data._column
-            if copy:
-                data = data.copy(deep=True)
-            if dtype is not None:
-                data = data.astype(dtype)
-
-        if isinstance(data, dict):
+        elif isinstance(data, dict):
             if not data:
-                current_index = RangeIndex(0)
+                column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
+                index_from_data = RangeIndex(0)
             else:
-                current_index = data.keys()
-            if index is not None:
-                series = Series(
-                    list(data.values()),
-                    nan_as_null=nan_as_null,
-                    dtype=dtype,
-                    index=current_index,
-                )
-                new_index = as_index(index)
-                if not series.index.equals(new_index):
-                    series = series.reindex(new_index)
-                data = series._column
-                index = series._index
-            else:
-                data = column.as_column(
+                column = as_column(
                     list(data.values()), nan_as_null=nan_as_null, dtype=dtype
                 )
-                index = current_index
-        if data is None:
-            if index is not None:
-                data = column.column_empty(
-                    row_count=len(index), dtype=None, masked=True
-                )
-            else:
-                data = {}
-
-        if not isinstance(data, ColumnBase):
+                index_from_data = as_index(list(data.keys()))
+        else:
             # Using `getattr_static` to check if
             # `data` is on device memory and perform
             # a deep copy later. This is different
@@ -677,25 +635,42 @@ def __init__(
                 )
                 is property
             )
-            data = column.as_column(
+            column = as_column(
                 data,
                 nan_as_null=nan_as_null,
                 dtype=dtype,
                 length=len(index) if index is not None else None,
             )
             if copy and has_cai:
-                data = data.copy(deep=True)
-        else:
-            if dtype is not None:
-                data = data.astype(dtype)
+                column = column.copy(deep=True)
 
-        if index is not None and not isinstance(index, BaseIndex):
-            index = as_index(index)
+        assert isinstance(column, ColumnBase)
+
+        if dtype is not None:
+            column = column.astype(dtype)
 
-        assert isinstance(data, ColumnBase)
+        if name_from_data is not None and name is None:
+            name = name_from_data
 
-        super().__init__({name: data})
-        self._index = RangeIndex(len(data)) if index is None else index
+        if index is not None:
+            index = as_index(index)
+
+        if index_from_data is not None:
+            first_index = index_from_data
+            second_index = index
+        elif index is None:
+            first_index = RangeIndex(len(column))
+            second_index = None
+        else:
+            first_index = index
+            second_index = None
+
+        super().__init__({name: column}, index=first_index)
+        if second_index is not None:
+            # TODO: This there a better way to do this?
+            reindexed = self.reindex(index=second_index, copy=False)
+            self._data = reindexed._data
+            self._index = second_index
         self._check_data_index_length_match()
 
     @classmethod
@@ -717,7 +692,7 @@ def __contains__(self, item):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_pandas(cls, s, nan_as_null=no_default):
+    def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         """
         Convert from a Pandas Series.
 
@@ -760,7 +735,7 @@ def from_pandas(cls, s, nan_as_null=no_default):
                 False if cudf.get_option("mode.pandas_compatible") else None
             )
         with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
+            warnings.simplefilter("ignore", FutureWarning)
             result = cls(s, nan_as_null=nan_as_null)
         return result
 
@@ -5250,16 +5225,16 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
         b = b.reindex(a.index)
         index = as_index(a.index)
 
-    a_col = column.as_column(a)
+    a_col = as_column(a)
     a_array = cupy.asarray(a_col.data_array_view(mode="read"))
 
-    b_col = column.as_column(b)
+    b_col = as_column(b)
     b_array = cupy.asarray(b_col.data_array_view(mode="read"))
 
     result = cupy.isclose(
         a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan
     )
-    result_col = column.as_column(result)
+    result_col = as_column(result)
 
     if a_col.null_count and b_col.null_count:
         a_nulls = a_col.isnull()

diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import warnings
 
@@ -161,7 +161,7 @@ def to_numeric(arg, errors="raise", downcast=None):
                     break
 
     if isinstance(arg, (cudf.Series, pd.Series)):
-        return cudf.Series(col)
+        return cudf.Series(col, index=arg.index, name=arg.name)
     else:
         if col.has_nulls():
             # To match pandas, always return a floating type filled with nan.