Merge remote-tracking branch 'upstream/branch-24.02' into cccl-2.2.0

rapidsai · Dec 11, 2023 · 218992e · 218992e
2 parents 1f4cefe + 759a1c8
commit 218992e
Show file tree

Hide file tree

Showing 13 changed files with 33 additions and 244 deletions.
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
@@ -2137,7 +2137,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
       }
     }
   }
-  stripe_dicts.host_to_device_async(stream);
+  // Synchronize to ensure the copy is complete before we clear `map_slots`
+  stripe_dicts.host_to_device_sync(stream);
 
   gpu::collect_map_entries(stripe_dicts, stream);
   gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
@@ -205,8 +205,9 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
       }
     }
   }
+
+  auto const total_len = block_reduce(reduce_storage).Sum(len);
   __syncthreads();
-  auto const total_len   = block_reduce(reduce_storage).Sum(len);
   auto const total_valid = block_reduce(reduce_storage).Sum(num_valid);
 
   if (t == 0) {

diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,6 @@ ignore_missing_imports = true
 follow_imports = "skip"
 exclude = [
     "cudf/_lib/",
-    "cudf/cudf/benchmarks/",
     "cudf/cudf/tests/",
     "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
     "custreamz/custreamz/tests/",

diff --git a/python/cudf/cudf/benchmarks/README.md b/python/cudf/cudf/benchmarks/README.md
diff --git a/python/cudf/cudf/benchmarks/bench_cudf_io.py b/python/cudf/cudf/benchmarks/bench_cudf_io.py
diff --git a/python/cudf/cudf/benchmarks/conftest.py b/python/cudf/cudf/benchmarks/conftest.py
diff --git a/python/cudf/cudf/benchmarks/get_datasets.py b/python/cudf/cudf/benchmarks/get_datasets.py
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -17,11 +17,10 @@
 from cudf import _lib as libcudf
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
-from cudf.api.types import is_interval_dtype
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
-from cudf.core.dtypes import CategoricalDtype
+from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils.dtypes import (
     is_mixed_with_object_dtype,
     min_signed_type,
@@ -997,7 +996,7 @@ def to_pandas(
             .fillna(_DEFAULT_CATEGORICAL_VALUE)
             .values_host
         )
-        if is_interval_dtype(col.categories.dtype):
+        if isinstance(col.categories.dtype, IntervalDtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -1128,7 +1128,12 @@ def is_interval_dtype(obj):
         or (
             isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name
         )
-        or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype))
+        or (
+            isinstance(
+                getattr(obj, "dtype", None),
+                (pd.IntervalDtype, cudf.core.dtypes.IntervalDtype),
+            )
+        )
     )
 
 

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -34,7 +34,6 @@
     _is_non_decimal_numeric_dtype,
     is_dtype_equal,
     is_integer,
-    is_interval_dtype,
     is_list_like,
     is_scalar,
     is_signed_integer_dtype,
@@ -3192,7 +3191,9 @@ def __init__(
 
         if isinstance(data, IntervalColumn):
             data = data
-        elif isinstance(data, pd.Series) and (is_interval_dtype(data.dtype)):
+        elif isinstance(data, pd.Series) and isinstance(
+            data.dtype, pd.IntervalDtype
+        ):
             data = column.as_column(data, data.dtype)
         elif isinstance(data, (pd.Interval, pd.IntervalIndex)):
             data = column.as_column(

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -2661,13 +2661,17 @@ def _reindex(
             )
             for name in names
         }
+        if column_names is None:
+            level_names = self._data.level_names
+        elif isinstance(column_names, pd.Index):
+            level_names = tuple(column_names.names)
+        else:
+            level_names = None
         result = self.__class__._from_data(
             data=cudf.core.column_accessor.ColumnAccessor(
                 cols,
                 multiindex=self._data.multiindex,
-                level_names=tuple(column_names.names)
-                if isinstance(column_names, pd.Index)
-                else None,
+                level_names=level_names,
             ),
             index=index,
         )

diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
@@ -13,7 +13,6 @@
 from cudf.api.types import (
     is_categorical_dtype,
     is_decimal_dtype,
-    is_interval_dtype,
     is_list_dtype,
     is_numeric_dtype,
     is_string_dtype,
@@ -30,7 +29,7 @@ def dtype_can_compare_equal_to_other(dtype):
         or is_list_dtype(dtype)
         or is_struct_dtype(dtype)
         or is_decimal_dtype(dtype)
-        or is_interval_dtype(dtype)
+        or isinstance(dtype, cudf.IntervalDtype)
     )
 
 
@@ -235,7 +234,7 @@ def assert_column_equal(
         )
         or (
             is_numeric_dtype(left)
-            and not dtype_can_compare_equal_to_other(right)
+            and not dtype_can_compare_equal_to_other(right.dtype)
         )
     ):
         try:

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -10754,6 +10754,15 @@ def test_dataframe_series_dot():
     assert_eq(expected, actual)
 
 
+def test_dataframe_reindex_keep_colname():
+    gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo"))
+    result = gdf.reindex(index=[0, 1])
+    expected = cudf.DataFrame(
+        [1, None], columns=cudf.Index([1], name="foo"), index=[0, 1]
+    )
+    assert_eq(result, expected)
+
+
 def test_dataframe_duplicate_index_reindex():
     gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
     pdf = gdf.to_pandas()