Merge branch 'branch-24.08' into cudf-polars-str-contains

rapidsai · Jun 13, 2024 · ee42757 · ee42757
2 parents 5533e5b + af09d3e
commit ee42757
Show file tree

Hide file tree

Showing 28 changed files with 333 additions and 54 deletions.
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
@@ -13,12 +13,10 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-version=$(rapids-generate-version)
-
 rapids-logger "Begin cpp build"
 
 # With boa installed conda build forward to boa
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -14,7 +14,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key docs \
+  --file-key docs \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs

diff --git a/ci/check_style.sh b/ci/check_style.sh
@@ -10,7 +10,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks

diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
@@ -12,7 +12,7 @@ REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
 
 rapids-dependency-file-generator \
   --output requirements \
-  --file_key test_static_build \
+  --file-key test_static_build \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
 
 python -m pip install -r "${REQUIREMENTS_FILE}"

diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_cpp \
+  --file-key test_cpp \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/ci/test_java.sh b/ci/test_java.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_java \
+  --file-key test_java \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_notebooks \
+  --file-key test_notebooks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
@@ -13,7 +13,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
@@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
       }
     }
     case arrow::Type::STRING: return data_type(type_id::STRING);
+    case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
     case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
     case arrow::Type::LIST: return data_type(type_id::LIST);
     case arrow::Type::DECIMAL: {
@@ -276,21 +277,42 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   rmm::device_async_resource_ref mr)
 {
   if (array.length() == 0) { return make_empty_column(type_id::STRING); }
-  auto str_array    = static_cast<arrow::StringArray const*>(&array);
-  auto offset_array = std::make_unique<arrow::Int32Array>(
-    str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
-  auto char_array = std::make_unique<arrow::Int8Array>(
-    str_array->value_data()->size(), str_array->value_data(), nullptr);
 
-  auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-    *offset_array, data_type(type_id::INT32), true, stream, mr);
-  auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
-    *char_array, data_type(type_id::INT8), true, stream, mr);
+  std::unique_ptr<column> offsets_column;
+  std::unique_ptr<arrow::Array> char_array;
+
+  if (array.type_id() == arrow::Type::LARGE_STRING) {
+    auto str_array    = static_cast<arrow::LargeStringArray const*>(&array);
+    auto offset_array = std::make_unique<arrow::Int64Array>(
+      str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
+    offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
+      *offset_array, data_type(type_id::INT64), true, stream, mr);
+    char_array = std::make_unique<arrow::Int8Array>(
+      str_array->value_data()->size(), str_array->value_data(), nullptr);
+  } else if (array.type_id() == arrow::Type::STRING) {
+    auto str_array    = static_cast<arrow::StringArray const*>(&array);
+    auto offset_array = std::make_unique<arrow::Int32Array>(
+      str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
+    offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
+      *offset_array, data_type(type_id::INT32), true, stream, mr);
+    char_array = std::make_unique<arrow::Int8Array>(
+      str_array->value_data()->size(), str_array->value_data(), nullptr);
+  } else {
+    throw std::runtime_error("Unsupported array type");
+  }
+
+  rmm::device_buffer chars(char_array->length(), stream, mr);
+  auto data_buffer = char_array->data()->buffers[1];
+  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
+                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
+                                chars.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column->release().data.release()[0]),
+                                     std::move(chars),
                                      array.null_count(),
                                      std::move(*get_mask_buffer(array, stream, mr)));
 

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
@@ -306,11 +306,19 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
                               static_cast<std::size_t>(sview.chars_size(stream))},
     ar_mr,
     stream);
-  return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
-                                              offset_buffer,
-                                              data_buffer,
-                                              fetch_mask_buffer(input_view, ar_mr, stream),
-                                              static_cast<int64_t>(input_view.null_count()));
+  if (sview.offsets().type().id() == cudf::type_id::INT64) {
+    return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
+                                                     offset_buffer,
+                                                     data_buffer,
+                                                     fetch_mask_buffer(input_view, ar_mr, stream),
+                                                     static_cast<int64_t>(input_view.null_count()));
+  } else {
+    return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
+                                                offset_buffer,
+                                                data_buffer,
+                                                fetch_mask_buffer(input_view, ar_mr, stream),
+                                                static_cast<int64_t>(input_view.null_count()));
+  }
 }
 
 template <>

diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
@@ -50,13 +50,36 @@ std::unique_ptr<cudf::table> get_cudf_table()
                                                               {true, false, true, true, true});
   columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
-                         {true, false, true, false, true}, {true, false, true, true, false})
+                         {true, false, true, false, true}, {true, false, true, true, false}).release());
+  columns.emplace_back(cudf::test::strings_column_wrapper(
+                         {
+                           "",
+                           "abc",
+                           "def",
+                           "1",
+                           "2",
+                         },
+                         {0, 1, 1, 1, 1})
                          .release());
   // columns.emplace_back(cudf::test::lists_column_wrapper<int>({{1, 2}, {3, 4}, {}, {6}, {7, 8,
   // 9}}).release());
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
+std::shared_ptr<arrow::LargeStringArray> get_arrow_large_string_array(
+  std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
+{
+  std::shared_ptr<arrow::LargeStringArray> large_string_array;
+  arrow::LargeStringBuilder large_string_builder;
+
+  CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(),
+               "Failed to append values to string builder");
+  CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(),
+               "Failed to create arrow string array");
+
+  return large_string_array;
+}
+
 struct FromArrowTest : public cudf::test::BaseFixture {};
 
 template <typename T>
@@ -294,6 +317,15 @@ TEST_F(FromArrowTest, ChunkedArray)
       "ccc",
     },
     {0, 1});
+  auto large_string_array_1 = get_arrow_large_string_array(
+    {
+      "",
+      "abc",
+      "def",
+      "1",
+      "2",
+    },
+    {0, 1, 1, 1, 1});
   auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1});
   auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3});
 
@@ -307,21 +339,25 @@ TEST_F(FromArrowTest, ChunkedArray)
   auto boolean_array =
     get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
   auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
+  auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
+    std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector(
     {arrow::field("a", int32_chunked_array->type()),
      arrow::field("b", int64array->type()),
      arrow::field("c", string_array_1->type()),
      arrow::field("d", dict_chunked_array->type()),
-     arrow::field("e", boolean_chunked_array->type())});
+     arrow::field("e", boolean_chunked_array->type()),
+     arrow::field("f", large_string_array_1->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
   auto arrow_table = arrow::Table::Make(schema,
                                         {int32_chunked_array,
                                          int64_chunked_array,
                                          string_chunked_array,
                                          dict_chunked_array,
-                                         boolean_chunked_array});
+                                         boolean_chunked_array,
+                                         large_string_chunked_array});
 
   auto expected_cudf_table = get_cudf_table();
 

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -334,12 +334,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             )
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
-        elif pa.types.is_large_string(array.type):
-            # Pandas-2.2+: Pandas defaults to `large_string` type
-            # instead of `string` without data-introspection.
-            # Temporary workaround until cudf has native
-            # support for `LARGE_STRING` i.e., 64 bit offsets
-            array = array.cast(pa.string())
 
         data = pa.table([array], [None])
 

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data):
     assert_eq(result, expected)
 
 
-def test_series_from_large_string():
-    pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string())
-    got = cudf.Series(pa_large_string_array)
-    expected = pd.Series(pa_large_string_array)
+@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string])
+def test_series_from_large_string(pa_type):
+    pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type())
+    got = cudf.Series(pa_string_array)
+    expected = pd.Series(pa_string_array)
 
     assert_eq(expected, got)
 
+    assert pa_string_array.equals(got.to_arrow())
+
 
 @pytest.mark.parametrize(
     "scalar",

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -119,7 +119,6 @@ skip = [
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
-commit-file = "cudf/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
 requires = [
     "cmake>=3.26.4",

diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
@@ -99,7 +99,6 @@ regex = "(?P<value>.*)"
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
-commit-file = "cudf_kafka/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
 requires = [
     "cmake>=3.26.4",

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -70,7 +70,7 @@ def num_columns(self) -> int:
     @cached_property
     def num_rows(self) -> int:
         """Number of rows."""
-        return self.table.num_rows()
+        return 0 if len(self.columns) == 0 else self.table.num_rows()
 
     @classmethod
     def from_cudf(cls, df: cudf.DataFrame) -> Self:

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -134,14 +134,14 @@ def is_equal(self, other: Any) -> bool:
         True if the two expressions are equal, false otherwise.
         """
         if type(self) is not type(other):
-            return False
+            return False  # pragma: no cover; __eq__ trips first
         return self._ctor_arguments(self.children) == other._ctor_arguments(
             other.children
         )
 
     def __eq__(self, other: Any) -> bool:
         """Equality of expressions."""
-        if type(self) != type(other) or hash(self) != hash(other):
+        if type(self) is not type(other) or hash(self) != hash(other):
             return False
         else:
             return self.is_equal(other)
@@ -196,7 +196,9 @@ def do_evaluate(
             are returned during translation to the IR, but for now we
             are not perfect.
         """
-        raise NotImplementedError(f"Evaluation of {type(self).__name__}")
+        raise NotImplementedError(
+            f"Evaluation of expression {type(self).__name__}"
+        )  # pragma: no cover; translation of unimplemented nodes trips first
 
     def evaluate(
         self,
@@ -266,7 +268,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         """
         raise NotImplementedError(
             f"Collecting aggregation info for {type(self).__name__}"
-        )
+        )  # pragma: no cover; check_agg trips first
 
 
 class NamedExpr:
@@ -287,7 +289,7 @@ def __hash__(self) -> int:
 
     def __repr__(self) -> str:
         """Repr of the expression."""
-        return f"NamedExpr({self.name}, {self.value}"
+        return f"NamedExpr({self.name}, {self.value})"
 
     def __eq__(self, other: Any) -> bool:
         """Equality of two expressions."""
@@ -740,8 +742,9 @@ def do_evaluate(
             elif self.name == pl_expr.StringFunction.StartsWith:
                 column, suffix = columns
                 return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
-            else:
-                raise NotImplementedError(f"StringFunction {self.name}")
+            raise NotImplementedError(
+                f"StringFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
 
 
 class Sort(Expr):
@@ -851,7 +854,7 @@ def do_evaluate(
             obj = plc.replace.replace_nulls(
                 indices.obj,
                 plc.interop.from_arrow(
-                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type()))
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
                 ),
             )
         else:

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -165,6 +165,10 @@ class PythonScan(IR):
     predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
+    def __post_init__(self):
+        """Validate preconditions."""
+        raise NotImplementedError("PythonScan not implemented")
+
 
 @dataclasses.dataclass(slots=True)
 class Scan(IR):
@@ -933,10 +937,10 @@ class Union(IR):
     """Optional slice to apply after concatenation."""
 
     def __post_init__(self) -> None:
-        """Validated preconditions."""
+        """Validate preconditions."""
         schema = self.dfs[0].schema
         if not all(s.schema == schema for s in self.dfs[1:]):
-            raise ValueError("Schema mismatch")
+            raise NotImplementedError("Schema mismatch")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""