Skip to content

Commit

Permalink
Merge branch 'branch-24.08' into cudf-polars-str-contains
Browse files Browse the repository at this point in the history
  • Loading branch information
wence- authored Jun 13, 2024
2 parents 5533e5b + af09d3e commit ee42757
Show file tree
Hide file tree
Showing 28 changed files with 333 additions and 54 deletions.
4 changes: 1 addition & 3 deletions ci/build_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@ export CMAKE_GENERATOR=Ninja

rapids-print-env

version=$(rapids-generate-version)

rapids-logger "Begin cpp build"

# With boa installed conda build forward to boa
RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
conda/recipes/libcudf

rapids-upload-conda-to-s3 cpp
2 changes: 1 addition & 1 deletion ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ ENV_YAML_DIR="$(mktemp -d)"

rapids-dependency-file-generator \
--output conda \
--file_key docs \
--file-key docs \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs
Expand Down
2 changes: 1 addition & 1 deletion ci/check_style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ENV_YAML_DIR="$(mktemp -d)"

rapids-dependency-file-generator \
--output conda \
--file_key checks \
--file-key checks \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks
Expand Down
2 changes: 1 addition & 1 deletion ci/configure_cpp_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"

rapids-dependency-file-generator \
--output requirements \
--file_key test_static_build \
--file-key test_static_build \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"

python -m pip install -r "${REQUIREMENTS_FILE}"
Expand Down
2 changes: 1 addition & 1 deletion ci/test_cpp_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"

rapids-dependency-file-generator \
--output conda \
--file_key test_cpp \
--file-key test_cpp \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
Expand Down
2 changes: 1 addition & 1 deletion ci/test_java.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"

rapids-dependency-file-generator \
--output conda \
--file_key test_java \
--file-key test_java \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
Expand Down
2 changes: 1 addition & 1 deletion ci/test_notebooks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"

rapids-dependency-file-generator \
--output conda \
--file_key test_notebooks \
--file-key test_notebooks \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
Expand Down
2 changes: 1 addition & 1 deletion ci/test_python_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ENV_YAML_DIR="$(mktemp -d)"

rapids-dependency-file-generator \
--output conda \
--file_key test_python \
--file-key test_python \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"

rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
Expand Down
42 changes: 32 additions & 10 deletions cpp/src/interop/from_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
}
}
case arrow::Type::STRING: return data_type(type_id::STRING);
case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
case arrow::Type::LIST: return data_type(type_id::LIST);
case arrow::Type::DECIMAL: {
Expand Down Expand Up @@ -276,21 +277,42 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
rmm::device_async_resource_ref mr)
{
if (array.length() == 0) { return make_empty_column(type_id::STRING); }
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
auto char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);

auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
*char_array, data_type(type_id::INT8), true, stream, mr);
std::unique_ptr<column> offsets_column;
std::unique_ptr<arrow::Array> char_array;

if (array.type_id() == arrow::Type::LARGE_STRING) {
auto str_array = static_cast<arrow::LargeStringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int64Array>(
str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
*offset_array, data_type(type_id::INT64), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else if (array.type_id() == arrow::Type::STRING) {
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else {
throw std::runtime_error("Unsupported array type");
}

rmm::device_buffer chars(char_array->length(), stream, mr);
auto data_buffer = char_array->data()->buffers[1];
CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
reinterpret_cast<uint8_t const*>(data_buffer->address()),
chars.size(),
cudaMemcpyDefault,
stream.value()));

auto const num_rows = offsets_column->size() - 1;
auto out_col = make_strings_column(num_rows,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
std::move(chars),
array.null_count(),
std::move(*get_mask_buffer(array, stream, mr)));

Expand Down
18 changes: 13 additions & 5 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,19 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
static_cast<std::size_t>(sview.chars_size(stream))},
ar_mr,
stream);
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
if (sview.offsets().type().id() == cudf::type_id::INT64) {
return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
} else {
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
}
}

template <>
Expand Down
42 changes: 39 additions & 3 deletions cpp/tests/interop/from_arrow_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,36 @@ std::unique_ptr<cudf::table> get_cudf_table()
{true, false, true, true, true});
columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
{true, false, true, false, true}, {true, false, true, true, false})
{true, false, true, false, true}, {true, false, true, true, false}).release());
columns.emplace_back(cudf::test::strings_column_wrapper(
{
"",
"abc",
"def",
"1",
"2",
},
{0, 1, 1, 1, 1})
.release());
// columns.emplace_back(cudf::test::lists_column_wrapper<int>({{1, 2}, {3, 4}, {}, {6}, {7, 8,
// 9}}).release());
return std::make_unique<cudf::table>(std::move(columns));
}

std::shared_ptr<arrow::LargeStringArray> get_arrow_large_string_array(
std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
{
std::shared_ptr<arrow::LargeStringArray> large_string_array;
arrow::LargeStringBuilder large_string_builder;

CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(),
"Failed to append values to string builder");
CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(),
"Failed to create arrow string array");

return large_string_array;
}

struct FromArrowTest : public cudf::test::BaseFixture {};

template <typename T>
Expand Down Expand Up @@ -294,6 +317,15 @@ TEST_F(FromArrowTest, ChunkedArray)
"ccc",
},
{0, 1});
auto large_string_array_1 = get_arrow_large_string_array(
{
"",
"abc",
"def",
"1",
"2",
},
{0, 1, 1, 1, 1});
auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1});
auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3});

Expand All @@ -307,21 +339,25 @@ TEST_F(FromArrowTest, ChunkedArray)
auto boolean_array =
get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});

std::vector<std::shared_ptr<arrow::Field>> schema_vector(
{arrow::field("a", int32_chunked_array->type()),
arrow::field("b", int64array->type()),
arrow::field("c", string_array_1->type()),
arrow::field("d", dict_chunked_array->type()),
arrow::field("e", boolean_chunked_array->type())});
arrow::field("e", boolean_chunked_array->type()),
arrow::field("f", large_string_array_1->type())});
auto schema = std::make_shared<arrow::Schema>(schema_vector);

auto arrow_table = arrow::Table::Make(schema,
{int32_chunked_array,
int64_chunked_array,
string_chunked_array,
dict_chunked_array,
boolean_chunked_array});
boolean_chunked_array,
large_string_chunked_array});

auto expected_cudf_table = get_cudf_table();

Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
)
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif pa.types.is_large_string(array.type):
# Pandas-2.2+: Pandas defaults to `large_string` type
# instead of `string` without data-introspection.
# Temporary workaround until cudf has native
# support for `LARGE_STRING` i.e., 64 bit offsets
array = array.cast(pa.string())

data = pa.table([array], [None])

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data):
assert_eq(result, expected)


def test_series_from_large_string():
pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string())
got = cudf.Series(pa_large_string_array)
expected = pd.Series(pa_large_string_array)
@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string])
def test_series_from_large_string(pa_type):
pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type())
got = cudf.Series(pa_string_array)
expected = pd.Series(pa_string_array)

assert_eq(expected, got)

assert pa_string_array.equals(got.to_arrow())


@pytest.mark.parametrize(
"scalar",
Expand Down
1 change: 0 additions & 1 deletion python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ skip = [

[tool.rapids-build-backend]
build-backend = "scikit_build_core.build"
commit-file = "cudf/GIT_COMMIT"
dependencies-file = "../../dependencies.yaml"
requires = [
"cmake>=3.26.4",
Expand Down
1 change: 0 additions & 1 deletion python/cudf_kafka/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ regex = "(?P<value>.*)"

[tool.rapids-build-backend]
build-backend = "scikit_build_core.build"
commit-file = "cudf_kafka/GIT_COMMIT"
dependencies-file = "../../dependencies.yaml"
requires = [
"cmake>=3.26.4",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/cudf_polars/containers/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def num_columns(self) -> int:
@cached_property
def num_rows(self) -> int:
"""Number of rows."""
return self.table.num_rows()
return 0 if len(self.columns) == 0 else self.table.num_rows()

@classmethod
def from_cudf(cls, df: cudf.DataFrame) -> Self:
Expand Down
19 changes: 11 additions & 8 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,14 @@ def is_equal(self, other: Any) -> bool:
True if the two expressions are equal, false otherwise.
"""
if type(self) is not type(other):
return False
return False # pragma: no cover; __eq__ trips first
return self._ctor_arguments(self.children) == other._ctor_arguments(
other.children
)

def __eq__(self, other: Any) -> bool:
"""Equality of expressions."""
if type(self) != type(other) or hash(self) != hash(other):
if type(self) is not type(other) or hash(self) != hash(other):
return False
else:
return self.is_equal(other)
Expand Down Expand Up @@ -196,7 +196,9 @@ def do_evaluate(
are returned during translation to the IR, but for now we
are not perfect.
"""
raise NotImplementedError(f"Evaluation of {type(self).__name__}")
raise NotImplementedError(
f"Evaluation of expression {type(self).__name__}"
) # pragma: no cover; translation of unimplemented nodes trips first

def evaluate(
self,
Expand Down Expand Up @@ -266,7 +268,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
"""
raise NotImplementedError(
f"Collecting aggregation info for {type(self).__name__}"
)
) # pragma: no cover; check_agg trips first


class NamedExpr:
Expand All @@ -287,7 +289,7 @@ def __hash__(self) -> int:

def __repr__(self) -> str:
"""Repr of the expression."""
return f"NamedExpr({self.name}, {self.value}"
return f"NamedExpr({self.name}, {self.value})"

def __eq__(self, other: Any) -> bool:
"""Equality of two expressions."""
Expand Down Expand Up @@ -740,8 +742,9 @@ def do_evaluate(
elif self.name == pl_expr.StringFunction.StartsWith:
column, suffix = columns
return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
else:
raise NotImplementedError(f"StringFunction {self.name}")
raise NotImplementedError(
f"StringFunction {self.name}"
) # pragma: no cover; handled by init raising


class Sort(Expr):
Expand Down Expand Up @@ -851,7 +854,7 @@ def do_evaluate(
obj = plc.replace.replace_nulls(
indices.obj,
plc.interop.from_arrow(
pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type()))
pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
),
)
else:
Expand Down
8 changes: 6 additions & 2 deletions python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ class PythonScan(IR):
predicate: expr.NamedExpr | None
"""Filter to apply to the constructed dataframe before returning it."""

def __post_init__(self):
"""Validate preconditions."""
raise NotImplementedError("PythonScan not implemented")


@dataclasses.dataclass(slots=True)
class Scan(IR):
Expand Down Expand Up @@ -933,10 +937,10 @@ class Union(IR):
"""Optional slice to apply after concatenation."""

def __post_init__(self) -> None:
"""Validated preconditions."""
"""Validate preconditions."""
schema = self.dfs[0].schema
if not all(s.schema == schema for s in self.dfs[1:]):
raise ValueError("Schema mismatch")
raise NotImplementedError("Schema mismatch")

def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
"""Evaluate and return a dataframe."""
Expand Down
Loading

0 comments on commit ee42757

Please sign in to comment.