Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate column factories to pylibcudf #15257

Merged
Merged
Show file tree
Hide file tree
Changes from 52 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
70386b8
begin column factorires
brandon-b-miller Mar 8, 2024
38ca43d
updates
brandon-b-miller Mar 8, 2024
6055670
progress
brandon-b-miller Mar 13, 2024
535c812
moving things around re: fused types
brandon-b-miller Mar 19, 2024
b5c888d
compiles
brandon-b-miller Mar 19, 2024
4b4d7b3
add back the rest of the column factories
brandon-b-miller Mar 19, 2024
7857097
cleanup
brandon-b-miller Mar 19, 2024
5b41c2e
Merge branch 'branch-24.04' into pylibcudf-column-factories
brandon-b-miller Mar 20, 2024
4f78361
add TypeId back in
brandon-b-miller Mar 20, 2024
b1bce3e
Merge branch 'branch-24.06' into pylibcudf-column-factories
brandon-b-miller Apr 11, 2024
ba490b1
fix up make_empty_column
brandon-b-miller Apr 11, 2024
d0eb39f
test_make_empty_column
brandon-b-miller Apr 17, 2024
eec143c
Merge branch 'branch-24.06' into pylibcudf-column-factories
brandon-b-miller Apr 17, 2024
d0e1ed5
few more tests
brandon-b-miller Apr 17, 2024
dfdbc77
add more make_numeric_column tests
brandon-b-miller Apr 17, 2024
3271b41
more tests
brandon-b-miller Apr 18, 2024
20b6208
fix decimal tests
brandon-b-miller Apr 18, 2024
688cd36
add error cases, refactor
brandon-b-miller Apr 18, 2024
8a929ec
use fixture
brandon-b-miller Apr 18, 2024
220d615
ids=repr
brandon-b-miller Apr 19, 2024
54b2b43
Merge branch 'branch-24.06' into pylibcudf-column-factories
brandon-b-miller Apr 22, 2024
a80a69f
dont cimport pylibcudf in column.pyx
brandon-b-miller Apr 22, 2024
98f2acf
Merge branch 'branch-24.06' into pylibcudf-column-factories
brandon-b-miller Apr 25, 2024
07f83d7
update expected error in cpp tests
brandon-b-miller Apr 25, 2024
30259d5
remove unused struct
brandon-b-miller Apr 30, 2024
ddce37d
fix minor doc issue
brandon-b-miller Apr 30, 2024
62d54c0
remove extra reference to struct
brandon-b-miller Apr 30, 2024
e8f1003
guard against invalid mask arguments
brandon-b-miller Apr 30, 2024
68c9039
move maskstate namespace
brandon-b-miller Apr 30, 2024
fd91f30
reorganize import flow
brandon-b-miller Apr 30, 2024
d2873c1
Apply suggestions from code review
brandon-b-miller Apr 30, 2024
9a0e833
renaming
brandon-b-miller Apr 30, 2024
db2c10d
Merge branch 'branch-24.06' into pylibcudf-column-factories
brandon-b-miller May 3, 2024
db8c8b3
short circuit empty type in cpp factories
brandon-b-miller May 6, 2024
2ef1498
update fixed point tests
brandon-b-miller May 7, 2024
fc4b23e
merge latest, resolve conflicts, fix
brandon-b-miller May 17, 2024
5cf04d3
error if a mask is passed for now
brandon-b-miller May 17, 2024
427cf3c
Merge branch 'branch-24.06' into pylibcudf-column-factories
brandon-b-miller May 21, 2024
836eb65
create interop to/from arrow for DataType
brandon-b-miller May 22, 2024
2ba382b
refactor
brandon-b-miller May 22, 2024
c4f874f
plumbing, fixes
brandon-b-miller May 22, 2024
9cadc1c
to_arrow updates
brandon-b-miller May 22, 2024
3a478be
small test fixes
brandon-b-miller May 22, 2024
b3e934c
use explicit mappings
brandon-b-miller May 22, 2024
10b07b8
dont validate the values themselves
brandon-b-miller May 22, 2024
1e73dfe
Update python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factorie…
brandon-b-miller May 22, 2024
092a2a5
listify parameterization
brandon-b-miller May 22, 2024
abda755
style
brandon-b-miller May 22, 2024
f09afa1
Merge branch 'branch-24.08' into pylibcudf-column-factories
brandon-b-miller May 30, 2024
bffe500
fix up to_arrow for datatype and add some tests
brandon-b-miller May 30, 2024
1dfbea4
Apply suggestions from code review
brandon-b-miller Jun 3, 2024
498a002
style fix
brandon-b-miller Jun 3, 2024
8c67671
Translate date32
wence- Jun 4, 2024
dba2ab3
Minor fixes
wence- Jun 4, 2024
463ed02
Fix whitespace
wence- Jun 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions cpp/src/column/column_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type)
std::unique_ptr<column> make_empty_column(data_type type)
{
CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type),
"make_empty_column is invalid to call on nested types");
"make_empty_column is invalid to call on nested types",
cudf::data_type_error);
return std::make_unique<column>(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
}

Expand All @@ -80,7 +81,9 @@ std::unique_ptr<column> make_numeric_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type),
"Invalid, non-numeric type.",
cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -100,7 +103,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -120,7 +123,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -140,7 +143,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -160,7 +163,9 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type),
"Invalid, non-fixed-width type.",
cudf::data_type_error);

// clang-format off
if (is_timestamp (type)) return make_timestamp_column (type, size, state, stream, mr);
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/column/factories_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow)
auto column = cudf::make_numeric_column(
cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
};
EXPECT_THROW(construct(), cudf::logic_error);
EXPECT_THROW(construct(), cudf::data_type_error);
}

INSTANTIATE_TEST_CASE_P(NonNumeric,
Expand Down Expand Up @@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow)
auto column = cudf::make_fixed_width_column(
cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
};
EXPECT_THROW(construct(), cudf::logic_error);
EXPECT_THROW(construct(), cudf::data_type_error);
}

INSTANTIATE_TEST_CASE_P(NonFixedWidth,
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/fixed_point/fixed_point_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID)
{
EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0),
cudf::logic_error);
cudf::data_type_error);
}

TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
================
column_factories
================

.. automodule:: cudf._lib.pylibcudf.column_factories
:members:
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf.
aggregation
binaryop
column
column_factories
concatenate
copying
filling
Expand Down
21 changes: 6 additions & 15 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,10 @@ from cudf._lib.types cimport (
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf._lib.types import dtype_from_pylibcudf_column

# TODO: We currently need this for "casting" empty pylibcudf columns in
# from_pylibcudf by instead creating an empty numeric column. We will be able
# to remove this once column factories are exposed to pylibcudf.

cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
from cudf._lib.pylibcudf cimport Column as plc_Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
make_column_from_scalar as cpp_make_column_from_scalar,
Expand Down Expand Up @@ -623,22 +619,17 @@ cdef class Column:
pylibcudf.Column
A new pylibcudf.Column referencing the same data.
"""
cdef libcudf_types.data_type new_dtype
if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS:
col = pylibcudf.unary.cast(
col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS)
)
elif col.type().id() == pylibcudf.TypeId.EMPTY:
new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8)
# TODO: This function call is what requires cimporting pylibcudf.
# We can remove the cimport once we can directly do
# pylibcudf.column_factories.make_numeric_column or equivalent.
col = plc_Column.from_libcudf(
move(
make_numeric_column(
new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
)
)
new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8)

col = pylibcudf.column_factories.make_numeric_column(
new_dtype,
col.size(),
pylibcudf.column_factories.MaskState.ALL_NULL
)

dtype = dtype_from_pylibcudf_column(col)
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(cython_sources
aggregation.pyx
binaryop.pyx
column.pyx
column_factories.pyx
concatenate.pyx
copying.pyx
filling.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . cimport (
aggregation,
binaryop,
column_factories,
concatenate,
copying,
filling,
Expand Down Expand Up @@ -39,6 +40,7 @@ __all__ = [
"binaryop",
"concatenate",
"copying",
"column_factories",
"filling",
"gpumemoryview",
"groupby",
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from . import (
aggregation,
binaryop,
column_factories,
concatenate,
copying,
filling,
Expand All @@ -26,7 +27,7 @@
from .gpumemoryview import gpumemoryview
from .scalar import Scalar
from .table import Table
from .types import DataType, TypeId
from .types import DataType, MaskState, TypeId

__all__ = [
"Column",
Expand All @@ -38,6 +39,7 @@
"binaryop",
"concatenate",
"copying",
"column_factories",
"filling",
"gpumemoryview",
"groupby",
Expand Down
52 changes: 52 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type

from .column cimport Column
from .types cimport DataType, size_type, type_id

ctypedef fused MakeEmptyColumnOperand:
DataType
type_id
object

ctypedef fused MaskArg:
mask_state
object


cpdef Column make_empty_column(
MakeEmptyColumnOperand type_or_id
)

cpdef Column make_numeric_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_fixed_point_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_timestamp_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_duration_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_fixed_width_column(
DataType type_,
size_type size,
MaskArg mask,
)
Loading
Loading