Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf._lib.labeling in favor of inlining pylibcudf #17346

Merged
merged 1 commit into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ set(cython_sources
interop.pyx
join.pyx
json.pyx
labeling.pyx
lists.pyx
merge.pyx
null_mask.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
interop,
join,
json,
labeling,
merge,
null_mask,
nvtext,
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/column.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

from __future__ import annotations

from typing import Literal

from typing_extensions import Self

import pylibcudf as plc

from cudf._typing import Dtype, DtypeObj, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
Expand Down Expand Up @@ -71,3 +75,8 @@ class Column:
# TODO: The val parameter should be Scalar, not ScalarLike
@staticmethod
def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
@staticmethod
def from_pylibcudf(
col: plc.Column, data_ptr_exposed: bool = False
) -> ColumnBase: ...
def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
24 changes: 0 additions & 24 deletions python/cudf/cudf/_lib/labeling.pyx

This file was deleted.

39 changes: 23 additions & 16 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,18 @@
import pandas as pd
import pyarrow as pa

import pylibcudf as plc

import cudf
from cudf import _lib as libcudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
get_compatible_timezone,
get_tz_data,
)
from cudf.core.buffer import Buffer
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column import ColumnBase, as_column, column, string
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
Expand Down Expand Up @@ -818,13 +819,16 @@ def _find_ambiguous_and_nonexistent(
# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
self,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
self.to_pylibcudf(mode="read"),
ambiguous_begin.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES,
ambiguous_end.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.NO,
)
ambiguous = libcudf.column.Column.from_pylibcudf(plc_column)
ambiguous = ambiguous.notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
Expand All @@ -834,13 +838,16 @@ def _find_ambiguous_and_nonexistent(
# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
self,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
self.to_pylibcudf(mode="read"),
nonexistent_begin.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES,
nonexistent_end.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.NO,
)
nonexistent = libcudf.column.Column.from_pylibcudf(plc_column)
nonexistent = nonexistent.notnull()

return ambiguous, nonexistent

Expand Down
22 changes: 18 additions & 4 deletions python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
import numpy as np
import pandas as pd

import pylibcudf as plc

import cudf
from cudf._lib.column import Column
from cudf.api.types import is_list_like
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import as_column
from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
from cudf.core.index import IntervalIndex, interval_range
Expand Down Expand Up @@ -256,9 +260,19 @@ def cut(
# the input arr must be changed to the same type as the edges
input_arr = input_arr.astype(left_edges.dtype)
# get the indexes for the appropriate number
index_labels = cudf._lib.labeling.label_bins(
input_arr, left_edges, left_inclusive, right_edges, right_inclusive
)
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
input_arr.to_pylibcudf(mode="read"),
left_edges.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if left_inclusive
else plc.labeling.Inclusive.NO,
right_edges.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if right_inclusive
else plc.labeling.Inclusive.NO,
)
index_labels = Column.from_pylibcudf(plc_column)

if labels is False:
# if labels is false we return the index labels, we return them
Expand All @@ -283,7 +297,7 @@ def cut(
# should allow duplicate categories.
return interval_labels[index_labels]

index_labels = as_unsigned_codes(len(interval_labels), index_labels)
index_labels = as_unsigned_codes(len(interval_labels), index_labels) # type: ignore[arg-type]

col = CategoricalColumn(
data=None,
Expand Down
32 changes: 20 additions & 12 deletions python/cudf/cudf/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
import numpy as np
import pandas as pd

import pylibcudf as plc

import cudf
import cudf._lib.labeling
import cudf.core.index
from cudf._lib.column import Column
from cudf.core.buffer import acquire_spill_lock
from cudf.core.groupby.groupby import (
DataFrameGroupBy,
GroupBy,
Expand All @@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)
if len(self.grouping.bin_labels) != len(result):
index = cudf.core.index.Index(
index = cudf.Index(
self.grouping.bin_labels, name=self.grouping.names[0]
)
return result._align_to_index(
Expand Down Expand Up @@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy):


class _ResampleGrouping(_Grouping):
bin_labels: cudf.core.index.Index
bin_labels: cudf.Index

def __init__(self, obj, by=None, level=None):
self._freq = getattr(by, "freq", None)
Expand Down Expand Up @@ -170,7 +172,7 @@ def deserialize(cls, header, frames):
out.names = names
out._named_columns = _named_columns
out._key_columns = key_columns
out.bin_labels = cudf.core.index.Index.deserialize(
out.bin_labels = cudf.Index.deserialize(
header["__bin_labels"], frames[-header["__bin_labels_count"] :]
)
out._freq = header["_freq"]
Expand Down Expand Up @@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by):
cast_bin_labels = bin_labels.astype(result_type)

# bin the key column:
bin_numbers = cudf._lib.labeling.label_bins(
cast_key_column,
left_edges=cast_bin_labels[:-1]._column,
left_inclusive=(closed == "left"),
right_edges=cast_bin_labels[1:]._column,
right_inclusive=(closed == "right"),
)
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
cast_key_column.to_pylibcudf(mode="read"),
cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if closed == "left"
else plc.labeling.Inclusive.NO,
cast_bin_labels[1:]._column.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if closed == "right"
else plc.labeling.Inclusive.NO,
)
bin_numbers = Column.from_pylibcudf(plc_column)

if label == "right":
cast_bin_labels = cast_bin_labels[1:]
Expand Down
Loading