Skip to content

Commit

Permalink
Use Lists of Columns for Various Files (#10463)
Browse files Browse the repository at this point in the history
This PR covers many low hanging fruits for #10153. All API accepting Frames now accepts a list of columns in the following files:

- hash.pyx
- interop.pyx
- join.pyx
- partitioning.pyx
- quantiles.pyx
- reshape.pyx
- search.pyx
- transform.pyx
- lists.pyx
- string/combine.pyx

This PR covers point 5 in the follow-ups to #9889.
Also, in `join.pyx`, gil was not released when dispatching workload to libcudf. This PR fixes that.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #10463
  • Loading branch information
isVoid authored Apr 19, 2022
1 parent 17d49fa commit 9dc728a
Show file tree
Hide file tree
Showing 22 changed files with 299 additions and 371 deletions.
32 changes: 10 additions & 22 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def hash_partition(source_table, object columns_to_hash,
int num_partitions, bool keep_index=True):
def hash_partition(list source_columns, object columns_to_hash,
int num_partitions):
cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
cdef int c_num_partitions = num_partitions
cdef table_view c_source_view = table_view_from_table(
source_table, not keep_index
)
cdef table_view c_source_view = table_view_from_columns(source_columns)

cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
with nogil:
Expand All @@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash,
)

# Note that the offsets (`c_result.second`) may be empty when
# the original table (`source_table`) is empty. We need to
# the original table (`source_columns`) is empty. We need to
# return a list of zeros in this case.
return (
*data_from_unique_ptr(
move(c_result.first),
column_names=source_table._column_names,
index_names=(
source_table._index_names
if keep_index is True
else None
)

),
list(c_result.second) if c_result.second.size()
else [0] * num_partitions
columns_from_unique_ptr(move(c_result.first)),
list(c_result.second)
if c_result.second.size() else [0] * num_partitions
)


def hash(source_table, str method, int seed=0):
cdef table_view c_source_view = table_view_from_table(
source_table, ignore_index=True)
def hash(list source_columns, str method, int seed=0):
cdef table_view c_source_view = table_view_from_columns(source_columns)
cdef unique_ptr[column] c_result
cdef cpp_hash_id c_hash_function
if method == "murmur3":
Expand Down
65 changes: 22 additions & 43 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import cudf

Expand All @@ -20,12 +20,12 @@ from cudf._lib.cpp.interop cimport (
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def from_dlpack(dlpack_capsule):
"""
Converts a DLPack Tensor PyCapsule into a cudf Frame object.
Converts a DLPack Tensor PyCapsule into a list of columns.
DLPack Tensor PyCapsule is expected to have the name "dltensor".
"""
Expand All @@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule):
cpp_from_dlpack(dlpack_tensor)
)

res = data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
)
res = columns_from_unique_ptr(move(c_result))
dlpack_tensor.deleter(dlpack_tensor)
return res


def to_dlpack(source_table):
def to_dlpack(list source_columns):
"""
Converts a cudf Frame into a DLPack Tensor PyCapsule.
Converts a list of columns into a DLPack Tensor PyCapsule.
DLPack Tensor PyCapsule will have the name "dltensor".
"""
for column in source_table._columns:
if column.null_count:
raise ValueError(
"Cannot create a DLPack tensor with null values. \
Input is required to have null count as zero."
)
if any(column.null_count for column in source_columns):
raise ValueError(
"Cannot create a DLPack tensor with null values. \
Input is required to have null count as zero."
)

cdef DLManagedTensor *dlpack_tensor
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index=True
)
cdef table_view source_table_view = table_view_from_columns(source_columns)

with nogil:
dlpack_tensor = cpp_to_dlpack(
Expand Down Expand Up @@ -110,27 +104,22 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *:
raise ValueError("Malformed metadata has been encountered")


def to_arrow(input_table,
object metadata,
bool keep_index=True):
"""Convert from cudf Frame to PyArrow Table.
def to_arrow(list source_columns, object metadata):
"""Convert a list of columns from
cudf Frame to a PyArrow Table.
Parameters
----------
input_table : cudf table
column_names : names for the pyarrow arrays
field_names : field names for nested type arrays
keep_index : whether index needs to be part of arrow table
source_columns : a list of columns to convert
metadata : a list of metadata, see `gather_metadata` for layout
Returns
-------
pyarrow table
"""

cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata)
cdef table_view input_table_view = (
table_view_from_table(input_table, not keep_index)
)
cdef table_view input_table_view = table_view_from_columns(source_columns)

cdef shared_ptr[CTable] cpp_arrow_table
with nogil:
Expand All @@ -141,22 +130,16 @@ def to_arrow(input_table,
return pyarrow_wrap_table(cpp_arrow_table)


def from_arrow(
object input_table,
object column_names=None,
object index_names=None
):
"""Convert from PyArrow Table to cudf Frame.
def from_arrow(object input_table):
"""Convert from PyArrow Table to a list of columns.
Parameters
----------
input_table : PyArrow table
column_names : names for the cudf table data columns
index_names : names for the cudf table index columns
Returns
-------
cudf Frame
A list of columns to construct Frame object
"""
cdef shared_ptr[CTable] cpp_arrow_table = (
pyarrow_unwrap_table(input_table)
Expand All @@ -166,8 +149,4 @@ def from_arrow(
with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return data_from_unique_ptr(
move(c_result),
column_names=column_names,
index_names=index_names
)
return columns_from_unique_ptr(move(c_result))
51 changes: 19 additions & 32 deletions python/cudf/cudf/_lib/join.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from itertools import chain

Expand All @@ -16,31 +16,25 @@ from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.utils cimport table_view_from_table
from cudf._lib.utils cimport table_view_from_columns

# The functions below return the *gathermaps* that represent
# the join result when joining on the keys `lhs` and `rhs`.

cpdef join(lhs, rhs, how=None):
cpdef join(list lhs, list rhs, how=None):
cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
cdef table_view c_lhs = table_view_from_table(lhs)
cdef table_view c_rhs = table_view_from_table(rhs)
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "inner":
c_result = move(cpp_join.inner_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
elif how == "left":
c_result = move(cpp_join.left_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.left_join(c_lhs, c_rhs))
elif how == "outer":
c_result = move(cpp_join.full_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.full_join(c_lhs, c_rhs))
else:
raise ValueError(f"Invalid join type {how}")

Expand All @@ -49,30 +43,23 @@ cpdef join(lhs, rhs, how=None):
return left_rows, right_rows


cpdef semi_join(lhs, rhs, how=None):
cpdef semi_join(list lhs, list rhs, how=None):
# left-semi and left-anti joins
cdef cpp_join.gather_map_type c_result
cdef table_view c_lhs = table_view_from_table(lhs)
cdef table_view c_rhs = table_view_from_table(rhs)
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "leftsemi":
c_result = move(cpp_join.left_semi_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
elif how == "leftanti":
c_result = move(cpp_join.left_anti_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
else:
raise ValueError(f"Invalid join type {how}")

cdef Column left_rows = _gather_map_as_column(move(c_result))
return (
left_rows,
None
)
return left_rows, None


cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
Expand Down
21 changes: 9 additions & 12 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ from cudf.core.dtypes import ListDtype

from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
from cudf._lib.cpp.lists.extract cimport extract_list_element
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def count_elements(Column col):
Expand All @@ -61,20 +61,18 @@ def count_elements(Column col):
return result


def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
cdef table_view c_table_view = table_view_from_table(tbl, ignore_index)
def explode_outer(
list source_columns, int explode_column_idx
):
cdef table_view c_table_view = table_view_from_columns(source_columns)
cdef size_type c_explode_column_idx = explode_column_idx

cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))

return data_from_unique_ptr(
move(c_result),
column_names=tbl._column_names,
index_names=None if ignore_index else tbl._index_names
)
return columns_from_unique_ptr(move(c_result))


def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal):
Expand Down Expand Up @@ -197,18 +195,17 @@ def index_of(Column col, object py_search_key):
return Column.from_unique_ptr(move(c_result))


def concatenate_rows(tbl):
def concatenate_rows(list source_columns):
cdef unique_ptr[column] c_result

cdef table_view c_table_view = table_view_from_table(tbl)
cdef table_view c_table_view = table_view_from_columns(source_columns)

with nogil:
c_result = move(cpp_concatenate_rows(
c_table_view,
))

result = Column.from_unique_ptr(move(c_result))
return result
return Column.from_unique_ptr(move(c_result))


def concatenate_list_elements(Column input_column, dropna=False):
Expand Down
22 changes: 6 additions & 16 deletions python/cudf/cudf/_lib/partitioning.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.partitioning cimport partition as cpp_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count

cimport cudf._lib.cpp.types as libcudf_types


def partition(source_table, Column partition_map,
object num_partitions, bool keep_index=True):
def partition(list source_columns, Column partition_map,
object num_partitions):

if num_partitions is None:
num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
cdef int c_num_partitions = num_partitions
cdef table_view c_source_view = table_view_from_table(
source_table, not keep_index
)
cdef table_view c_source_view = table_view_from_columns(source_columns)

cdef column_view c_partition_map_view = partition_map.view()

Expand All @@ -40,13 +38,5 @@ def partition(source_table, Column partition_map,
)

return (
*data_from_unique_ptr(
move(c_result.first),
column_names=source_table._column_names,
index_names=source_table._index_names if(
keep_index is True)
else None

),
list(c_result.second)
columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
)
Loading

0 comments on commit 9dc728a

Please sign in to comment.