Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Lists of Columns for Various Files #10463

Merged
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 10 additions & 22 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def hash_partition(source_table, object columns_to_hash,
int num_partitions, bool keep_index=True):
def hash_partition(list source_columns, object columns_to_hash,
int num_partitions):
cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
cdef int c_num_partitions = num_partitions
cdef table_view c_source_view = table_view_from_table(
source_table, not keep_index
)
cdef table_view c_source_view = table_view_from_columns(source_columns)

cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
with nogil:
Expand All @@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash,
)

# Note that the offsets (`c_result.second`) may be empty when
# the original table (`source_table`) is empty. We need to
# the original table (`source_columns`) is empty. We need to
# return a list of zeros in this case.
return (
*data_from_unique_ptr(
move(c_result.first),
column_names=source_table._column_names,
index_names=(
source_table._index_names
if keep_index is True
else None
)

),
list(c_result.second) if c_result.second.size()
else [0] * num_partitions
columns_from_unique_ptr(move(c_result.first)),
list(c_result.second)
if c_result.second.size() else [0] * num_partitions
)


def hash(source_table, str method, int seed=0):
cdef table_view c_source_view = table_view_from_table(
source_table, ignore_index=True)
def hash(list source_columns, str method, int seed=0):
cdef table_view c_source_view = table_view_from_columns(source_columns)
cdef unique_ptr[column] c_result
cdef cpp_hash_id c_hash_function
if method == "murmur3":
Expand Down
65 changes: 22 additions & 43 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import cudf

Expand All @@ -20,12 +20,12 @@ from cudf._lib.cpp.interop cimport (
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def from_dlpack(dlpack_capsule):
"""
Converts a DLPack Tensor PyCapsule into a cudf Frame object.
Converts a DLPack Tensor PyCapsule into a list of columns.

DLPack Tensor PyCapsule is expected to have the name "dltensor".
"""
Expand All @@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule):
cpp_from_dlpack(dlpack_tensor)
)

res = data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
)
res = columns_from_unique_ptr(move(c_result))
isVoid marked this conversation as resolved.
Show resolved Hide resolved
dlpack_tensor.deleter(dlpack_tensor)
return res


def to_dlpack(source_table):
def to_dlpack(list source_columns):
"""
Converts a cudf Frame into a DLPack Tensor PyCapsule.
Converts a list of columns into a DLPack Tensor PyCapsule.

DLPack Tensor PyCapsule will have the name "dltensor".
"""
for column in source_table._columns:
if column.null_count:
raise ValueError(
"Cannot create a DLPack tensor with null values. \
Input is required to have null count as zero."
)
if any(column.null_count for column in source_columns):
raise ValueError(
"Cannot create a DLPack tensor with null values. \
Input is required to have null count as zero."
)

cdef DLManagedTensor *dlpack_tensor
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index=True
)
cdef table_view source_table_view = table_view_from_columns(source_columns)

with nogil:
dlpack_tensor = cpp_to_dlpack(
Expand Down Expand Up @@ -110,27 +104,22 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *:
raise ValueError("Malformed metadata has been encountered")


def to_arrow(input_table,
object metadata,
bool keep_index=True):
"""Convert from cudf Frame to PyArrow Table.
def to_arrow(list source_columns, object metadata):
"""Convert a list of columns from
cudf Frame to a PyArrow Table.

Parameters
----------
input_table : cudf table
column_names : names for the pyarrow arrays
field_names : field names for nested type arrays
keep_index : whether index needs to be part of arrow table
source_columns : a list of columns to convert
metadata : a list of metadata, see `gather_metadata` for layout

Returns
-------
pyarrow table
"""

cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata)
cdef table_view input_table_view = (
table_view_from_table(input_table, not keep_index)
)
cdef table_view input_table_view = table_view_from_columns(source_columns)

cdef shared_ptr[CTable] cpp_arrow_table
with nogil:
Expand All @@ -141,22 +130,16 @@ def to_arrow(input_table,
return pyarrow_wrap_table(cpp_arrow_table)


def from_arrow(
object input_table,
object column_names=None,
object index_names=None
):
"""Convert from PyArrow Table to cudf Frame.
def from_arrow(object input_table):
"""Convert from PyArrow Table to a list of columns.

Parameters
----------
input_table : PyArrow table
column_names : names for the cudf table data columns
index_names : names for the cudf table index columns

isVoid marked this conversation as resolved.
Show resolved Hide resolved
Returns
-------
cudf Frame
A list of columns to construct Frame object
"""
cdef shared_ptr[CTable] cpp_arrow_table = (
pyarrow_unwrap_table(input_table)
Expand All @@ -166,8 +149,4 @@ def from_arrow(
with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return data_from_unique_ptr(
move(c_result),
column_names=column_names,
index_names=index_names
)
return columns_from_unique_ptr(move(c_result))
51 changes: 19 additions & 32 deletions python/cudf/cudf/_lib/join.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from itertools import chain

Expand All @@ -16,31 +16,25 @@ from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.utils cimport table_view_from_table
from cudf._lib.utils cimport table_view_from_columns

# The functions below return the *gathermaps* that represent
# the join result when joining on the keys `lhs` and `rhs`.

cpdef join(lhs, rhs, how=None):
cpdef join(list lhs, list rhs, how=None):
cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
cdef table_view c_lhs = table_view_from_table(lhs)
cdef table_view c_rhs = table_view_from_table(rhs)
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "inner":
c_result = move(cpp_join.inner_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
elif how == "left":
c_result = move(cpp_join.left_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.left_join(c_lhs, c_rhs))
elif how == "outer":
c_result = move(cpp_join.full_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.full_join(c_lhs, c_rhs))
else:
raise ValueError(f"Invalid join type {how}")

Expand All @@ -49,30 +43,23 @@ cpdef join(lhs, rhs, how=None):
return left_rows, right_rows


cpdef semi_join(lhs, rhs, how=None):
cpdef semi_join(list lhs, list rhs, how=None):
# left-semi and left-anti joins
cdef cpp_join.gather_map_type c_result
cdef table_view c_lhs = table_view_from_table(lhs)
cdef table_view c_rhs = table_view_from_table(rhs)
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "leftsemi":
c_result = move(cpp_join.left_semi_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
elif how == "leftanti":
c_result = move(cpp_join.left_anti_join(
c_lhs,
c_rhs
))
with nogil:
c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
else:
raise ValueError(f"Invalid join type {how}")

cdef Column left_rows = _gather_map_as_column(move(c_result))
return (
left_rows,
None
)
return left_rows, None


cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
Expand Down
21 changes: 9 additions & 12 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ from cudf.core.dtypes import ListDtype

from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
from cudf._lib.cpp.lists.extract cimport extract_list_element
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def count_elements(Column col):
Expand All @@ -61,20 +61,18 @@ def count_elements(Column col):
return result


def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
cdef table_view c_table_view = table_view_from_table(tbl, ignore_index)
def explode_outer(
list source_columns, int explode_column_idx
):
cdef table_view c_table_view = table_view_from_columns(source_columns)
cdef size_type c_explode_column_idx = explode_column_idx

cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))

return data_from_unique_ptr(
move(c_result),
column_names=tbl._column_names,
index_names=None if ignore_index else tbl._index_names
)
return columns_from_unique_ptr(move(c_result))


def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal):
Expand Down Expand Up @@ -197,18 +195,17 @@ def index_of(Column col, object py_search_key):
return Column.from_unique_ptr(move(c_result))


def concatenate_rows(tbl):
def concatenate_rows(list source_columns):
cdef unique_ptr[column] c_result

cdef table_view c_table_view = table_view_from_table(tbl)
cdef table_view c_table_view = table_view_from_columns(source_columns)

with nogil:
c_result = move(cpp_concatenate_rows(
c_table_view,
))

result = Column.from_unique_ptr(move(c_result))
return result
return Column.from_unique_ptr(move(c_result))


def concatenate_list_elements(Column input_column, dropna=False):
Expand Down
22 changes: 6 additions & 16 deletions python/cudf/cudf/_lib/partitioning.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.partitioning cimport partition as cpp_partition
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count

cimport cudf._lib.cpp.types as libcudf_types


def partition(source_table, Column partition_map,
object num_partitions, bool keep_index=True):
def partition(list source_columns, Column partition_map,
object num_partitions):

if num_partitions is None:
num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
cdef int c_num_partitions = num_partitions
cdef table_view c_source_view = table_view_from_table(
source_table, not keep_index
)
cdef table_view c_source_view = table_view_from_columns(source_columns)

cdef column_view c_partition_map_view = partition_map.view()

Expand All @@ -40,13 +38,5 @@ def partition(source_table, Column partition_map,
)

return (
*data_from_unique_ptr(
move(c_result.first),
column_names=source_table._column_names,
index_names=source_table._index_names if(
keep_index is True)
else None

),
list(c_result.second)
columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
)
Loading